python|电信运营商客户流失风险分析
一、选题背景:
关于用户留存有这样一个观点,如果将用户流失率降低5%,公司利润将提升25%-85%。如今高居不下的获客成本让电信运营商遭遇“天花板”,甚至陷入获客难的窘境。随着市场饱和度上升,电信运营商亟待解决增加用户黏性,延长用户生命周期的问题。因此,电信用户流失分析与预测至关重要。
数据集来自kesci中的“电信运营商客户数据集”
二、理解数据:
根据介绍,该数据集有21个字段,共7043条记录。每条记录包含了唯一客户的特征。我们目标就是发现前20列特征和最后一列客户是否流失特征之间的关系。
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 import seaborn as sns 5 #导入数据集文件 6 # csv_data=\'https://github.com/aprial/growth-workshop/blob/master/data/churn.csv\' 7 csv_data=\'C:/Users/wade z shao/Downloads/\'+\'WA_Fn-UseC_-Telco-Customer-Churn.csv\' 8 data=pd.read_csv(csv_data) 9 print(data)
数据公用7042行,主要有用户id,套餐及使用情况,简单的个人信息
“TotalCharges”(总消费额)存在数据问题,采用强制转换,将“TotalCharges”(总消费额)转换为浮点型数据。
1 print(data[\'TotalCharges\'])
尝试将数据转换 data[\’TotalCharges\’].astype(float)#不可行
第一遍执行 customerDF[\’TotalCharges\’]=customerDF[\’TotalCharges\’].convert_objects(convert_numeric=True) 时候报错,
AttributeError: \’Series\’ object has no attribute \’convert_objects\’
查看数据的描述统计信息,根据一般经验,所有数据正常。
1 # 获取数据类型的描述统计信息 2 data.describe()
可视化分析
1 # plt.rcParams[\'figure.figsize\']=6,6 2 plt.pie(data[\'Churn\'].value_counts(),explode=(0.1,0)) 3 # plt.pie(data[\'Churn\'].value_counts(),labels=data[\'Churn\'].value_counts().index,autopct=\'%1.2f%%\',explode=(0.1,0)) 4 # plt.title(\'Churn(Yes/No) Ratio\') 5 plt.show()
用户属性分析
1 churn_data=data[\'Churn\'].value_counts().to_frame() 2 def barplot_percentage(feature,orient=\'v\',axis_name="percentage of customers"): 3 ratios=pd.DataFrame() 4 g=(data.groupby(feature)["Churn"].value_counts()/len(data)).to_frame() 5 print(g) 6 plt.show() 7 barplot_percentage("SeniorCitizen") 8 barplot_percentage(\'gender\')
1 fig,axis=plt.subplots(1,2,figsize=(12,4)) 2 3 gp_partner=(data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame() 4 # axis1=seaborn.barplot(x=\'1\',y=\'1\',data=gp_partner) 5 6 gp_dependents=(data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame() 7 # axis2=seaborn.barplot(x=\'2\',y=\'2\',data=gp_dependents) 8 9 plt.show()
1 import seaborn 2 fig, axis = plt.subplots(1, 2, figsize=(12,4)) 3 axis[0].set_title("Has Partner") 4 axis[1].set_title("Has Dependents") 5 axis_y = "percentage of customers" 6 7 # Plot Partner column 8 gp_partner = (data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame() 9 gp_partner.rename(columns={"Churn": axis_y}, inplace=True) 10 gp_partner.reset_index(inplace=True) 11 ax1 = seaborn.barplot(x=\'Partner\', y= axis_y, hue=\'Churn\', data=gp_partner, ax=axis[0]) 12 ax1.legend(fontsize=10) 13 #ax1.set_xlabel(\'伴侣\') 14 15 16 # Plot Dependents column 17 gp_dep = (data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame() 18 #print(gp_dep) 19 gp_dep.rename(columns={"Churn": axis_y} , inplace=True) 20 #print(gp_dep) 21 gp_dep.reset_index(inplace=True) 22 #print(gp_dep) 23 24 ax2 = seaborn.barplot(x=\'Dependents\', y= axis_y, hue=\'Churn\', data=gp_dep, ax=axis[1]) 25 #ax2.set_xlabel(\'家属\') 26 27 28 #设置字体大小 29 plt.rcParams.update({\'font.size\': 20}) 30 ax2.legend(fontsize=10) 31 32 #设置 33 plt.show()
1 # Kernel density estimaton核密度估计 2 def kdeplot(feature,xlabel): 3 plt.figure(figsize=(9, 4)) 4 plt.title("KDE for {0}".format(feature)) 5 ax0 = seaborn.kdeplot(data[data[\'Churn\'] == \'No\'][feature].dropna(), color= \'navy\', label= \'Churn: No\', shade=\'True\') 6 ax1 = seaborn.kdeplot(data[data[\'Churn\'] == \'Yes\'][feature].dropna(), color= \'orange\', label= \'Churn: Yes\',shade=\'True\') 7 plt.xlabel(xlabel) 8 #设置字体大小 9 plt.rcParams.update({\'font.size\': 20}) 10 plt.legend(fontsize=10) 11 kdeplot(\'tenure\',\'tenure\') 12 plt.show()
用户流失预测
数据清洗
1 customer_id=data[\'customerID\'] 2 print(customer_id.head())
1 data.drop([\'customerID\'],axis=1,inplace=True)
观察数据类型,发现大多除了“tenure”、“MonthlyCharges”、“TotalCharges”是连续特征,其它都是离散特征。
1 cateCols = [c for c in data.columns if data[c].dtype == \'object\' or c == \'SeniorCitizen\'] 2 dfCate = data[cateCols].copy() 3 dfCate.head(3)
1 #进行特征编码。 2 for col in cateCols: 3 if dfCate[col].nunique() == 2: 4 dfCate[col] = pd.factorize(dfCate[col])[0] 5 else: 6 dfCate = pd.get_dummies(dfCate, columns=[col]) 7 dfCate[\'tenure\']=data[[\'tenure\']] 8 dfCate[\'MonthlyCharges\']=data[[\'MonthlyCharges\']] 9 dfCate[\'TotalCharges\']=data[[\'TotalCharges\']]
1 #查看关联关系 2 plt.figure(figsize=(16,8)) 3 dfCate.corr()[\'Churn\'].sort_values(ascending=False).plot(kind=\'bar\') 4 plt.show()
1 # 特征选择 2 dropFea = [\'gender\',\'PhoneService\', 3 \'OnlineSecurity_No internet service\', \'OnlineBackup_No internet service\', 4 \'DeviceProtection_No internet service\', \'TechSupport_No internet service\', 5 \'StreamingTV_No internet service\', \'StreamingMovies_No internet service\', 6 #\'OnlineSecurity_No\', \'OnlineBackup_No\', 7 #\'DeviceProtection_No\',\'TechSupport_No\', 8 #\'StreamingTV_No\', \'StreamingMovies_No\', 9 ] 10 dfCate.drop(dropFea, inplace=True, axis =1) 11 #最后一列是作为标识 12 target = dfCate[\'Churn\'].values 13 #列表:特征和1个标识 14 columns = dfCate.columns.tolist()
1 # 构造训练数据集和测试数据集。 2 # 列表:特征 3 columns.remove(\'Churn\') 4 # 含有特征的DataFrame 5 features = dfCate[columns].values 6 # 30% 作为测试集,其余作为训练集 7 # random_state = 1表示重复试验随机得到的数据集始终不变 8 # stratify = target 表示按标识的类别,作为训练数据集、测试数据集内部的分配比例 9 from sklearn.model_selection import train_test_split 10 train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)
构建模型
1 # 构造各种分类器 2 classifiers = [ 3 SVC(random_state = 1, kernel = \'rbf\'), 4 DecisionTreeClassifier(random_state = 1, criterion = \'gini\'), 5 RandomForestClassifier(random_state = 1, criterion = \'gini\'), 6 KNeighborsClassifier(metric = \'minkowski\'), 7 AdaBoostClassifier(random_state = 1), 8 ] 9 # 分类器名称 10 classifier_names = [ 11 \'svc\', 12 \'decisiontreeclassifier\', 13 \'randomforestclassifier\', 14 \'kneighborsclassifier\', 15 \'adaboostclassifier\', 16 ] 17 # 分类器参数 18 #注意分类器的参数,字典键的格式,GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名" 19 classifier_param_grid = [ 20 {\'svc__C\':[0.1], \'svc__gamma\':[0.01]}, 21 {\'decisiontreeclassifier__max_depth\':[6,9,11]}, 22 {\'randomforestclassifier__n_estimators\':range(1,11)} , 23 {\'kneighborsclassifier__n_neighbors\':[4,6,8]}, 24 {\'adaboostclassifier__n_estimators\':[70,80,90]} 25 ]
模型参数调优和评估
对分类器进行参数调优和评估,最后得到试用AdaBoostClassifier(n_estimators=80)效果最好。
1 # 对具体的分类器进行 GridSearchCV 参数调优 2 def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = \'accuracy_score\'): 3 response = {} 4 gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score) 5 # 寻找最优的参数 和最优的准确率分数 6 search = gridsearch.fit(train_x, train_y) 7 print("GridSearch 最优参数:", search.best_params_) 8 print("GridSearch 最优分数: %0.4lf" %search.best_score_) 9 #采用predict函数(特征是测试数据集)来预测标识,预测使用的参数是上一步得到的最优参数 10 predict_y = gridsearch.predict(test_x) 11 print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y)) 12 response[\'predict_y\'] = predict_y 13 response[\'accuracy_score\'] = accuracy_score(test_y,predict_y) 14 return response 15 16 for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid): 17 #采用 StandardScaler 方法对数据规范化:均值为0,方差为1的正态分布 18 pipeline = Pipeline([ 19 #(\'scaler\', StandardScaler()), 20 #(\'pca\',PCA), 21 (model_name, model) 22 ]) 23 result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = \'accuracy\')
代码汇总
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 import seaborn as sns 5 #导入数据集文件 6 7 8 # csv_data=\'https://github.com/aprial/growth-workshop/blob/master/data/churn.csv\' 9 csv_data=\'C:/Users/wade z shao/Downloads/\'+\'WA_Fn-UseC_-Telco-Customer-Churn.csv\' 10 data=pd.read_csv(csv_data) 11 print(data) 12 print(data[\'TotalCharges\']) 13 14 # 获取数据类型的描述统计信息 15 data.describe() 16 17 18 # plt.rcParams[\'figure.figsize\']=6,6 19 plt.pie(data[\'Churn\'].value_counts(),explode=(0.1,0)) 20 # plt.pie(data[\'Churn\'].value_counts(),labels=data[\'Churn\'].value_counts().index,autopct=\'%1.2f%%\',explode=(0.1,0)) 21 # plt.title(\'Churn(Yes/No) Ratio\') 22 plt.show() 23 churn_data=data[\'Churn\'].value_counts().to_frame() 24 def barplot_percentage(feature,orient=\'v\',axis_name="percentage of customers"): 25 ratios=pd.DataFrame() 26 g=(data.groupby(feature)["Churn"].value_counts()/len(data)).to_frame() 27 print(g) 28 plt.show() 29 barplot_percentage("SeniorCitizen") 30 barplot_percentage(\'gender\') 31 import seaborn 32 33 fig,axis=plt.subplots(1,2,figsize=(12,4)) 34 35 gp_partner=(data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame() 36 # axis1=seaborn.barplot(x=\'1\',y=\'1\',data=gp_partner) 37 38 gp_dependents=(data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame() 39 # axis2=seaborn.barplot(x=\'2\',y=\'2\',data=gp_dependents) 40 41 plt.show() 42 import seaborn 43 fig, axis = plt.subplots(1, 2, figsize=(12,4)) 44 axis[0].set_title("Has Partner") 45 axis[1].set_title("Has Dependents") 46 axis_y = "percentage of customers" 47 48 # Plot Partner column 49 gp_partner = (data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame() 50 gp_partner.rename(columns={"Churn": axis_y}, inplace=True) 51 gp_partner.reset_index(inplace=True) 52 ax1 = seaborn.barplot(x=\'Partner\', y= axis_y, hue=\'Churn\', data=gp_partner, ax=axis[0]) 53 ax1.legend(fontsize=10) 54 #ax1.set_xlabel(\'伴侣\') 55 56 57 # Plot Dependents column 58 gp_dep = (data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame() 59 #print(gp_dep) 60 gp_dep.rename(columns={"Churn": axis_y} , inplace=True) 61 #print(gp_dep) 62 gp_dep.reset_index(inplace=True) 63 #print(gp_dep) 64 65 ax2 = seaborn.barplot(x=\'Dependents\', y= axis_y, hue=\'Churn\', data=gp_dep, ax=axis[1]) 66 #ax2.set_xlabel(\'家属\') 67 68 69 70 71 #设置字体大小 72 plt.rcParams.update({\'font.size\': 20}) 73 ax2.legend(fontsize=10) 74 75 76 77 #设置 78 plt.show() 79 # Kernel density estimaton核密度估计 80 def kdeplot(feature,xlabel): 81 plt.figure(figsize=(9, 4)) 82 plt.title("KDE for {0}".format(feature)) 83 ax0 = seaborn.kdeplot(data[data[\'Churn\'] == \'No\'][feature].dropna(), color= \'navy\', label= \'Churn: No\', shade=\'True\') 84 ax1 = seaborn.kdeplot(data[data[\'Churn\'] == \'Yes\'][feature].dropna(), color= \'orange\', label= \'Churn: Yes\',shade=\'True\') 85 plt.xlabel(xlabel) 86 87 88 #设置字体大小 89 plt.rcParams.update({\'font.size\': 20}) 90 plt.legend(fontsize=10) 91 kdeplot(\'tenure\',\'tenure\') 92 plt.show() 93 customer_id=data[\'customerID\'] 94 print(customer_id.head()) 95 data.drop([\'customerID\'],axis=1,inplace=True) 96 cateCols = [c for c in data.columns if data[c].dtype == \'object\' or c == \'SeniorCitizen\'] 97 dfCate = data[cateCols].copy() 98 dfCate.head(3) 99 100 101 #进行特征编码。 102 for col in cateCols: 103 if dfCate[col].nunique() == 2: 104 dfCate[col] = pd.factorize(dfCate[col])[0] 105 else: 106 dfCate = pd.get_dummies(dfCate, columns=[col]) 107 dfCate[\'tenure\']=data[[\'tenure\']] 108 dfCate[\'MonthlyCharges\']=data[[\'MonthlyCharges\']] 109 dfCate[\'TotalCharges\']=data[[\'TotalCharges\']] 110 111 112 #查看关联关系 113 plt.figure(figsize=(16,8)) 114 dfCate.corr()[\'Churn\'].sort_values(ascending=False).plot(kind=\'bar\') 115 plt.show() 116 117 118 # 特征选择 119 dropFea = [\'gender\',\'PhoneService\', 120 \'OnlineSecurity_No internet service\', \'OnlineBackup_No internet service\', 121 \'DeviceProtection_No internet service\', \'TechSupport_No internet service\', 122 \'StreamingTV_No internet service\', \'StreamingMovies_No internet service\', 123 #\'OnlineSecurity_No\', \'OnlineBackup_No\', 124 #\'DeviceProtection_No\',\'TechSupport_No\', 125 #\'StreamingTV_No\', \'StreamingMovies_No\', 126 ] 127 dfCate.drop(dropFea, inplace=True, axis =1) 128 129 130 #最后一列是作为标识 131 target = dfCate[\'Churn\'].values 132 133 #列表:特征和1个标识 134 columns = dfCate.columns.tolist() 135 # 构造训练数据集和测试数据集。 136 137 # 列表:特征 138 columns.remove(\'Churn\') 139 140 141 # 含有特征的DataFrame 142 features = dfCate[columns].values 143 144 145 # 30% 作为测试集,其余作为训练集 146 147 # random_state = 1表示重复试验随机得到的数据集始终不变 148 149 150 # stratify = target 表示按标识的类别,作为训练数据集、测试数据集内部的分配比例 151 from sklearn.model_selection import train_test_split 152 train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1) 153 154 155 # 构造各种分类器 156 classifiers = [ 157 SVC(random_state = 1, kernel = \'rbf\'), 158 DecisionTreeClassifier(random_state = 1, criterion = \'gini\'), 159 RandomForestClassifier(random_state = 1, criterion = \'gini\'), 160 KNeighborsClassifier(metric = \'minkowski\'), 161 AdaBoostClassifier(random_state = 1), 162 ] 163 164 165 166 # 分类器名称 167 classifier_names = [ 168 \'svc\', 169 \'decisiontreeclassifier\', 170 \'randomforestclassifier\', 171 \'kneighborsclassifier\', 172 \'adaboostclassifier\', 173 ] 174 175 176 # 分类器参数 177 178 179 180 #注意分类器的参数,字典键的格式,GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名" 181 classifier_param_grid = [ 182 {\'svc__C\':[0.1], \'svc__gamma\':[0.01]}, 183 {\'decisiontreeclassifier__max_depth\':[6,9,11]}, 184 {\'randomforestclassifier__n_estimators\':range(1,11)} , 185 {\'kneighborsclassifier__n_neighbors\':[4,6,8]}, 186 {\'adaboostclassifier__n_estimators\':[70,80,90]} 187 ] 188 189 190 # 对具体的分类器进行 GridSearchCV 参数调优 191 def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = \'accuracy_score\'): 192 response = {} 193 gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score) 194 # 寻找最优的参数 和最优的准确率分数 195 search = gridsearch.fit(train_x, train_y) 196 print("GridSearch 最优参数:", search.best_params_) 197 print("GridSearch 最优分数: %0.4lf" %search.best_score_) 198 #采用predict函数(特征是测试数据集)来预测标识,预测使用的参数是上一步得到的最优参数 199 predict_y = gridsearch.predict(test_x) 200 print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y)) 201 response[\'predict_y\'] = predict_y 202 response[\'accuracy_score\'] = accuracy_score(test_y,predict_y) 203 return response 204 205 for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid): 206 #采用 StandardScaler 方法对数据规范化:均值为0,方差为1的正态分布 207 pipeline = Pipeline([ 208 #(\'scaler\', StandardScaler()), 209 #(\'pca\',PCA), 210 (model_name, model) 211 ]) 212 result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = \'accuracy\')
三、总结
用户属性:老年用户,未婚用户,无亲属用户更容易流失;
服务属性:在网时长小于半年,有电话服务,光纤用户/光纤用户附加流媒体电视、电影服务,无互联网增值服务;
合同属性:签订的合同期较短,采用电子支票支付,是电子账单,月租费约70-110元的客户容易流失;
其它属性对用户流失影响较小,以上特征保持独立。