一、选题背景:

       关于用户留存有这样一个观点,如果将用户流失率降低5%,公司利润将提升25%-85%。如今高居不下的获客成本让电信运营商遭遇“天花板”,甚至陷入获客难的窘境。随着市场饱和度上升,电信运营商亟待解决增加用户黏性,延长用户生命周期的问题。因此,电信用户流失分析与预测至关重要。
数据集来自kesci中的“电信运营商客户数据集”

二、理解数据:

        根据介绍,该数据集有21个字段,共7043条记录。每条记录包含了唯一客户的特征。我们目标就是发现前20列特征和最后一列客户是否流失特征之间的关系。

1 import pandas as pd
2 import numpy as np
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 #导入数据集文件
6 # csv_data=\'https://github.com/aprial/growth-workshop/blob/master/data/churn.csv\'
7 csv_data=\'C:/Users/wade z shao/Downloads/\'+\'WA_Fn-UseC_-Telco-Customer-Churn.csv\'
8 data=pd.read_csv(csv_data)
9 print(data)

 

 

 

 

数据公用7042行,主要有用户id,套餐及使用情况,简单的个人信息

 

“TotalCharges”(总消费额)存在数据问题,采用强制转换,将“TotalCharges”(总消费额)转换为浮点型数据。

1 print(data[\'TotalCharges\'])

 

 

 

尝试将数据转换 data[\’TotalCharges\’].astype(float)#不可行

 

第一遍执行 customerDF[\’TotalCharges\’]=customerDF[\’TotalCharges\’].convert_objects(convert_numeric=True) 时候报错,

AttributeError: \’Series\’ object has no attribute \’convert_objects\’

 

查看数据的描述统计信息,根据一般经验,所有数据正常。

1 # 获取数据类型的描述统计信息
2 data.describe()

 

可视化分析

1 # plt.rcParams[\'figure.figsize\']=6,6
2 plt.pie(data[\'Churn\'].value_counts(),explode=(0.1,0))
3 # plt.pie(data[\'Churn\'].value_counts(),labels=data[\'Churn\'].value_counts().index,autopct=\'%1.2f%%\',explode=(0.1,0))
4 # plt.title(\'Churn(Yes/No) Ratio\')
5 plt.show()

 

 

 

用户属性分析

1 churn_data=data[\'Churn\'].value_counts().to_frame()
2 def barplot_percentage(feature,orient=\'v\',axis_name="percentage of customers"):
3     ratios=pd.DataFrame()
4     g=(data.groupby(feature)["Churn"].value_counts()/len(data)).to_frame()
5     print(g)
6     plt.show()
7 barplot_percentage("SeniorCitizen") 
8 barplot_percentage(\'gender\')

 

 

 

1 fig,axis=plt.subplots(1,2,figsize=(12,4))
2 
3 gp_partner=(data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame()
4 # axis1=seaborn.barplot(x=\'1\',y=\'1\',data=gp_partner)
5 
6 gp_dependents=(data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame()
7 # axis2=seaborn.barplot(x=\'2\',y=\'2\',data=gp_dependents)
8 
9 plt.show()

 

 

 

 1 import seaborn
 2 fig, axis = plt.subplots(1, 2, figsize=(12,4))
 3 axis[0].set_title("Has Partner")
 4 axis[1].set_title("Has Dependents")
 5 axis_y = "percentage of customers"
 6 
 7 # Plot Partner column
 8 gp_partner = (data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame()
 9 gp_partner.rename(columns={"Churn": axis_y}, inplace=True)
10 gp_partner.reset_index(inplace=True)
11 ax1 = seaborn.barplot(x=\'Partner\', y= axis_y, hue=\'Churn\', data=gp_partner, ax=axis[0])
12 ax1.legend(fontsize=10)
13 #ax1.set_xlabel(\'伴侣\')
14 
15 
16 # Plot Dependents column
17 gp_dep = (data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame()
18 #print(gp_dep)
19 gp_dep.rename(columns={"Churn": axis_y} , inplace=True)
20 #print(gp_dep)
21 gp_dep.reset_index(inplace=True)
22 #print(gp_dep)
23 
24 ax2 = seaborn.barplot(x=\'Dependents\', y= axis_y, hue=\'Churn\', data=gp_dep, ax=axis[1])
25 #ax2.set_xlabel(\'家属\')
26 
27 
28 #设置字体大小
29 plt.rcParams.update({\'font.size\': 20})
30 ax2.legend(fontsize=10)
31 
32 #设置
33 plt.show()

 

 

 

 

 

 1 # Kernel density estimaton核密度估计
 2 def kdeplot(feature,xlabel):
 3     plt.figure(figsize=(9, 4))
 4     plt.title("KDE for {0}".format(feature))
 5     ax0 = seaborn.kdeplot(data[data[\'Churn\'] == \'No\'][feature].dropna(), color= \'navy\', label= \'Churn: No\', shade=\'True\')
 6     ax1 = seaborn.kdeplot(data[data[\'Churn\'] == \'Yes\'][feature].dropna(), color= \'orange\', label= \'Churn: Yes\',shade=\'True\')
 7     plt.xlabel(xlabel)
 8     #设置字体大小
 9     plt.rcParams.update({\'font.size\': 20})
10     plt.legend(fontsize=10)
11 kdeplot(\'tenure\',\'tenure\')
12 plt.show()

 

用户流失预测

数据清洗

1 customer_id=data[\'customerID\']
2 print(customer_id.head())

 

 

 

 

1 data.drop([\'customerID\'],axis=1,inplace=True)

 

观察数据类型,发现大多除了“tenure”、“MonthlyCharges”、“TotalCharges”是连续特征,其它都是离散特征。

1 cateCols = [c for c in data.columns if data[c].dtype == \'object\' or c == \'SeniorCitizen\']
2 dfCate = data[cateCols].copy()
3 dfCate.head(3)

 

 

 

 

1 #进行特征编码。
2 for col in cateCols:
3     if dfCate[col].nunique() == 2:
4         dfCate[col] = pd.factorize(dfCate[col])[0]
5     else:
6         dfCate = pd.get_dummies(dfCate, columns=[col])
7 dfCate[\'tenure\']=data[[\'tenure\']]
8 dfCate[\'MonthlyCharges\']=data[[\'MonthlyCharges\']]
9 dfCate[\'TotalCharges\']=data[[\'TotalCharges\']]
1 #查看关联关系
2 plt.figure(figsize=(16,8))
3 dfCate.corr()[\'Churn\'].sort_values(ascending=False).plot(kind=\'bar\')
4 plt.show()
 1 # 特征选择
 2 dropFea = [\'gender\',\'PhoneService\',
 3            \'OnlineSecurity_No internet service\', \'OnlineBackup_No internet service\',
 4            \'DeviceProtection_No internet service\', \'TechSupport_No internet service\',
 5            \'StreamingTV_No internet service\', \'StreamingMovies_No internet service\',
 6            #\'OnlineSecurity_No\', \'OnlineBackup_No\',
 7            #\'DeviceProtection_No\',\'TechSupport_No\',
 8            #\'StreamingTV_No\', \'StreamingMovies_No\',
 9            ]
10 dfCate.drop(dropFea, inplace=True, axis =1) 
11 #最后一列是作为标识
12 target = dfCate[\'Churn\'].values
13 #列表:特征和1个标识
14 columns = dfCate.columns.tolist()
 1 # 构造训练数据集和测试数据集。
 2 # 列表:特征
 3 columns.remove(\'Churn\')
 4 # 含有特征的DataFrame
 5 features = dfCate[columns].values
 6 # 30% 作为测试集,其余作为训练集
 7 # random_state = 1表示重复试验随机得到的数据集始终不变
 8 # stratify = target 表示按标识的类别,作为训练数据集、测试数据集内部的分配比例
 9 from sklearn.model_selection import train_test_split
10 train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)

构建模型

 1 # 构造各种分类器
 2 classifiers = [
 3     SVC(random_state = 1, kernel = \'rbf\'),    
 4     DecisionTreeClassifier(random_state = 1, criterion = \'gini\'),
 5     RandomForestClassifier(random_state = 1, criterion = \'gini\'),
 6     KNeighborsClassifier(metric = \'minkowski\'),
 7     AdaBoostClassifier(random_state = 1),   
 8 ]
 9 # 分类器名称
10 classifier_names = [
11             \'svc\', 
12             \'decisiontreeclassifier\',
13             \'randomforestclassifier\',
14             \'kneighborsclassifier\',
15             \'adaboostclassifier\',
16 ]
17 # 分类器参数
18 #注意分类器的参数,字典键的格式,GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名"
19 classifier_param_grid = [
20             {\'svc__C\':[0.1], \'svc__gamma\':[0.01]},
21             {\'decisiontreeclassifier__max_depth\':[6,9,11]},
22             {\'randomforestclassifier__n_estimators\':range(1,11)} ,
23             {\'kneighborsclassifier__n_neighbors\':[4,6,8]},
24             {\'adaboostclassifier__n_estimators\':[70,80,90]}
25 ]

 

模型参数调优和评估

对分类器进行参数调优和评估,最后得到试用AdaBoostClassifier(n_estimators=80)效果最好。

 1 # 对具体的分类器进行 GridSearchCV 参数调优
 2 def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = \'accuracy_score\'):
 3     response = {}
 4     gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score)
 5     # 寻找最优的参数 和最优的准确率分数
 6     search = gridsearch.fit(train_x, train_y)
 7     print("GridSearch 最优参数:", search.best_params_)
 8     print("GridSearch 最优分数: %0.4lf" %search.best_score_)
 9     #采用predict函数(特征是测试数据集)来预测标识,预测使用的参数是上一步得到的最优参数
10     predict_y = gridsearch.predict(test_x)
11     print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y))
12     response[\'predict_y\'] = predict_y
13     response[\'accuracy_score\'] = accuracy_score(test_y,predict_y)
14     return response
15  
16 for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
17     #采用 StandardScaler 方法对数据规范化:均值为0,方差为1的正态分布
18     pipeline = Pipeline([
19             #(\'scaler\', StandardScaler()),
20             #(\'pca\',PCA),
21             (model_name, model)
22     ])
23     result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = \'accuracy\')

代码汇总

  1 import pandas as pd
  2 import numpy as np
  3 import matplotlib.pyplot as plt
  4 import seaborn as sns
  5 #导入数据集文件
  6 
  7 
  8 # csv_data=\'https://github.com/aprial/growth-workshop/blob/master/data/churn.csv\'
  9 csv_data=\'C:/Users/wade z shao/Downloads/\'+\'WA_Fn-UseC_-Telco-Customer-Churn.csv\'
 10 data=pd.read_csv(csv_data)
 11 print(data)
 12 print(data[\'TotalCharges\'])
 13 
 14 # 获取数据类型的描述统计信息
 15 data.describe()
 16 
 17 
 18 # plt.rcParams[\'figure.figsize\']=6,6
 19 plt.pie(data[\'Churn\'].value_counts(),explode=(0.1,0))
 20 # plt.pie(data[\'Churn\'].value_counts(),labels=data[\'Churn\'].value_counts().index,autopct=\'%1.2f%%\',explode=(0.1,0))
 21 # plt.title(\'Churn(Yes/No) Ratio\')
 22 plt.show()
 23 churn_data=data[\'Churn\'].value_counts().to_frame()
 24 def barplot_percentage(feature,orient=\'v\',axis_name="percentage of customers"):
 25     ratios=pd.DataFrame()
 26     g=(data.groupby(feature)["Churn"].value_counts()/len(data)).to_frame()
 27     print(g)
 28     plt.show()
 29 barplot_percentage("SeniorCitizen") 
 30 barplot_percentage(\'gender\')
 31 import seaborn
 32 
 33 fig,axis=plt.subplots(1,2,figsize=(12,4))
 34 
 35 gp_partner=(data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame()
 36 # axis1=seaborn.barplot(x=\'1\',y=\'1\',data=gp_partner)
 37 
 38 gp_dependents=(data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame()
 39 # axis2=seaborn.barplot(x=\'2\',y=\'2\',data=gp_dependents)
 40 
 41 plt.show()
 42 import seaborn
 43 fig, axis = plt.subplots(1, 2, figsize=(12,4))
 44 axis[0].set_title("Has Partner")
 45 axis[1].set_title("Has Dependents")
 46 axis_y = "percentage of customers"
 47 
 48 # Plot Partner column
 49 gp_partner = (data.groupby(\'Partner\')["Churn"].value_counts()/len(data)).to_frame()
 50 gp_partner.rename(columns={"Churn": axis_y}, inplace=True)
 51 gp_partner.reset_index(inplace=True)
 52 ax1 = seaborn.barplot(x=\'Partner\', y= axis_y, hue=\'Churn\', data=gp_partner, ax=axis[0])
 53 ax1.legend(fontsize=10)
 54 #ax1.set_xlabel(\'伴侣\')
 55 
 56 
 57 # Plot Dependents column
 58 gp_dep = (data.groupby(\'Dependents\')["Churn"].value_counts()/len(data)).to_frame()
 59 #print(gp_dep)
 60 gp_dep.rename(columns={"Churn": axis_y} , inplace=True)
 61 #print(gp_dep)
 62 gp_dep.reset_index(inplace=True)
 63 #print(gp_dep)
 64 
 65 ax2 = seaborn.barplot(x=\'Dependents\', y= axis_y, hue=\'Churn\', data=gp_dep, ax=axis[1])
 66 #ax2.set_xlabel(\'家属\')
 67 
 68 
 69 
 70 
 71 #设置字体大小
 72 plt.rcParams.update({\'font.size\': 20})
 73 ax2.legend(fontsize=10)
 74 
 75 
 76 
 77 #设置
 78 plt.show()
 79 # Kernel density estimaton核密度估计
 80 def kdeplot(feature,xlabel):
 81     plt.figure(figsize=(9, 4))
 82     plt.title("KDE for {0}".format(feature))
 83     ax0 = seaborn.kdeplot(data[data[\'Churn\'] == \'No\'][feature].dropna(), color= \'navy\', label= \'Churn: No\', shade=\'True\')
 84     ax1 = seaborn.kdeplot(data[data[\'Churn\'] == \'Yes\'][feature].dropna(), color= \'orange\', label= \'Churn: Yes\',shade=\'True\')
 85     plt.xlabel(xlabel)
 86     
 87 
 88 #设置字体大小
 89     plt.rcParams.update({\'font.size\': 20})
 90     plt.legend(fontsize=10)
 91 kdeplot(\'tenure\',\'tenure\')
 92 plt.show()
 93 customer_id=data[\'customerID\']
 94 print(customer_id.head())
 95 data.drop([\'customerID\'],axis=1,inplace=True)
 96 cateCols = [c for c in data.columns if data[c].dtype == \'object\' or c == \'SeniorCitizen\']
 97 dfCate = data[cateCols].copy()
 98 dfCate.head(3)
 99 
100 
101 #进行特征编码。
102 for col in cateCols:
103     if dfCate[col].nunique() == 2:
104         dfCate[col] = pd.factorize(dfCate[col])[0]
105     else:
106         dfCate = pd.get_dummies(dfCate, columns=[col])
107 dfCate[\'tenure\']=data[[\'tenure\']]
108 dfCate[\'MonthlyCharges\']=data[[\'MonthlyCharges\']]
109 dfCate[\'TotalCharges\']=data[[\'TotalCharges\']]
110 
111 
112 #查看关联关系
113 plt.figure(figsize=(16,8))
114 dfCate.corr()[\'Churn\'].sort_values(ascending=False).plot(kind=\'bar\')
115 plt.show()
116 
117 
118 # 特征选择
119 dropFea = [\'gender\',\'PhoneService\',
120            \'OnlineSecurity_No internet service\', \'OnlineBackup_No internet service\',
121            \'DeviceProtection_No internet service\', \'TechSupport_No internet service\',
122            \'StreamingTV_No internet service\', \'StreamingMovies_No internet service\',
123            #\'OnlineSecurity_No\', \'OnlineBackup_No\',
124            #\'DeviceProtection_No\',\'TechSupport_No\',
125            #\'StreamingTV_No\', \'StreamingMovies_No\',
126            ]
127 dfCate.drop(dropFea, inplace=True, axis =1) 
128 
129 
130 #最后一列是作为标识
131 target = dfCate[\'Churn\'].values
132 
133 #列表:特征和1个标识
134 columns = dfCate.columns.tolist()
135 # 构造训练数据集和测试数据集。
136 
137 # 列表:特征
138 columns.remove(\'Churn\')
139 
140 
141 # 含有特征的DataFrame
142 features = dfCate[columns].values
143 
144 
145 # 30% 作为测试集,其余作为训练集
146 
147 # random_state = 1表示重复试验随机得到的数据集始终不变
148 
149 
150 # stratify = target 表示按标识的类别,作为训练数据集、测试数据集内部的分配比例
151 from sklearn.model_selection import train_test_split
152 train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)
153 
154 
155 # 构造各种分类器
156 classifiers = [
157     SVC(random_state = 1, kernel = \'rbf\'),    
158     DecisionTreeClassifier(random_state = 1, criterion = \'gini\'),
159     RandomForestClassifier(random_state = 1, criterion = \'gini\'),
160     KNeighborsClassifier(metric = \'minkowski\'),
161     AdaBoostClassifier(random_state = 1),   
162 ]
163 
164 
165 
166 # 分类器名称
167 classifier_names = [
168             \'svc\', 
169             \'decisiontreeclassifier\',
170             \'randomforestclassifier\',
171             \'kneighborsclassifier\',
172             \'adaboostclassifier\',
173 ]
174 
175 
176 # 分类器参数
177 
178 
179 
180 #注意分类器的参数,字典键的格式,GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名"
181 classifier_param_grid = [
182             {\'svc__C\':[0.1], \'svc__gamma\':[0.01]},
183             {\'decisiontreeclassifier__max_depth\':[6,9,11]},
184             {\'randomforestclassifier__n_estimators\':range(1,11)} ,
185             {\'kneighborsclassifier__n_neighbors\':[4,6,8]},
186             {\'adaboostclassifier__n_estimators\':[70,80,90]}
187 ]
188 
189 
190 # 对具体的分类器进行 GridSearchCV 参数调优
191 def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = \'accuracy_score\'):
192     response = {}
193     gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score)
194     # 寻找最优的参数 和最优的准确率分数
195     search = gridsearch.fit(train_x, train_y)
196     print("GridSearch 最优参数:", search.best_params_)
197     print("GridSearch 最优分数: %0.4lf" %search.best_score_)
198     #采用predict函数(特征是测试数据集)来预测标识,预测使用的参数是上一步得到的最优参数
199     predict_y = gridsearch.predict(test_x)
200     print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y))
201     response[\'predict_y\'] = predict_y
202     response[\'accuracy_score\'] = accuracy_score(test_y,predict_y)
203     return response
204  
205 for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
206     #采用 StandardScaler 方法对数据规范化:均值为0,方差为1的正态分布
207     pipeline = Pipeline([
208             #(\'scaler\', StandardScaler()),
209             #(\'pca\',PCA),
210             (model_name, model)
211     ])
212     result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = \'accuracy\')

 

 

 

 

三、总结

用户属性:老年用户,未婚用户,无亲属用户更容易流失;
服务属性:在网时长小于半年,有电话服务,光纤用户/光纤用户附加流媒体电视、电影服务,无互联网增值服务;
合同属性:签订的合同期较短,采用电子支票支付,是电子账单,月租费约70-110元的客户容易流失;
其它属性对用户流失影响较小,以上特征保持独立。

 

版权声明:本文为yingjiahui原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/yingjiahui/p/14925134.html