scikit-learn机器学习(四)使用决策树做分类,并画出决策树,随机森林对比
数据来自 UCI 数据集 匹马印第安人糖尿病数据集
载入数据
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams[\'font.sans-serif\']=[u\'simHei\'] matplotlib.rcParams[\'axes.unicode_minus\']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_breast_cancer data_set = pd.read_csv(\'pima-indians-diabetes.csv\') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y)
建立决策树,网格搜索微调模型
# In[1] 网格搜索微调模型 pipeline = Pipeline([ (\'clf\',DecisionTreeClassifier(criterion=\'entropy\')) ]) parameters={ \'clf__max_depth\':(3,5,10,15,20,25,30,35,40), \'clf__min_samples_split\':(2,3), \'clf__min_samples_leaf\':(1,2,3) } #GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。 grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring=\'f1\') grid_search.fit(X_train,y_train) # 获取搜索到的最优参数 best_parameters = grid_search.best_estimator_.get_params() print("最好的F1值为:",grid_search.best_score_) print(\'最好的参数为:\') for param_name in sorted(parameters.keys()): print(\'t%s: %r\' % (param_name,best_parameters[param_name])) # In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
最好的F1值为: 0.5573515325670498 最好的参数为: tclf__max_depth: 5 tclf__min_samples_leaf: 1 tclf__min_samples_split: 2
评价模型
# In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support
0.0 0.74 0.89 0.81 124
1.0 0.67 0.43 0.52 68
画出决策树
# In[3]打印树 from sklearn import tree feature_name=data_set.columns.values.tolist()[:-1] # 列名称 DT = tree.DecisionTreeClassifier(criterion=\'entropy\',max_depth=5,min_samples_split=2,min_samples_leaf=5) DT.fit(X_train,y_train) \'\'\' # 法一 import pydotplus from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name, class_names=["有糖尿病","无病"],filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") print(\'Visible tree plot saved as pdf.\') \'\'\' # 法二 import graphviz #ID3为决策树分类器fit之后得到的模型,注意这里必须在fit后执行,在predict之后运行会报错 dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP #在同级目录下生成tree.pdf文件 graph.render("tree2") # doctest: +SKIP
随机森林
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams[\'font.sans-serif\']=[u\'simHei\'] matplotlib.rcParams[\'axes.unicode_minus\']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer data_set = pd.read_csv(\'pima-indians-diabetes.csv\') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y) RF = RandomForestClassifier(n_estimators=10,random_state=11) RF.fit(X_train,y_train) predictions = RF.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support 0.0 0.82 0.91 0.86 126 1.0 0.78 0.61 0.68 66 micro avg 0.81 0.81 0.81 192 macro avg 0.80 0.76 0.77 192 weighted avg 0.80 0.81 0.80 192
版权声明:本文为caiyishuai原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。