决策树(Decision Trees)
- 简介
- 基本思想
先从X轴观察,在X = 3时,样本点有一次明显的“突变”,我们以X = 3作为一次决策,进行一次划分:
再从Y轴观察,两类样本点在Y = 4 和Y = 2处可以进行划分,进而进行两次划分:
- 熵(entropy)
熵的定义:它是一系列样本中的不纯度的测量值(measure of impurity in a bunch of examples)
熵描述了数据的混乱程度,熵越大,混乱程度越高,也就是纯度越低;反之,熵越小,混乱程度越低,纯度越高。 熵的计算公式如下所示:
- 信息增益
- 偏差(bias)与方差(variance)
- 代码实现
环境:MacOS mojave 10.14.3
Python 3.7.0
使用库:scikit-learn 0.19.2
>>> from sklearn import tree >>> X = [[0, 0], [1, 1]] #两个样本点 >>> Y = [0, 1] #分别属于两个标签 >>> clf = tree.DecisionTreeClassifier() #进行分类 >>> clf = clf.fit(X, Y) >>> clf.predict([[2., 2.]]) #预测新点 array([1]) #新点通过分类属于标签1
Main.py 主程序
import sys from class_vis import prettyPicture, output_image from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl from classifyDT import classify features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic ### happens--fill in this function in the file \'classifyDT.py\'! clf = classify(features_train, labels_train) #### grader code, do not modify below this line prettyPicture(clf, features_test, labels_test) accuracy = clf.score(features_test, labels_test) # output_image("test.png", "png", open("test.png", "rb").read()) print (accuracy) acc = accuracy ### you fill this in!
classifyDT.py 决策树分类
def classify(features_train, labels_train): ### your code goes here--should return a trained decision tree classifer from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) clf.fit(features_train,labels_train) return clf
perp_terrain_data.py 生成训练点
import random def makeTerrainData(n_points=1000): ############################################################################### ### make the toy dataset random.seed(42) grade = [random.random() for ii in range(0,n_points)] bumpy = [random.random() for ii in range(0,n_points)] error = [random.random() for ii in range(0,n_points)] y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)] for ii in range(0, len(y)): if grade[ii]>0.8 or bumpy[ii]>0.8: y[ii] = 1.0 ### split into train/test sets X = [[gg, ss] for gg, ss in zip(grade, bumpy)] split = int(0.75*n_points) X_train = X[0:split] X_test = X[split:] y_train = y[0:split] y_test = y[split:] grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0] bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0] grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1] bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1] # training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig} # , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0] bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0] grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1] bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig} , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} return X_train, y_train, X_test, y_test # return training_data, test_data
class_vis.py 绘图与保存图像
import warnings warnings.filterwarnings("ignore") import matplotlib matplotlib.use(\'agg\') import matplotlib.pyplot as plt import pylab as pl import numpy as np #import numpy as np #import matplotlib.pyplot as plt #plt.ioff() def prettyPicture(clf, X_test, y_test): x_min = 0.0; x_max = 1.0 y_min = 0.0; y_max = 1.0 # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. h = .01 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic) # Plot also the test points grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0] bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0] grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1] bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast") plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.savefig("test.png")
- 决策树的参数
acc_min_samples.py acc_min_samples对比
import sys from class_vis import prettyPicture from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_50, respectively from sklearn.tree import DecisionTreeClassifier clf1 = DecisionTreeClassifier(min_samples_split=2) clf2 = DecisionTreeClassifier(min_samples_split=50) clf1.fit(features_train,labels_train) clf2.fit(features_train,labels_train) acc_min_samples_split_2 = clf1.score(features_test, labels_test) acc_min_samples_split_50 = clf2.score(features_test, labels_test) print (acc_min_samples_split_2) print (acc_min_samples_split_50) #choose one of two prettyPicture(clf1, features_test, labels_test) # prettyPicture(clf2, features_test, labels_test)
上图,min_samples_split分别为2 和50
- 决策树的优点与缺点