决策树(Decision Tree)SkLearn
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from sklearn.feature_extraction import DictVectorizer
- import csv
- from sklearn import tree
- from sklearn import preprocessing
- from sklearn.externals.six import StringIO
- # Read in the csv file and put features into list of dict and list of class label
- allElectronicsData = open(r\'AllElectronics.csv\', \'rb\')
- reader = csv.reader(allElectronicsData)
- headers = reader.next()
- print(headers)
- featureList = []
- labelList = []
- for row in reader:
- labelList.append(row[len(row)-1]) # 取得每一行最后一个值 标签
- rowDict = {} #取得每一行的值 包含有字典的list
- for i in range(1, len(row)-1):
- rowDict[headers[i]] = row[i]
- featureList.append(rowDict)
- print(featureList)
- # Vetorize features
- vec = DictVectorizer()#将字典转换成00100的形式(1000)
- dummyX = vec.fit_transform(featureList) .toarray()
- print("dummyX: " + str(dummyX))
- print(vec.get_feature_names())
- print("labelList: " + str(labelList))
- # vectorize class labels
- lb = preprocessing.LabelBinarizer()#将标签转换成0,1
- dummyY = lb.fit_transform(labelList)
- print("dummyY: " + str(dummyY))
- # Using decision tree for classification
- # clf = tree.DecisionTreeClassifier()
- clf = tree.DecisionTreeClassifier(criterion=\'entropy\')#信息熵
- clf = clf.fit(dummyX, dummyY)
- print("clf: " + str(clf))
- # Visualize model
- with open("allElectronicInformationGainOri.dot", \'w\') as f:
- f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
- oneRowX = dummyX[0, :]
- print("oneRowX: " + str(oneRowX))
- newRowX = oneRowX
- newRowX[0] = 1
- newRowX[2] = 0
- print("newRowX: " + str(newRowX))
- predictedY = clf.predict(newRowX)
- print("predictedY: " + str(predictedY))