1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from sklearn.feature_extraction import DictVectorizer
  4. import csv
  5. from sklearn import tree
  6. from sklearn import preprocessing
  7. from sklearn.externals.six import StringIO
  8.  
  9. # Read in the csv file and put features into list of dict and list of class label
  10. allElectronicsData = open(r\'AllElectronics.csv\', \'rb\')
  11. reader = csv.reader(allElectronicsData)
  12. headers = reader.next()
  13.  
  14. print(headers)
  15.  
  16. featureList = []
  17. labelList = []
  18.  
  19. for row in reader:
  20. labelList.append(row[len(row)-1]) # 取得每一行最后一个值 标签
  21. rowDict = {} #取得每一行的值 包含有字典的list
  22. for i in range(1, len(row)-1):
  23. rowDict[headers[i]] = row[i]
  24. featureList.append(rowDict)
  25.  
  26. print(featureList)
  27.  
  28. # Vetorize features
  29. vec = DictVectorizer()#将字典转换成00100的形式(1000
  30. dummyX = vec.fit_transform(featureList) .toarray()
  31.  
  32. print("dummyX: " + str(dummyX))
  33. print(vec.get_feature_names())
  34.  
  35. print("labelList: " + str(labelList))
  36.  
  37. # vectorize class labels
  38. lb = preprocessing.LabelBinarizer()#将标签转换成0,1
  39. dummyY = lb.fit_transform(labelList)
  40. print("dummyY: " + str(dummyY))
  41.  
  42. # Using decision tree for classification
  43. # clf = tree.DecisionTreeClassifier()
  44. clf = tree.DecisionTreeClassifier(criterion=\'entropy\')#信息熵
  45. clf = clf.fit(dummyX, dummyY)
  46. print("clf: " + str(clf))
  47.  
  48.  
  49. # Visualize model
  50. with open("allElectronicInformationGainOri.dot", \'w\') as f:
  51. f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
  52.  
  53. oneRowX = dummyX[0, :]
  54. print("oneRowX: " + str(oneRowX))
  55.  
  56. newRowX = oneRowX
  57. newRowX[0] = 1
  58. newRowX[2] = 0
  59. print("newRowX: " + str(newRowX))
  60.  
  61. predictedY = clf.predict(newRowX)
  62. print("predictedY: " + str(predictedY))

  

版权声明:本文为wlc297984368原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/wlc297984368/p/7462684.html