KNN——- K近邻算法——–K-Nearest Neighbors

思想极度简单

应用数学知识少 (近乎为零)

效果好(缺点?)

可以解释机器学习算法使用过程中很多细节问题

更完整的刻画机器学习应用的流程

 

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. 实现我们自己的 kNN
  4. 创建简单测试用例
  5. raw_data_X = [[3.393533211, 2.331273381],
  6. [3.110073483, 1.781539638],
  7. [1.343808831, 3.368360954],
  8. [3.582294042, 4.679179110],
  9. [2.280362439, 2.866990263],
  10. [7.423436942, 4.696522875],
  11. [5.745051997, 3.533989803],
  12. [9.172168622, 2.511101045],
  13. [7.792783481, 3.424088941],
  14. [7.939820817, 0.791637231]
  15. ]
  16. raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
  17. X_train = np.array(raw_data_X)
  18. y_train = np.array(raw_data_y)
  19. X_train
  20. array([[ 3.39353321, 2.33127338],
  21. [ 3.11007348, 1.78153964],
  22. [ 1.34380883, 3.36836095],
  23. [ 3.58229404, 4.67917911],
  24. [ 2.28036244, 2.86699026],
  25. [ 7.42343694, 4.69652288],
  26. [ 5.745052 , 3.5339898 ],
  27. [ 9.17216862, 2.51110105],
  28. [ 7.79278348, 3.42408894],
  29. [ 7.93982082, 0.79163723]])
  30. y_train
  31. array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

 

  1. from math import sqrt
  2. distances = []
  3. for x_train in X_train:
  4. d = sqrt(np.sum((x_train - x)**2))
  5. distances.append(d)
  6. distances
  7. [4.812566907609877,
  8. 5.229270827235305,
  9. 6.749798999160064,
  10. 4.6986266144110695,
  11. 5.83460014556857,
  12. 1.4900114024329525,
  13. 2.354574897431513,
  14. 1.3761132675144652,
  15. 0.3064319992975,
  16. 2.5786840957478887]
  17. distances = [sqrt(np.sum((x_train - x)**2))
  18. for x_train in X_train]
  19. distances
  20. [4.812566907609877,
  21. 5.229270827235305,
  22. 6.749798999160064,
  23. 4.6986266144110695,
  24. 5.83460014556857,
  25. 1.4900114024329525,
  26. 2.354574897431513,
  27. 1.3761132675144652,
  28. 0.3064319992975,
  29. 2.5786840957478887]
  30. np.argsort(distances)
  31. array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2])
  32. nearest = np.argsort(distances)
  33. k = 6
  34. topK_y = [y_train[neighbor] for neighbor in nearest[:k]]
  35. topK_y
  36. [1, 1, 1, 1, 1, 0]
  37. from collections import Counter
  38. votes = Counter(topK_y)
  39. votes
  40. Counter({0: 1, 1: 5})
  41. votes.most_common(1)
  42. [(1, 5)]
  43. predict_y = votes.most_common(1)[0][0]
  44. predict_y
  45. 1

 二、scikit-learn 中的机器学习算法封装
KNN/KNNN.py

  1. import numpy as np
  2. from math import sqrt
  3. from collections import Counter
  4. class KNNClassifier:
  5. def __init__(self, k):
  6. """初始化kNN分类器"""
  7. assert k >= 1, "k must be valid"
  8. self.k = k
  9. self._X_train = None
  10. self._y_train = None
  11. def fit(self, X_train, y_train):
  12. """根据训练数据集X_train和y_train训练kNN分类器"""
  13. assert X_train.shape[0] == y_train.shape[0], \
  14. "the size of X_train must be equal to the size of y_train"
  15. assert self.k <= X_train.shape[0], \
  16. "the size of X_train must be at least k."
  17. self._X_train = X_train
  18. self._y_train = y_train
  19. return self
  20. def predict(self, X_predict):
  21. """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
  22. assert self._X_train is not None and self._y_train is not None, \
  23. "must fit before predict!"
  24. assert X_predict.shape[1] == self._X_train.shape[1], \
  25. "the feature number of X_predict must be equal to X_train"
  26. y_predict = [self._predict(x) for x in X_predict]
  27. return np.array(y_predict)
  28. def _predict(self, x):
  29. """给定单个待预测数据x,返回x的预测结果值"""
  30. assert x.shape[0] == self._X_train.shape[1], \
  31. "the feature number of x must be equal to X_train"
  32. distances = [sqrt(np.sum((x_train - x) ** 2))
  33. for x_train in self._X_train]
  34. nearest = np.argsort(distances)
  35. topK_y = [self._y_train[i] for i in nearest[:self.k]]
  36. votes = Counter(topK_y)
  37. return votes.most_common(1)[0][0]
  38. def __repr__(self):
  39. return "KNN(k=%d)" % self.k

kNN_function/KNN.py

  1. import numpy as np
  2. from math import sqrt
  3. from collections import Counter
  4. def kNN_classify(k, X_train, y_train, x):
  5. assert 1 <= k <= X_train.shape[0], "k must be valid"
  6. assert X_train.shape[0] == y_train.shape[0], \
  7. "the size of X_train must equal to the size of y_train"
  8. assert X_train.shape[1] == x.shape[0], \
  9. "the feature number of x must be equal to X_train"
  10. distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
  11. nearest = np.argsort(distances)
  12. topK_y = [y_train[i] for i in nearest[:k]]
  13. votes = Counter(topK_y)
  14. return votes.most_common(1)[0][0]

 

三、训练数据集、测试数据集

判断机器学习算法的性能

 

 playML/KNN.py

  1. import numpy as np
  2. from math import sqrt
  3. from collections import Counter
  4. class KNNClassifier:
  5. def __init__(self, k):
  6. """初始化kNN分类器"""
  7. assert k >= 1, "k must be valid"
  8. self.k = k
  9. self._X_train = None
  10. self._y_train = None
  11. def fit(self, X_train, y_train):
  12. """根据训练数据集X_train和y_train训练kNN分类器"""
  13. assert X_train.shape[0] == y_train.shape[0], \
  14. "the size of X_train must be equal to the size of y_train"
  15. assert self.k <= X_train.shape[0], \
  16. "the size of X_train must be at least k."
  17. self._X_train = X_train
  18. self._y_train = y_train
  19. return self
  20. def predict(self, X_predict):
  21. """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
  22. assert self._X_train is not None and self._y_train is not None, \
  23. "must fit before predict!"
  24. assert X_predict.shape[1] == self._X_train.shape[1], \
  25. "the feature number of X_predict must be equal to X_train"
  26. y_predict = [self._predict(x) for x in X_predict]
  27. return np.array(y_predict)
  28. def _predict(self, x):
  29. """给定单个待预测数据x,返回x的预测结果值"""
  30. assert x.shape[0] == self._X_train.shape[1], \
  31. "the feature number of x must be equal to X_train"
  32. distances = [sqrt(np.sum((x_train - x) ** 2))
  33. for x_train in self._X_train]
  34. nearest = np.argsort(distances)
  35. topK_y = [self._y_train[i] for i in nearest[:self.k]]
  36. votes = Counter(topK_y)
  37. return votes.most_common(1)[0][0]
  38. def __repr__(self):
  39. return "KNN(k=%d)" % self.k

 

 playML/model_selection.py

  1. import numpy as np
  2. def train_test_split(X, y, test_ratio=0.2, seed=None):
  3. """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
  4. assert X.shape[0] == y.shape[0], \
  5. "the size of X must be equal to the size of y"
  6. assert 0.0 <= test_ratio <= 1.0, \
  7. "test_ration must be valid"
  8.  
  9. if seed:
  10. np.random.seed(seed)
  11. shuffled_indexes = np.random.permutation(len(X))
  12. test_size = int(len(X) * test_ratio)
  13. test_indexes = shuffled_indexes[:test_size]
  14. train_indexes = shuffled_indexes[test_size:]
  15. X_train = X[train_indexes]
  16. y_train = y[train_indexes]
  17. X_test = X[test_indexes]
  18. y_test = y[test_indexes]
  19. return X_train, X_test, y_train, y_test

playML/__init__.py

 

 

版权声明:本文为zhangtaotqy原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/zhangtaotqy/p/9533425.html