KNN的算法实现
首先创建一个演示数据集
import numpy as np
import matplotlib.pyplot as plt
# 给出训练数据以及对应的类别
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
# 对应类别为A的数据集我们使用红色六角形表示
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
# 对应类别为B的数据集我们使用绿色十字形表示
plt.show()
代码介绍:
- createDataSet用于创建训练数据集及其对应的类别,group对应的是二维训练数据集分别对应x轴和y轴的数据
- labels对应的是训练集的标签
- 使用Matplotlib绘制图形,scatter绘制散点图
Python基于欧氏距离实现KNN分类器
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E代表欧氏距离,M代表哈曼顿距离'
num_test = Y_test.shape[0]
labellist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
测试KNN算法
需要注意的是,我们在输入测试集的时候需要将其转换为Numpy的矩阵,否则系统会提示传入的参数是list类型,没有shape的方法
if __name__ == '__main__':
group, labels = createDataSet()
y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
print(y_test_pred)
# 测试数据选择一个测试,前面测试为A,后面的测试为B
完整代码
import operator
import numpy as np
import matplotlib.pyplot as plt
# 给出训练数据以及对应的类别
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
# 对应类别为A的数据集我们使用红色六角形表示
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
# 对应类别为B的数据集我们使用绿色十字形表示
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances) # 距离由小到大进行排序,并返回index值
topK = nearest_k[:k] # 选取前k个距离
classCount = {}
for i in topK: # 统计每个类别的个数
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
if __name__ == '__main__':
group, labels = createDataSet()
y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
print(y_test_pred)
# ['A' 'B']
KNN实战
KNN实现MNIST数据分类
MNIST数据集是一个很经典的且很常用的数据集(类似图像处理中的“Hello World!”),它是一个基本的数据集,因此我们可以直接使用PyTorch框架进行数据下载与读取
执行代码的过程是一个比较长的过程
import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist', # 选择数据的根目录
train=True, # 选择训练集
transform=None, # 不考虑使用任何数据预处理
download=True) # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
"""
MNIST数据集在KNN算法下的分类准确度
比较方法:逐像素比较,最后将所有差值相加。如果两张图片相同,则差异值将相同。差异值越大,图片之间的差异越大。
import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np
# 给出训练数据以及对应的类别
from KNN import train_loader, test_loader
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
# 对应类别为A的数据集我们使用红色六角形表示
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
# 对应类别为B的数据集我们使用绿色十字形表示
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances) # 距离由小到大进行排序,并返回index值
topK = nearest_k[:k] # 选取前k个距离
classCount = {}
for i in topK: # 统计每个类别的个数
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist', # 选择数据的根目录
train=True, # 选择训练集
transform=None, # 不考虑使用任何数据预处理
download=True) # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
digit = train_loader.dataset.train_data[0] # 取第一个图片的数据
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])
def getXmean(X_train):
X_train = np.reshape(X_train, (X_train.shape[0], -1)) # 将图片从二维转化为一维
mean_image = np.mean(X_train, axis=0) # 求出训练集中所有图片每个像素位置上的平均值
return mean_image
def centralized(X_train, mean_image):
X_train = np.reshape(X_train, (X_train.shape[0], -1)) # 将图片从二维张开为一维
X_train = X_train.astype(np.float64)
X_train -= mean_image # 减去均值图像,实现零均值化
return X_train
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy()
# mean_image=getXmean(X_train)
# x_train=centralized(x_train,mean_image)
X_train = X_train.reshape(X_train.shape[0], 28 * 28)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = X_test.reshape(X_test.shape[0], 28 * 28)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = KNN_classify(5, 'M', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d /%d correct => accuracy: %f' % (num_correct, num_test, accuracy))
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 368 /1000 correct => accuracy: 0.368000
"""
正常化
import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np
# 给出训练数据以及对应的类别
from KNN import train_loader, test_loader
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
# 对应类别为A的数据集我们使用红色六角形表示
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
# 对应类别为B的数据集我们使用绿色十字形表示
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances) # 距离由小到大进行排序,并返回index值
topK = nearest_k[:k] # 选取前k个距离
classCount = {}
for i in topK: # 统计每个类别的个数
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist', # 选择数据的根目录
train=True, # 选择训练集
transform=None, # 不考虑使用任何数据预处理
download=True) # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
digit = train_loader.dataset.train_data[0] # 取第一个图片的数据
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])
def getXmean(X_train):
X_train = np.reshape(X_train, (X_train.shape[0], -1)) # 将图片从二维转化为一维
mean_image = np.mean(X_train, axis=0) # 求出训练集中所有图片每个像素位置上的平均值
return mean_image
def centralized(X_train, mean_image):
X_train = np.reshape(X_train, (X_train.shape[0], -1)) # 将图片从二维张开为一维
X_train = X_train.astype(np.float64)
X_train -= mean_image # 减去均值图像,实现零均值化
return X_train
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy()
mean_image = getXmean(X_train)
X_train = centralized(X_train, mean_image)
# X_train = X_train.reshape(X_train.shape[0], 28 * 28)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = centralized(X_test, mean_image)
# X_test = X_test.reshape(X_test.shape[0], 28 * 28)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = KNN_classify(5, 'E', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 963 / 1000 correct => accuracy: 0.963000
"""
1复利积累,量变引起质变。比如读书,运动,正确的认知
2.正向循环,变成更好的自己,优秀是一种习惯。比如早起读书写作一小时,运动,冥想,看电影听音乐。
3.成长性思维,享受不断成长的过程。
4.长期主义,比如慢慢变富。
5.沸水效应,类似习惯培养,集中培养一个习惯固定好了,再培养下一个习惯。比如阅读专注,运动。
6.吸引效应,积极的心态去面对问题。
7.以终为始,决策定位。比如定好目标再分解到每天去做。
8.二八定律,重要的事情只有两成,做好这两成,比如阅读只有两成是重点,读书是为了改变行动,重点在落到行动力上。
9.不同的人看法不用,认知层次决定。
10.终身成长,走出舒适圈,不要给自己设限。
人从来不是一成不变的,生活会追着你一而再的脱胎换骨。只要愿意,你就永远有机会成为一个不断前行着的人,成为那个自己喜欢着的模样。 |
---|
文章出处登录后可见!
已经登录?立即刷新