【PyTorch】KNN实战之MNIST数据分类与归一化处理

KNN的算法实现

首先创建一个演示数据集

import numpy as np
import matplotlib.pyplot as plt


# 给出训练数据以及对应的类别

def createDataSet():
    group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
    labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
    return group, labels


if __name__ == '__main__':
    group, labels = createDataSet()
    plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
    #  对应类别为A的数据集我们使用红色六角形表示
    plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
    #  对应类别为B的数据集我们使用绿色十字形表示
    plt.show()

代码介绍:

  • createDataSet用于创建训练数据集及其对应的类别,group对应的是二维训练数据集分别对应x轴和y轴的数据
  • labels对应的是训练集的标签
  • 使用Matplotlib绘制图形,scatter绘制散点图

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-R4C2x6KM-1647412956332)(C:\Users\Administrator\AppData\Roaming\Typora\typora-user-images\image-20220216183410997.png)]

Python基于欧氏距离实现KNN分类器

def KNN_classify(k, dis, X_train, x_train, Y_test):
    assert dis == 'E' or dis == 'M', 'dis must E or M,E代表欧氏距离,M代表哈曼顿距离'
    num_test = Y_test.shape[0]
    labellist = []

    if (dis == 'E'):
        for i in range(num_test):
            distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
            nearest_k = np.argsort(distances)
            topK = nearest_k[:k]
            classCount = {}
            for i in topK:
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
                sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
                labellist.append(sortedClassCount[0][0])
                return np.array(labellist)

测试KNN算法

需要注意的是,我们在输入测试集的时候需要将其转换为Numpy的矩阵,否则系统会提示传入的参数是list类型,没有shape的方法

if __name__ == '__main__':
    group, labels = createDataSet()
    y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
    print(y_test_pred)
# 测试数据选择一个测试,前面测试为A,后面的测试为B

完整代码

import operator

import numpy as np
import matplotlib.pyplot as plt


# 给出训练数据以及对应的类别

def createDataSet():
    group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
    labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
    return group, labels


if __name__ == '__main__':
    group, labels = createDataSet()
    plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
    #  对应类别为A的数据集我们使用红色六角形表示
    plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
    #  对应类别为B的数据集我们使用绿色十字形表示
    plt.show()


def KNN_classify(k, dis, X_train, x_train, Y_test):
    assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
    num_test = Y_test.shape[0]
    leballist = []
    if (dis == 'E'):
        for i in range(num_test):
            distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
            nearest_k = np.argsort(distances)  # 距离由小到大进行排序,并返回index值
            topK = nearest_k[:k]  # 选取前k个距离
            classCount = {}
            for i in topK:  # 统计每个类别的个数
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)
    else:
        for i in range(num_test):
            distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
            nearest_k = np.argsort(distances)
            topK = nearest_k[:k]
            classCount = {}
            for i in topK:
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)


if __name__ == '__main__':
    group, labels = createDataSet()
    y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
    print(y_test_pred)

#  ['A' 'B']

KNN实战

KNN实现MNIST数据分类

MNIST数据集是一个很经典的且很常用的数据集(类似图像处理中的“Hello World!”),它是一个基本的数据集,因此我们可以直接使用PyTorch框架进行数据下载与读取

执行代码的过程是一个比较长的过程

import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms

batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist',  # 选择数据的根目录
                            train=True,  # 选择训练集
                            transform=None,  # 不考虑使用任何数据预处理
                            download=True)  # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)  # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())

"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
"""

MNIST数据集在KNN算法下的分类准确度

比较方法:逐像素比较,最后将所有差值相加。如果两张图片相同,则差异值将相同。差异值越大,图片之间的差异越大。

import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np

# 给出训练数据以及对应的类别
from KNN import train_loader, test_loader


def createDataSet():
    group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
    labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
    return group, labels


if __name__ == '__main__':
    group, labels = createDataSet()
    plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
    #  对应类别为A的数据集我们使用红色六角形表示
    plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
    #  对应类别为B的数据集我们使用绿色十字形表示
    plt.show()


def KNN_classify(k, dis, X_train, x_train, Y_test):
    assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
    num_test = Y_test.shape[0]
    leballist = []
    if (dis == 'E'):
        for i in range(num_test):
            distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
            nearest_k = np.argsort(distances)  # 距离由小到大进行排序,并返回index值
            topK = nearest_k[:k]  # 选取前k个距离
            classCount = {}
            for i in topK:  # 统计每个类别的个数
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)
    else:
        for i in range(num_test):
            distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
            nearest_k = np.argsort(distances)
            topK = nearest_k[:k]
            classCount = {}
            for i in topK:
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)


batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist',  # 选择数据的根目录
                            train=True,  # 选择训练集
                            transform=None,  # 不考虑使用任何数据预处理
                            download=True)  # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)  # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())

digit = train_loader.dataset.train_data[0]  # 取第一个图片的数据
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])


def getXmean(X_train):
    X_train = np.reshape(X_train, (X_train.shape[0], -1))  # 将图片从二维转化为一维
    mean_image = np.mean(X_train, axis=0)  # 求出训练集中所有图片每个像素位置上的平均值
    return mean_image


def centralized(X_train, mean_image):
    X_train = np.reshape(X_train, (X_train.shape[0], -1))  # 将图片从二维张开为一维
    X_train = X_train.astype(np.float64)
    X_train -= mean_image  # 减去均值图像,实现零均值化
    return X_train


if __name__ == '__main__':
    X_train = train_loader.dataset.train_data.numpy()
    # mean_image=getXmean(X_train)
    # x_train=centralized(x_train,mean_image)
    X_train = X_train.reshape(X_train.shape[0], 28 * 28)
    y_train = train_loader.dataset.train_labels.numpy()
    X_test = test_loader.dataset.test_data[:1000].numpy()
    X_test = X_test.reshape(X_test.shape[0], 28 * 28)
    y_test = test_loader.dataset.test_labels[:1000].numpy()
    num_test = y_test.shape[0]
    y_test_pred = KNN_classify(5, 'M', X_train, y_train, X_test)
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print('Got %d /%d correct => accuracy: %f' % (num_correct, num_test, accuracy))
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 368 /1000 correct => accuracy: 0.368000
"""

正常化

import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np

# 给出训练数据以及对应的类别
from KNN import train_loader, test_loader


def createDataSet():
    group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
    labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
    return group, labels


if __name__ == '__main__':
    group, labels = createDataSet()
    plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
    #  对应类别为A的数据集我们使用红色六角形表示
    plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
    #  对应类别为B的数据集我们使用绿色十字形表示
    plt.show()


def KNN_classify(k, dis, X_train, x_train, Y_test):
    assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
    num_test = Y_test.shape[0]
    leballist = []
    if (dis == 'E'):
        for i in range(num_test):
            distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
            nearest_k = np.argsort(distances)  # 距离由小到大进行排序,并返回index值
            topK = nearest_k[:k]  # 选取前k个距离
            classCount = {}
            for i in topK:  # 统计每个类别的个数
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)
    else:
        for i in range(num_test):
            distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
            nearest_k = np.argsort(distances)
            topK = nearest_k[:k]
            classCount = {}
            for i in topK:
                classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            leballist.append(sortedClassCount[0][0])
        return np.array(leballist)


batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root='/ml/pymnist',  # 选择数据的根目录
                            train=True,  # 选择训练集
                            transform=None,  # 不考虑使用任何数据预处理
                            download=True)  # 从网络上下载图片
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
# 加载数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)  # 将数据打乱
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())

digit = train_loader.dataset.train_data[0]  # 取第一个图片的数据
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])


def getXmean(X_train):
    X_train = np.reshape(X_train, (X_train.shape[0], -1))  # 将图片从二维转化为一维
    mean_image = np.mean(X_train, axis=0)  # 求出训练集中所有图片每个像素位置上的平均值
    return mean_image


def centralized(X_train, mean_image):
    X_train = np.reshape(X_train, (X_train.shape[0], -1))  # 将图片从二维张开为一维
    X_train = X_train.astype(np.float64)
    X_train -= mean_image  # 减去均值图像,实现零均值化
    return X_train


if __name__ == '__main__':
    X_train = train_loader.dataset.train_data.numpy()
    mean_image = getXmean(X_train)
    X_train = centralized(X_train, mean_image)
    #  X_train = X_train.reshape(X_train.shape[0], 28 * 28)
    y_train = train_loader.dataset.train_labels.numpy()
    X_test = test_loader.dataset.test_data[:1000].numpy()
    X_test = centralized(X_test, mean_image)
    # X_test = X_test.reshape(X_test.shape[0], 28 * 28)
    y_test = test_loader.dataset.test_labels[:1000].numpy()
    num_test = y_test.shape[0]
    y_test_pred = KNN_classify(5, 'E', X_train, y_train, X_test)
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

    """
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 963 / 1000 correct => accuracy: 0.963000
    """

1复利积累,量变引起质变。比如读书,运动,正确的认知
2.正向循环,变成更好的自己,优秀是一种习惯。比如早起读书写作一小时,运动,冥想,看电影听音乐。
3.成长性思维,享受不断成长的过程。
4.长期主义,比如慢慢变富。
5.沸水效应,类似习惯培养,集中培养一个习惯固定好了,再培养下一个习惯。比如阅读专注,运动。
6.吸引效应,积极的心态去面对问题。
7.以终为始,决策定位。比如定好目标再分解到每天去做。
8.二八定律,重要的事情只有两成,做好这两成,比如阅读只有两成是重点,读书是为了改变行动,重点在落到行动力上。
9.不同的人看法不用,认知层次决定。
10.终身成长,走出舒适圈,不要给自己设限。

人从来不是一成不变的,生活会追着你一而再的脱胎换骨。只要愿意,你就永远有机会成为一个不断前行着的人,成为那个自己喜欢着的模样。

文章出处登录后可见!

已经登录?立即刷新

共计人评分,平均

到目前为止还没有投票!成为第一位评论此文章。

(0)
青葱年少的头像青葱年少普通用户
上一篇 2022年3月18日 下午9:20
下一篇 2022年3月18日 下午9:39

相关推荐