目录
(一)示例
(二)使用scikit-learn包实现
编辑:
校对:
版本:
Jiangnan
Jiangnan
python3
示例
使用K-近邻算法改进约会网站的配对效果,我们将进行下面几个过程:
(1) 准备数据:从文本文件中读取数据
(2) 分析数据:使用matplotlib 画散点图
(3) 准备数据:归一化处理
(4) 测试算法:判断分类效果
from numpy import * #将函数库中的所有模块导入
import operator #导入运算符模块
#读取路径下文件
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(
int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
#对数据进行归一化处理
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet, ranges, minVals
#分类函数
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
#测试算法
def datingClassTest():
hoRatio = 0.50
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print(errorCount)
#画散点图
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(returnMat[:,1],returnMat[:,2])
plt.show()
复制
复制
使用scikit-learn包实现
我们可以用上面的代码读取数据和归一化之后用下面的代码进行预测和测试。
代码:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
np.random.seed(0)
indices = np.random.permutation(len(returnMat))
x_train = returnMat[indices[:-10]]
y_train = array(classLabelVector)[indices[:-10]]
x_test = returnMat[indices[-10:]]
y_test = array(classLabelVector)[indices[-10:]]
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
y_predict = knn.predict(x_test)
# 调用该对象的测试方法,主要接收一个参数:测试数据集
probility = knn.predict_proba(x_test)
# 计算各测试样本基于概率的预测
neighborpoint = knn.kneighbors([x_test[-1]], 5, False)
# 计算与最后一个测试样本距离在最近的5个点,返回的是这些样本的序号组成的数组
score = knn.score(x_test, y_test, sample_weight=None)
# 调用该对象的打分方法,计算出准确率
print('y_predict = ')
print(y_predict)
# 输出测试的结果
print('y_test = ')
print(y_test)
# 输出原始测试数据集的正确标签,以方便对比
print('Accuracy:', score)
# 输出准确率计算结果
print('neighborpoint of last test sample:', neighborpoint)
print('probility:')
print(probility)
复制
思考——学而不思则罔
(1)当数据含有非常多的特征时如何运用sklearn 中的K-近邻算法快速的得到分类结果?
(2)如何更加合理的确定K的取值?

理解编程语言,探索数据奥秘
每日练习|干货分享|新闻资讯|公益平台。
每天学习一点点,你将会见到全新的自己。

长按识别二维码关注