TensorFlow也可以用来做KNN
demo1
step1:加载数据
import pandas as pd
import sklearn.datasets as datasets
import tensorflow as tf
data_boston = datasets.load_boston()
df = pd.DataFrame(data_boston.data, columns=data_boston.feature_names).drop(['ZN', 'CHAS', 'RAD'], axis=1)
X, Y = df.values, data_boston.target.reshape(-1, 1)
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
num_featurs=X.shape[1]
step2:声明占位符
sess=tf.Session()
k=4
batch_size=20
x_train=tf.placeholder(shape=[None,num_featurs],dtype=tf.float32)
x_test=tf.placeholder(shape=[None,num_featurs],dtype=tf.float32)
y_train=tf.placeholder(shape=[None,1],dtype=tf.float32)
y_test=tf.placeholder(shape=[None,1],dtype=tf.float32)
distance=tf.reduce_sum(tf.abs(x_train-tf.expand_dims(x_test,1)),axis=2)
# 返回一个距离矩阵(使用了tf的广播计算原理)
# 这里用的是l1范数。如果想用l2范数,只需要把tf.abs换成tf.quare,
step3:预测阶段
top_k_xvals, top_k_indices = tf.nn.top_k(-distance, k=k)
top_k_yvals = tf.gather(y_train, top_k_indices)
prediction=tf.reduce_mean(top_k_yvals,axis=1)
下面这个是《TensorFlow机器学习实战指南》中的代码,计算了距离权重。
有一处错误,权重与距离成正比(正确算法应当是成反比),不过如果直接用倒数的话会出现除0错误。
top_k_xvals, top_k_indices = tf.nn.top_k(-distance, k=k)
x_sums = tf.expand_dims(tf.reduce_sum(top_k_xvals, 1), 1)
x_sums_repeated = tf.matmul(x_sums, tf.ones((1, k), dtype=tf.float32))
x_val_weights = tf.expand_dims(tf.div(top_k_xvals, x_sums_repeated), 1)
# 主要为了用距离做权重
top_k_yvals = tf.gather(y_train, top_k_indices)
prediction = tf.squeeze(tf.matmul(x_val_weights, top_k_yvals), squeeze_dims=[1])
# mse = tf.div(tf.square(prediction - y_test), batch_size)
step4:预测并验证结果
Y_test_predict=sess.run(prediction,feed_dict={x_train:X_train,x_test:X_test,y_train:Y_train})
from sklearn.metrics import r2_score
r2_score(Y_test,Y_test_predict)
# 附上一个sklearn的实现
from sklearn.neighbors import KNeighborsRegressor
knr=KNeighborsRegressor(n_neighbors=k)
knr.fit(X_train,Y_train)
Y_test_predict1=knr.predict(X_test)
r2_score(Y_test,Y_test_predict1)
demo2:MNIST数据集
仅需少许改动(改动是由于y是OneHotEncode)
step1:读取数据
因为内存不够大,所以用采样的方式(如果你的内存足够大,train可以不用取样,只取样test即可)
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)
train_size = 1000
test_size = 100
rand_train_indices = np.random.choice(len(mnist.train.images), train_size)
rand_test_indices = np.random.choice(len(mnist.test.images), test_size)
X_train = mnist.train.images[rand_train_indices]
Y_train = mnist.train.labels[rand_train_indices]
X_test = mnist.test.images[rand_test_indices]
Y_test = mnist.test.labels[rand_test_indices]
num_featurs = X_train.shape[1]
step2:声明占位符
sess=tf.Session()
k=4
x_train=tf.placeholder(shape=[None,num_featurs],dtype=tf.float32)
x_test=tf.placeholder(shape=[None,num_featurs],dtype=tf.float32)
y_train=tf.placeholder(shape=[None,10],dtype=tf.float32)
y_test=tf.placeholder(shape=[None,10],dtype=tf.float32)
distance=tf.reduce_sum(tf.abs(x_train-tf.expand_dims(x_test,1)),axis=2)
top_k_xvals, top_k_indices = tf.nn.top_k(-distance, k=k)
top_k_yvals = tf.gather(y_train, top_k_indices)
prediction=tf.reduce_mean(top_k_yvals,axis=1)
step3:预测阶段
Y_test_predict=sess.run(prediction,feed_dict={x_train:X_train,x_test:X_test,y_train:Y_train})
step4:验证结果
把OneHot转成普通编码,然后计数
# 也可以用tf框架做:tf.argmax(prediction,dimension=1)
predict = [i.argmax() for i in Y_test_predict]
true = [i.argmax() for i in Y_test]
correct_count, total_count = 0, 0
for i in range(len(predict)):
if predict[i] == true[i]:
correct_count += 1
total_count += 1
correct_count/total_count
参考资料
Nick McClure:《TensorFlow机器学习实战指南》 机械工业出版社
lan Goodfellow:《深度学习》 人民邮电出版社
王琛等:《深度学习原理与TensorFlow实战》 电子工业出版社
李嘉璇:《TensorFlow技术解析与实战》 人民邮电出版社
黄文坚:《TensorFlow实战》 电子工业出版社
郑泽宇等:《TensorFlow实战Google深度学习框架》 电子工业出版社