%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
data=iris.data
# pd.set_option('display.max_columns', 10)
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn import cluster
from sklearn.cluster import KMeans,AgglomerativeClustering
pca_scaled_data=PCA(n_components=2,whiten=True).fit_transform(data)
fa_scaled_data=FactorAnalysis(n_components=2).fit_transform(data)
ward = AgglomerativeClustering(n_clusters=3, linkage='ward', compute_full_tree=True)
ward.fit(iris.data)
使用scipy中层次聚类函数:
import scipy.cluster.hierarchy as sch
disMat = sch.distance.pdist(pca_scaled_data, 'euclidean') # 生成距离矩阵
Z = sch.linkage(disMat, method='ward') # 进行层次聚类
P = sch.dendrogram(Z) # 将层级聚类结果以树状图表示出来
# plt.savefig('plot_dendrogram1.png') # 可保存图片
建模:
kmeans = KMeans(n_clusters=3, n_init=15) #15次聚类
kmeans.fit(pca_scaled_data)
labels=kmeans.labels_#原样本被分配到哪个类
kmeans.predict(pca_scaled_data)#新样本所在的类
kmeans.cluster_centers_#聚类中心点
import seaborn as sns
k=3#聚成3类
n=4#2个feature
df=pd.DataFrame(data,columns=['x1','x2','x3','x4'])
fig, ax = plt.subplots(k, n, sharex=True) # 共享x坐标的范围
for i in range(k):
for j in range(n):
sns.distplot(df.loc[labels == i, df.columns[j]], hist=False, ax=ax[i, j])
fig2, ax2 = plt.subplots(3, 1, sharex=True,sharey=True)
for i in range(3):
data_temp = df.loc[labels == i, :]
m_temp, n_temp = data_temp.shape
for j in range(m_temp):
ax2[i].plot(list(data_temp.iloc[j, :]), color='r')
plt.xticks(range(n_temp), data_temp.columns)
plt.show()
因为没有标准化,所以图看起来不太好
from sklearn.metrics import silhouette_score, adjusted_rand_score
print('Sihouette:', silhouette_score(pca_scaled_data, kmeans.labels_ ,metric='euclidean'))
print('ARI:', adjusted_rand_score(iris.target, kmeans.labels_ ))
from sklearn.pipeline import Pipeline
steps = [('pca', PCA(n_components=1)), ('km', KMeans(n_clusters=3))]
pl = Pipeline(steps=steps)
pl.fit(iris.data)
print('Silhouette: ',silhouette_score(pl.named_steps['pca'].transform(iris.data),pl.predict(iris.data)))
print('ARI: ', adjusted_rand_score(iris.target, pl.predict(iris.data)))
standardization是对列,Normalization是对行
from sklearn.preprocessing import Normalizer
steps2=[('Normalize',Normalizer()),['km',KMeans(n_clusters=3)]]
pl2=Pipeline(steps=steps2)
pl2.fit(iris.data)
print('Silhouette: ',silhouette_score(pl2.named_steps['Normalize'].transform(iris.data),pl2.predict(iris.data)))
print('ARI: ', adjusted_rand_score(iris.target, pl2.predict(iris.data)))
因为iris数据集量纲并不是差异巨大,所以聚类效果并不变更好
steps3=[('Normalize',Normalizer()),('pca',PCA(n_components=1)),['km',KMeans(n_clusters=3)]]
pl3=Pipeline(steps=steps3)
pl3.fit(iris.data)
print('Silhouette: ',silhouette_score(pl3.named_steps['pca'].transform(
pl3.named_steps['Normalize'].transform(iris.data)),pl3.predict(iris.data)))
print('ARI: ', adjusted_rand_score(iris.target, pl3.predict(iris.data)))
from sklearn.mixture import GaussianMixture
pca_scaled_data=PCA(n_components=2).fit_transform(data)
gm = GaussianMixture(n_components=3, n_init=3)
gm.fit(pca_scaled_data)
gm.predict(pca_scaled_data)
gm.predict_proba(pca_scaled_data)
gm.bic(pca_scaled_data)
print('Silhouette: ',
silhouette_score(pca_scaled_data, gm.predict(pca_scaled_data)))
print('ARI: ',
adjusted_rand_score(iris.target, gm.predict(pca_scaled_data)))
for cluster, marker in zip(range(3), ['x', 'o', '+']):
x_axis = pca_scaled_data[:, 0][gm.predict(pca_scaled_data) == cluster]
y_axis = pca_scaled_data[:, 1][gm.predict(pca_scaled_data) == cluster]
plt.scatter(x_axis, y_axis, marker=marker)
x = np.linspace(-4., 5.)
y = np.linspace(-1.5, 2.)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -gm.score_samples(XX)
Z = Z.reshape(X.shape)
plt.contour(X, Y, Z, levels=np.logspace(0, 2, 20), alpha=0.3)
plt.show()
聚类个数的选择
cv_types = ['spherical', 'tied', 'diag', 'full']
ks = range(1, 7)
cv_bic = {}
for c in cv_types:
k_bic = {}
for k in ks:
bic = GaussianMixture(n_components=k, covariance_type=c
).fit(iris.data).bic(iris.data)
k_bic[k] = bic
cv_bic[c] = k_bic
candidates = pd.DataFrame(cv_bic)
candidates.plot(style=['--', '-', '-.', ':'])
plt.show()
from sklearn.cluster import DBSCAN
pca_scaled_data=PCA(n_components=2).fit_transform(data)
dbscan = DBSCAN(eps=0.3, min_samples=6)
dbscan.fit(pca_scaled_data)
dbscan.labels_#-1指的是孤立点
from sklearn.metrics import adjusted_rand_score
print('ARI:', adjusted_rand_score(iris.target, dbscan.labels_))
mask = dbscan.labels_ > -1
print('ARI: ',adjusted_rand_score(iris.target[mask], dbscan.labels_[mask]))
plt.figure(figsize=[4, 3])
for cluster, marker in zip(range(-1, 3), ['^', 'x', 'o', '.']):
x_axis = pca_scaled_data[:, 0][dbscan.labels_ == cluster]
y_axis = pca_scaled_data[:, 1][dbscan.labels_ == cluster]
plt.scatter(x_axis, y_axis, marker=marker)
plt.show()