import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
from sklearn import preprocessing
import matplotlib.pyplot as plt
import os
%matplotlib inline
pd.set_option('display.max_columns', None)
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',na_values='?',
header=None,index_col=0)#,names=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'])
df=df.loc[:,:5]
df=df.dropna(how='any')
df.head()
g = sns.PairGrid(df)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_diag(sns.kdeplot, lw=3, legend=False);
计算相关系数矩阵
df.corr(method='pearson')
标准化,一般情况下都要做,这里省略
初次查看主成分的解释方差占比
pca = PCA(n_components=5, whiten=True)
newData = pca.fit_transform(df)
pca.explained_variance_ratio_
观察数据可知,可以降到3维
pca = PCA(n_components=3, whiten=True)
pca.fit(df)
pca.explained_variance_#特征值
pca.explained_variance_ratio_ #特征值所占比例
pca.components_
score=pca.transform(df)
将打分结果和原始数据联结
score=pd.DataFrame(score,index=df.index,columns=['f1','f2','f3'])
data_new = df.join(score)
data_new.sort_values('f1').head()