import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
from sklearn import preprocessing
import matplotlib.pyplot as plt
import os
%matplotlib inline
pd.set_option('display.max_columns', None)
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',na_values='?',
header=None,index_col=0)#,names=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'])
df=df.loc[:,:5]
df=df.dropna(how='any')
df.head()
g = sns.PairGrid(df)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_diag(sns.kdeplot, lw=3, legend=False);
df.corr(method='pearson')
pca = PCA(n_components=4, whiten=True)
newData = pca.fit(df)
pca.explained_variance_ratio_
降维到2维
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2)
fa.fit(df)
fa.components_
fa.transform(df)
newData = pd.DataFrame(fa.transform(df),index=df.index,columns=['f1','f2'])
df.join(newData).head()