import pandas as pd
import sklearn.datasets as datasets
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/servo/servo.data',
header=None,names=['motor','screw','pgain','vgain','target'])
df.head()
df.describe(include='all')
df.plot(kind='box') # Box Plots
import statsmodels.api as sm
# 或者使用DescrStatsW
d1 = sm.stats.DescrStatsW(df.loc[:,'target'])
d1.tconfint_mean(0.05) # alpha=0.05
%matplotlib inline
import seaborn as sns
from scipy import stats
sns.distplot(df.loc[:,'target'], kde=True, fit=stats.norm) # Histograph
import statsmodels.api as sm
from matplotlib import pyplot as plt
fig = sm.qqplot(df.loc[:,'target'], fit=True, line='45')
import pandas as pd
import scipy
rv=scipy.stats.norm(loc=0,scale=1)
df_rv=pd.DataFrame(rv.rvs(size=4000),columns=['data'])
print('Jarque-Bera test:', stats.jarque_bera(df_rv.data)) # 返回统计量和p-value
print('Shapiro-Wilk test:', stats.shapiro(df_rv.data)) # 返回统计量和p-value
stats.kstest(rvs=df_rv.data,cdf='norm') # 返回统计量和p-value
't-statistic=%6.4f, p-value=%6.4f, df=%s' %d1.ttest_mean(1.15)
p-value>0.05,说明不能拒绝原假设,认为均值确实是1.15
先做个描述性统计
df.groupby('motor')['target'].describe()
挑选前两类做T检验
(_,df1),(_,df2),(_,df3),(_,_),(_,_)=df.groupby('motor')['target']
leveneTestRes = stats.levene(df1, df2, center='median')
print('w-value=%6.4f, p-value=%6.4f' %leveneTestRes)
p-value < 0.05,说明拒绝原假设,认为方差不齐
p-value > 0.05,说明无法拒绝原假设,认为方差齐性
# stats.stats.ttest_ind(df1, df2, equal_var=True)#scipy.stats
'statistic=%6.4f, pvalue=%6.4f, df=%6.4f'%sm.stats.ttest_ind(df1, df2, usevar='unequal')#usevar='pooled'or'unequal'
pvalue < 0.05 说明拒绝原假设,认为均值不相等 pvalue > 0.05 说明无法拒绝原假设,认为均值相等 http://www.statsmodels.org/stable/generated/statsmodels.stats.weightstats.CompareMeans.ttest_ind.html
sepal_length_list = []
for i in df['motor'].unique():
sepal_length_list.append(df[df['motor'] == i]['target'])
stats.f_oneway(*sepal_length_list)
# 利用回归模型中的方差分析
from statsmodels.formula.api import ols
sm.stats.anova_lm(ols('target ~ C(motor)',data=df).fit())
sm.stats.anova_lm(ols('target ~ C(motor) + C(screw)',data=df).fit())
ana = ols('target ~ C(motor) + C(screw) +C(motor)*C(screw)', data= df).fit()
sm.stats.anova_lm(ana)
散点图
import seaborn as sns
sns.jointplot(x='vgain',y='target', data=df,kind='scatter')
相关性分析:“spearman”,“pearson” 和 "kendall"
df.loc[:,['pgain', 'target']].corr(method='pearson')
# df.loc[:,['pgain', 'target']].corr(method='spearman')
# df.loc[:,['pgain', 'target']].corr(method='kendall')
cross_table=df.pivot_table(index='motor',columns='screw',values='target',aggfunc='count')
cross_table
print('chisq = %6.4f\n p-value = %6.4f\n dof = %i\n expected_freq = %s'%stats.chi2_contingency(cross_table))