大图, 下载
数据源:http://archive.ics.uci.edu/ml/datasets/Auto+MPG
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',sep='\s+',na_values='?',
header=None,names=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'])
df.head()
df=df.dropna(how='any')
df.describe()
散点图
plt.plot(df.loc[:,'displacement'],df.loc[:,'mpg'],'.')
plt.show()
df.corr(method='pearson')
lm_s = ols('mpg ~ displacement', data=df).fit()
lm_s.params
Predict-在原始数据集上得到预测值和残差
lm_s.summary()
pd.DataFrame([lm_s.predict(df), lm_s.resid], index=['predict', 'resid']).T.head()
plt.plot(df.displacement,df.mpg,'.')
plt.plot(df.displacement,lm_s.predict(df.displacement))
plt.show()
lm_m = ols('mpg ~ cylinders + displacement + horsepower + weight', data=df).fit()
lm_m.summary()
'''forward select'''
def forward_select(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
aic = ols(formula=formula, data=data).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break
formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = ols(formula=formula, data=data).fit()
return(model)
lm_m = forward_select(data=df, response='mpg')
print(lm_m.rsquared)
ana1 = ols('mpg ~ displacement', data=df).fit()
plt.plot(df.displacement,lm_s.resid,'.')
plt.show()
遇到异方差情况,教科书上会介绍使用加权最小二乘法,但是实际上最常用的是对被解释变量取对数
df2=df.copy()
df2['mpg_ln']=np.log(df2['mpg'])
df2['displacement_ln']=np.log(df2['displacement'])
ana2 = ols('mpg_ln ~ displacement', data=df2).fit()
plt.plot(df2.displacement,ana2.resid,'.')
plt.show()
ana3 = ols('mpg_ln ~ displacement_ln', data=df2).fit()
plt.plot(df2.displacement_ln,ana2.resid,'.')
plt.show()
寻找最优的模型
r_sq = {'mpg~displacement':ana1.rsquared, 'ln(mpg)~displacement':ana2.rsquared, 'ln(mpg)~ln(displacement)':ana3.rsquared}
print(r_sq)
df2['resid']=ana3.resid
df2['resid_t'] = (df2['resid'] - df2['resid'].mean()) / df2['resid'].std()
Find outlier:
df2[abs(df2['resid_t']) > 2].head(5)
Drop outlier
df2 = df2[abs(df2['resid_t']) <= 2].copy()
ana4 = ols('mpg_ln ~ displacement_ln', df2).fit()
plt.plot(df2.displacement_ln,ana4.resid,'.')
plt.show()
ana4.rsquared
statemodels包提供了更多强影响点判断指标
from statsmodels.stats.outliers_influence import OLSInfluence
OLSInfluence(ana3).summary_frame().head()
用以上的方法,增加变量,做多元回归
df2['horsepower_ln']=np.log(df2['horsepower'])
df2['weight_ln']=np.log(df2['weight'])
ana5=ols('mpg_ln ~ displacement_ln + horsepower_ln + weight_ln + acceleration + model_year ',data=df2).fit()
ana5.summary()
ana5.bse # The standard errors of the parameter estimates
The function "statsmodels.stats.outliers_influence.variance_inflation_factor" uses "OLS" to fit data, and it will generates a wrong rsquared. So define it ourselves!
def vif(df, col_i):
cols = list(df.columns)
cols.remove(col_i)
cols_noti = cols
formula = col_i + '~' + '+'.join(cols_noti)
r2 = ols(formula, df).fit().rsquared
return 1. / (1. - r2)
dfog = df2[['displacement_ln', 'horsepower_ln', 'weight_ln','acceleration','model_year']]
for i in dfog.columns:
print(i, '\t', vif(df=dfog, col_i=i))
displacement_ln与weight_ln具有共线性,可剔除
这里用比率去制造一个新feature
df2['d_w_ratio'] = df2['weight']/df2['displacement']
exog1 = df2[['d_w_ratio','horsepower_ln','acceleration','model_year']]
for i in exog1.columns:
print(i, '\t', vif(df=exog1, col_i=i))
ana6 = ols('mpg_ln ~ d_w_ratio + horsepower_ln + acceleration + model_year',data=df2).fit()
ana6.summary()
var_select = df2[['mpg_ln','d_w_ratio','cylinders','acceleration','model_year']]
ana7 = forward_select(data=var_select, response='mpg_ln')
print(ana7.rsquared)
formula8 = '''
mpg_ln ~ weight_ln + model_year + d_w_ratio +
C(origin)
'''
ana8 = ols(formula8, df2).fit()
ana8.summary()
formula9 = '''
mpg_ln ~ weight_ln + model_year + d_w_ratio +
C(origin):C(cylinders)
'''
ana9 = ols(formula9, df2).fit()
ana9.summary()
lmr = ols('mpg ~ displacement+ horsepower+ weight+model_year ',
data=df).fit_regularized(alpha=1, L1_wt=0)
lmr.summary()
lmrl = ols('mpg ~ displacement+ horsepower+ weight+model_year ',
data=df).fit_regularized(alpha=1, L1_wt=1)
lmrl.summary()
from sklearn.linear_model import RidgeCV
X = df2[['displacement', 'horsepower', 'weight', 'model_year']]
y = df2[['mpg_ln']]
# alphas = np.logspace(-3, 1, 20, base=10)
alphas=np.linspace(0.1,100,200)
rcv = RidgeCV(alphas=alphas, store_cv_values=True) # Search the min MSE by CV
rcv.fit(X, y)
print('The best alpha is {}'.format(rcv.alpha_))
print('The r-square is {}'.format(rcv.score(X,y))) # Default score is rsquared
rcv.predict(X)[:5]
cv_values = rcv.cv_values_[:, 0, :]
cv_mean = cv_values.mean(axis=0)
cv_std = cv_values.std(axis=0)
ub = cv_mean + 0.0001*cv_std
lb = cv_mean - 0.0001*cv_std
# plt.semilogx(alphas, cv_mean, label='mean_score')
plt.plot(alphas, cv_mean, label='mean_score')
plt.fill_between(alphas, lb, ub, alpha=0.2)
plt.xlabel("$\\alpha$")
plt.ylabel("mean squared errors")
plt.legend(loc="best")
plt.show()
rcv.coef_
from sklearn.linear_model import Ridge
ridge = Ridge()
coefs = []
for alpha in alphas:
ridge.set_params(alpha=alpha)
ridge.fit(X, y)
coefs.append(ridge.coef_)
coefs
ax = plt.gca()
ax.plot(alphas, coefs)
# ax.set_xscale('log')
# plt.xlabel('alpha')
# plt.ylabel('weights')
# plt.title('Ridge coefficients as a function of the regularization')
# plt.axis('tight')
plt.show()