%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
pd.set_option('display.max_columns', 8)
from sklearn import datasets
dataset=datasets.load_boston()
X=dataset.data
y=dataset.target
df=pd.DataFrame(dataset.data,columns=dataset.feature_names)
df.loc[:,'target']=dataset.target
df.head()
df.corr()
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(X)
from sklearn.linear_model import LinearRegression
regr=LinearRegression().fit(X,y)
regr.score(X,y)
regr.predict(X)
regr.coef_#Coefficients
df1=df.iloc[:,:-1]
def vif(df, col_i):
cols = list(df.columns)
cols.remove(col_i)
cols_noti = cols
formula = col_i + '~' + '+'.join(cols_noti)
r2 = ols(formula, df).fit().rsquared
return 1. / (1. - r2)
lm = ols('target ~ '+'+'.join(dataset.feature_names),data=df).fit()
lm.summary()
from sklearn.linear_model import Ridge
ridge = Ridge()
alphas = np.logspace(-2, 5, 1000, base=10)
coefs = []
scores=[]
for alpha in alphas:
ridge.set_params(alpha=alpha)
ridge.fit(X, y)
coefs.append(ridge.coef_)
scores.append(ridge.score(X,y))
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
plt.figure()
plt.plot(alphas,scores)
ridge.set_params(alpha=4.67)
ridge.fit(X, y)
ridge.predict(X)
ridge.coef_
ridge.score(X, y)
lmr = ols('target ~ '+'+'.join(dataset.feature_names),data=df).fit_regularized(alpha=1, L1_wt=0)
lmr.summary()
# L1_wt参数为0则使用岭回归,为1使用lasso
lmr.params
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X = scaler.fit_transform(df.iloc[:,:-1])
y = df.loc[:,'target']
from sklearn.linear_model import RidgeCV
alphas = np.logspace(-2, 5, 1000, base=10)
# Search the min MSE by CV
rcv = RidgeCV(alphas=alphas, store_cv_values=True)
rcv.fit(X, y)
rcv.predict(X)
print('The best alpha is {}'.format(rcv.alpha_))
print('The r-square is {}'.format(rcv.score(X, y)))
# Default score is rsquared
rcv.coef_
cv_values = rcv.cv_values_
n_fold, n_alphas = cv_values.shape
cv_mean = cv_values.mean(axis=0)
cv_std = cv_values.std(axis=0)
ub = cv_mean + cv_std / np.sqrt(n_fold)
lb = cv_mean - cv_std / np.sqrt(n_fold)
plt.semilogx(alphas, cv_mean, label='mean_score')
plt.fill_between(alphas, lb, ub, alpha=0.2)
plt.xlabel(r"$\alpha$")
plt.ylabel("mean squared errors")
plt.legend(loc="best")
plt.show()
#这个图,最低点最好
lmr1 = ols('target ~ '+'+'.join(dataset.feature_names),data=df).fit_regularized(alpha=1, L1_wt=1)
lmr1.summary()
from sklearn.linear_model import LassoCV
lasso_alphas = np.logspace(-3, 2, 100, base=10)
lcv = LassoCV(alphas=lasso_alphas, cv=10) # Search the min MSE by CV
lcv.fit(X, y)
print('The best alpha is {}'.format(lcv.alpha_))
print('The r-square is {}'.format(lcv.score(X, y)))
# Default score is rsquared
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso_coefs = []
for alpha in lasso_alphas:
lasso.set_params(alpha=alpha)
lasso.fit(X, y)
lasso_coefs.append(lasso.coef_)
ax = plt.gca()
ax.plot(lasso_alphas, lasso_coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
from sklearn.linear_model import ElasticNet#还有ElasticNetCV
regr=ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001,
warm_start=False, positive=False, random_state=None, selection='cyclic')
# alpha:alpha
# l1_ratio:rho值
# fit_intercept:是否拟合intercept