import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pylab as plt
df = pd.read_csv('http://www.guofei.site/StatisticsBlog/origin_files/timeseries/AirPassengers.csv',header=0,encoding='utf-8', index_col=0)
df.index.name='date'
df.index = pd.to_datetime(df.index) # 将字符串索引转换成时间索引
ts = df['NumPassengers'] # 生成pd.Series对象
ts.head()
def draw_ts(timeSeries):
f = plt.figure(facecolor='white')
timeSeries.plot(color='blue')
plt.show()
draw_ts(ts)
# 移动平均图
def draw_trend(timeSeries, size):
f = plt.figure(facecolor='white')
# 对size个数据进行移动平均
rol_mean = timeSeries.rolling(window=size).mean()
# 对size个数据进行加权移动平均
rol_weighted_mean = timeSeries.ewm( span=size).mean()
timeSeries.plot(color='blue', label='Original')
rol_mean.plot(color='red', label='Rolling Mean')
rol_weighted_mean.plot(color='black', label='Weighted Rolling Mean')
plt.legend(loc='best')
plt.title('Rolling Mean')
plt.show()
draw_trend(ts, size=5)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 自相关和偏相关图
def draw_acf_pacf(ts, lags=31):
f,ax=plt.subplots(2,1)
plot_acf(ts, lags=lags, ax=ax[0])
plot_pacf(ts, lags=lags, ax=ax[1])
plt.subplots_adjust(0,0,1,1,0,0)
plt.show()
draw_acf_pacf(ts)
from statsmodels.tsa.arima_process import arma_generate_sample
ts_rv=arma_generate_sample(ar=[0.3,-0.3,0.2],ma=[1],nsample=1000)
draw_acf_pacf(ts_rv)
AR(p) | MA(q) | ARMA(p,q) | |
---|---|---|---|
ACF | 拖尾 | q期后截尾 | 拖尾 |
PACF | p期后截尾 | 拖尾 | 拖尾 |
from statsmodels.tsa.stattools import adfuller
'''
Unit Root Test
The null hypothesis of the Augmented Dickey-Fuller is that there is a unit
root, with the alternative that there is no unit root. That is to say the
bigger the p-value the more reason we assert that there is a unit root
'''
def testStationarity(ts):
dftest = adfuller(ts)
# 对上述函数求得的值进行语义描述
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
return dfoutput
testStationarity(ts)
# p-value>0.05,证明序列非平稳
H0:存在单位根,此时非平稳
案例汇中p较大,因此不能拒绝原假设。认为时间序列不平稳
ts_log = np.log(ts)
draw_ts(ts_log)
draw_trend(ts_log, 12)
diff_12 = ts_log.diff(12)
diff_12.dropna(inplace=True)
diff_12_1 = diff_12.diff(1)
diff_12_1.dropna(inplace=True)
testStationarity(diff_12_1)
因此使用diff_12_1来做ARMA
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log, model="additive")
# model="additive" 加法模型
# model='multiplicative' 乘法模型
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
f=plt.figure()
ax1 = f.add_subplot(311)
trend.plot()
ax2 = f.add_subplot(312)
seasonal.plot()
ax3 = f.add_subplot(313)
residual.plot()
plt.subplots_adjust(0,0,1,1,0,0)
plt.show()
(我们选用移动平均和差分法生成的平稳数据)
rol_mean = ts_log.rolling(window=12).mean()
rol_mean.dropna(inplace=True)
ts_diff_1 = rol_mean.diff(1)
ts_diff_1.dropna(inplace=True)
testStationarity(ts_diff_1)
不显著,因此再次diff
ts_diff_2 = ts_diff_1.diff(1)
ts_diff_2.dropna(inplace=True)
testStationarity(ts_diff_2)
from statsmodels.tsa.arima_model import ARMA
model = ARMA(ts_diff_2, order=(1, 1)) #order=(1,1,1)就是ARIMA
result_arma = model.fit( disp=-1, method='css')
result_arma.aic
result_arma.arparams
result_arma.maparams
predict_ts=result_arma.predict()
predict_ts = result_arma.predict()
# 一阶差分还原
diff_shift_ts = ts_diff_1.shift(1)
diff_recover_1 = predict_ts.add(diff_shift_ts)
# 再次一阶差分还原
rol_shift_ts = rol_mean.shift(1)
diff_recover = diff_recover_1.add(rol_shift_ts)
# 移动平均还原
rol_sum = ts_log.rolling(window=11).sum()
rol_recover = diff_recover*12 - rol_sum.shift(1)
# 对数还原
log_recover = np.exp(rol_recover)
log_recover.dropna(inplace=True)
ts = ts[log_recover.index] # 过滤没有预测的记录
plt.figure(facecolor='white')
log_recover.plot(color='blue', label='Predict')
ts.plot(color='red', label='Original')
plt.legend(loc='best')
plt.title('RMSE: %.4f'% np.sqrt(sum((log_recover-ts)**2)/ts.size))
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('ARIMA([0.6],1,0).csv', index_col=0)
ts = df['x']
ts.plot()
plt.show()
# 自相关和偏相关图
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
f,ax=plt.subplots(2,1)
plot_acf(ts, lags=51, ax=ax[0])
plot_pacf(ts, lags=51, ax=ax[1])
plt.subplots_adjust(0,0,1,1,0,0)
plt.show()
from statsmodels.tsa.stattools import adfuller
dftest=adfuller(ts, autolag='AIC') # 返回统计量、p-value 等
dftest
# p-value>0.05, 说明ts非平稳
ts_diff=ts.diff(1)[1:]
adfuller(ts_diff, autolag='AIC')
# p-value<0.05,说明序列平稳,可以使用统计时序模型进行建模
# 自相关和偏相关图
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
f,ax=plt.subplots(2,1)
plot_acf(ts_diff, lags=51, ax=ax[0])
plot_pacf(ts_diff, lags=51, ax=ax[1])
plt.subplots_adjust(0,0,1,1,0,0)
plt.show()
import statsmodels.tsa.api as smt
arima_1 = smt.ARIMA(df,order=(1,1,0)).fit()
print(arima_1.summary())
from statsmodels.stats.diagnostic import acorr_ljungbox
lags=5
rdtest = acorr_ljungbox(arima_1.resid,lags=lags)
pd.DataFrame({'test statistic':rdtest[0],'p-value':rdtest[1]},index=np.arange(lags)+1)
# p-value>0.05,不能拒绝原假设,认为是白噪声序列
arima_1.plot_predict(plot_insample=True,dynamic=False)
df_predict=arima_1.predict(start=1,end=998,exog=None,typ='levels',dynamic=False)
# 因为做了一阶差分,因此start=1,
# typ='levels' 表示预测差分以前的ts
# typ='linear' 猜测是无视了差分(不确定)
arima_1.forecast(steps=10,alpha=0.05)
arima_1.plot_predict(start=10,end=1020,plot_insample=True)
df_forecast=arima_1.forecast(steps=3,alpha=0.05)
pd.concat([pd.DataFrame(df_forecast[0], columns=['点估计']),
pd.DataFrame(df_forecast[1], columns=['标准差']),
pd.DataFrame(df_forecast[2], columns=['区间估计下限(95%)','区间估计下限(95%)'])],
axis=1)
df_forecast=arima_1.forecast(steps=3,alpha=0.05)
pd.DataFrame({'点估计':df_forecast[0],'标准差':df_forecast[1],
'区间估计下限(95%)':df_forecast[2][:,0],
'区间估计上限(95%)':df_forecast[2][:,1]})
# 这个模型会更细节一些
call_arima_final = smt.SARIMAX(ts, order=(1,1,1),seasonal_order=(0,1,0,52)).fit()
call_arima_final.summary()
call_arima_final.forecast(3)