Pythonによる時系列分析の基礎

https://logics-of-blue.com/python-time-series-analysis/

In [2]:
# 基本のライブラリを読み込む
import numpy as np
import pandas as pd
from scipy import stats

# グラフ描画
from matplotlib import pylab as plt
import seaborn as sns
%matplotlib inline
sns.set()

# グラフを横長にする
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

# 統計モデル
import statsmodels.api as sm

# ワーニングを表示させない
import warnings
warnings.filterwarnings('ignore')

時系列データの読み込み

In [3]:
# 普通にデータを読み込む
# https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/AirPassengers.html
dataNormal = pd.read_csv('AirPassengers.csv')
dataNormal.head()
Out[3]:
Month #Passengers
0 1949-01 112
1 1949-02 118
2 1949-03 132
3 1949-04 129
4 1949-05 121
In [4]:
# 日付形式で読み込む(dtype=floatで読み込まないと、あとでARIMAモデル推定時にエラーとなる)
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
data_old_method = pd.read_csv('AirPassengers.csv', index_col='Month', date_parser=dateparse, dtype='float')
data_old_method.head()
Out[4]:
#Passengers
Month
1949-01-01 112.0
1949-02-01 118.0
1949-03-01 132.0
1949-04-01 129.0
1949-05-01 121.0
In [5]:
# 2018年7月7日追記
# 以下のコードの方が読み込みは簡単
data = pd.read_csv('AirPassengers.csv', 
                      index_col='Month', 
                      parse_dates=True, 
                      dtype='float')
data.head()
Out[5]:
#Passengers
Month
1949-01-01 112.0
1949-02-01 118.0
1949-03-01 132.0
1949-04-01 129.0
1949-05-01 121.0
In [6]:
# 日付形式にする
ts = data['#Passengers'] 
ts.head()
Out[6]:
Month
1949-01-01    112.0
1949-02-01    118.0
1949-03-01    132.0
1949-04-01    129.0
1949-05-01    121.0
Name: #Passengers, dtype: float64

時系列データの取り扱い

In [7]:
# プロット
plt.plot(ts)
Out[7]:
[<matplotlib.lines.Line2D at 0x237ce0dcc18>]
In [8]:
# データの取得方法その1
ts['1949-01-01']
Out[8]:
112.0
In [9]:
# データの取得方法その2
from datetime import datetime
ts[datetime(1949,1, 1)]
Out[9]:
112.0
In [10]:
# 1949年のデータをすべて取ってくる
ts['1949']
Out[10]:
Month
1949-01-01    112.0
1949-02-01    118.0
1949-03-01    132.0
1949-04-01    129.0
1949-05-01    121.0
1949-06-01    135.0
1949-07-01    148.0
1949-08-01    148.0
1949-09-01    136.0
1949-10-01    119.0
1949-11-01    104.0
1949-12-01    118.0
Name: #Passengers, dtype: float64
In [11]:
# シフト
ts.shift().head()
Out[11]:
Month
1949-01-01      NaN
1949-02-01    112.0
1949-03-01    118.0
1949-04-01    132.0
1949-05-01    129.0
Name: #Passengers, dtype: float64
In [12]:
# 差分系列
diff = ts - ts.shift()
diff.head()
Out[12]:
Month
1949-01-01     NaN
1949-02-01     6.0
1949-03-01    14.0
1949-04-01    -3.0
1949-05-01    -8.0
Name: #Passengers, dtype: float64
In [13]:
# 2018年7月7日追記
# こちらの方が簡単
ts.diff().head()
Out[13]:
Month
1949-01-01     NaN
1949-02-01     6.0
1949-03-01    14.0
1949-04-01    -3.0
1949-05-01    -8.0
Name: #Passengers, dtype: float64
In [14]:
# 対数差分系列
logDiff = np.log(ts) - np.log(ts.shift())

# NaNを取り除てから表示
logDiff.dropna().head()
Out[14]:
Month
1949-02-01    0.052186
1949-03-01    0.112117
1949-04-01   -0.022990
1949-05-01   -0.064022
1949-06-01    0.109484
Name: #Passengers, dtype: float64

自己相関係数の推定

In [15]:
# 自己相関を求める
ts_acf = sm.tsa.stattools.acf(ts, nlags=40)
ts_acf
Out[15]:
array([ 1.        ,  0.94804734,  0.87557484,  0.80668116,  0.75262542,
        0.71376997,  0.6817336 ,  0.66290439,  0.65561048,  0.67094833,
        0.70271992,  0.74324019,  0.76039504,  0.71266087,  0.64634228,
        0.58592342,  0.53795519,  0.49974753,  0.46873401,  0.44987066,
        0.4416288 ,  0.45722376,  0.48248203,  0.51712699,  0.53218983,
        0.49397569,  0.43772134,  0.3876029 ,  0.34802503,  0.31498388,
        0.28849682,  0.27080187,  0.26429011,  0.27679934,  0.2985215 ,
        0.32558712,  0.3370236 ,  0.30333486,  0.25397708,  0.21065534,
        0.17217092])
In [16]:
# 偏自己相関
ts_pacf = sm.tsa.stattools.pacf(ts, nlags=40, method='ols')
ts_pacf
Out[16]:
array([ 1.        ,  0.95893198, -0.32983096,  0.2018249 ,  0.14500798,
        0.25848232, -0.02690283,  0.20433019,  0.15607896,  0.56860841,
        0.29256358,  0.8402143 ,  0.61268285, -0.66597616, -0.38463943,
        0.0787466 , -0.02663483, -0.05805221, -0.04350748,  0.27732556,
       -0.04046447,  0.13739883,  0.3859958 ,  0.24203808, -0.04912986,
       -0.19599778, -0.15443575,  0.04484465,  0.18371541, -0.0906113 ,
       -0.06202938,  0.34827092,  0.09899499, -0.08396793,  0.36328898,
       -0.17956662,  0.15839435,  0.06376775, -0.27503705,  0.2707607 ,
        0.32002003])
In [17]:
#  自己相関のグラフ
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(ts, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(ts, lags=40, ax=ax2)

ARIMAモデルの推定

In [18]:
# たぶん和分過程なので、差分をとる
diff = ts - ts.shift()
diff = diff.dropna()
diff.head()
Out[18]:
Month
1949-02-01     6.0
1949-03-01    14.0
1949-04-01    -3.0
1949-05-01    -8.0
1949-06-01    14.0
Name: #Passengers, dtype: float64
In [19]:
# 差分系列のグラフ
plt.plot(diff)
Out[19]:
[<matplotlib.lines.Line2D at 0x237ce5f4780>]
In [20]:
# 差分系列への自動ARMA推定関数の実行
resDiff = sm.tsa.arma_order_select_ic(diff, ic='aic', trend='nc')
resDiff
Out[20]:
{'aic':              0            1            2
 0          NaN  1397.257791  1397.093436
 1  1401.852641  1412.615224  1385.496795
 2  1396.587654  1378.338024  1353.175669
 3  1395.021214  1379.614000  1351.138701
 4  1388.216680  1379.616584  1373.560615, 'aic_min_order': (3, 2)}
In [21]:
# P-3, q=2 が最善となったので、それをモデル化
from statsmodels.tsa.arima_model import ARIMA
ARIMA_3_1_2 = ARIMA(ts, order=(3, 1, 2)).fit(dist=False)
ARIMA_3_1_2.params
Out[21]:
const                  2.673484
ar.L1.D.#Passengers    0.261979
ar.L2.D.#Passengers    0.367849
ar.L3.D.#Passengers   -0.363463
ma.L1.D.#Passengers   -0.075008
ma.L2.D.#Passengers   -0.924821
dtype: float64
In [22]:
# 残差のチェック
# SARIMAじゃないので、周期性が残ってしまっている。。。
resid = ARIMA_3_1_2.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

SARIMAモデルの推定

In [23]:
# SARIMAモデルを「決め打ち」で推定する
import statsmodels.api as sm

SARIMA_3_1_2_111 = sm.tsa.SARIMAX(ts, order=(3,1,2), seasonal_order=(1,1,1,12)).fit(method='bfgs', maxiter=300)
print(SARIMA_3_1_2_111.summary())
Warning: Desired error not necessarily achieved due to precision loss.
         Current function value: 3.511414
         Iterations: 7
         Function evaluations: 55
         Gradient evaluations: 43
                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:                        #Passengers   No. Observations:                  144
Model:             SARIMAX(3, 1, 2)x(1, 1, 1, 12)   Log Likelihood                -505.644
Date:                            Sat, 07 Jul 2018   AIC                           1027.287
Time:                                    10:33:16   BIC                           1051.046
Sample:                                01-01-1949   HQIC                          1036.941
                                     - 12-01-1960                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4409      1.357      0.325      0.745      -2.218       3.100
ar.L2          0.0507      0.663      0.076      0.939      -1.249       1.351
ar.L3         -0.0873      0.259     -0.337      0.736      -0.596       0.421
ma.L1         -0.7838      1.356     -0.578      0.563      -3.442       1.874
ma.L2          0.0613      1.153      0.053      0.958      -2.198       2.321
ar.S.L12      -0.1699      0.969     -0.175      0.861      -2.068       1.728
ma.S.L12       0.0476      1.003      0.048      0.962      -1.917       2.013
sigma2       137.6133     17.590      7.823      0.000     103.137     172.089
===================================================================================
Ljung-Box (Q):                       54.47   Jarque-Bera (JB):                 4.71
Prob(Q):                              0.06   Prob(JB):                         0.09
Heteroskedasticity (H):               2.80   Skew:                             0.08
Prob(H) (two-sided):                  0.00   Kurtosis:                         3.91
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [24]:
# 残差のチェック
residSARIMA = SARIMA_3_1_2_111.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)
In [25]:
# 予測
pred = SARIMA_3_1_2_111.predict('1960-01-01', '1961-12-01')
print(pred)
1960-01-01    420.364131
1960-02-01    398.838840
1960-03-01    453.338125
1960-04-01    420.523949
1960-05-01    472.223706
1960-06-01    529.449243
1960-07-01    602.833858
1960-08-01    626.121889
1960-09-01    513.956313
1960-10-01    453.777408
1960-11-01    415.123863
1960-12-01    440.413298
1961-01-01    448.989212
1961-02-01    427.307245
1961-03-01    461.384314
1961-04-01    497.595829
1961-05-01    510.265920
1961-06-01    571.708245
1961-07-01    657.388219
1961-08-01    644.623095
1961-09-01    546.890641
1961-10-01    498.720577
1961-11-01    430.927496
1961-12-01    473.146432
Freq: MS, dtype: float64
In [26]:
# 実データと予測結果の図示
plt.plot(ts)
plt.plot(pred, "r")
Out[26]:
[<matplotlib.lines.Line2D at 0x237ce52d630>]

総当たり法によるSARIMAモデル次数の決定

In [27]:
# 総当たりで、AICが最小となるSARIMAの次数を探す
max_p = 3
max_q = 3
max_d = 1
max_sp = 1
max_sq = 1
max_sd = 1

pattern = max_p*(max_q + 1)*(max_d + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)

modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])
pattern
Out[27]:
192
In [28]:
# 自動SARIMA選択
num = 0

for p in range(1, max_p + 1):
    for d in range(0, max_d + 1):
        for q in range(0, max_q + 1):
            for sp in range(0, max_sp + 1):
                for sd in range(0, max_sd + 1):
                    for sq in range(0, max_sq + 1):
                        sarima = sm.tsa.SARIMAX(
                            ts, order=(p,d,q), 
                            seasonal_order=(sp,sd,sq,12), 
                            enforce_stationarity = False, 
                            enforce_invertibility = False
                        ).fit(method='bfgs', maxiter=300, disp=False)
                        modelSelection.ix[num]["model"] = "order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
                        modelSelection.ix[num]["aic"] = sarima.aic
                        num = num + 1
In [29]:
modelSelection
Out[29]:
model aic
0 order=(1,0,0), season=(0,0,0) 1415.91
1 order=(1,0,0), season=(0,0,1) 1205.39
2 order=(1,0,0), season=(0,1,0) 1029.98
3 order=(1,0,0), season=(0,1,1) 944.385
4 order=(1,0,0), season=(1,0,0) 1017.32
5 order=(1,0,0), season=(1,0,1) 1007.03
6 order=(1,0,0), season=(1,1,0) 944.044
7 order=(1,0,0), season=(1,1,1) 945.44
8 order=(1,0,1), season=(0,0,0) 1390.45
9 order=(1,0,1), season=(0,0,1) 1192.29
10 order=(1,0,1), season=(0,1,0) 1014.25
11 order=(1,0,1), season=(0,1,1) 929.433
12 order=(1,0,1), season=(1,0,0) 1009.59
13 order=(1,0,1), season=(1,0,1) 989.176
14 order=(1,0,1), season=(1,1,0) 935.816
15 order=(1,0,1), season=(1,1,1) 935.915
16 order=(1,0,2), season=(0,0,0) 1381.52
17 order=(1,0,2), season=(0,0,1) 1185.89
18 order=(1,0,2), season=(0,1,0) 1009.29
19 order=(1,0,2), season=(0,1,1) 923.304
20 order=(1,0,2), season=(1,0,0) 1010.71
21 order=(1,0,2), season=(1,0,1) 984.278
22 order=(1,0,2), season=(1,1,0) 937.696
23 order=(1,0,2), season=(1,1,1) 929.569
24 order=(1,0,3), season=(0,0,0) 1354.88
25 order=(1,0,3), season=(0,0,1) 1161.8
26 order=(1,0,3), season=(0,1,0) 1000.8
27 order=(1,0,3), season=(0,1,1) 915.052
28 order=(1,0,3), season=(1,0,0) 1011.19
29 order=(1,0,3), season=(1,0,1) 979.4
... ... ...
162 order=(3,1,0), season=(0,1,0) 1003
163 order=(3,1,0), season=(0,1,1) 931.842
164 order=(3,1,0), season=(1,0,0) 997.193
165 order=(3,1,0), season=(1,0,1) 983.289
166 order=(3,1,0), season=(1,1,0) 916.573
167 order=(3,1,0), season=(1,1,1) 919.056
168 order=(3,1,1), season=(0,0,0) 1353.66
169 order=(3,1,1), season=(0,0,1) 1167.2
170 order=(3,1,1), season=(0,1,0) 997.603
171 order=(3,1,1), season=(0,1,1) 918.466
172 order=(3,1,1), season=(1,0,0) 993.436
173 order=(3,1,1), season=(1,0,1) 983.971
174 order=(3,1,1), season=(1,1,0) 911.376
175 order=(3,1,1), season=(1,1,1) 913.492
176 order=(3,1,2), season=(0,0,0) 1352
177 order=(3,1,2), season=(0,0,1) 1142.78
178 order=(3,1,2), season=(0,1,0) 999.602
179 order=(3,1,2), season=(0,1,1) 913.457
180 order=(3,1,2), season=(1,0,0) 994.036
181 order=(3,1,2), season=(1,0,1) 987.185
182 order=(3,1,2), season=(1,1,0) 913.265
183 order=(3,1,2), season=(1,1,1) 914.041
184 order=(3,1,3), season=(0,0,0) 1337.23
185 order=(3,1,3), season=(0,0,1) 1134.12
186 order=(3,1,3), season=(0,1,0) 988.935
187 order=(3,1,3), season=(0,1,1) 898.105
188 order=(3,1,3), season=(1,0,0) 992.115
189 order=(3,1,3), season=(1,0,1) 979.579
190 order=(3,1,3), season=(1,1,0) 910.008
191 order=(3,1,3), season=(1,1,1) 903.239

192 rows × 2 columns

In [30]:
# AIC最小モデル
modelSelection[modelSelection.aic == min(modelSelection.aic)]
Out[30]:
model aic
187 order=(3,1,3), season=(0,1,1) 898.105
In [31]:
# 参考:次数がすべて0だとエラーになる 
# sarima = sm.tsa.SARIMAX(ts, order=(0,0,0), seasonal_order=(0,0,0,12), enforce_stationarity = False).fit()
In [32]:
bestSARIMA = sm.tsa.SARIMAX(ts, order=(3,1,3), seasonal_order=(0,1,1,12), enforce_stationarity = False, enforce_invertibility = False).fit()
In [33]:
print(bestSARIMA.summary())
                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:                        #Passengers   No. Observations:                  144
Model:             SARIMAX(3, 1, 3)x(0, 1, 1, 12)   Log Likelihood                -441.052
Date:                            Sat, 07 Jul 2018   AIC                            898.105
Time:                                    10:35:56   BIC                            921.863
Sample:                                01-01-1949   HQIC                           907.759
                                     - 12-01-1960                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2231      0.097     -2.302      0.021      -0.413      -0.033
ar.L2         -0.1642      0.108     -1.515      0.130      -0.377       0.048
ar.L3          0.7244      0.094      7.704      0.000       0.540       0.909
ma.L1         -0.0837    111.130     -0.001      0.999    -217.895     217.728
ma.L2          0.1221    139.158      0.001      0.999    -272.622     272.866
ma.L3         -0.9797    243.017     -0.004      0.997    -477.285     475.325
ma.S.L12      -0.1583      0.118     -1.337      0.181      -0.390       0.074
sigma2       119.6719   2.97e+04      0.004      0.997   -5.81e+04    5.83e+04
===================================================================================
Ljung-Box (Q):                       36.68   Jarque-Bera (JB):                 4.39
Prob(Q):                              0.62   Prob(JB):                         0.11
Heteroskedasticity (H):               1.87   Skew:                             0.16
Prob(H) (two-sided):                  0.06   Kurtosis:                         3.90
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [34]:
# 残差のチェック
residSARIMA = bestSARIMA.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)
In [35]:
# 予測
bestPred = bestSARIMA.predict('1960-01-01', '1961-12-01')
# 実データと予測結果の図示
plt.plot(ts)
plt.plot(bestPred, "r")
Out[35]:
[<matplotlib.lines.Line2D at 0x237cf0c2668>]