time_series
ARIMA and SARIMA
hayleyhell
2023. 1. 10. 16:36
In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
In [10]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.tools import diff
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
!pip install pmdarima
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import VAR
from statsmodels.tools.eval_measures import rmse
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: pmdarima in /usr/local/lib/python3.8/dist-packages (2.0.2) Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.29.32) Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.0.2) Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.24.3) Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.3.5) Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.7.3) Requirement already satisfied: numpy>=1.21.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.21.6) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.2.0) Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (57.4.0) Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.13.5) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2022.7) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2.8.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.22->pmdarima) (3.1.0) Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (21.3) Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9) Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.15.0)
In [11]:
df1 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/DailyTotalFemaleBirths.csv', index_col='Date', parse_dates=True)
df1.index.freq = "D"
df1 = df1[:120]
In [12]:
df2 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/TradeInventories.csv', index_col='Date', parse_dates=True)
df2.index.freq = "MS"
ARIMA¶
In [13]:
df2.plot(figsize=(12,8));
In [15]:
result = seasonal_decompose(df2['Inventories'], model='add')
result.plot();
계절성 범위가 매우 작다 = 계절성 모델을 사용하지 않는다.
In [35]:
stepwise_fit = auto_arima(df2['Inventories'], start_p=0, start_q=0, max_p=2, max_q=2,
seasonal=False, trace=True)
stepwise_fit.summary()
Performing stepwise search to minimize aic ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=5348.037, Time=0.06 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=5399.843, Time=0.09 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=5350.241, Time=0.17 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=5409.217, Time=0.06 sec ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=5378.835, Time=0.69 sec Best model: ARIMA(0,1,0)(0,0,0)[0] intercept Total fit time: 1.104 seconds
Out[35]:
| Dep. Variable: | y | No. Observations: | 264 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 0) | Log Likelihood | -2672.018 |
| Date: | Tue, 10 Jan 2023 | AIC | 5348.037 |
| Time: | 06:51:59 | BIC | 5355.181 |
| Sample: | 01-01-1997 | HQIC | 5350.908 |
| - 12-01-2018 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 3258.3802 | 470.991 | 6.918 | 0.000 | 2335.255 | 4181.506 |
| sigma2 | 3.91e+07 | 2.95e+06 | 13.250 | 0.000 | 3.33e+07 | 4.49e+07 |
| Ljung-Box (L1) (Q): | 82.61 | Jarque-Bera (JB): | 100.74 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.86 | Skew: | -1.15 |
| Prob(H) (two-sided): | 0.48 | Kurtosis: | 4.98 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [17]:
df2['Diff_1'] = diff(df2['Inventories'], k_diff=1)
In [18]:
def adf_test(series, title=''):
print(f'Augmented Dickey-Fuller Test: {title}')
result = adfuller(series.dropna(), autolag='AIC')
labels = ['ADF Test Statistic', 'p-value', '# Lags Used', '# Observations']
out = pd.Series(result[:4], index = labels)
for key, val in result[4].items():
out[f'critical value ({key})'] = val
print(out.to_string()) # .to_string() removes the line "dtype: float64"
if result[1] <= 0.05:
print('\nSTATIONARY')
else:
print('\nNON STATIONARY')
In [19]:
adf_test(df2['Diff_1'])
Augmented Dickey-Fuller Test: ADF Test Statistic -3.412249 p-value 0.010548 # Lags Used 4.000000 # Observations 258.000000 critical value (1%) -3.455953 critical value (5%) -2.872809 critical value (10%) -2.572775 STATIONARY
TRAIN our ARIMA¶
In [25]:
len(df2) - 252
Out[25]:
12
In [26]:
train = df2.iloc[:252]
test = df2.iloc[252:]
In [36]:
model = ARIMA(train['Inventories'], order=(1,1,1))
results = model.fit()
results.summary()
Out[36]:
| Dep. Variable: | Inventories | No. Observations: | 252 |
|---|---|---|---|
| Model: | ARIMA(1, 1, 1) | Log Likelihood | -2550.306 |
| Date: | Tue, 10 Jan 2023 | AIC | 5106.611 |
| Time: | 06:52:48 | BIC | 5117.188 |
| Sample: | 01-01-1997 | HQIC | 5110.868 |
| - 12-01-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.9999 | 0.002 | 536.009 | 0.000 | 0.996 | 1.004 |
| ma.L1 | -0.9992 | 0.007 | -149.506 | 0.000 | -1.012 | -0.986 |
| sigma2 | 3.917e+07 | 2.53e-12 | 1.55e+19 | 0.000 | 3.92e+07 | 3.92e+07 |
| Ljung-Box (L1) (Q): | 87.25 | Jarque-Bera (JB): | 99.18 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.76 | Skew: | -1.17 |
| Prob(H) (two-sided): | 0.21 | Kurtosis: | 5.00 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.74e+34. Standard errors may be unstable.
In [37]:
start = len(train)
end = len(train) + len(test) - 1
predictions = results.predict(start=start, end=end, typ='levels').rename('ARIMA (1,1,1) Predictions')
typ='linear'는 1로 차분화된 데이터를 반환한다.
typ='levels'은 원래 내생 변수의 수준을 예측한다.
=> 차분항이 있는 모델인 경우 타입을 levels로 지정한다.
In [30]:
predictions
Out[30]:
2018-01-01 2103751.0 2018-02-01 2103751.0 2018-03-01 2103751.0 2018-04-01 2103751.0 2018-05-01 2103751.0 2018-06-01 2103751.0 2018-07-01 2103751.0 2018-08-01 2103751.0 2018-09-01 2103751.0 2018-10-01 2103751.0 2018-11-01 2103751.0 2018-12-01 2103751.0 Freq: MS, Name: ARIMA (0,1,0) Predictions, dtype: float64
In [31]:
test
Out[31]:
| Inventories | Diff_1 | |
|---|---|---|
| Date | ||
| 2018-01-01 | 2110158 | 6407.0 |
| 2018-02-01 | 2118199 | 8041.0 |
| 2018-03-01 | 2112427 | -5772.0 |
| 2018-04-01 | 2112276 | -151.0 |
| 2018-05-01 | 2111835 | -441.0 |
| 2018-06-01 | 2109298 | -2537.0 |
| 2018-07-01 | 2119618 | 10320.0 |
| 2018-08-01 | 2127170 | 7552.0 |
| 2018-09-01 | 2134172 | 7002.0 |
| 2018-10-01 | 2144639 | 10467.0 |
| 2018-11-01 | 2143001 | -1638.0 |
| 2018-12-01 | 2158115 | 15114.0 |
In [38]:
test['Inventories'].plot(legend=True, figsize=(12,8))
predictions.plot(legend=True);
In [39]:
error = rmse(test['Inventories'], predictions)
error
Out[39]:
7965.762521145892
In [40]:
test['Inventories'].mean()
Out[40]:
2125075.6666666665
In [41]:
predictions.mean()
Out[41]:
2124219.9454642516
FORECAST¶
In [43]:
model = ARIMA(df2['Inventories'], order=(1,1,1))
results = model.fit()
fcast = results.predict(start=len(df2), end=len(df2)+11, typ='levels').rename('ARIMA (1,1,1) FORECAST')
In [44]:
df2['Inventories'].plot(legend=True, figsize=(12, 8))
fcast.plot(legend=True);
SARIMA¶
In [116]:
df = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/co2_mm_mlo.csv')
In [117]:
df['date'] = pd.to_datetime({'year':df['year'], 'month':df['month'], 'day':1})
df = df.set_index('date')
df.index.freq = "MS"
In [118]:
df.head()
Out[118]:
| year | month | decimal_date | average | interpolated | |
|---|---|---|---|---|---|
| date | |||||
| 1958-03-01 | 1958 | 3 | 1958.208 | 315.71 | 315.71 |
| 1958-04-01 | 1958 | 4 | 1958.292 | 317.45 | 317.45 |
| 1958-05-01 | 1958 | 5 | 1958.375 | 317.50 | 317.50 |
| 1958-06-01 | 1958 | 6 | 1958.458 | NaN | 317.10 |
| 1958-07-01 | 1958 | 7 | 1958.542 | 315.86 | 315.86 |
In [119]:
df['interpolated'].plot(figsize=(12,8));
In [120]:
result = seasonal_decompose(df['interpolated'], model='add')
result.plot();
In [121]:
result.seasonal.plot(figsize=(12,5));
In [122]:
auto_arima(df['interpolated'], seasonal=True, m=12, trace=True).summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(1,0,1)[12] intercept : AIC=468.894, Time=4.71 sec ARIMA(0,1,0)(0,0,0)[12] intercept : AIC=2369.532, Time=0.07 sec ARIMA(1,1,0)(1,0,0)[12] intercept : AIC=inf, Time=1.29 sec ARIMA(0,1,1)(0,0,1)[12] intercept : AIC=1614.808, Time=0.97 sec ARIMA(0,1,0)(0,0,0)[12] : AIC=2375.248, Time=0.06 sec ARIMA(2,1,2)(0,0,1)[12] intercept : AIC=1101.992, Time=4.31 sec ARIMA(2,1,2)(1,0,0)[12] intercept : AIC=770.633, Time=3.98 sec ARIMA(2,1,2)(2,0,1)[12] intercept : AIC=502.192, Time=12.13 sec ARIMA(2,1,2)(1,0,2)[12] intercept : AIC=538.444, Time=10.89 sec ARIMA(2,1,2)(0,0,0)[12] intercept : AIC=1440.918, Time=0.85 sec ARIMA(2,1,2)(0,0,2)[12] intercept : AIC=inf, Time=10.29 sec ARIMA(2,1,2)(2,0,0)[12] intercept : AIC=611.679, Time=11.98 sec ARIMA(2,1,2)(2,0,2)[12] intercept : AIC=inf, Time=14.18 sec ARIMA(1,1,2)(1,0,1)[12] intercept : AIC=454.400, Time=4.65 sec ARIMA(1,1,2)(0,0,1)[12] intercept : AIC=1491.576, Time=1.21 sec ARIMA(1,1,2)(1,0,0)[12] intercept : AIC=771.800, Time=3.54 sec ARIMA(1,1,2)(2,0,1)[12] intercept : AIC=466.549, Time=9.10 sec ARIMA(1,1,2)(1,0,2)[12] intercept : AIC=476.379, Time=11.31 sec ARIMA(1,1,2)(0,0,0)[12] intercept : AIC=1715.503, Time=0.34 sec ARIMA(1,1,2)(0,0,2)[12] intercept : AIC=1348.445, Time=3.07 sec ARIMA(1,1,2)(2,0,0)[12] intercept : AIC=617.600, Time=8.15 sec ARIMA(1,1,2)(2,0,2)[12] intercept : AIC=inf, Time=13.05 sec ARIMA(0,1,2)(1,0,1)[12] intercept : AIC=428.392, Time=3.91 sec ARIMA(0,1,2)(0,0,1)[12] intercept : AIC=1506.152, Time=0.89 sec ARIMA(0,1,2)(1,0,0)[12] intercept : AIC=791.261, Time=1.29 sec ARIMA(0,1,2)(2,0,1)[12] intercept : AIC=437.664, Time=10.38 sec ARIMA(0,1,2)(1,0,2)[12] intercept : AIC=477.549, Time=8.60 sec ARIMA(0,1,2)(0,0,0)[12] intercept : AIC=1762.294, Time=0.23 sec ARIMA(0,1,2)(0,0,2)[12] intercept : AIC=1349.347, Time=2.49 sec ARIMA(0,1,2)(2,0,0)[12] intercept : AIC=617.552, Time=4.12 sec ARIMA(0,1,2)(2,0,2)[12] intercept : AIC=inf, Time=17.28 sec ARIMA(0,1,1)(1,0,1)[12] intercept : AIC=428.751, Time=3.39 sec ARIMA(0,1,3)(1,0,1)[12] intercept : AIC=442.138, Time=4.55 sec ARIMA(1,1,1)(1,0,1)[12] intercept : AIC=460.443, Time=3.25 sec ARIMA(1,1,3)(1,0,1)[12] intercept : AIC=466.972, Time=4.16 sec ARIMA(0,1,2)(1,0,1)[12] : AIC=426.285, Time=2.44 sec ARIMA(0,1,2)(0,0,1)[12] : AIC=1507.230, Time=0.30 sec ARIMA(0,1,2)(1,0,0)[12] : AIC=789.744, Time=0.48 sec ARIMA(0,1,2)(2,0,1)[12] : AIC=428.274, Time=4.17 sec ARIMA(0,1,2)(1,0,2)[12] : AIC=428.327, Time=7.56 sec ARIMA(0,1,2)(0,0,0)[12] : AIC=1763.938, Time=0.12 sec ARIMA(0,1,2)(0,0,2)[12] : AIC=1350.023, Time=1.36 sec ARIMA(0,1,2)(2,0,0)[12] : AIC=615.890, Time=1.17 sec ARIMA(0,1,2)(2,0,2)[12] : AIC=inf, Time=6.35 sec ARIMA(0,1,1)(1,0,1)[12] : AIC=426.894, Time=2.11 sec ARIMA(1,1,2)(1,0,1)[12] : AIC=442.139, Time=3.55 sec ARIMA(0,1,3)(1,0,1)[12] : AIC=423.398, Time=3.47 sec ARIMA(0,1,3)(0,0,1)[12] : AIC=1473.391, Time=0.56 sec ARIMA(0,1,3)(1,0,0)[12] : AIC=784.597, Time=0.50 sec ARIMA(0,1,3)(2,0,1)[12] : AIC=425.390, Time=7.06 sec ARIMA(0,1,3)(1,0,2)[12] : AIC=425.360, Time=7.96 sec ARIMA(0,1,3)(0,0,0)[12] : AIC=1665.331, Time=0.17 sec ARIMA(0,1,3)(0,0,2)[12] : AIC=1344.996, Time=1.65 sec ARIMA(0,1,3)(2,0,0)[12] : AIC=613.245, Time=1.37 sec ARIMA(0,1,3)(2,0,2)[12] : AIC=inf, Time=9.48 sec ARIMA(1,1,3)(1,0,1)[12] : AIC=433.783, Time=4.07 sec ARIMA(0,1,4)(1,0,1)[12] : AIC=424.953, Time=4.09 sec ARIMA(1,1,4)(1,0,1)[12] : AIC=426.747, Time=3.90 sec Best model: ARIMA(0,1,3)(1,0,1)[12] Total fit time: 268.646 seconds
Out[122]:
| Dep. Variable: | y | No. Observations: | 729 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 3)x(1, 0, [1], 12) | Log Likelihood | -205.699 |
| Date: | Tue, 10 Jan 2023 | AIC | 423.398 |
| Time: | 07:30:53 | BIC | 450.939 |
| Sample: | 03-01-1958 | HQIC | 434.025 |
| - 11-01-2018 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ma.L1 | -0.3564 | 0.033 | -10.674 | 0.000 | -0.422 | -0.291 |
| ma.L2 | -0.0221 | 0.036 | -0.610 | 0.542 | -0.093 | 0.049 |
| ma.L3 | -0.0860 | 0.030 | -2.884 | 0.004 | -0.144 | -0.028 |
| ar.S.L12 | 0.9996 | 0.000 | 2938.338 | 0.000 | 0.999 | 1.000 |
| ma.S.L12 | -0.8655 | 0.021 | -40.501 | 0.000 | -0.907 | -0.824 |
| sigma2 | 0.0956 | 0.004 | 21.680 | 0.000 | 0.087 | 0.104 |
| Ljung-Box (L1) (Q): | 0.07 | Jarque-Bera (JB): | 4.00 |
|---|---|---|---|
| Prob(Q): | 0.79 | Prob(JB): | 0.14 |
| Heteroskedasticity (H): | 1.13 | Skew: | 0.00 |
| Prob(H) (two-sided): | 0.34 | Kurtosis: | 3.36 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
TRAIN our SARIMA¶
In [57]:
len(df) -12
Out[57]:
717
In [123]:
train = df.iloc[:717]
test = df.iloc[717:]
In [124]:
model = SARIMAX(train['interpolated'], order=(0,1,3), seasonal_order=(1,0,1,12))
results = model.fit()
results.summary()
Out[124]:
| Dep. Variable: | interpolated | No. Observations: | 717 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 3)x(1, 0, [1], 12) | Log Likelihood | -201.188 |
| Date: | Tue, 10 Jan 2023 | AIC | 414.376 |
| Time: | 07:34:21 | BIC | 441.818 |
| Sample: | 03-01-1958 | HQIC | 424.973 |
| - 11-01-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ma.L1 | -0.3541 | 0.035 | -10.032 | 0.000 | -0.423 | -0.285 |
| ma.L2 | -0.0246 | 0.032 | -0.758 | 0.449 | -0.088 | 0.039 |
| ma.L3 | -0.0870 | 0.027 | -3.190 | 0.001 | -0.140 | -0.034 |
| ar.S.L12 | 0.9996 | 0.000 | 3080.426 | 0.000 | 0.999 | 1.000 |
| ma.S.L12 | -0.8662 | 0.022 | -38.579 | 0.000 | -0.910 | -0.822 |
| sigma2 | 0.0951 | 0.005 | 20.382 | 0.000 | 0.086 | 0.104 |
| Ljung-Box (L1) (Q): | 0.08 | Jarque-Bera (JB): | 4.35 |
|---|---|---|---|
| Prob(Q): | 0.78 | Prob(JB): | 0.11 |
| Heteroskedasticity (H): | 1.15 | Skew: | 0.02 |
| Prob(H) (two-sided): | 0.28 | Kurtosis: | 3.38 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [125]:
start = len(train)
end = len(train) + len(test) - 1
In [126]:
predictions = results.predict(start, end, typ='levels').rename('SARIMA Predictions')
In [127]:
test['interpolated'].plot(legend=True, figsize=(12,8))
predictions.plot(legend=True);
In [63]:
error = rmse(test['interpolated'], predictions)
error
Out[63]:
0.3583670792390456
In [64]:
test['interpolated'].mean()
Out[64]:
408.3333333333333
In [65]:
predictions.mean()
Out[65]:
408.4378349126259
FORECAST¶
In [66]:
model = SARIMAX(df['interpolated'], order=(0,1,3), seasonal_order=(1,0,1,12))
results = model.fit()
In [67]:
fcast = results.predict(len(df), len(df)+11, typ='levels').rename('SARIMA FORECAST')
In [70]:
df['interpolated'].plot(legend=True, figsize=(12,8))
fcast.plot(legend=True);
Exogenous Variables SARIMAX¶
In [71]:
df = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/RestaurantVisitors.csv', index_col='date', parse_dates=True)
df.index.freq = "D"
In [72]:
df.head()
Out[72]:
| weekday | holiday | holiday_name | rest1 | rest2 | rest3 | rest4 | total | |
|---|---|---|---|---|---|---|---|---|
| date | ||||||||
| 2016-01-01 | Friday | 1 | New Year's Day | 65.0 | 25.0 | 67.0 | 139.0 | 296.0 |
| 2016-01-02 | Saturday | 0 | na | 24.0 | 39.0 | 43.0 | 85.0 | 191.0 |
| 2016-01-03 | Sunday | 0 | na | 24.0 | 31.0 | 66.0 | 81.0 | 202.0 |
| 2016-01-04 | Monday | 0 | na | 23.0 | 18.0 | 32.0 | 32.0 | 105.0 |
| 2016-01-05 | Tuesday | 0 | na | 2.0 | 15.0 | 38.0 | 43.0 | 98.0 |
In [73]:
df1 = df.dropna()
In [74]:
print(df.shape)
print(df1.shape)
(517, 8) (478, 8)
In [78]:
cols = ['rest1', 'rest2', 'rest3', 'rest4', 'total']
for col in cols:
df1[col] = df[col].astype(int, errors='ignore')
In [79]:
df1.head()
Out[79]:
| weekday | holiday | holiday_name | rest1 | rest2 | rest3 | rest4 | total | |
|---|---|---|---|---|---|---|---|---|
| date | ||||||||
| 2016-01-01 | Friday | 1 | New Year's Day | 65.0 | 25.0 | 67.0 | 139.0 | 296.0 |
| 2016-01-02 | Saturday | 0 | na | 24.0 | 39.0 | 43.0 | 85.0 | 191.0 |
| 2016-01-03 | Sunday | 0 | na | 24.0 | 31.0 | 66.0 | 81.0 | 202.0 |
| 2016-01-04 | Monday | 0 | na | 23.0 | 18.0 | 32.0 | 32.0 | 105.0 |
| 2016-01-05 | Tuesday | 0 | na | 2.0 | 15.0 | 38.0 | 43.0 | 98.0 |
In [80]:
df1['total'].plot(figsize=(16,5))
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7d5a268d00>
In [81]:
ax = df1['total'].plot(figsize=(16,5))
for day in df1.query('holiday==1').index:
ax.axvline(x=day, color='k', alpha=0.5);
휴일이 방문자 수에 영향을 미칠까?
시각적으로 외생변수가 식당의 방문자 수를 예측하게 될지는 불분명하다.
In [82]:
result = seasonal_decompose(df1['total'])
result.plot();
In [83]:
result.seasonal.plot(figsize=(18,5));
이 데이터셋의 계절성은 주간을 기준으로 한다.
TRAIN our SARIMAX¶
In [84]:
len(df1)
Out[84]:
478
In [85]:
train = df1.iloc[:436]
test = df1.iloc[436:]
In [86]:
auto_arima(df1['total'], seasonal=True, m=7, trace=True).summary()
Performing stepwise search to minimize aic ARIMA(2,0,2)(1,0,1)[7] intercept : AIC=inf, Time=2.49 sec ARIMA(0,0,0)(0,0,0)[7] intercept : AIC=5269.484, Time=0.03 sec ARIMA(1,0,0)(1,0,0)[7] intercept : AIC=4916.749, Time=0.75 sec ARIMA(0,0,1)(0,0,1)[7] intercept : AIC=5049.644, Time=0.52 sec ARIMA(0,0,0)(0,0,0)[7] : AIC=6126.084, Time=0.02 sec ARIMA(1,0,0)(0,0,0)[7] intercept : AIC=5200.790, Time=0.12 sec ARIMA(1,0,0)(2,0,0)[7] intercept : AIC=4845.442, Time=2.23 sec ARIMA(1,0,0)(2,0,1)[7] intercept : AIC=inf, Time=nan sec ARIMA(1,0,0)(1,0,1)[7] intercept : AIC=4827.220, Time=1.29 sec ARIMA(1,0,0)(0,0,1)[7] intercept : AIC=5058.642, Time=0.56 sec ARIMA(1,0,0)(1,0,2)[7] intercept : AIC=4944.867, Time=2.70 sec ARIMA(1,0,0)(0,0,2)[7] intercept : AIC=4982.776, Time=1.67 sec ARIMA(1,0,0)(2,0,2)[7] intercept : AIC=inf, Time=7.85 sec ARIMA(0,0,0)(1,0,1)[7] intercept : AIC=4762.608, Time=2.90 sec ARIMA(0,0,0)(0,0,1)[7] intercept : AIC=5093.130, Time=0.31 sec ARIMA(0,0,0)(1,0,0)[7] intercept : AIC=4926.360, Time=0.58 sec ARIMA(0,0,0)(2,0,1)[7] intercept : AIC=inf, Time=2.66 sec ARIMA(0,0,0)(1,0,2)[7] intercept : AIC=4928.453, Time=2.20 sec ARIMA(0,0,0)(0,0,2)[7] intercept : AIC=5010.582, Time=0.82 sec ARIMA(0,0,0)(2,0,0)[7] intercept : AIC=4859.638, Time=2.21 sec ARIMA(0,0,0)(2,0,2)[7] intercept : AIC=inf, Time=2.94 sec ARIMA(0,0,1)(1,0,1)[7] intercept : AIC=inf, Time=1.52 sec ARIMA(1,0,1)(1,0,1)[7] intercept : AIC=inf, Time=1.70 sec ARIMA(0,0,0)(1,0,1)[7] : AIC=inf, Time=0.32 sec Best model: ARIMA(0,0,0)(1,0,1)[7] intercept Total fit time: 40.929 seconds
Out[86]:
| Dep. Variable: | y | No. Observations: | 478 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, [1], 7) | Log Likelihood | -2377.304 |
| Date: | Tue, 10 Jan 2023 | AIC | 4762.608 |
| Time: | 07:11:28 | BIC | 4779.286 |
| Sample: | 01-01-2016 | HQIC | 4769.165 |
| - 04-22-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 3.1467 | 1.357 | 2.318 | 0.020 | 0.486 | 5.807 |
| ar.S.L7 | 0.9754 | 0.010 | 96.753 | 0.000 | 0.956 | 0.995 |
| ma.S.L7 | -0.7716 | 0.048 | -16.214 | 0.000 | -0.865 | -0.678 |
| sigma2 | 1209.5554 | 70.835 | 17.076 | 0.000 | 1070.722 | 1348.388 |
| Ljung-Box (L1) (Q): | 14.25 | Jarque-Bera (JB): | 66.94 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.87 | Skew: | 0.74 |
| Prob(H) (two-sided): | 0.36 | Kurtosis: | 4.08 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [17]:
# (0,0,0)(1,0,1)[7]
In [87]:
# ValueError : non-invertible starting MA parameters found
model = SARIMAX(train['total'], order=(0,0,0), seasonal_order=(1,0,1,7),
enforce_invertibility=False)
In [88]:
results = model.fit()
results.summary()
Out[88]:
| Dep. Variable: | total | No. Observations: | 436 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, [1], 7) | Log Likelihood | -2165.369 |
| Date: | Tue, 10 Jan 2023 | AIC | 4336.738 |
| Time: | 07:13:02 | BIC | 4348.970 |
| Sample: | 01-01-2016 | HQIC | 4341.565 |
| - 03-11-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.S.L7 | 0.9999 | 9.57e-05 | 1.05e+04 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.9384 | 0.024 | -39.208 | 0.000 | -0.985 | -0.891 |
| sigma2 | 1111.7971 | 58.740 | 18.927 | 0.000 | 996.669 | 1226.925 |
| Ljung-Box (L1) (Q): | 15.40 | Jarque-Bera (JB): | 83.57 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.96 | Skew: | 0.72 |
| Prob(H) (two-sided): | 0.81 | Kurtosis: | 4.59 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [89]:
start = len(train)
end = len(train) + len(test) -1
In [90]:
predictions = results.predict(start, end).rename('SARIMA Model')
In [92]:
test['total'].plot(legend=True, figsize=(15,8))
predictions.plot(legend=True);
In [93]:
ax = test['total'].plot(legend=True, figsize=(15,8))
predictions.plot(legend=True)
for day in test.query('holiday==1').index:
ax.axvline(x=day, color='k', alpha=0.9);
만일 휴일을 외생 변수로 추가한다면, 모델이 개선되는지 확인해보자.
In [94]:
error = rmse(test['total'], predictions)
error
Out[94]:
31.91232897229996
In [95]:
test['total'].mean()
Out[95]:
134.26190476190476
In [96]:
predictions.mean()
Out[96]:
133.08491155988327
In [97]:
auto_arima(df1['total'], exogenous=df1[['holiday']], seasonal=True, m=7, trace=True).summary()
Performing stepwise search to minimize aic ARIMA(2,0,2)(1,0,1)[7] intercept : AIC=inf, Time=2.32 sec ARIMA(0,0,0)(0,0,0)[7] intercept : AIC=5269.484, Time=0.02 sec ARIMA(1,0,0)(1,0,0)[7] intercept : AIC=4916.749, Time=0.70 sec ARIMA(0,0,1)(0,0,1)[7] intercept : AIC=5049.644, Time=0.46 sec ARIMA(0,0,0)(0,0,0)[7] : AIC=6126.084, Time=0.02 sec ARIMA(1,0,0)(0,0,0)[7] intercept : AIC=5200.790, Time=0.13 sec ARIMA(1,0,0)(2,0,0)[7] intercept : AIC=4845.442, Time=1.97 sec ARIMA(1,0,0)(2,0,1)[7] intercept : AIC=inf, Time=nan sec ARIMA(1,0,0)(1,0,1)[7] intercept : AIC=4827.220, Time=2.05 sec ARIMA(1,0,0)(0,0,1)[7] intercept : AIC=5058.642, Time=1.20 sec ARIMA(1,0,0)(1,0,2)[7] intercept : AIC=4944.867, Time=2.98 sec ARIMA(1,0,0)(0,0,2)[7] intercept : AIC=4982.776, Time=1.24 sec ARIMA(1,0,0)(2,0,2)[7] intercept : AIC=inf, Time=3.44 sec ARIMA(0,0,0)(1,0,1)[7] intercept : AIC=4762.608, Time=1.00 sec ARIMA(0,0,0)(0,0,1)[7] intercept : AIC=5093.130, Time=0.20 sec ARIMA(0,0,0)(1,0,0)[7] intercept : AIC=4926.360, Time=0.43 sec ARIMA(0,0,0)(2,0,1)[7] intercept : AIC=inf, Time=2.44 sec ARIMA(0,0,0)(1,0,2)[7] intercept : AIC=4928.453, Time=2.02 sec ARIMA(0,0,0)(0,0,2)[7] intercept : AIC=5010.582, Time=0.76 sec ARIMA(0,0,0)(2,0,0)[7] intercept : AIC=4859.638, Time=1.93 sec ARIMA(0,0,0)(2,0,2)[7] intercept : AIC=inf, Time=2.58 sec ARIMA(0,0,1)(1,0,1)[7] intercept : AIC=inf, Time=1.30 sec ARIMA(1,0,1)(1,0,1)[7] intercept : AIC=inf, Time=1.47 sec ARIMA(0,0,0)(1,0,1)[7] : AIC=inf, Time=0.32 sec Best model: ARIMA(0,0,0)(1,0,1)[7] intercept Total fit time: 33.727 seconds
Out[97]:
| Dep. Variable: | y | No. Observations: | 478 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, [1], 7) | Log Likelihood | -2377.304 |
| Date: | Tue, 10 Jan 2023 | AIC | 4762.608 |
| Time: | 07:17:58 | BIC | 4779.286 |
| Sample: | 01-01-2016 | HQIC | 4769.165 |
| - 04-22-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 3.1467 | 1.357 | 2.318 | 0.020 | 0.486 | 5.807 |
| ar.S.L7 | 0.9754 | 0.010 | 96.753 | 0.000 | 0.956 | 0.995 |
| ma.S.L7 | -0.7716 | 0.048 | -16.214 | 0.000 | -0.865 | -0.678 |
| sigma2 | 1209.5554 | 70.835 | 17.076 | 0.000 | 1070.722 | 1348.388 |
| Ljung-Box (L1) (Q): | 14.25 | Jarque-Bera (JB): | 66.94 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.87 | Skew: | 0.74 |
| Prob(H) (two-sided): | 0.36 | Kurtosis: | 4.08 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Statsmodel의 외생 변수를 사용할 때는 두 세트의 대괄호를 사용한다. = 데이터프레임
In [27]:
#(0,0,0)(1,0,1)[7]
In [98]:
model = SARIMAX(train['total'],exog=train[['holiday']],order=(1,0,0),seasonal_order=(1,0,1,7),enforce_invertibility=False)
results = model.fit()
results.summary()
Out[98]:
| Dep. Variable: | total | No. Observations: | 436 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, 0)x(1, 0, [1], 7) | Log Likelihood | -2089.208 |
| Date: | Tue, 10 Jan 2023 | AIC | 4188.417 |
| Time: | 07:18:23 | BIC | 4208.805 |
| Sample: | 01-01-2016 | HQIC | 4196.463 |
| - 03-11-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| holiday | 68.9355 | 3.773 | 18.271 | 0.000 | 61.541 | 76.330 |
| ar.L1 | 0.2101 | 0.044 | 4.763 | 0.000 | 0.124 | 0.297 |
| ar.S.L7 | 1.0000 | 5.78e-05 | 1.73e+04 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -0.9581 | 0.022 | -43.532 | 0.000 | -1.001 | -0.915 |
| sigma2 | 779.3161 | 44.867 | 17.370 | 0.000 | 691.379 | 867.253 |
| Ljung-Box (L1) (Q): | 0.19 | Jarque-Bera (JB): | 20.47 |
|---|---|---|---|
| Prob(Q): | 0.66 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.98 | Skew: | 0.22 |
| Prob(H) (two-sided): | 0.88 | Kurtosis: | 3.96 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [99]:
start = len(train)
end = len(train) + len(test) - 1
In [100]:
predictions = results.predict(start, end, exog=test[['holiday']]).rename('SARIMAX with Exog')
In [101]:
ax =predictions.plot(figsize=(12,8), legend=True)
test['total'].plot(legend=True)
for day in test.query('holiday==1').index:
ax.axvline(x=day, color='black', alpha=0.9);
In [102]:
error = rmse(test['total'], predictions)
error
Out[102]:
22.929753286303306
외생변수를 추가해서 에러를 줄였다.
FORECAST¶
In [104]:
model = SARIMAX(df1['total'], exog=df1['holiday'], order=(0,0,0), seasonal_order=(1,0,1,7), enforce_invertibility=False)
In [106]:
results = model.fit()
results.summary()
Out[106]:
| Dep. Variable: | total | No. Observations: | 478 |
|---|---|---|---|
| Model: | SARIMAX(1, 0, [1], 7) | Log Likelihood | -2290.989 |
| Date: | Tue, 10 Jan 2023 | AIC | 4589.978 |
| Time: | 07:20:00 | BIC | 4606.656 |
| Sample: | 01-01-2016 | HQIC | 4596.535 |
| - 04-22-2017 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| holiday | 70.1652 | 3.837 | 18.286 | 0.000 | 62.644 | 77.686 |
| ar.S.L7 | 1.0000 | 2.76e-05 | 3.62e+04 | 0.000 | 1.000 | 1.000 |
| ma.S.L7 | -1.0300 | 0.024 | -43.225 | 0.000 | -1.077 | -0.983 |
| sigma2 | 734.4745 | 47.087 | 15.598 | 0.000 | 642.185 | 826.764 |
| Ljung-Box (L1) (Q): | 12.55 | Jarque-Bera (JB): | 22.58 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.91 | Skew: | 0.22 |
| Prob(H) (two-sided): | 0.55 | Kurtosis: | 3.97 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [107]:
exog_forecast = df[478:][['holiday']]
In [112]:
len(df) - len(df1)
Out[112]:
39
In [113]:
fcast = results.predict(len(df1), len(df1)+39-1, exog=exog_forecast).rename('FINAL SARIMAX FORECAST')
In [114]:
df1['total'].plot(figsize=(15,8), legend=True)
fcast.plot(legend=True);
In [115]:
ax = df1['total'].loc['2017-01-01':].plot(figsize=(15,8), legend=True)
fcast.plot(legend=True)
for x in df.query('holiday==1').index:
ax.axvline(x=x, color='k', alpha=0.8);