import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/airline_passengers.csv', index_col='Month', parse_dates=True)


df.dropna(inplace=True)


df.index.freq = 'MS'
df.index

DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
               '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
               '1949-09-01', '1949-10-01',
               ...
               '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',
               '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',
               '1960-11-01', '1960-12-01'],
              dtype='datetime64[ns]', name='Month', length=144, freq='MS')


df.plot();


from statsmodels.tsa.seasonal import seasonal_decompose


result = seasonal_decompose(df['Thousands of Passengers'], model='add')
fig = result.plot()
fig.set_size_inches(8,8);


result = seasonal_decompose(df['Thousands of Passengers'], model='multiplicative')
fig = result.plot()
fig.set_size_inches(8,8);


df['6-month-SMA'] = df['Thousands of Passengers'].rolling(window=6).mean()
df['12-month-SMA'] = df['Thousands of Passengers'].rolling(window=12).mean()


df.head()


df.plot();


df['EWMA12'] = df['Thousands of Passengers'].ewm(span=12).mean()


df[['Thousands of Passengers', '12-month-SMA', 'EWMA12']].plot(figsize=(12, 8));


from statsmodels.tsa.holtwinters import SimpleExpSmoothing 

span = 12 
alpha = 2/(span+1)


df['EWMA12'] = df['Thousands of Passengers'].ewm(alpha=alpha, adjust=False).mean()
df.head()


model = SimpleExpSmoothing(df['Thousands of Passengers'])
fitted_model = model.fit(smoothing_level=alpha, optimized=False) 
df['SES12'] = fitted_model.fittedvalues.shift(-1)


from statsmodels.tsa.holtwinters import ExponentialSmoothing


df['DESadd12'] = ExponentialSmoothing(df['Thousands of Passengers'], trend='add').fit().fittedvalues.shift(-1)


df[['Thousands of Passengers', 'EWMA12', 'DESadd12']].iloc[:24].plot(figsize=(12,6));


df['DESmul12'] = ExponentialSmoothing(df['Thousands of Passengers'], trend='mul').fit().fittedvalues.shift(-1)


df[['Thousands of Passengers', 'DESadd12', 'DESmul12']].iloc[:24].plot(figsize=(12,6));


df['TESadd12'] = ExponentialSmoothing(df['Thousands of Passengers'], trend='add', seasonal='add', seasonal_periods=12).fit().fittedvalues
df.head()


df['TESmul12'] = ExponentialSmoothing(df['Thousands of Passengers'], trend='mul', seasonal='mul', seasonal_periods=12).fit().fittedvalues
df.head()


df[['Thousands of Passengers', 'TESadd12', 'TESmul12']].plot(figsize=(12,6));


train_data = df.iloc[:108]
test_data = df.iloc[108:]


test_data.shape

(36, 9)


fitted_model = ExponentialSmoothing(train_data['Thousands of Passengers'], trend='mul', seasonal='mul', seasonal_periods=12).fit()


test_predictions = fitted_model.forecast(36).rename('HW Forecast')


train_data['Thousands of Passengers'].plot(legend=True, label='TRAIN')
test_data['Thousands of Passengers'].plot(legend=True, label='TEST', figsize=(12,8))
test_predictions.plot(legend=True, label='PREDICTION');


train_data['Thousands of Passengers'].plot(legend=True, label='TRAIN')
test_data['Thousands of Passengers'].plot(legend=True, label='TEST', figsize=(12,8))
test_predictions.plot(legend=True, label='PREDICTION', xlim=['1958-01-01', '1961-01-01']);


from sklearn.metrics import mean_squared_error, mean_absolute_error


np.sqrt(mean_squared_error(test_data['Thousands of Passengers'], test_predictions))

59.379221423818684


test_data['Thousands of Passengers'].mean()

428.5


final_model = ExponentialSmoothing(df['Thousands of Passengers'], trend='mul', seasonal='mul', seasonal_periods=12).fit()


forecast_predictions = final_model.forecast(36).rename('FORECAST')


df['Thousands of Passengers'].plot(figsize=(12,8), legend=True)
forecast_predictions.plot(legend=True);


df2 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/samples.csv', index_col=0, parse_dates=True)
df2.head()


df2['a'].plot(ylim=[0, 100], title='STATIONARY DATA');


df2['b'].plot(ylim=[0, 100], title='NON-STATIONARY DATA');


df2['c'].plot(ylim=[0, 10000], title='MORE NON-STATIONARY DATA');


df2['d1b'] = df2['b'] - df2['b'].shift(1)
df2[['b', 'd1b']].head()


from statsmodels.tsa.statespace.tools import diff 

diff(df2['b'], k_diff=1)

1950-02-01    -5.0
1950-03-01    -5.0
1950-04-01    -2.0
1950-05-01    -2.0
1950-06-01     3.0
              ... 
1959-08-01     3.0
1959-09-01     4.0
1959-10-01    -7.0
1959-11-01    17.0
1959-12-01   -14.0
Name: b, Length: 119, dtype: float64


diff(df2['b'], k_diff=1).plot(title='FISRT ORDER DIFFERENCE');


df2['d2c'] = df2['c'].diff().diff()

df2['d2c'].plot(title='SECOND ORDER DIFFERENCE');


# NON STATIONARY 
df1 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/airline_passengers.csv', index_col='Month', parse_dates=True)
df1.index.freq = 'MS'

# STATIONARY
df2 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/DailyTotalFemaleBirths.csv', index_col='Date', parse_dates=True)
df2.index.freq = 'D'


from pandas.plotting import lag_plot


lag_plot(df1['Thousands of Passengers']);


lag_plot(df2['Births']);


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


# NON STATIONARY
plot_acf(df1, lags=40);


# STATIONARY 
plot_acf(df2, lags=40);


# NON STATIONARY 
plot_pacf(df2, lags=40);


from statsmodels.tsa.ar_model import AR, ARResults


df = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/uspopulation.csv', index_col='DATE', parse_dates=True)
df.index.freq = 'MS'


df.head()


df['PopEst'].plot(figsize=(12,5));


len(df)

96


train = df.iloc[:84]
test = df.iloc[84:]


len(train), len(test)

(84, 12)


model = AR(train['PopEst'])


AR1fit = model.fit(maxlag=1)


AR1fit.aic # 정보량 규준

6.410771237032229


AR1fit.k_ar # k차

1


AR1fit.params

const        284.913797
L1.PopEst      0.999686
dtype: float64


start = len(train)
end = len(train) + len(test) - 1 
predictions1 = AR1fit.predict(start=start, end=end, dynamic=False).rename('AR (1) Predictions')


test.plot(figsize=(12,8), legend=True)
predictions1.plot(legend=True);


model = AR(train['PopEst']) 
AR2fit = model.fit(maxlag=2)


AR2fit.params

const        137.368305
L1.PopEst      1.853490
L2.PopEst     -0.853836
dtype: float64


predictions2 = AR2fit.predict(start, end).rename('AR (2) Predictions')


test.plot(figsize=(12,8), legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True);


model = AR(train['PopEst']) 
ARfit = model.fit(ic='t-stat') # 정보량 규준


ARfit.params

const        82.309677
L1.PopEst     2.437997
L2.PopEst    -2.302100
L3.PopEst     1.565427
L4.PopEst    -1.431211
L5.PopEst     1.125022
L6.PopEst    -0.919494
L7.PopEst     0.963694
L8.PopEst    -0.439511
dtype: float64


predictions8 = ARfit.predict(start, end).rename('AR (8) Predictions')


from sklearn.metrics import mean_squared_error


labels = ['AR1', 'AR2', 'AR8']


preds = [predictions1, predictions2, predictions8]


for i in range(3): 
  # np.sqrt()
  error = mean_squared_error(test['PopEst'], preds[i])
  print(f'{labels[i]} MSE was : {error}')

AR1 MSE was : 17449.71423587912
AR2 MSE was : 2713.2585540102214
AR8 MSE was : 186.97053754548145


test.plot(figsize=(12,8), legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True)
predictions8.plot(legend=True);


model = AR(df['PopEst'])

ARfit = model.fit(ic='t-stat')


ARfit.params

const         84.885175
L1.PopEst      2.296674
L2.PopEst     -2.109518
L3.PopEst      1.429221
L4.PopEst     -1.259837
L5.PopEst      1.093852
L6.PopEst     -0.985774
L7.PopEst      1.066295
L8.PopEst     -0.858709
L9.PopEst      0.826672
L10.PopEst    -1.074975
L11.PopEst     1.034535
L12.PopEst    -0.458679
dtype: float64


forcasted_values = ARfit.predict(start=len(df), end=len(df)+11).rename('Forecast')


df['PopEst'].plot(figsize=(12,8), legend=True)
forcasted_values.plot(legend=True);


df1 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/airline_passengers.csv', index_col='Month', parse_dates=True)
df1.index.freq = 'MS'

df2 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/DailyTotalFemaleBirths.csv', index_col='Date', parse_dates=True)
df2.index.freq = "D"


df1.plot();


from statsmodels.tsa.stattools import adfuller


adfuller(df1['Thousands of Passengers'])

(0.8153688792060472,
 0.991880243437641,
 13,
 130,
 {'1%': -3.4816817173418295,
  '5%': -2.8840418343195267,
  '10%': -2.578770059171598},
 996.6929308390189)


dftest = adfuller(df1['Thousands of Passengers'])
dfout = pd.Series(dftest[:4], index=['ADF Test Statistic', 'p-value', '# Lags Used', '# Observations'])


for key, val in dftest[4].items(): 
  dfout[f'critical value ({key})'] = val


dfout

ADF Test Statistic        0.815369
p-value                   0.991880
# Lags Used              13.000000
# Observations          130.000000
critical value (1%)      -3.481682
critical value (5%)      -2.884042
critical value (10%)     -2.578770
dtype: float64


def adf_test(series, title=''): 
  
  print(f'Augmented Dickey-Fuller Test: {title}')
  result = adfuller(series.dropna(), autolag='AIC')
  
  labels = ['ADF Test Statistic', 'p-value', '# Lags Used', '# Observations']
  out = pd.Series(result[:4], index = labels)

  for key, val in result[4].items(): 
    out[f'critical value ({key})'] = val 
  
  print(out.to_string()) # .to_string() removes the line "dtype: float64"

  if result[1] <= 0.05: 
    print('\nSTATIONARY')
  else: 
    print('\nNON STATIONARY')


adf_test(df1['Thousands of Passengers'])

Augmented Dickey-Fuller Test: 
ADF Test Statistic        0.815369
p-value                   0.991880
# Lags Used              13.000000
# Observations          130.000000
critical value (1%)      -3.481682
critical value (5%)      -2.884042
critical value (10%)     -2.578770

NON STATIONARY


df2.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8085a79310>


adf_test(df2['Births'])

Augmented Dickey-Fuller Test: 
ADF Test Statistic       -4.808291
p-value                   0.000052
# Lags Used               6.000000
# Observations          358.000000
critical value (1%)      -3.448749
critical value (5%)      -2.869647
critical value (10%)     -2.571089

STATIONARY


df3 = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/samples.csv', index_col=0, parse_dates=True)
df3.index.freq = 'MS'


df3[['a', 'd']].plot(figsize=(12, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x7f80852c4610>


df3['a'].iloc[2:].plot(figsize=(12, 8), legend=True) 
df3['d'].shift(2).plot(legend=True);


from statsmodels.tsa.stattools import grangercausalitytests


grangercausalitytests(df3[['a', 'd']], maxlag=3);

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.7051  , p=0.1942  , df_denom=116, df_num=1
ssr based chi2 test:   chi2=1.7492  , p=0.1860  , df=1
likelihood ratio test: chi2=1.7365  , p=0.1876  , df=1
parameter F test:         F=1.7051  , p=0.1942  , df_denom=116, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=286.0339, p=0.0000  , df_denom=113, df_num=2
ssr based chi2 test:   chi2=597.3806, p=0.0000  , df=2
likelihood ratio test: chi2=212.6514, p=0.0000  , df=2
parameter F test:         F=286.0339, p=0.0000  , df_denom=113, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=188.7446, p=0.0000  , df_denom=110, df_num=3
ssr based chi2 test:   chi2=602.2669, p=0.0000  , df=3
likelihood ratio test: chi2=212.4789, p=0.0000  , df=3
parameter F test:         F=188.7446, p=0.0000  , df_denom=110, df_num=3


grangercausalitytests(df3[['b', 'd']], maxlag=3);

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.5225  , p=0.2197  , df_denom=116, df_num=1
ssr based chi2 test:   chi2=1.5619  , p=0.2114  , df=1
likelihood ratio test: chi2=1.5517  , p=0.2129  , df=1
parameter F test:         F=1.5225  , p=0.2197  , df_denom=116, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.4350  , p=0.6483  , df_denom=113, df_num=2
ssr based chi2 test:   chi2=0.9086  , p=0.6349  , df=2
likelihood ratio test: chi2=0.9051  , p=0.6360  , df=2
parameter F test:         F=0.4350  , p=0.6483  , df_denom=113, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.5333  , p=0.6604  , df_denom=110, df_num=3
ssr based chi2 test:   chi2=1.7018  , p=0.6365  , df=3
likelihood ratio test: chi2=1.6895  , p=0.6393  , df=3
parameter F test:         F=0.5333  , p=0.6604  , df_denom=110, df_num=3


np.random.seed(42)

df = pd.DataFrame(np.random.randint(20,30, (50,2)), columns=['test', 'predictions'])


df.head()


from statsmodels.tools.eval_measures import mse, rmse, meanabs


rmse(df['test'], df['predictions'])

4.125530268947253


df['test'].mean()

25.14


df = pd.read_csv('/content/drive/MyDrive/UDEMY_TSA_FINAL/Data/airline_passengers.csv', index_col='Month', parse_dates=True)
df.index.freq = 'MS'


from statsmodels.graphics.tsaplots import month_plot, quarter_plot


month_plot(df['Thousands of Passengers']);


dfq = df['Thousands of Passengers'].resample(rule='Q').mean()
quarter_plot(dfq);

	Thousands of Passengers	6-month-SMA	12-month-SMA
Month
1949-01-01	112	NaN	NaN
1949-02-01	118	NaN	NaN
1949-03-01	132	NaN	NaN
1949-04-01	129	NaN	NaN
1949-05-01	121	NaN	NaN

	Thousands of Passengers	6-month-SMA	12-month-SMA	EWMA12
Month
1949-01-01	112	NaN	NaN	112.000000
1949-02-01	118	NaN	NaN	112.923077
1949-03-01	132	NaN	NaN	115.857988
1949-04-01	129	NaN	NaN	117.879836
1949-05-01	121	NaN	NaN	118.359861

	Thousands of Passengers	6-month-SMA	12-month-SMA	EWMA12	SES12	DESadd12	DESmul12	TESadd12
Month
1949-01-01	112	NaN	NaN	112.000000	112.000000	113.474828	114.978251	112.001172
1949-02-01	118	NaN	NaN	112.923077	112.923077	119.464366	121.191659	120.168193
1949-03-01	132	NaN	NaN	115.857988	115.857988	133.477561	135.802180	134.698694
1949-04-01	129	NaN	NaN	117.879836	117.879836	130.543312	132.657709	131.376310
1949-05-01	121	NaN	NaN	118.359861	118.359861	122.528126	124.213566	124.628035

	Thousands of Passengers	6-month-SMA	12-month-SMA	EWMA12	SES12	DESadd12	DESmul12	TESadd12	TESmul12
Month
1949-01-01	112	NaN	NaN	112.000000	112.000000	113.474828	114.978251	112.001172	111.597879
1949-02-01	118	NaN	NaN	112.923077	112.923077	119.464366	121.191659	120.168193	118.844235
1949-03-01	132	NaN	NaN	115.857988	115.857988	133.477561	135.802180	134.698694	133.334951
1949-04-01	129	NaN	NaN	117.879836	117.879836	130.543312	132.657709	131.376310	127.901291
1949-05-01	121	NaN	NaN	118.359861	118.359861	122.528126	124.213566	124.628035	120.978657

	PopEst
DATE
2011-01-01	311037
2011-02-01	311189
2011-03-01	311351
2011-04-01	311522
2011-05-01	311699

hayley

Statsmodels를 이용한 시계열 분석

ETS¶

MA¶

SMA / Simple Moving Average¶

EWMA / Exponentially Weighted Moving Average¶

Holt-Winters Method¶

Simple Exponential Smoothing / Simple Moving Average¶

Double Exponential Smoothing / Holt's Method¶

Triple Exponential Smoothing / Holt-Winters Method¶

Forecasting¶

Stationarity¶

Diffrencing¶

ACF and PACF¶

Autocorrelation Function¶

Partial Autocorrelation Function¶

AutoRegression Model¶

Descriptive Statistics and Tests¶

Augmented Dickey-Fullter Test¶

Granger Causality Tests¶

Evaluating forecast accuracy¶

Exposing Seasonality with Month and Quarter Plots¶

'time_series' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	a	b	c	d
1950-01-01	36	27	0	67
1950-02-01	58	22	3	31
1950-03-01	61	17	5	67
1950-04-01	37	15	8	47
1950-05-01	66	13	8	62

	b	d1b
1950-01-01	27	NaN
1950-02-01	22	-5.0
1950-03-01	17	-5.0
1950-04-01	15	-2.0
1950-05-01	13	-2.0

	test	predictions
0	26	23
1	27	24
2	26	29
3	22	26
4	27	24