# 단계 1: 폰트 설치
import matplotlib.font_manager as fm
!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()
# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)
# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm
# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()
#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
!pip install kaggle
from google.colab import files
files.upload()
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: kaggle in /usr/local/lib/python3.8/dist-packages (1.5.12) Requirement already satisfied: python-slugify in /usr/local/lib/python3.8/dist-packages (from kaggle) (7.0.0) Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.24.3) Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from kaggle) (4.64.1) Requirement already satisfied: certifi in /usr/local/lib/python3.8/dist-packages (from kaggle) (2022.12.7) Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.25.1) Requirement already satisfied: python-dateutil in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.8.2) Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.15.0) Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.8/dist-packages (from python-slugify->kaggle) (1.3) Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (4.0.0) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (2.10)
Saving kaggle.json to kaggle.json
{'kaggle.json': b'{"username":"minaahayley","key":"2b9a4ddc90c33aadf0ac55ad9e6919df"}'}
!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning
#다운로드가 제대로 되었는지 확인한다.
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다.
%ls ~/.kaggle
kaggle.json
!kaggle competitions download -c competitive-data-science-predict-future-sales
!unzip competitive-data-science-predict-future-sales.zip
Downloading competitive-data-science-predict-future-sales.zip to /content 33% 5.00M/15.1M [00:00<00:00, 40.2MB/s] 100% 15.1M/15.1M [00:00<00:00, 84.1MB/s] Archive: competitive-data-science-predict-future-sales.zip inflating: item_categories.csv inflating: items.csv inflating: sales_train.csv inflating: sample_submission.csv inflating: shops.csv inflating: test.csv
!mkdir -p ~/.kaggle/competitions/predict-future-sales #create folder name Kaggle
!cp item_categories.csv ~/.kaggle/competitions/predict-future-sales #copy into folder Kaggle
!cp items.csv ~/.kaggle/competitions/predict-future-sales
!cp sales_train.csv ~/.kaggle/competitions/predict-future-sales
!cp sample_submission.csv ~/.kaggle/competitions/predict-future-sales
!cp shops.csv ~/.kaggle/competitions/predict-future-sales
!cp test.csv ~/.kaggle/competitions/predict-future-sales
#다운로드가 제대로 되었는지 확인한다.
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다.
%ls ~/.kaggle/competitions/predict-future-sales
item_categories.csv sales_train.csv shops.csv items.csv sample_submission.csv test.csv
import pandas as pd
import numpy as np
# TIME SERIES
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
!pip install pmdarima
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.tools import diff
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tools.eval_measures import rmse
# settings
import warnings
warnings.filterwarnings('ignore')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: pmdarima in /usr/local/lib/python3.8/dist-packages (2.0.2) Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.3.5) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.2.0) Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (57.4.0) Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.0.2) Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.24.3) Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.7.3) Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.29.32) Requirement already satisfied: numpy>=1.21.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.21.6) Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.13.5) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2022.7) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2.8.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.22->pmdarima) (3.1.0) Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3) Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (21.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9) Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.15.0)
train = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sales_train.csv')
item_cat = pd.read_csv('~/.kaggle/competitions/predict-future-sales/item_categories.csv')
item = pd.read_csv('~/.kaggle/competitions/predict-future-sales/items.csv')
sub = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sample_submission.csv')
shops = pd.read_csv('~/.kaggle/competitions/predict-future-sales/shops.csv')
test = pd.read_csv('~/.kaggle/competitions/predict-future-sales/test.csv')
Preprocessing¶
# sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
# test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
# drop duplicates
subset = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_cnt_day']
train.duplicated(subset=subset).value_counts()
False 2935825 True 24 dtype: int64
train.drop_duplicates(subset=subset, inplace=True)
train.shape
(2935825, 6)
# drop shops & items not in test data
test_shops = test.shop_id.unique()
test_items = test.item_id.unique()
train = train[train.shop_id.isin(test_shops)]
train = train[train.item_id.isin(test_items)]
train.shape
(1224429, 6)
train.sample()
date | date_block_num | shop_id | item_id | item_price | item_cnt_day | |
---|---|---|---|---|---|---|
2429801 | 28.02.2015 | 25 | 57 | 1856 | 1199.0 | 1.0 |
from itertools import product
# create all combinations
block_shop_combi = pd.DataFrame(list(product(np.arange(34), test_shops)), columns=['date_block_num', 'shop_id'])
shop_item_combi = pd.DataFrame(list(product(test_shops, test_items)), columns=['shop_id', 'item_id'])
all_combi = block_shop_combi.merge(shop_item_combi, on='shop_id', how='inner')
all_combi.head()
date_block_num | shop_id | item_id | |
---|---|---|---|
0 | 0 | 5 | 5037 |
1 | 0 | 5 | 5320 |
2 | 0 | 5 | 5233 |
3 | 0 | 5 | 5232 |
4 | 0 | 5 | 5268 |
# group by monthly
train_base = all_combi.merge(train, on=['date_block_num', 'shop_id', 'item_id'], how='left')
train_base.head()
date_block_num | shop_id | item_id | date | item_price | item_cnt_day | |
---|---|---|---|---|---|---|
0 | 0 | 5 | 5037 | NaN | NaN | NaN |
1 | 0 | 5 | 5320 | NaN | NaN | NaN |
2 | 0 | 5 | 5233 | NaN | NaN | NaN |
3 | 0 | 5 | 5232 | NaN | NaN | NaN |
4 | 0 | 5 | 5268 | NaN | NaN | NaN |
train_base.fillna(0, inplace=True)
# summary count by month
train_monthly = train_base.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum()
train_monthly.head()
shop_id item_id date_block_num 2 30 0 0.0 1 0.0 2 1.0 3 0.0 4 0.0 Name: item_cnt_day, dtype: float64
#train_monthly = train_monthly.unstack(level=-1).T
#dates=pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')
#train_monthly.index=dates
# train_monthly = train_monthly.reset_index().melt(id_vars='index')
#train_monthly.head()
shop_id | 2 | ... | 59 | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
item_id | 30 | 31 | 32 | 33 | 38 | 42 | 45 | 51 | 53 | 57 | ... | 22118 | 22137 | 22139 | 22145 | 22154 | 22162 | 22163 | 22164 | 22166 | 22167 |
2013-01-01 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2013-02-01 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2013-03-01 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2013-04-01 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2013-05-01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 214200 columns
train_monthly = train_monthly.reset_index()
train_monthly.head()
shop_id | item_id | date_block_num | item_cnt_day | |
---|---|---|---|---|
0 | 2 | 30 | 0 | 0.0 |
1 | 2 | 30 | 1 | 0.0 |
2 | 2 | 30 | 2 | 1.0 |
3 | 2 | 30 | 3 | 0.0 |
4 | 2 | 30 | 4 | 0.0 |
train_monthly.date_block_num.max()
33
train_monthly.columns = ['shop_id', 'item_id', 'date_block_num', 'item_cnt_month']
test.drop('ID', axis=1)
shop_id | item_id | |
---|---|---|
0 | 5 | 5037 |
1 | 5 | 5320 |
2 | 5 | 5233 |
3 | 5 | 5232 |
4 | 5 | 5268 |
... | ... | ... |
214195 | 45 | 18454 |
214196 | 45 | 16188 |
214197 | 45 | 15757 |
214198 | 45 | 19648 |
214199 | 45 | 969 |
214200 rows × 2 columns
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id','item_id']]
test.head()
date_block_num | shop_id | item_id | |
---|---|---|---|
0 | 34 | 5 | 5037 |
1 | 34 | 5 | 5320 |
2 | 34 | 5 | 5233 |
3 | 34 | 5 | 5232 |
4 | 34 | 5 | 5268 |
item_grp = item_cat['item_category_name'].apply(lambda x : str(x).split(' ')[0])
item_cat['item_group'] = pd.Categorical(item_grp).codes
item = pd.merge(item, item_cat.loc[:,['item_category_id', 'item_group']], on='item_category_id', how='left')
item.head()
item_name | item_id | item_category_id | item_group | |
---|---|---|---|---|
0 | ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D | 0 | 40 | 7 |
1 | !ABBYY FineReader 12 Professional Edition Full... | 1 | 76 | 11 |
2 | ***В ЛУЧАХ СЛАВЫ (UNV) D | 2 | 40 | 7 |
3 | ***ГОЛУБАЯ ВОЛНА (Univ) D | 3 | 40 | 7 |
4 | ***КОРОБКА (СТЕКЛО) D | 4 | 40 | 7 |
city = shops.shop_name.apply(lambda x : str.replace(x, '!', '')).apply(lambda x : x.split(' ')[0])
shops['city'] = pd.Categorical(city).codes
shops.head()
shop_name | shop_id | city | |
---|---|---|---|
0 | !Якутск Орджоникидзе, 56 фран | 0 | 29 |
1 | !Якутск ТЦ "Центральный" фран | 1 | 29 |
2 | Адыгея ТЦ "Мега" | 2 | 0 |
3 | Балашиха ТРК "Октябрь-Киномир" | 3 | 1 |
4 | Волжский ТЦ "Волга Молл" | 4 | 2 |
train_monthly = train_monthly.merge(item[['item_id', 'item_group']], how='left', on='item_id')
test = test.merge(item[['item_id', 'item_group']], how='left', on='item_id')
train_monthly = train_monthly.merge(shops[['shop_id', 'city']], how='left', on='shop_id')
test = test.merge(shops[['shop_id', 'city']], how='left', on='shop_id')
train_monthly.head()
shop_id | item_id | date_block_num | item_cnt_month | item_group | city | |
---|---|---|---|---|---|---|
0 | 2 | 30 | 0 | 0.0 | 7 | 0 |
1 | 2 | 30 | 1 | 0.0 | 7 | 0 |
2 | 2 | 30 | 2 | 1.0 | 7 | 0 |
3 | 2 | 30 | 3 | 0.0 | 7 | 0 |
4 | 2 | 30 | 4 | 0.0 | 7 | 0 |
test.head()
date_block_num | shop_id | item_id | item_group | city | |
---|---|---|---|---|---|
0 | 34 | 5 | 5037 | 5 | 3 |
1 | 34 | 5 | 5320 | 9 | 3 |
2 | 34 | 5 | 5233 | 5 | 3 |
3 | 34 | 5 | 5232 | 5 | 3 |
4 | 34 | 5 | 5268 | 5 | 3 |
df_train = train_monthly.copy()
df_train = df_train.drop(['shop_id', 'item_id'], axis=1)
df_test = test.copy()
df_test = df_test.drop(['shop_id', 'item_id'], axis=1)
display(df_train.head())
display(df_test.head())
date_block_num | item_cnt_month | item_group | city | |
---|---|---|---|---|
0 | 0 | 0.0 | 7 | 0 |
1 | 1 | 0.0 | 7 | 0 |
2 | 2 | 1.0 | 7 | 0 |
3 | 3 | 0.0 | 7 | 0 |
4 | 4 | 0.0 | 7 | 0 |
date_block_num | item_group | city | |
---|---|---|---|
0 | 34 | 5 | 3 |
1 | 34 | 9 | 3 |
2 | 34 | 5 | 3 |
3 | 34 | 5 | 3 |
4 | 34 | 5 | 3 |
X_train = df_train.drop('item_cnt_month', axis=1)
y_train = df_train['item_cnt_month']
X_train.head()
date_block_num | item_group | city | |
---|---|---|---|
0 | 0 | 7 | 0 |
1 | 1 | 7 | 0 |
2 | 2 | 7 | 0 |
3 | 3 | 7 | 0 |
4 | 4 | 7 | 0 |
X_test = df_test.copy()
X_test.head()
date_block_num | item_group | city | |
---|---|---|---|
0 | 34 | 5 | 3 |
1 | 34 | 9 | 3 |
2 | 34 | 5 | 3 |
3 | 34 | 5 | 3 |
4 | 34 | 5 | 3 |
from sklearn import linear_model, preprocessing
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
params = {'learning_rate' : 0.05,
'objective' : 'regression',
'metric' : 'rmse',
'num_leaves':64,
'verbose' : 1,
'random_state': 42,
'bagging_fraction':1, # 각각 iteration 에 사용되는 데이터의 일부를 지정하는데, 일반적으로 훈련 속도를 높이거나, 과적합을 피할때 사용한다.
'feature_fraction':1} # used when your boosting is random forest.
# 즉, 랜덤 포레스트 일 경우에 사용.
# 0.8 feature fraction은 LightGBM이 Tree를 만들 때 각각 iteration 반복에서 파라미터 중에서 80%를 랜덤으로 선택.
folds = GroupKFold(n_splits=6)
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
for fold_, (trn_, val_) in enumerate(folds.split(X_train, y_train, X_train['date_block_num'])):
trn_x, trn_y = X_train.iloc[trn_], y_train[trn_]
val_x, val_y = X_train.iloc[val_], y_train[val_]
reg = lgb.LGBMRegressor(**params, n_estimators=3000)
reg.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=50, verbose=500)
oof_preds[val_] = reg.predict(val_x.values, num_iteration=reg.best_iteration_)
sub_preds += reg.predict(X_test.values, num_iteration=reg.best_iteration_) / folds.n_splits
Training until validation scores don't improve for 50 rounds. Early stopping, best iteration is: [79] valid_0's rmse: 3.96099 Training until validation scores don't improve for 50 rounds. Early stopping, best iteration is: [28] valid_0's rmse: 3.29469 Training until validation scores don't improve for 50 rounds. Early stopping, best iteration is: [160] valid_0's rmse: 2.63286 Training until validation scores don't improve for 50 rounds. Early stopping, best iteration is: [66] valid_0's rmse: 2.2012 Training until validation scores don't improve for 50 rounds. Early stopping, best iteration is: [51] valid_0's rmse: 3.19721 Training until validation scores don't improve for 50 rounds. [500] valid_0's rmse: 4.09475 Early stopping, best iteration is: [565] valid_0's rmse: 4.09427
pred_cnt = sub_preds
print('RMSE:', np.sqrt(mean_squared_error(y_train, oof_preds.clip(0.,20.))))
RMSE: 3.2962126297373016
test = pd.read_csv('~/.kaggle/competitions/predict-future-sales/test.csv')
result = pd.DataFrame({
'ID' : test['ID'],
'item_cnt_month' : pred_cnt.clip(0., 20.)
})
result.to_csv('submission.csv', index=False)
10122/16100
0.628695652173913
'kaggle' 카테고리의 다른 글
[Kaggle][필사] Customer Lifetime Value Prediction (0) | 2023.01.05 |
---|---|
[Kaggle][필사] Wallmart_Sales_Top_3___EDA_Feature_Engineering (0) | 2022.12.28 |
[Kaggle][필사] Future Sales Prediction: playground (0) | 2022.12.26 |
bus passenger prediction (0) | 2022.12.23 |
The Boston Housing Dataset (0) | 2022.11.26 |