# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()


# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)


# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.8/dist-packages (1.5.12)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.8/dist-packages (from kaggle) (7.0.0)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.8/dist-packages (from kaggle) (2022.12.7)
Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.25.1)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.8/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (4.0.0)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (2.10)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"2b9a4ddc90c33aadf0ac55ad9e6919df"}'}


!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

kaggle.json


!kaggle competitions download -c competitive-data-science-predict-future-sales

!unzip competitive-data-science-predict-future-sales.zip

Downloading competitive-data-science-predict-future-sales.zip to /content
 33% 5.00M/15.1M [00:00<00:00, 40.2MB/s]
100% 15.1M/15.1M [00:00<00:00, 84.1MB/s]
Archive:  competitive-data-science-predict-future-sales.zip
  inflating: item_categories.csv     
  inflating: items.csv               
  inflating: sales_train.csv         
  inflating: sample_submission.csv   
  inflating: shops.csv               
  inflating: test.csv


!mkdir -p ~/.kaggle/competitions/predict-future-sales #create folder name Kaggle
!cp item_categories.csv ~/.kaggle/competitions/predict-future-sales #copy into folder Kaggle
!cp items.csv ~/.kaggle/competitions/predict-future-sales
!cp sales_train.csv ~/.kaggle/competitions/predict-future-sales
!cp sample_submission.csv ~/.kaggle/competitions/predict-future-sales
!cp shops.csv ~/.kaggle/competitions/predict-future-sales
!cp test.csv ~/.kaggle/competitions/predict-future-sales


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/predict-future-sales

item_categories.csv  sales_train.csv        shops.csv
items.csv            sample_submission.csv  test.csv


import pandas as pd 
import numpy as np 

# TIME SERIES   
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults 
!pip install pmdarima
from pmdarima import auto_arima 
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import VAR 
from statsmodels.tsa.stattools import adfuller 
from statsmodels.tsa.statespace.tools import diff 
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 
from statsmodels.tools.eval_measures import rmse 

# settings 
import warnings 
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pmdarima in /usr/local/lib/python3.8/dist-packages (2.0.2)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.3.5)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.2.0)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (57.4.0)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.0.2)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.24.3)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.7.3)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.29.32)
Requirement already satisfied: numpy>=1.21.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (1.21.6)
Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.8/dist-packages (from pmdarima) (0.13.5)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2022.7)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.22->pmdarima) (3.1.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.8/dist-packages (from statsmodels>=0.13.2->pmdarima) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.15.0)


train = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sales_train.csv')
item_cat = pd.read_csv('~/.kaggle/competitions/predict-future-sales/item_categories.csv')
item = pd.read_csv('~/.kaggle/competitions/predict-future-sales/items.csv') 
sub = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sample_submission.csv')
shops = pd.read_csv('~/.kaggle/competitions/predict-future-sales/shops.csv')
test = pd.read_csv('~/.kaggle/competitions/predict-future-sales/test.csv')


# sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
# test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.


# drop duplicates 
subset = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_cnt_day']
train.duplicated(subset=subset).value_counts()

False    2935825
True          24
dtype: int64


train.drop_duplicates(subset=subset, inplace=True)
train.shape

(2935825, 6)


# drop shops & items not in test data 
test_shops = test.shop_id.unique()
test_items = test.item_id.unique()

train = train[train.shop_id.isin(test_shops)]
train = train[train.item_id.isin(test_items)]
train.shape

(1224429, 6)


train.sample()


from itertools import product

# create all combinations 
block_shop_combi = pd.DataFrame(list(product(np.arange(34), test_shops)), columns=['date_block_num', 'shop_id'])
shop_item_combi = pd.DataFrame(list(product(test_shops, test_items)), columns=['shop_id', 'item_id'])
all_combi = block_shop_combi.merge(shop_item_combi, on='shop_id', how='inner')
all_combi.head()


# group by monthly 
train_base = all_combi.merge(train, on=['date_block_num', 'shop_id', 'item_id'], how='left')
train_base.head()


train_base.fillna(0, inplace=True)


# summary count by month 
train_monthly = train_base.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum()


train_monthly.head()

shop_id  item_id  date_block_num
2        30       0                 0.0
                  1                 0.0
                  2                 1.0
                  3                 0.0
                  4                 0.0
Name: item_cnt_day, dtype: float64


#train_monthly = train_monthly.unstack(level=-1).T
#dates=pd.date_range(start = '2013-01-01',end='2015-10-01', freq = 'MS')
#train_monthly.index=dates
# train_monthly = train_monthly.reset_index().melt(id_vars='index')
#train_monthly.head()


train_monthly = train_monthly.reset_index()


train_monthly.head()


train_monthly.date_block_num.max()

33


train_monthly.columns = ['shop_id', 'item_id', 'date_block_num', 'item_cnt_month']


test.drop('ID', axis=1)


test['date_block_num'] = 34


test = test[['date_block_num', 'shop_id','item_id']]
test.head()


item_grp = item_cat['item_category_name'].apply(lambda x : str(x).split(' ')[0])
item_cat['item_group'] = pd.Categorical(item_grp).codes
item = pd.merge(item, item_cat.loc[:,['item_category_id', 'item_group']], on='item_category_id', how='left')

item.head()


city = shops.shop_name.apply(lambda x : str.replace(x, '!', '')).apply(lambda x : x.split(' ')[0])
shops['city'] = pd.Categorical(city).codes 
shops.head()


train_monthly = train_monthly.merge(item[['item_id', 'item_group']], how='left', on='item_id')


test = test.merge(item[['item_id', 'item_group']], how='left', on='item_id')


train_monthly = train_monthly.merge(shops[['shop_id', 'city']], how='left', on='shop_id')
test = test.merge(shops[['shop_id', 'city']], how='left', on='shop_id')


train_monthly.head()


test.head()


df_train = train_monthly.copy()
df_train = df_train.drop(['shop_id', 'item_id'], axis=1)


df_test = test.copy()
df_test = df_test.drop(['shop_id', 'item_id'], axis=1)


display(df_train.head())
display(df_test.head())


X_train = df_train.drop('item_cnt_month', axis=1)
y_train = df_train['item_cnt_month'] 
X_train.head()


X_test = df_test.copy()
X_test.head()


from sklearn import linear_model, preprocessing 
from sklearn.model_selection import GroupKFold 
import lightgbm as lgb 

params = {'learning_rate' : 0.05,
          'objective' : 'regression',
          'metric' : 'rmse',
          'num_leaves':64, 
          'verbose' : 1,
          'random_state': 42,
          'bagging_fraction':1, # 각각 iteration 에 사용되는 데이터의 일부를 지정하는데, 일반적으로 훈련 속도를 높이거나, 과적합을 피할때 사용한다.
          'feature_fraction':1} # used when your boosting is random forest. 
          # 즉, 랜덤 포레스트 일 경우에 사용. 
          # 0.8 feature fraction은 LightGBM이 Tree를 만들 때 각각 iteration 반복에서 파라미터 중에서 80%를 랜덤으로 선택.

folds = GroupKFold(n_splits=6)
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(X_train, y_train, X_train['date_block_num'])):
  trn_x, trn_y = X_train.iloc[trn_], y_train[trn_]
  val_x, val_y = X_train.iloc[val_], y_train[val_]

  reg = lgb.LGBMRegressor(**params, n_estimators=3000)
  reg.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=50, verbose=500) 

  oof_preds[val_] = reg.predict(val_x.values, num_iteration=reg.best_iteration_) 
  sub_preds += reg.predict(X_test.values, num_iteration=reg.best_iteration_) / folds.n_splits

Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[79]	valid_0's rmse: 3.96099
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[28]	valid_0's rmse: 3.29469
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[160]	valid_0's rmse: 2.63286
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[66]	valid_0's rmse: 2.2012
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[51]	valid_0's rmse: 3.19721
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 4.09475
Early stopping, best iteration is:
[565]	valid_0's rmse: 4.09427


pred_cnt = sub_preds


print('RMSE:', np.sqrt(mean_squared_error(y_train, oof_preds.clip(0.,20.))))

RMSE: 3.2962126297373016


test = pd.read_csv('~/.kaggle/competitions/predict-future-sales/test.csv')


result = pd.DataFrame({
    'ID' : test['ID'], 
    'item_cnt_month' : pred_cnt.clip(0., 20.)
})
result.to_csv('submission.csv', index=False)


10122/16100

0.628695652173913


# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()


# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)


# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


from __future__ import division

from datetime import datetime, timedelta,date
import pandas as pd
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans


import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go

import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')


tx_data = pd.read_csv('/content/drive/MyDrive/data/customer_segmentation.csv', encoding='cp1252')


tx_data.head()


tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])


tx_data['InvoiceYearMonth'] = tx_data['InvoiceDate'].apply(lambda x : 100*x.year + x.month)


tx_data.describe()


tx_data['Country'].value_counts()

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              30
Bahrain                     19
Saudi Arabia                10
Name: Country, dtype: int64


#we will be using only UK data
tx_uk = tx_data.query("Country == 'United Kingdom'").reset_index(drop=True)


#create a generic user dataframe to keep CustomerID and new segmentation scores 
tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
tx_user.columns = ['CustomerID']
tx_user.head()


#get the max purchase date for each customer and create a dataframe with it
tx_max_purchase = tx_uk.groupby('CustomerID')['InvoiceDate'].max().reset_index()
tx_max_purchase.columns = ['CustomerID', 'MaxPurchaseData']
tx_max_purchase.head()


# Compare the last transaction of dataset with last transaction dates of the individual customer IDs.
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseData'].max() - tx_max_purchase['MaxPurchaseData']).dt.days 
tx_max_purchase.head()


# merge this dataframe to our new user dataframe 
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID', 'Recency']], on='CustomerID')
tx_user.head()


from sklearn.cluster import KMeans 

sse = {} # error 

tx_recency = tx_user[['Recency']]

for k in range(1, 10): 
  kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
  tx_recency['clusters'] = kmeans.labels_ 
  sse[k] = kmeans.inertia_ # 군집의 응집도 값이 작을수록 군집화가 잘 되었다고 평가 

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()


#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
tx_user['RecencyCluster'] = kmeans.fit_predict(tx_user[['Recency']])


tx_user.head()


tx_user.groupby('RecencyCluster')['Recency'].describe()


# function for ordering cluster numbers 

def order_cluster(cluster_field_name, target_field_name, df, ascending): 
  new_cluster_field_name = 'new_' + cluster_field_name
  df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
  df_new = df_new.sort_values(by=target_field_name, ascending=ascending).reset_index(drop=True)
  df_new['index'] = df_new.index 
  df_final = pd.merge(df, df_new[[cluster_field_name, 'index']], on=cluster_field_name)
  df_final = df_final.drop(cluster_field_name, axis=1)
  df_final = df_final.rename(columns={'index' : cluster_field_name})
  return df_final


tx_user = order_cluster('RecencyCluster', 'Recency', tx_user, False)
tx_user


tx_user.groupby('RecencyCluster')['Recency'].describe()


#get order counts for each user and create a dataframe with it 
tx_frequency = tx_uk.groupby('CustomerID')['InvoiceDate'].count().reset_index()
tx_frequency.columns = ['CustomerID', 'Frequency']
tx_frequency.head()


# add this data to our main dataframe 
tx_user = tx_user.merge(tx_frequency, on ='CustomerID')
tx_user.head()


from sklearn.cluster import KMeans 

sse = {} 

tx_frequency = tx_user[['Frequency']]

for k in range(1, 10): 
  kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_frequency)  
  tx_frequency['clusters'] = kmeans.labels_ 
  sse[k] = kmeans.inertia_ 

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel('Number of cluster')
plt.show()


# Applying K-Means 
Kmeans = KMeans(n_clusters=4)
tx_user['FrequencyCluster'] = kmeans.fit_predict(tx_user[['Frequency']])


tx_user.groupby('FrequencyCluster')['Frequency'].describe()


# order the frequency cluster 
tx_user = order_cluster('FrequencyCluster', 'Frequency', tx_user, True)
tx_user.groupby('FrequencyCluster')['Frequency'].describe()


# calculate revenue for each customer 
tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()
tx_revenue.head()


# merge it with our main dataframe 
tx_user = tx_user.merge(tx_revenue, on='CustomerID')
tx_user.head()


from sklearn.cluster import KMeans

sse={} # error

tx_monetary = tx_user[['Revenue']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_monetary)
    tx_monetary["clusters"] = kmeans.labels_ 
    sse[k] = kmeans.inertia_ 

plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()


#apply clustering
kmeans = KMeans(n_clusters=4)
tx_user['RevenueCluster'] = kmeans.fit_predict(tx_user[['Revenue']])

#order the cluster numbers
tx_user = order_cluster('RevenueCluster', 'Revenue', tx_user, True)

#show details of the dataframe
tx_user.groupby('RevenueCluster')['Revenue'].describe()


# calculate overall score and use mean() to see details 

tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency', 'Frequency', 'Revenue'].mean()


tx_user['Segment'] = 'Low-Value'


tx_user.loc[tx_user['OverallScore']>2, 'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4, 'Segment'] = 'High-Value'


tx_uk['InvoiceDate'].describe()

count                  495478
unique                  21220
top       2011-10-31 14:41:00
freq                     1114
first     2010-12-01 08:26:00
last      2011-12-09 12:49:00
Name: InvoiceDate, dtype: object


tx_uk.InvoiceDate = pd.to_datetime(tx_uk.InvoiceDate).dt.date
tx_3m = tx_uk[(tx_uk.InvoiceDate < date(2011, 6, 1)) & (tx_uk.InvoiceDate >= date(2011, 3, 1))].reset_index()
tx_3m.InvoiceDate # 3 months time

0        2011-03-01
1        2011-03-01
2        2011-03-01
3        2011-03-01
4        2011-03-01
            ...    
95188    2011-05-31
95189    2011-05-31
95190    2011-05-31
95191    2011-05-31
95192    2011-05-31
Name: InvoiceDate, Length: 95193, dtype: object


tx_uk.InvoiceDate = pd.to_datetime(tx_uk.InvoiceDate).dt.date
tx_6m = tx_uk[(tx_uk.InvoiceDate >= date(2011, 6, 1)) & (tx_uk.InvoiceDate < date(2011, 12, 1))].reset_index()
tx_6m.InvoiceDate # 6 months time

0         2011-06-01
1         2011-06-01
2         2011-06-01
3         2011-06-01
4         2011-06-01
             ...    
278961    2011-11-30
278962    2011-11-30
278963    2011-11-30
278964    2011-11-30
278965    2011-11-30
Name: InvoiceDate, Length: 278966, dtype: object


# calculte revenue and create a new dataframe for it 
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']
tx_user_6m = tx_6m.groupby('CustomerID')['Revenue'].sum().reset_index()
tx_user_6m.columns = ['CustomerID', 'm6_Revenue']


tx_user_6m


# plot LTV histogram 
sns.distplot(tx_user_6m['m6_Revenue'], kde=False)
plt.title('6m Revenue');


tx_merge = tx_user.merge(tx_user_6m, on='CustomerID', how='left')
tx_merge = tx_merge.fillna(0)  #Only people who are in the timeline of tx_user_6m
tx_merge.head()


tx_graph = tx_merge.query('m6_Revenue < 50000') #because max values are ending at 50,000 as seen in graph above

sns.scatterplot(data=tx_graph.query('Segment == "Low-Value"'), x='OverallScore', y='m6_Revenue', label='Low')
sns.scatterplot(data=tx_graph.query('Segment == "Mid-Value"'), x='OverallScore', y='m6_Revenue', label='Mid')
sns.scatterplot(data=tx_graph.query('Segment == "High-Value"'), x='OverallScore', y='m6_Revenue', label='High')

plt.title('LTV')
plt.xlabel('RFM Score')
plt.ylabel('6m LTV')
plt.legend(bbox_to_anchor=(1.02, 1.02));


# remove outliers 
tx_merge = tx_merge[tx_merge['m6_Revenue'] < tx_merge['m6_Revenue'].quantile(0.99)]


tx_merge.head()


# creating 3 clusters 
kmeans = KMeans(n_clusters=3) 
tx_merge['LTVCluster'] = kmeans.fit_predict(tx_merge[['m6_Revenue']])
tx_merge.head()


#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm6_Revenue', tx_merge, True)

# creating a new cluster dataframe 
tx_cluster = tx_merge.copy()


tx_cluster.groupby('LTVCluster')['m6_Revenue'].describe()


tx_cluster.head()


# convert categorical columns to numerical 
tx_class = pd.get_dummies(tx_cluster)
tx_class.head()


# calculate and show correlations 
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)

LTVCluster            1.000000
m6_Revenue            0.878888
Revenue               0.776532
OverallScore          0.633204
FrequencyCluster      0.630540
RevenueCluster        0.607540
Segment_High-Value    0.603418
Frequency             0.567484
RecencyCluster        0.355218
Segment_Mid-Value    -0.026781
CustomerID           -0.027210
Recency              -0.346594
Segment_Low-Value    -0.403302
Name: LTVCluster, dtype: float64


# create X and y, X will be feature set and y is the label - LTV 
X = tx_class.drop(['LTVCluster', 'm6_Revenue'], axis=1)
y = tx_class['LTVCluster'] 

# split training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)


#XGBoost Multiclassification Model 
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_jobs=-1)
ltv_xgb_model.fit(X_train, y_train) 

print('Accuracy of XGB classfier on training set: {:.2f}'.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classfier on test set: {:.2f}'.format(ltv_xgb_model.score(X_test, y_test)))

y_pred = ltv_xgb_model.predict(X_test)

Accuracy of XGB classfier on training set: 0.96
Accuracy of XGB classfier on test set: 0.90


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       146
           1       0.82      0.74      0.78        43
           2       0.57      0.57      0.57         7

    accuracy                           0.90       196
   macro avg       0.78      0.76      0.77       196
weighted avg       0.90      0.90      0.90       196


y_test

1212    0
1584    0
954     0
3723    1
336     0
       ..
1994    0
2198    0
45      0
3681    1
3172    1
Name: LTVCluster, Length: 196, dtype: int64


y_pred

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 1, 2, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 2, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1])



# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()



# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)



# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')



import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

from sklearn import model_selection
from sklearn import metrics, ensemble, linear_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')



!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.8/dist-packages (1.5.12)
Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.8/dist-packages (from kaggle) (2022.12.7)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.8/dist-packages (from kaggle) (7.0.0)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.8/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (3.0.4)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"2b9a4ddc90c33aadf0ac55ad9e6919df"}'}



!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning 

%ls ~/.kaggle

kaggle.json



!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

!unzip walmart-recruiting-store-sales-forecasting.zip

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 160MB/s]
Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip



!unzip features.csv.zip    
!unzip sampleSubmission.csv.zip  
!unzip test.csv.zip
!unzip train.csv.zip

Archive:  features.csv.zip
  inflating: features.csv            
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv



!mkdir -p ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting #create folder name Kaggle
!cp features.csv ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting #copy into folder Kaggle 
!cp sampleSubmission.csv  ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting
!cp stores.csv  ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting
!cp test.csv  ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting
!cp train.csv  ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting

%ls ~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting

features.csv  sampleSubmission.csv  stores.csv  test.csv  train.csv



features = pd.read_csv('~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting/features.csv')
train = pd.read_csv('~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting/train.csv')
stores = pd.read_csv('~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting/stores.csv')
test = pd.read_csv('~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting/test.csv')
sample_submission = pd.read_csv('~/.kaggle/competitions/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv')



feature_store = features.merge(stores, on='Store', how='inner')
train_df = train.merge(feature_store, on=['Store', 'Date', 'IsHoliday'], how='left')
train_df = train_df.reset_index(drop=True)

test_df = test.merge(feature_store, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = test_df.reset_index(drop=True)



print(train_df.shape)
print(test_df.shape)

(421570, 16)
(115064, 15)



train_df.describe().T



# Converting date column to datetime 
feature_store['Date'] = pd.to_datetime(feature_store['Date'])
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Adding some basic datetime features 
feature_store['Day'] = feature_store['Date'].dt.day 
feature_store['Week'] = feature_store['Date'].dt.week
feature_store['Month'] = feature_store['Date'].dt.month
feature_store['Year'] = feature_store['Date'].dt.year



train.sample()



feature_store.sample()



train_df = train.merge(feature_store, how='left', on=['Store', 'Date', 'IsHoliday']).reset_index(drop=True)
test_df = test.merge(feature_store, how='left', on=['Store', 'Date', 'IsHoliday']).reset_index(drop=True)



df_weeks = train_df.groupby('Week').sum()



sns.lineplot(data=df_weeks, x=df_weeks.index, y='Weekly_Sales')
plt.title('Sales over weeks')

Text(0.5, 1.0, 'Sales over weeks')



sns.lineplot(x = df_weeks.index, y=df_weeks['MarkDown1'], label='MarkDown1')
sns.lineplot(x = df_weeks.index, y=df_weeks['MarkDown2'], label='MarkDown2')
sns.lineplot(x = df_weeks.index, y=df_weeks['MarkDown3'], label='MarkDown3')
sns.lineplot(x = df_weeks.index, y=df_weeks['MarkDown4'], label='MarkDown4')
sns.lineplot(x = df_weeks.index, y=df_weeks['MarkDown5'], label='MarkDown5')
sns.lineplot(data=df_weeks, x=df_weeks.index, y='Weekly_Sales', label='Weekly Sales')

<matplotlib.axes._subplots.AxesSubplot at 0x7faafcd979d0>



weekly_sales = train_df.groupby(['Year', 'Week'], as_index=False).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2010 = train_df.loc[train_df['Year']==2010].groupby('Week').agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2011 = train_df.loc[train_df['Year']==2011].groupby('Week').agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2012 = train_df.loc[train_df['Year']==2012].groupby('Week').agg({'Weekly_Sales': ['mean', 'median']})



sns.lineplot(data=weekly_sales2010, x=weekly_sales2010['Weekly_Sales']['mean'].index, y=weekly_sales2010['Weekly_Sales']['mean'])
sns.lineplot(data=weekly_sales2011, x=weekly_sales2011['Weekly_Sales']['mean'].index, y=weekly_sales2011['Weekly_Sales']['mean'])
sns.lineplot(data=weekly_sales2012, x=weekly_sales2012['Weekly_Sales']['mean'].index, y=weekly_sales2012['Weekly_Sales']['mean'])

<matplotlib.axes._subplots.AxesSubplot at 0x7faafc987820>



# Converting the temperature to celsius for a better interpretation
train_df['Temperature'] = train_df['Temperature'].apply(lambda x : (x-32)/1.8)
test_df['Temperature'] = test_df['Temperature'].apply(lambda x : (x-32)/1.8)



train_plt = train_df.sample(frac=0.20)



print(train_df.shape)
print(train_plt.shape)

(421570, 20)
(84314, 20)



fig = plt.figure(figsize=(14, 5))
ax1 = fig.add_subplot(1, 2, 1) # 서브플롯들을 2 x 1 배열로 배치 그중 첫번째
ax2 = fig.add_subplot(1, 2, 2)  # 서브플롯들을 2 x 1 배열로 배치 그중 두번째

# Temperature
sns.regplot(data=train_plt, x='Temperature', y='Weekly_Sales', ax=ax1) 
ax1.set_title('Temparature and Sales by holiday')

sns.histplot(data=train_plt, x='Temperature', hue='IsHoliday', ax=ax2) 
ax2.set_title('Temparature and Sales by holiday')

plt.show();



fig = plt.figure(figsize=(14, 5))
ax1 = fig.add_subplot(1, 2, 1) # 서브플롯들을 2 x 1 배열로 배치 그중 첫번째
ax2 = fig.add_subplot(1, 2, 2)  # 서브플롯들을 2 x 1 배열로 배치 그중 두번째

# Fuel_Price
sns.regplot(data=train_plt, x='Fuel_Price', y='Weekly_Sales', ax=ax1) 
ax1.set_title('Fuel_Price and Sales by holiday')

sns.histplot(data=train_plt, x='Fuel_Price', hue='IsHoliday', ax=ax2) 
ax2.set_title('Fuel_Price and Sales by holiday')

plt.show();



fig = plt.figure(figsize=(14, 5))
ax1 = fig.add_subplot(1, 2, 1) # 서브플롯들을 2 x 1 배열로 배치 그중 첫번째
ax2 = fig.add_subplot(1, 2, 2)  # 서브플롯들을 2 x 1 배열로 배치 그중 두번째

# CPI
sns.regplot(data=train_plt, x='CPI', y='Weekly_Sales', ax=ax1) 
ax1.set_title('CPI and Sales by holiday')

sns.histplot(data=train_plt, x='CPI', hue='IsHoliday', ax=ax2) 
ax2.set_title('CPI and Sales by holiday')

plt.show();



fig = plt.figure(figsize=(14, 5))
ax1 = fig.add_subplot(1, 2, 1) # 서브플롯들을 2 x 1 배열로 배치 그중 첫번째
ax2 = fig.add_subplot(1, 2, 2)  # 서브플롯들을 2 x 1 배열로 배치 그중 두번째

# Unemployment
sns.regplot(data=train_plt, x='Unemployment', y='Weekly_Sales', ax=ax1) 
ax1.set_title('Unemployment and Sales by holiday')

sns.histplot(data=train_plt, x='Unemployment', hue='IsHoliday', ax=ax2) 
ax2.set_title('Unemployment and Sales by holiday')

plt.show();



sizes = train_plt.groupby('Size')['Weekly_Sales'].mean().reset_index()

sns.lineplot(data=sizes, x=sizes.index, y=sizes.Weekly_Sales) 
plt.title('Store size and Sales');



store_type = stores[['Type', 'Size']]
sns.boxplot(data=store_type, x='Type', y='Size')

<matplotlib.axes._subplots.AxesSubplot at 0x7faafc29ad60>



dept = train_plt.groupby('Dept')['Weekly_Sales'].mean().reset_index() 

sns.barplot(data=dept, x='Dept', y='Weekly_Sales')
plt.title('Department and sales');



plt.figure(figsize=(16, 16))

sns.heatmap(train_df.corr(), 
            linewidths=0.1,
            square=True,
            annot=True,
            fmt='.2f',
            cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7faafc0cfd30>



weekly_sales_corr = pd.DataFrame(train_df.corr()['Weekly_Sales']).sort_values(by='Weekly_Sales', ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(data=weekly_sales_corr, x=weekly_sales_corr.index, y='Weekly_Sales')
plt.xticks(rotation=70)
plt.title('Feature Correlation with Sales');



data_train = train_df.copy()
data_test = test_df.copy()



data_train['Days_to_Thanksgiving'] = (pd.to_datetime(train_df['Year'].astype(str)+"-11-24", format='%Y-%m-%d') - train_df['Date']).dt.days.astype(int)

data_train['Days_to_Christmas'] = (pd.to_datetime(train_df['Year'].astype(str)+"-12-24", format='%Y-%m-%d') - train_df['Date']).dt.days.astype(int)



data_test['Days_to_Thanksgiving'] = (pd.to_datetime(test_df['Year'].astype(str)+"-11-24", format='%Y-%m-%d') - test_df['Date']).dt.days.astype(int)

data_test['Days_to_Christmas'] = (pd.to_datetime(test_df['Year'].astype(str)+"-12-24", format='%Y-%m-%d') - test_df['Date']).dt.days.astype(int)



data_train['SuperBowlWeek'] = train_df['Week'].apply(lambda x : 1 if x == 6 else 0)
data_train['LaborDay'] = train_df['Week'].apply(lambda x : 1 if x == 36 else 0)
data_train['Thanksgiving'] = train_df['Week'].apply(lambda x : 1 if x == 47 else 0)
data_train['Christmas'] = train_df['Week'].apply(lambda x : 1 if x == 52 else 0)



data_test['SuperBowlWeek'] = test_df['Week'].apply(lambda x : 1 if x == 6 else 0)
data_test['LaborDay'] = test_df['Week'].apply(lambda x : 1 if x == 36 else 0)
data_test['Thanksgiving'] = test_df['Week'].apply(lambda x : 1 if x == 47 else 0)
data_test['Christmas'] = test_df['Week'].apply(lambda x : 1 if x == 52 else 0)



data_train['MarkDownSum'] = train_df['MarkDown1'] + train_df['MarkDown2'] + train_df['MarkDown3'] + train_df['MarkDown4'] + train_df['MarkDown5']    
data_test['MarkDownSum'] = test_df['MarkDown1'] + test_df['MarkDown2'] + test_df['MarkDown3'] + test_df['MarkDown4'] + test_df['MarkDown5']



data_train.isna().sum()[data_train.isna().sum() > 0]

MarkDown1      270889
MarkDown2      310322
MarkDown3      284479
MarkDown4      286603
MarkDown5      270138
MarkDownSum    324514
dtype: int64



data_test.isna().sum()[data_test.isna().sum() > 0]

MarkDown1         149
MarkDown2       28627
MarkDown3        9829
MarkDown4       12888
CPI             38162
Unemployment    38162
MarkDownSum     37457
dtype: int64



data_train.fillna(0, inplace=True)



data_test['CPI'].fillna(data_test['CPI'].mean(), inplace=True)
data_test['Unemployment'].fillna(data_test['Unemployment'].mean(), inplace=True)



data_test.fillna(0, inplace=True)



data_train['IsHoliday'] = data_train['IsHoliday'].apply(lambda x : 1 if x == True else 0)
data_test['IsHoliday'] = data_test['IsHoliday'].apply(lambda x : 1 if x == True else 0)



data_train['Type'] = data_train['Type'].apply(lambda x: 1 if x =='A' else (2 if x =='B' else 3))
data_test['Type'] = data_test['Type'].apply(lambda x: 1 if x =='A' else (2 if x =='B' else 3))



features = [feature for feature in data_train.columns if feature not in ('Date', 'Weekly_Sales') ]



X = data_train[features].copy()
y = data_train.Weekly_Sales.copy()



data_sample = data_train.copy().sample(frac=.25)
X_sample = data_sample[features].copy()
y_sample = data_sample.Weekly_Sales.copy()



X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_sample, y_sample, random_state=0, test_size=0.15)



feat_model = xgb.XGBRegressor(random_state=0, objective='reg:squarederror')
feat_model.fit(X_train, y_train)

XGBRegressor(objective='reg:squarederror')



!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance 

perm = PermutationImportance(feat_model, random_state=1)
perm.fit(X_valid, y_valid)

feature_weights = eli5.show_weights(perm, top=len(X_train.columns), feature_names= X_valid.columns.tolist())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: eli5 in /usr/local/lib/python3.8/dist-packages (0.13.0)
Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.8/dist-packages (from eli5) (0.8.10)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from eli5) (1.7.3)
Requirement already satisfied: jinja2>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from eli5) (3.1.2)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from eli5) (1.15.0)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.8/dist-packages (from eli5) (1.21.6)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.8/dist-packages (from eli5) (1.0.2)
Requirement already satisfied: attrs>17.1.0 in /usr/local/lib/python3.8/dist-packages (from eli5) (22.1.0)
Requirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from eli5) (0.10.1)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from jinja2>=3.0.0->eli5) (2.0.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20->eli5) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20->eli5) (3.1.0)



feature_weights



s = pd.Series(dict(zip(X_valid.columns.to_list(), perm.feature_importances_))).sort_values(ascending=False)
s.plot(kind='bar');



# Eval metric for the competition 
def WMAE(dataset, real, predict): 
  weights = datasets.IsHoliday.apply(lambda x : 5 if x else 1)
  return np.round(np.sum(weights*abs(real-predict))/np.sum(weights),2)



models = {
    'LGBM' : lgb.LGBMRegressor(random_state=0),
    'XGBoost' : xgb.XGBRegressor(random_state=0, objective ='reg:squarederror'), 
    'CatBoost' : cb.CatBoostRegressor(random_state=0, verbose=False), 
    'HGBR' : HistGradientBoostingRegressor(random_state=0), 
    'ExtraTr' : ensemble.ExtraTreesRegressor(bootstrap = True, random_state=0),
    'RandomF' : ensemble.RandomForestRegressor(random_state=0)
}



def model_evaluation (name, model, models, X_train, y_train, X_valid, y_valid): 

  rmses = []

  for i in range(len(models)): 

    # Model fit
    model.fit(X_train, y_train)
    
    # Model predict 
    y_preds = model.predict(X_valid)
    
    # RMSE 
    rmse = np.sqrt(np.mean( (y_valid-y_preds)**2))
    rmses.append(rmse) 
  
  return np.mean(rmses)



for name, model in models.items(): 
  result = model_evaluation(name, model, models, X_train, y_train, X_valid, y_valid)

  print(name + ' Valid RMSE : {:.4f}'.format(result))

LGBMValid RMSE : 6637.5050
XGBoostValid RMSE : 11147.8071
CatBoostValid RMSE : 5577.6563
HGBRValid RMSE : 6828.7690
ExtraTrValid RMSE : 4530.0808
RandomFValid RMSE : 4450.2748



X = data_train[features].copy()
y = data_train.Weekly_Sales.copy()



X_baseline = X[['Dept', 'Size','Store', 'Week', 'IsHoliday', 'Type', 'Year', 'Day']].copy()



X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_baseline, y, random_state=0, test_size=0.1)



rf = ensemble.RandomForestRegressor(n_estimators=60,
                                    max_depth=25,
                                    min_samples_split=3,
                                    min_samples_leaf=1)

rf.fit(X_train.values, y_train.values)

RandomForestRegressor(max_depth=25, min_samples_split=3, n_estimators=60)



test = data_test[['Dept', 'Size','Store',  'Week', 'IsHoliday', 'Type', 'Year', 'Day']].copy()
predict_rf = rf.predict(test)



sample_submission['Weekly_Sales'] = predict_rf
sample_submission.to_csv('submission.csv', index=False)



# 2719 
58/688 * 100

8.430232558139535



etr = ensemble.ExtraTreesRegressor(n_estimators=50, bootstrap=True, random_state=0) 
etr.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=True, n_estimators=50, random_state=0)



predict_etr = etr.predict(test)



avg_preds = (predict_rf + predict_etr) / 2



sample_submission['Weekly_Sales'] = avg_preds 
sample_submission.to_csv('submission.csv', index=False)



#2672 
42/688 * 100

6.104651162790697



# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()



# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)



# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')



import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from glob import glob
import os, random, time, gc, warnings

from tqdm import tqdm_notebook

import lightgbm as lgbm
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

!pip install catboost
from catboost import CatBoostRegressor
from sklearn.feature_selection import RFECV


from sklearn.cluster import KMeans

from datetime import datetime

from math import sqrt

import folium
from folium import Marker, Icon, CircleMarker

!pip install pdpbox
from pdpbox import pdp, info_plots

warnings.filterwarnings('ignore')

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: catboost in /usr/local/lib/python3.8/dist-packages (1.1.1)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from catboost) (3.1.1)
Requirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from catboost) (1.7.3)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.21.6)
Requirement already satisfied: plotly in /usr/local/lib/python3.8/dist-packages (from catboost) (5.5.0)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.3.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2022.6)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (1.4.4)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from plotly->catboost) (8.1.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pdpbox in /usr/local/lib/python3.8/dist-packages (0.2.1)
Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from pdpbox) (5.4.8)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.7.3)
Requirement already satisfied: matplotlib==3.1.1 in /usr/local/lib/python3.8/dist-packages (from pdpbox) (3.1.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.21.6)
Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.2.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.3.5)
Requirement already satisfied: sklearn in /usr/local/lib/python3.8/dist-packages (from pdpbox) (0.0.post1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (0.11.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (1.4.4)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (3.0.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.1->matplotlib==3.1.1->pdpbox) (1.15.0)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->pdpbox) (2022.6)



!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.8/dist-packages (1.5.12)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.8/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.8/dist-packages (from kaggle) (2022.12.7)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.8/dist-packages (from kaggle) (7.0.0)
Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.8/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->kaggle) (2.10)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"2b9a4ddc90c33aadf0ac55ad9e6919df"}'}



!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning 

%ls ~/.kaggle

kaggle.json



!kaggle competitions download -c competitive-data-science-predict-future-sales

!unzip competitive-data-science-predict-future-sales.zip

Downloading competitive-data-science-predict-future-sales.zip to /content
 73% 11.0M/15.1M [00:00<00:00, 19.6MB/s]
100% 15.1M/15.1M [00:00<00:00, 16.1MB/s]
Archive:  competitive-data-science-predict-future-sales.zip
  inflating: item_categories.csv     
  inflating: items.csv               
  inflating: sales_train.csv         
  inflating: sample_submission.csv   
  inflating: shops.csv               
  inflating: test.csv



!mkdir -p ~/.kaggle/competitions/predict-future-sales #create folder name Kaggle
!cp item_categories.csv ~/.kaggle/competitions/predict-future-sales #copy into folder Kaggle
!cp items.csv ~/.kaggle/competitions/predict-future-sales
!cp sales_train.csv ~/.kaggle/competitions/predict-future-sales
!cp sample_submission.csv ~/.kaggle/competitions/predict-future-sales
!cp shops.csv ~/.kaggle/competitions/predict-future-sales
!cp test.csv ~/.kaggle/competitions/predict-future-sales

%ls ~/.kaggle/competitions/predict-future-sales

item_categories.csv  sales_train.csv        shops.csv
items.csv            sample_submission.csv  test.csv



item_cats = pd.read_csv('~/.kaggle/competitions/predict-future-sales/item_categories.csv')
train = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sales_train.csv')
shops = pd.read_csv('~/.kaggle/competitions/predict-future-sales/shops.csv')
items = pd.read_csv('~/.kaggle/competitions/predict-future-sales/items.csv')
sample_submission = pd.read_csv('~/.kaggle/competitions/predict-future-sales/sample_submission.csv')
test = pd.read_csv('~/.kaggle/competitions/predict-future-sales/test.csv')



print('Train-set : {}'.format(train.shape))
print('Test-set : {}'.format(test.shape))

Train-set : (2935849, 6)
Test-set : (214200, 3)



train.date_block_num.max()

33



pd.options.display.float_format = '{:.5f}'.format

train.item_cnt_day.describe()

count   2935849.00000
mean          1.24264
std           2.61883
min         -22.00000
25%           1.00000
50%           1.00000
75%           1.00000
max        2169.00000
Name: item_cnt_day, dtype: float64



# 상위 25개 
train.item_cnt_day.nlargest(25).values

array([2169., 1000.,  669.,  637.,  624.,  539.,  533.,  512.,  508.,
        504.,  502.,  501.,  500.,  500.,  480.,  412.,  405.,  401.,
        401.,  343.,  325.,  313.,  313.,  300.,  299.])



display(train.head())
display(test.head())



# test data에는 있지만 train data에는 없는 아이템 
test_only = test[~test['item_id'].isin(train['item_id'].unique())]['item_id'].unique()
print('test only items:', len(test_only))

test only items: 363



# drop duplicates
subset = ['date','date_block_num','shop_id','item_id','item_cnt_day']
print(train.duplicated(subset=subset).value_counts())
train.drop_duplicates(subset=subset, inplace=True)

False    2935825
True          24
dtype: int64



# drop shops & items not in test data 
test_shops = test.shop_id.unique()
test_items = test.item_id.unique()
train = train[train.shop_id.isin(test_shops)]
train = train[train.item_id.isin(test_items)]

train.shape

(1224429, 6)



train.sample()



from itertools import product 

# create all combinations 
block_shop_combi = pd.DataFrame(list(product(np.arange(34), test_shops)), columns=['date_block_num', 'shop_id'])
shop_item_combi = pd.DataFrame(list(product(test_shops, test_items)), columns=['shop_id', 'item_id'])
all_combi = block_shop_combi.merge(shop_item_combi, on='shop_id', how='inner')
print(len(all_combi), 34 * len(test_shops) * len(test_items) )

# group by monthly 
train_base = all_combi.merge(train, on=['date_block_num', 'shop_id', 'item_id'], how='left')
train_base['item_cnt_day'].fillna(0, inplace=True)
train_grp = train_base.groupby(['date_block_num', 'shop_id', 'item_id'])

7282800 7282800



# summary count by month 
train_monthly = pd.DataFrame(train_grp.agg({'item_cnt_day':['sum', 'count']})).reset_index()
train_monthly.columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt', 'item_order']
print(train_monthly[['item_cnt', 'item_order']].describe())

# trim count 
train_monthly['item_cnt'].clip(0, 20, inplace=True)

train_monthly.head()

           item_cnt    item_order
count  7.282800e+06  7.282800e+06
mean   2.221345e-01  1.085718e+00
std    3.324564e+00  7.254517e-01
min   -4.000000e+00  1.000000e+00
25%    0.000000e+00  1.000000e+00
50%    0.000000e+00  1.000000e+00
75%    0.000000e+00  1.000000e+00
max    2.253000e+03  3.100000e+01



display(items.head())
display(item_cats.head())



# pickup first category name
item_grp = item_cats['item_category_name'].apply(lambda x : str(x).split(' ')[0])
item_cats['item_group'] = pd.Categorical(item_grp).codes
items = pd.merge(items, item_cats.loc[:,['item_category_id', 'item_group']], on='item_category_id', how='left')

items.head()



city = shops.shop_name.apply(lambda x : str.replace(x, '!', '')).apply(lambda x : x.split(' ')[0])
shops['city'] = pd.Categorical(city).codes 
shops.head()



# By shop, item 
grp = train_monthly.groupby(['shop_id', 'item_id'])
train_shop = grp.agg({'item_cnt':['mean', 'median', 'std'], 'item_order':'mean'}).reset_index()
train_shop.columns = ['shop_id', 'item_id', 'cnt_mean_shop', 'cnt_med_shop', 'cnt_std_shop', 'order_mean_shop']
print(train_shop[['cnt_mean_shop', 'cnt_med_shop', 'cnt_std_shop']].describe())

train_shop.head()

       cnt_mean_shop   cnt_med_shop   cnt_std_shop
count  214200.000000  214200.000000  214200.000000
mean        0.187741       0.054169       0.380878
std         0.608329       0.508886       0.773115
min         0.000000       0.000000       0.000000
25%         0.000000       0.000000       0.000000
50%         0.029412       0.000000       0.171499
75%         0.147059       0.000000       0.430562
max        20.000000      20.000000      10.054796



# By shop, item_group 
train_cat_monthly = pd.merge(train_monthly, items, on='item_id', how='left')
grp = train_cat_monthly.groupby(['shop_id', 'item_group'])
train_shop_cat = grp.agg({'item_cnt': 'mean'}).reset_index()
train_shop_cat.columns = ['shop_id', 'item_group', 'cnt_mean_shop_cat']
print(train_shop_cat.loc[:, ['cnt_mean_shop_cat']].describe())

train_shop_cat.head()

       cnt_mean_shop_cat
count         546.000000
mean            0.924991
std             2.172233
min             0.000000
25%             0.029441
50%             0.149099
75%             0.467216
max            13.382353



# By month,shop,item At previous
train_prev = train_monthly.copy()
train_prev['date_block_num'] = train_prev['date_block_num'] + 1 
train_prev.columns = ['date_block_num','shop_id','item_id','cnt_prev','order_prev']

for i in [2, 12]: 
  train_prev_n = train_monthly.copy()
  train_prev_n['date_block_num'] = train_prev_n['date_block_num'] + i 
  train_prev_n.columns = ['date_block_num', 'shop_id', 'item_id', 'cnt_prev' + str(i), 'order_prev'+str(i)]
  train_prev = pd.merge(train_prev, train_prev_n, on =['date_block_num', 'shop_id', 'item_id'], how='left')

train_prev.head()



# By month, shop, item_group At previous 
grp = pd.merge(train_prev, items, on='item_id', how='left').groupby(['date_block_num', 'shop_id', 'item_group'])
train_cat_prev = grp['cnt_prev'].mean().reset_index()
train_cat_prev = train_cat_prev.rename(columns={'cnt_prev':'cnt_prev_cat'})
print(train_cat_prev.loc[:, ['cnt_prev_cat']].describe())

train_cat_prev.head()

       cnt_prev_cat
count  18564.000000
mean       0.924991
std        2.963806
min        0.000000
25%        0.000000
50%        0.095969
75%        0.396336
max       20.000000



train_piv = train_monthly.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num', values='item_cnt', aggfunc=np.sum, fill_value=0)
train_piv = train_piv.reset_index()
train_piv.head()



# Moving Average Convergence Divergence : 2개의 지수이평선 간의 거리를 수치화 
# MACD = 12일 EMA - 26일 EMA / 단기 EMA - 장기 EMA 
col = np.arange(34)
pivT = train_piv[col].T
ema_s = pivT.ewm(span=4).mean().T
ema_m = pivT.ewm(span=12).mean().T
ema_l = pivT.ewm(span=26).mean().T 
macd = ema_s - ema_l 
sig = macd.ewm(span=9).mean() # 시그널선 : MACD의 9일 지수이평선 

ema_list = [] 
for c in col: 
  sub_ema = pd.concat([train_piv.loc[:,['shop_id','item_id']], 
                      pd.DataFrame(ema_s.loc[:, c]).rename(columns={c:'cnt_ema_s_prev'}),
                      pd.DataFrame(ema_m.loc[:, c]).rename(columns={c:'cnt_ema_m_prev'}),
                      pd.DataFrame(ema_l.loc[:, c]).rename(columns={c:'cnt_ema_l_prev'}),
                      pd.DataFrame(macd.loc[:, c]).rename(columns={c:'cnt_macd_prev'}),
                      pd.DataFrame(sig.loc[:, c]).rename(columns={c:'cnt_sig_prev'})], axis=1) 
  sub_ema['date_block_num'] = c+1
  ema_list.append(sub_ema)

train_ema_prev = pd.concat(ema_list) 
train_ema_prev.head()



fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 4)) 

train_monthly.groupby('date_block_num')['item_cnt'].sum().reset_index().plot(ax=ax[0])
train_cat_monthly.pivot_table(index=['date_block_num'], columns='item_group', values='item_cnt', aggfunc=np.sum, fill_value=0).plot(ax=ax[1], legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x7f9233428fd0>



# Price mean by month, shop, item 
train_price = train_grp['item_price'].mean().reset_index() 
price = train_price[~train_price['item_price'].isnull()]

# last price by shop, item 
last_price = price.drop_duplicates(subset=['shop_id', 'item_id'], keep='last').drop('date_block_num', axis=1)

# null price by shop, item 
uitem = price['item_id'].unique()
pred_price_set = test[~test['item_id'].isin(uitem)].drop('ID', axis=1)

print(pred_price_set.shape)
pred_price_set.head()

(16128, 2)



from sklearn import ensemble, metrics 

if len(pred_price_set) > 0: 
  train_price_set = pd.merge(price, items, on='item_id', how='inner') 
  pred_price_set = pd.merge(pred_price_set, items, on='item_id', how='inner').drop('item_name', axis=1) 
  
  reg = ensemble.ExtraTreesRegressor(n_estimators=25, n_jobs=-1, max_depth=15, random_state=42)
  reg.fit(train_price_set[pred_price_set.columns], train_price_set['item_price'])
  pred_price_set['item_price'] = reg.predict(pred_price_set)

test_price = pd.concat([last_price, pred_price_set], join='inner')
test_price.head()



price_max = price.groupby('item_id')['item_price'].max().reset_index()
price_max.rename(columns={'item_price': 'item_max_price'}, inplace=True) 
price_max.head()



train_price_a = pd.merge(price, price_max, on='item_id', how='left')
train_price_a['discount_rate'] = 1 - (train_price_a['item_price']/train_price_a['item_max_price']) 
train_price_a.drop('item_max_price', axis=1, inplace=True)
train_price_a.head()



test_price_a = pd.merge(test_price, price_max, on='item_id', how='left')
test_price_a.loc[test_price_a['item_max_price'].isnull(), 'item_max_price'] = test_price_a['item_price'] 
test_price_a['discount_rate'] = 1 - ( test_price_a['item_price']/test_price_a['item_max_price'])
test_price_a.drop('item_max_price', axis=1, inplace=True)
test_price_a.head()



# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



def mergeFeature(df): 
  df = pd.merge(df, items, on='item_id', how='left').drop('item_group', axis=1)
  df = pd.merge(df, item_cats, on='item_category_id', how='left')
  df = pd.merge(df, shops, on='shop_id', how='left')
  df = pd.merge(df, train_shop, on=['shop_id', 'item_id'], how='left')
  df = pd.merge(df, train_shop_cat, on=['shop_id', 'item_group'], how='left')
  df = pd.merge(df, train_prev, on=['date_block_num','shop_id','item_id'], how='left') 
  df = pd.merge(df, train_cat_prev, on=['date_block_num','shop_id','item_group'], how='left')
  df = pd.merge(df, train_ema_prev, on=['date_block_num','shop_id','item_id'], how='left')
  
  df['month'] = df['date_block_num'] % 12 
  days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
  df['days'] = df['month'].map(days).astype(np.int8)

  df.drop(['shop_id', 'shop_name', 'item_id', 'item_name', 'item_category_id', 'item_category_name', 'item_group'], axis=1, inplace=True)
  df.fillna(0.0, inplace=True)
  return reduce_mem_usage(df)



display(train_monthly.sample())
display(train_price_a.sample() )



# 전 기간의 모든 데이터셋을 사용하지 않고, 초반의 12개월 후의 데이터만 사용 
# 왜냐하면 training feature에는 12개월 전의 값들(order_prev12)이 들어있다 
train_set = train_monthly[train_monthly['date_block_num'] >= 12]
train_set = pd.merge(train_set, train_price_a, on=['date_block_num', 'shop_id', 'item_id'], how='left')
train_set = mergeFeature(train_set)

Memory usage of dataframe is 871.85 MB --> 404.47 MB (Decreased by 53.6%)



train_set = train_set.join(pd.DataFrame(train_set.pop('item_order'))) # move to last column
X_train = train_set.drop('item_cnt', axis=1)
y_train = train_set['item_cnt'].clip(0., 20.)
X_train.head()



test_set = test.copy()
test_set['date_block_num'] = 34 

test_set = pd.merge(test_set, test_price_a, on=['shop_id', 'item_id'], how='left')
test_set = mergeFeature(test_set)

Memory usage of dataframe is 38.00 MB --> 18.18 MB (Decreased by 52.2%)



test_set.sample()



test_set['item_order'] = test_set['cnt_ema_s_prev'] #order_prev
test_set.loc[test_set['item_order']==0, 'item_order'] = 1 

X_test = test_set.drop('ID', axis=1)
X_test.head()



assert(X_train.columns.isin(X_test.columns).all())



from sklearn import linear_model, preprocessing 
from sklearn.model_selection import GroupKFold 
import lightgbm as lgb 

params = {'learning_rate' : 0.05,
          'objective' : 'regression',
          'metric' : 'rmse',
          'num_leaves':64, 
          'verbose' : 1,
          'random_state': 42,
          'bagging_fraction':1, # 각각 iteration 에 사용되는 데이터의 일부를 지정하는데, 일반적으로 훈련 속도를 높이거나, 과적합을 피할때 사용한다.
          'feature_fraction':1} # used when your boosting is random forest. 
          # 즉, 랜덤 포레스트 일 경우에 사용. 
          # 0.8 feature fraction은 LightGBM이 Tree를 만들 때 각각 iteration 반복에서 파라미터 중에서 80%를 랜덤으로 선택.

folds = GroupKFold(n_splits=6)
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(X_train, y_train, X_train['date_block_num'])):
  trn_x, trn_y = X_train.iloc[trn_], y_train[trn_]
  val_x, val_y = X_train.iloc[val_], y_train[val_]

  reg = lgb.LGBMRegressor(**params, n_estimators=3000)
  reg.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=50, verbose=500) 

  oof_preds[val_] = reg.predict(val_x.values, num_iteration=reg.best_iteration_) 
  sub_preds += reg.predict(X_test.values, num_iteration=reg.best_iteration_) / folds.n_splits

Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.209647
[1000]	valid_0's rmse: 0.207681
Early stopping, best iteration is:
[1018]	valid_0's rmse: 0.207633
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.206711
[1000]	valid_0's rmse: 0.205003
Early stopping, best iteration is:
[1294]	valid_0's rmse: 0.20464
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.208489
[1000]	valid_0's rmse: 0.205751
Early stopping, best iteration is:
[1275]	valid_0's rmse: 0.20516
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.186031
Early stopping, best iteration is:
[607]	valid_0's rmse: 0.185691
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.15861
Early stopping, best iteration is:
[463]	valid_0's rmse: 0.158591
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's rmse: 0.186485
Early stopping, best iteration is:
[547]	valid_0's rmse: 0.18632



pred_cnt = sub_preds



print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, oof_preds.clip(0.,20.))))

RMSE: 0.19348729089786726



plt.figure(figsize=(24,5))

# lgbm의 Feature Importances를 barplot으로 그린다.
plt.bar(X_train.columns, reg.feature_importances_[np.argsort(reg.feature_importances_)][::-1])

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=70)
plt.title("Coef of light GBM Model");



result = pd.DataFrame({
    'ID' : test['ID'], 
    'item_cnt_month' : pred_cnt.clip(0., 20.)
})
result.to_csv('submission.csv', index=False)


# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()


# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)


# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from glob import glob
import os, random, time, gc, warnings

from tqdm import tqdm_notebook

import lightgbm as lgbm
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

!pip install catboost
from catboost import CatBoostRegressor
from sklearn.feature_selection import RFECV


from sklearn.cluster import KMeans

from datetime import datetime

from math import sqrt

import folium
from folium import Marker, Icon, CircleMarker

!pip install pdpbox
from pdpbox import pdp, info_plots

warnings.filterwarnings('ignore')

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: catboost in /usr/local/lib/python3.8/dist-packages (1.1.1)
Requirement already satisfied: plotly in /usr/local/lib/python3.8/dist-packages (from catboost) (5.5.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from catboost) (1.7.3)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.3.5)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.21.6)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from catboost) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2022.6)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (1.4.4)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from plotly->catboost) (8.1.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pdpbox in /usr/local/lib/python3.8/dist-packages (0.2.1)
Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.2.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.21.6)
Requirement already satisfied: matplotlib==3.1.1 in /usr/local/lib/python3.8/dist-packages (from pdpbox) (3.1.1)
Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from pdpbox) (5.4.8)
Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.3.5)
Requirement already satisfied: sklearn in /usr/local/lib/python3.8/dist-packages (from pdpbox) (0.0.post1)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.7.3)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (1.4.4)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (2.8.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (0.11.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.1->matplotlib==3.1.1->pdpbox) (1.15.0)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->pdpbox) (2022.6)


df_bus = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/bus_bts.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/submission_sample.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/test.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/train.csv')


# Train/Test-set을 각각 체크해봅니다.
display(train.head(3))
display(test.head(3))


print('Train-set : {}'.format(train.shape))
print('Test-set : {}'.format(test.shape))

Train-set : (415423, 21)
Test-set : (228170, 20)


# Train-set에만 있는 칼럼은?

train.columns.difference(test.columns)

Index(['18~20_ride'], dtype='object')


# Train-set의 id는?
print(train['id'].agg(['min', 'max']) )
print('Size : {}'.format(len(train)))

min         0
max    415422
Name: id, dtype: int64
Size : 415423


# Test-set의 id는?
print(test['id'].agg(['min', 'max']) )
print('Size : {}'.format(len(test)))

min    415423
max    643592
Name: id, dtype: int64
Size : 228170


# Train-set의 date는?
print(train['date'].agg(['min', 'max']) )
print('Size : {}'.format(len(train)))

min    2019-09-01
max    2019-09-30
Name: date, dtype: object
Size : 415423


# Test-set의 date는?
print(test['date'].agg(['min', 'max']) )
print('Size : {}'.format(len(test)))

min    2019-10-01
max    2019-10-16
Name: date, dtype: object
Size : 228170


# Train/Test의 date 분포는?
plt.figure(figsize = (12,8))

pd.to_datetime(train['date']).value_counts().sort_index().plot(color='b', lw=2, label='train')
pd.to_datetime(test['date']).value_counts().sort_index().plot(color='r', lw=2, label='test')

plt.legend()
plt.xlabel("date")
plt.ylabel("# of rows")
plt.title("Distribution of date in Train/Test-set");

# 평균적으로 탑승객이 어떤 식으로 분포되어있는지 살펴보자.
# 굉장히 떨어지는 지점이 특별한 공휴일은 아닌지 살펴보자.


display(train.head(3))
display(test.head(3))


# Train/Test-set의 고유한 bus_route_id를 구함.
train_bus_route_id_set = set(train['bus_route_id'])
test_bus_route_id_set  = set(test['bus_route_id'])


# Train/Test-set 고유한 bus_route의 개수를 구함.
print('Train-set에 있는 고유한 bus_route의 개수 : {}'.format( len( train_bus_route_id_set) ))
print('Test-set에 있는 고유한 bus_route의 개수 : {}'.format( len( test_bus_route_id_set) ))

Train-set에 있는 고유한 bus_route의 개수 : 613
Test-set에 있는 고유한 bus_route의 개수 : 601


# Train/Test-set 모두에 포함되어있는 bus_route를 구함.
common_bus_route_id = train_bus_route_id_set.intersection( test_bus_route_id_set )
print('Train/Test-set에 공통으로 포함되어 있는 bus_route 개수 : {}'.format( len(common_bus_route_id) ))

Train/Test-set에 공통으로 포함되어 있는 bus_route 개수 : 0


# Train-set에만 있는 bus_route를 구함. 
only_train_bus_route = train_bus_route_id_set.difference( test_bus_route_id_set ) 
print( 'Train-set에만 있는 bus_route 개수 : {}'.format (len( only_train_bus_route) ))

only_test_bus_route = test_bus_route_id_set.difference( train_bus_route_id_set ) 
print ('Test-set에만 있는 bus_route 개수 : {}'.format(len( only_test_bus_route)))

Train-set에만 있는 bus_route 개수 : 613
Test-set에만 있는 bus_route 개수 : 601


# 탑승 관련 columns & 하차 관련 columns
ride_columns = [col for col in test.columns if '_ride' in col] + ['bus_route_id','date']
take_off_columns = [col for col in test.columns if '_takeoff' in col] + ['bus_route_id','date']


# 두 경우의 탑승 관련 columns 비교
plt.figure(figsize=(12,5))
test[test['bus_route_id'].isin( only_test_bus_route)][ride_columns].groupby(['date', 'bus_route_id'])['8~9_ride'].sum().groupby('date').mean().plot(color='b', label='only in Test-set')
test[test['bus_route_id'].isin( common_bus_route_id)][ride_columns].groupby(['date', 'bus_route_id'])['8~9_ride'].sum().groupby('date').mean().plot(color='r', label='Both in Train/Test-set')
plt.legend()
plt.title('Average number of passengers\n bus_route_id only in Test-set VS bus_route_id both in Train/Test-set');


# Missing Values
msno.matrix(train)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd86b120580>


print("Train-set")
display(train.isnull().sum())

print('=' * 80)

print("Test-set")
display(test.isnull().sum())

Train-set

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
18~20_ride       0
dtype: int64

================================================================================
Test-set

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
dtype: int64


# Target Variable의 분포를 살펴보자
target_col = '18~20_ride'
train[ target_col].value_counts().sort_index()

0.0      296528
1.0       44268
2.0       23752
3.0       13560
4.0        8630
5.0        5911
6.0        4291
7.0        3152
8.0        2288
9.0        1865
10.0       1409
11.0       1183
12.0        996
13.0        807
14.0        660
15.0        597
16.0        508
17.0        450
18.0        408
19.0        394
20.0        331
21.0        261
22.0        234
23.0        218
24.0        198
25.0        168
26.0        194
27.0        150
28.0        144
29.0        132
30.0        113
31.0         94
32.0         80
33.0         79
34.0         77
35.0         78
36.0         58
37.0         61
38.0         64
39.0         53
40.0         37
41.0         51
42.0         43
43.0         41
44.0         52
45.0         37
46.0         32
47.0         35
48.0         21
49.0         28
50.0         26
51.0         24
52.0         21
53.0         24
54.0         22
55.0         22
56.0         20
57.0         17
58.0         21
59.0         13
60.0         16
61.0         20
62.0         13
63.0         11
64.0         14
65.0         17
66.0         15
67.0          6
68.0          6
69.0          5
70.0          9
71.0          7
72.0          7
73.0         10
74.0         10
75.0          6
76.0          7
77.0          6
78.0          7
79.0          5
80.0          9
81.0          5
82.0          7
83.0         10
84.0          5
85.0          6
86.0          4
87.0          6
88.0          6
89.0          4
90.0          1
91.0          6
92.0          5
93.0          3
95.0          4
96.0          4
97.0          4
98.0          2
99.0          1
100.0         2
101.0         5
102.0         1
103.0         2
104.0         2
105.0         3
106.0         1
107.0         2
108.0         2
109.0         2
111.0         1
112.0         2
113.0         1
114.0         1
115.0         3
116.0         3
118.0         1
119.0         2
120.0         1
122.0         3
123.0         1
124.0         1
125.0         1
126.0         2
127.0         3
128.0         2
129.0         1
131.0         1
132.0         1
134.0         2
135.0         1
136.0         1
138.0         1
143.0         1
146.0         1
149.0         1
150.0         2
152.0         2
155.0         1
157.0         1
158.0         1
161.0         1
162.0         2
164.0         2
166.0         1
167.0         1
168.0         1
169.0         2
170.0         1
173.0         1
175.0         1
180.0         1
181.0         2
184.0         1
185.0         2
198.0         1
200.0         1
201.0         1
202.0         1
209.0         1
213.0         1
215.0         1
218.0         1
226.0         1
227.0         1
229.0         1
240.0         1
241.0         1
245.0         1
265.0         1
272.0         1
Name: 18~20_ride, dtype: int64


# Dist-plot을 그려보도록 한다.
# --> (1) 0이 굉장히 많다. 
# --> (2) right-skewed된 형태이며, 값이 매우 큰 outlier들이 존재한다.
sns.distplot( train[target_col] );


# log1p transformation을 적용해봐도 정규분포에 근사한 모양을 보이지 않는다.
sns.distplot( np.log1p( train[target_col] ) );


# 탑승 관련 columns & 하차 관련 columns
ride_columns = [col for col in test.columns if '_ride' in col]
take_off_columns = [col for col in test.columns if '_takeoff' in col]


# Train-set의 승차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ride_columns].sum(axis=1).agg(['min', 'max']))
print('=' * 80)

# Train-set의 하차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ take_off_columns].sum(axis=1).agg(['min', 'max']))
print('=' * 80)

# Train-set의 승하차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ride_columns+ take_off_columns].sum(axis=1).agg(['min', 'max']))

min      0.0
max    138.0
dtype: float64

================================================================================

min      0.0
max    165.0
dtype: float64

================================================================================

min      1.0
max    283.0
dtype: float64


# (1)의 경우에는 어떤 것들이 있나 예시를 통해 살펴보도록 하자
# 하나의 station_name에 여러 개의 station_code가 기록되어 있는 경우는 어떤 상황인가?
multiple_station_name = train.groupby('station_name')['station_code'].nunique()
multiple_station_name = multiple_station_name[multiple_station_name>=7]
print(multiple_station_name)

df_sample = train[train['station_name'].isin(multiple_station_name.index)][['station_code','station_name','latitude','longitude']]
df_sample = df_sample.drop_duplicates().reset_index(drop=True) # drop=True, 기존 인덱스가 컬럼으로 더해지는 것을 방지 
df_sample

station_name
203     7
1001    8
Name: station_code, dtype: int64


def generateMap(default_location=[33.35098, 126.79807], default_zoom_start=10):
    base_map = folium.Map(location=default_location, 
                          control_scale=True, 
                          zoom_start=default_zoom_start)
    
    # 여러 개의 정거장에 대해서 Icon 생성하기
    for row in df_sample.itertuples():
        station_code, station_name, latitude, longitude = row[1:]
        
        # Create Icon
        if station_name == '금악리':
            icon = Icon(color='red',icon='station')
        else:
            icon = Icon(color='blue',icon='station')
                
        # Add Marker
        Marker(location=[ latitude , longitude], 
               popup=f'station_code : {station_code} station_name : {station_name}',
               icon = icon).add_to(base_map)
        
    
    base_map.save('하나의 station_name에 여러개의 station_code.html')
    return base_map

generateMap()


display( train.groupby('station_name')['station_code'].nunique().value_counts() )

2    1308
1     540
3      60
4      45
5       4
6       2
7       1
8       1
Name: station_code, dtype: int64


# station_name를 기준으로 삼는다면?
# --> 하나의 station_name가 여러 개의 latitude, longitude를 갖는 것으로 보임
display(train.groupby('station_name')['latitude'].nunique().value_counts())
print('='*80)
display(train.groupby('station_name')['longitude'].nunique().value_counts())

2    1283
1     566
3      59
4      45
5       4
6       2
7       1
8       1
Name: latitude, dtype: int64

================================================================================

2    1291
1     560
3      58
4      45
5       3
7       2
6       2
Name: longitude, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code에는 1개의  station_name이 매핑되어있음.
display(train.groupby('station_code')['station_name'].nunique().value_counts())

1    3563
Name: station_name, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code는 latitude, longitude와 1대1 관계를 만족함
display( train.groupby('station_code')['latitude'].nunique().value_counts() )
display( train.groupby('station_code')['longitude'].nunique().value_counts() )

1    3563
Name: latitude, dtype: int64

1    3563
Name: longitude, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code는 in_out와 1대1 관계를 만족함
display( train.groupby('station_code')['in_out'].nunique().value_counts() )

1    3563
Name: in_out, dtype: int64


# date, bus_route_id, station_code이 특정 날짜에 몇번 등장했는지 재확인하기 
display( train.groupby(['date','bus_route_id','station_code']).size().value_counts() )
print('='* 80)
print(f'Train-set size : {len(train)}')

1    415423
dtype: int64

================================================================================
Train-set size : 415423


# local_train/local_test를 만든다.
local_train = train[train['date']<='2019-09-24'].reset_index(drop=True)
local_test  = train[train['date']>'2019-09-24'].reset_index(drop=True)


display(local_train.head(1))
display(local_test.head(1))


# categorical variable인 'bus_route_id','in_out','station_code','station_name' 에 대해선 label_encoding을 적용해주고,
# numeric variable들에 대해선 있는 그대로 학습을 시켜보도록 한다.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    # local_train과 local_test를 concat하여 temp_df에 저장
    temp_df = pd.concat([ local_train[[col]], local_test[[col]] ] , axis=0)
    
    # Label-Encoding을 fitting함
    lbl.fit( temp_df[col] )
    
    # local_train/local_test에 label_encoding한 값을 대입함
    local_train[col] = lbl.transform(local_train[col])
    local_test[col] = lbl.transform(local_test[col])


display(local_train.head(1))
display(local_test.head(1))


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt', # 가장 많이 쓰는 게 그라디언트 부스팅 트리 
              'random_state':1993,
              'learning_rate':0.3, # 모델 학습을 얼마나 빠르게 진행할거냐 
              'subsample':0.7, # 전체 학습 데이터 중 몇 퍼센트 데이터를 사용할 것인가? 서브 샘플링을 낮추면 오버피팅을 방지 할 수 있다
              'tree_learner': 'serial', # 트리 학습기 
              'colsample_bytree':0.78, # 여러 개의 칼럼 중 몇 퍼센트를 사용할 것인가? 컬럼의 개수가 1000개, 2000개 되면 과감하게 0.3, 0.2
              'early_stopping_rounds':50, # train set의 loss는 지속적으로 감소하지만, validation의 loss가 하락하지 않을 경우 50 iteration이 넘었을 때 학습 중단 
              'subsample_freq': 1,
              'reg_lambda':7, # 정규화, 보통 1,2,3, 오버피팅을 방지하고자 할때 값을 높게 설정 
              'reg_alpha': 5, # 정규화, 보통 1,2,3, 오버피팅을 방지하고자 할때 값을 높게 설정 
              'num_leaves': 96,
              'seed' : SEED
            }


# 제거해야 하는 columns 정의 
drop_cols = ['id', 'date', target_col]

# local_train/local_test에 대한 label 정의
local_train_label = local_train[target_col]
local_test_label  = local_test[target_col]


# local_train/local_test의 예측값을 저장하기 위한 OOF 만들기 & CV를 저장할 list 정의 
oof_train = np.zeros((local_train.shape[0],))
oof_test = np.zeros((local_test.shape[0],))

cv_list = []

# Kfold 정의
kfolds = KFold(n_splits=n_splits
               , random_state=1993, shuffle=True) # 여러 개의 모델을 만들어 앙상블할때 random_state를 똑같이 지정해야 함 

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = local_train, y = local_train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = local_train.iloc[trn_ind].drop(drop_cols, 1), local_train_label[trn_ind]
    X_valid , y_valid = local_train.iloc[val_ind].drop(drop_cols, 1), local_train_label[val_ind]
    
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100) # 몇번당 학습의 결과를 화면에 내보낼 것인가 
    
    # local_valid/local_test에 대한 예측
    valid_pred = model.predict(X_valid)
    test_pred  = model.predict( local_test.drop(drop_cols,1) )
    
    # CV를 저장
    cv_list.append( sqrt( mean_squared_error( y_valid, valid_pred )  ) )
    
    # OOF에 예측값을 저장
    oof_train[val_ind] = valid_pred
    oof_test += test_pred/n_splits # 학습이 진행 될 때마다 1/5만큼만 더해라 
    print('='*80)
    
print(f"<LOCAL_TRAIN> OVERALL RMSE : {sqrt( mean_squared_error( local_train_label, oof_train ) )}")
print(f"<LOCAL_TEST>  OVERALL RMSE : {sqrt( mean_squared_error( local_test_label, oof_test ) )}")

Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34013	valid's rmse: 2.76295
[200]	train's rmse: 2.10389	valid's rmse: 2.74295
[300]	train's rmse: 1.95722	valid's rmse: 2.72343
[400]	train's rmse: 1.84886	valid's rmse: 2.71699
Early stopping, best iteration is:
[445]	train's rmse: 1.80723	valid's rmse: 2.71068
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34328	valid's rmse: 2.73387
[200]	train's rmse: 2.10399	valid's rmse: 2.71437
[300]	train's rmse: 1.96205	valid's rmse: 2.70933
[400]	train's rmse: 1.85441	valid's rmse: 2.70514
Early stopping, best iteration is:
[357]	train's rmse: 1.89796	valid's rmse: 2.70329
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32837	valid's rmse: 2.91084
[200]	train's rmse: 2.09175	valid's rmse: 2.8965
Early stopping, best iteration is:
[192]	train's rmse: 2.10446	valid's rmse: 2.89016
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.35792	valid's rmse: 2.6745
[200]	train's rmse: 2.11241	valid's rmse: 2.64986
Early stopping, best iteration is:
[223]	train's rmse: 2.07327	valid's rmse: 2.64049
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32428	valid's rmse: 2.88111
[200]	train's rmse: 2.0944	valid's rmse: 2.85047
[300]	train's rmse: 1.94905	valid's rmse: 2.8433
Early stopping, best iteration is:
[318]	train's rmse: 1.92777	valid's rmse: 2.8406
================================================================================
<LOCAL_TRAIN> OVERALL RMSE : 2.7586152832265656
<LOCAL_TEST>  OVERALL RMSE : 2.6149846987553675


# 실제값과 예측값의 분포 비교 
fig, axes = plt.subplots(1, 2, figsize=(20,8), sharex=True, sharey= True)

# y=x를 그리기 위하여 
x_range = np.linspace(0, 300, 1000) # 0에서 300까지 1000개의 요소로 만들어라 

# <SUBPLOT 1> : local_train에 대한 예측/실제값 비교
axes[0].scatter( local_train_label, oof_train )
axes[0].set_xlabel("Prediction")
axes[0].set_ylabel("Real")
axes[0].plot(x_range, x_range, color='r')

# <SUBPLOT 2> : local_test에 대한 예측/실제값 비교
axes[1].scatter( local_test_label, oof_test )
axes[1].set_xlabel("Prediction")
axes[1].set_ylabel("Real")
axes[1].plot(x_range, x_range, color='r')

plt.suptitle('Comparison between Prediction VS Real');


# 실제값 vs 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( oof_train, color='r' , label='Prediction for Local-Train')
sns.distplot( local_train_label, color='b', label='Real' )
plt.legend()
plt.title("Comparing the real vs prediction in Local-Train");


min(oof_train) # 0보다 작은 수에 대해서는 0으로 보정

-16.972225679224387


min(local_train_label) # 최소 승객수 0

0.0


# 실제값 vs 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( oof_test, color='r' , label='Prediction for Local-Test')
sns.distplot( local_test_label, color='b', label='Real' )
plt.legend()
plt.title("Comparing the real vs prediction in Local-Test");


del local_train, local_test, local_train_label, local_test_label; 
gc.collect();


# train_label 정의
train_label = train[target_col]


# categorical variable에 대해서는 Label-Encoding을 수행 
# --> One-Hot Encoding가 바람직하다고 생각되나 메모리 문제로 실행할 수 없음.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    
    # Label-Encoding을 fitting함
    lbl.fit( train[col] )
    
    # local_train/local_test에 label_encoding한 값을 대입함
    train[col] = lbl.transform(train[col])


# 각 모델에 대한 oof 정의 
ridge_oof_train = np.zeros((train.shape[0]))
lasso_oof_train = np.zeros((train.shape[0]))
dt_oof_train = np.zeros((train.shape[0]))
rf_oof_train = np.zeros((train.shape[0]))
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의 
Kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True) 

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (1) Ridge
    print("---TRAINING RIDGE---")
    ridge = Ridge(random_state = 1993)
    
    ridge.fit(X_train, y_train)
    
    ridge_valid_pred = ridge.predict(X_valid)
    ridge_oof_train[val_ind] = ridge_valid_pred
    
    # (2) Lasso
    print("---TRAINING LASSO---")
    lasso = Lasso(random_state = 1993)
    
    lasso.fit(X_train, y_train)
    
    lasso_valid_pred = lasso.predict(X_valid)
    lasso_oof_train[val_ind] = lasso_valid_pred
    
    # (3) Decision Tree
    print("---TRAINING DECISION TREE---")
    dt = DecisionTreeRegressor(random_state=231)
    
    dt.fit(X_train, y_train)
    
    dt_valid_pred = dt.predict(X_valid)
    dt_oof_train[val_ind] = dt_valid_pred
    
    
    # (4) Random Forest
    print("---TRAINING RANDOM FOREST---")
    rf = RandomForestRegressor(random_state=231, n_estimators=20 )
    
    rf.fit(X_train, y_train)
    
    rf_valid_pred = rf.predict(X_valid)
    rf_oof_train[val_ind] = rf_valid_pred
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 0)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Ridge> OVERALL RMSE         : {sqrt( mean_squared_error( train_label, ridge_oof_train ) )}")
print(f"<Lasso> OVERALL RMSE         : {sqrt( mean_squared_error( train_label, lasso_oof_train ) )}")
print(f"<Decision-Tree> OVERALL RMSE : {sqrt( mean_squared_error( train_label, dt_oof_train ) )}")
print(f"<Random-Forest> OVERALL RMSE : {sqrt( mean_squared_error( train_label, rf_oof_train ) )}")
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
<Ridge> OVERALL RMSE         : 3.5256174413470043
<Lasso> OVERALL RMSE         : 3.6478537865113023
<Decision-Tree> OVERALL RMSE : 4.174571185391608
<Random-Forest> OVERALL RMSE : 2.922049207507851
<Light-GBM> OVERALL RMSE     : 2.7837265002333376


plt.figure(figsize=(24,5))

# Ridge의 Coef를 barplot으로 그린다. 
plt.bar( train.drop(drop_cols,1).columns, ridge.coef_   )

# y=0인 horizental한 선을 그린다. 
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title('Coef of Ridge Model');


plt.figure(figsize=(24,5))

# lasso의 Coef를 barplot으로 그린다.
plt.bar( train.drop(drop_cols,1).columns,  lasso.coef_ )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of lasso Model");


# 상관관계를 살펴보도록 하자.
train.corr()[target_col].sort_values()

bus_route_id    -0.137364
station_code    -0.034048
longitude       -0.021368
id              -0.000544
in_out           0.022116
station_name     0.054216
latitude         0.079261
6~7_takeoff      0.178353
7~8_takeoff      0.219430
6~7_ride         0.262173
8~9_takeoff      0.274360
10~11_takeoff    0.290691
9~10_takeoff     0.295875
11~12_takeoff    0.313540
7~8_ride         0.371751
8~9_ride         0.445316
9~10_ride        0.494085
10~11_ride       0.512666
11~12_ride       0.569747
18~20_ride       1.000000
Name: 18~20_ride, dtype: float64


plt.figure(figsize=(24,5))

# Random Forest의 Feature Importances를 barplot으로 그린다. 
plt.bar( train.drop(drop_cols, 1).columns, rf.feature_importances_ )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of Random Forest Model");


plt.figure(figsize=(24,5))

# lgbm의 Feature Importances를 barplot으로 그린다.
plt.bar( train.drop(drop_cols,1).columns,  model.feature_importance() )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of light GBM Model");


# 전체 데이터를 사용할 시 너무 많은 시간이 소요되어 일부 샘플만 사용하도록 하겠습니다.
sample = train.drop(drop_cols,1).sample(1000)


# PDP Plot For 11~12_ride
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), #컬럼의 이름 
    feature='11~12_ride')

fig, axes = pdp.pdp_plot(pdp_, '11~12_ride')

# 해당 피처가 변함에 따라 타겟이 어떻게 변하는가 
# 11~12시 탑승 승객수가 많은 곳일수록 퇴근 시간에 타는 승객수가 많을 것이다. 
# 폭이 작을수록 예측값에 대한 편차가 적다. = 신뢰도가 높다


# PDP Plot For 8~9_takeoff
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='8~9_takeoff'
)
fig, axes = pdp.pdp_plot(pdp_, '8~9_takeoff')


# PDP Plot For latitude
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='latitude'
)
fig, axes = pdp.pdp_plot(pdp_, 'latitude')


# PDP Plot For longitude
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='longitude'
)
fig, axes = pdp.pdp_plot(pdp_, 'longitude')


# Interactive PDP Plot For latitude,longitude
pdp_ = pdp.pdp_interact(
    model= model, dataset=sample, model_features=list(sample), features=['latitude','longitude']
)

fig, axes = pdp.pdp_interact_plot(pdp_interact_out=pdp_,
                                  feature_names=['longitude', 'latitude'],
                                  plot_type='grid',
                                  x_quantile=True,
                                  plot_pdp=False)


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000 # boosting을 얼마나 돌릴지 지정한다 
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.1,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.78,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }


df_bus = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/bus_bts.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/submission_sample.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/test.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/train.csv')


# categorical variable에 대해서는 Label-Encoding을 수행 
# --> One-Hot Encoding가 바람직하다고 생각되나 메모리 문제로 실행할 수 없음.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    
    # Label-Encoding을 fitting함
    lbl.fit( train[[col]].append(test[[col]]) )
    
    # train/test label_encoding한 값을 대입함
    train[col] = lbl.transform(train[col])
    test[col] = lbl.transform(test[col])


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30574	valid's rmse: 2.74915
[200]	train's rmse: 2.05878	valid's rmse: 2.70246
[300]	train's rmse: 1.90955	valid's rmse: 2.66699
[400]	train's rmse: 1.80317	valid's rmse: 2.65386
[500]	train's rmse: 1.72525	valid's rmse: 2.63958
[600]	train's rmse: 1.66145	valid's rmse: 2.63267
[700]	train's rmse: 1.60627	valid's rmse: 2.62778
Early stopping, best iteration is:
[660]	train's rmse: 1.62635	valid's rmse: 2.62755
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32874	valid's rmse: 2.66488
[200]	train's rmse: 2.08707	valid's rmse: 2.63891
[300]	train's rmse: 1.94474	valid's rmse: 2.62898
[400]	train's rmse: 1.83968	valid's rmse: 2.61132
[500]	train's rmse: 1.75473	valid's rmse: 2.60217
[600]	train's rmse: 1.68747	valid's rmse: 2.59209
[700]	train's rmse: 1.63126	valid's rmse: 2.58362
[800]	train's rmse: 1.58192	valid's rmse: 2.57711
[900]	train's rmse: 1.53579	valid's rmse: 2.57242
[1000]	train's rmse: 1.49653	valid's rmse: 2.56804
Early stopping, best iteration is:
[1046]	train's rmse: 1.47885	valid's rmse: 2.5658
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30812	valid's rmse: 2.79685
[200]	train's rmse: 2.06095	valid's rmse: 2.78083
Early stopping, best iteration is:
[187]	train's rmse: 2.08647	valid's rmse: 2.77592
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30582	valid's rmse: 2.7847
[200]	train's rmse: 2.05827	valid's rmse: 2.74054
[300]	train's rmse: 1.91722	valid's rmse: 2.72011
[400]	train's rmse: 1.81145	valid's rmse: 2.70567
[500]	train's rmse: 1.72925	valid's rmse: 2.69213
[600]	train's rmse: 1.66309	valid's rmse: 2.68586
[700]	train's rmse: 1.60667	valid's rmse: 2.6818
[800]	train's rmse: 1.55937	valid's rmse: 2.68089
[900]	train's rmse: 1.51643	valid's rmse: 2.67663
Early stopping, best iteration is:
[895]	train's rmse: 1.51852	valid's rmse: 2.67584
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.29918	valid's rmse: 2.86883
[200]	train's rmse: 2.05735	valid's rmse: 2.82333
[300]	train's rmse: 1.91679	valid's rmse: 2.81182
[400]	train's rmse: 1.81536	valid's rmse: 2.79259
[500]	train's rmse: 1.73252	valid's rmse: 2.77651
[600]	train's rmse: 1.66839	valid's rmse: 2.77229
[700]	train's rmse: 1.61136	valid's rmse: 2.76939
[800]	train's rmse: 1.56388	valid's rmse: 2.76638
[900]	train's rmse: 1.52321	valid's rmse: 2.76538
Early stopping, best iteration is:
[886]	train's rmse: 1.52821	valid's rmse: 2.7637
================================================================================
<Light-GBM> OVERALL RMSE     : 2.6829542351234412


# 승하차 간격을 2시간 간격으로 설정할 수는 없는가? (3시간으로 설정해도 ok -> 결국 실험의 영역)
dawn_ride_cols, dawn_takeoff_cols = ['6~7_ride','7~8_ride'], ['6~7_takeoff','7~8_takeoff']
morning_ride_cols, morning_takeoff_cols = ['8~9_ride','9~10_ride'], ['8~9_takeoff','9~10_takeoff']
noon_ride_cols, noon_takeoff_cols = ['10~11_ride','11~12_ride'], ['10~11_takeoff','11~12_takeoff']

def modify_terms(df): 
  # ride columns
  df['dawn_ride'] = df[dawn_ride_cols].sum(axis=1)
  df['morning_ride'] = df[morning_ride_cols].sum(axis=1)
  df['noon_ride'] = df[noon_ride_cols].sum(axis=1)

  # takeoff columns
  df['dawn_takeoff'] = df[dawn_takeoff_cols].sum(axis=1)
  df['morning_takeoff'] = df[morning_takeoff_cols].sum(axis=1)
  df['noon_takeoff'] = df[noon_takeoff_cols].sum(axis=1)

  # drop columns 
  drop_cols = dawn_ride_cols + morning_ride_cols + noon_ride_cols + dawn_takeoff_cols + morning_takeoff_cols + noon_takeoff_cols
  df = df.drop(drop_cols, 1)

  return df 

train = modify_terms(train)
test = modify_terms(test)


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32718	valid's rmse: 2.75328
Early stopping, best iteration is:
[140]	train's rmse: 2.21752	valid's rmse: 2.74061
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.37316	valid's rmse: 2.63744
[200]	train's rmse: 2.14616	valid's rmse: 2.60199
[300]	train's rmse: 2.01191	valid's rmse: 2.59023
[400]	train's rmse: 1.92035	valid's rmse: 2.58024
[500]	train's rmse: 1.84405	valid's rmse: 2.57767
[600]	train's rmse: 1.77875	valid's rmse: 2.57152
Early stopping, best iteration is:
[565]	train's rmse: 1.79968	valid's rmse: 2.56977
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34121	valid's rmse: 2.75993
[200]	train's rmse: 2.11551	valid's rmse: 2.71367
[300]	train's rmse: 1.98132	valid's rmse: 2.70279
Early stopping, best iteration is:
[347]	train's rmse: 1.93213	valid's rmse: 2.69721
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34294	valid's rmse: 2.72779
[200]	train's rmse: 2.12502	valid's rmse: 2.69412
[300]	train's rmse: 1.99061	valid's rmse: 2.67572
[400]	train's rmse: 1.89143	valid's rmse: 2.66737
[500]	train's rmse: 1.81195	valid's rmse: 2.66091
[600]	train's rmse: 1.74383	valid's rmse: 2.65133
[700]	train's rmse: 1.68879	valid's rmse: 2.64864
Early stopping, best iteration is:
[744]	train's rmse: 1.66847	valid's rmse: 2.64618
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32519	valid's rmse: 2.80855
[200]	train's rmse: 2.10711	valid's rmse: 2.77664
[300]	train's rmse: 1.97337	valid's rmse: 2.76544
Early stopping, best iteration is:
[293]	train's rmse: 1.98019	valid's rmse: 2.76211
================================================================================
<Light-GBM> OVERALL RMSE     : 2.6840671696237


# 피쳐 중요도 확인 
data = { 'col' : model.feature_name(), 
         'imp' : model.feature_importance()}

df_imp = pd.DataFrame(data).sort_values(by='imp', ascending=False).reset_index(drop=True)
df_imp


train['date'] = pd.to_datetime(train['date'].astype(str))
test['date'] = pd.to_datetime(test['date'].astype(str))


# 요일 정보 추가 
train['weekday'] = train['date'].dt.weekday
test['weekday'] = test['date'].dt.weekday

# 공휴일 정보 추가 
# -> EDA 필요 
holidays = [datetime(2019, 9 ,12), datetime(2019, 9, 13), datetime(2019, 9 ,14), datetime(2019, 10,3), datetime(2019, 10,9) ]
train['is_holiday'] = train['date'].apply(lambda x : x in holidays).astype(np.int8)
test['is_holiday'] = test['date'].apply(lambda x : x in holidays).astype(np.int8)


# Mean Encoding
# (1) 일자별로 dawn, morning, noon에 각각 몇몇의 승객이 탑승하였는가
# (2) 일자별로 dawn, morning, noon에 각각 몇몇의 승객이 하차하였는가
# - 기준 :
# - (1) bus_route_id
# - (2) bus_route_id , station_code
# - (3) station_code

# (1) bus_route_id 기준

# 탑승
train['avg_dawn_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['dawn_ride'].transform('mean')
train['avg_morning_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['morning_ride'].transform('mean')
train['avg_noon_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['noon_ride'].transform('mean')

test['avg_dawn_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['dawn_ride'].transform('mean')
test['avg_morning_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['morning_ride'].transform('mean')
test['avg_noon_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['noon_ride'].transform('mean')

# 하차 
train['avg_dawn_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['dawn_takeoff'].transform('mean')
train['avg_morning_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['morning_takeoff'].transform('mean')
train['avg_noon_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['noon_takeoff'].transform('mean')

test['avg_dawn_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['dawn_takeoff'].transform('mean')
test['avg_morning_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['morning_takeoff'].transform('mean')
test['avg_noon_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['noon_takeoff'].transform('mean')

# (2) bus_route_id, station_code 기준
# train['avg_dawn_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['dawn_ride'].transform('mean') 
# train['avg_morning_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['morning_ride'].transform('mean') 
# train['avg_noon_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['noon_ride'].transform('mean') 

# test['avg_dawn_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['dawn_ride'].transform('mean') 
# test['avg_morning_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['morning_ride'].transform('mean') 
# test['avg_noon_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['noon_ride'].transform('mean') 

# (3) station_code 기준
# train['avg_dawn_ride_station_code'] = train.groupby(['date','station_code'])['dawn_ride'].transform('mean') 
# train['avg_morning_ride_bus_station_code'] = train.groupby(['date','station_code'])['morning_ride'].transform('mean') 
# train['avg_noon_ride_station_code'] = train.groupby(['date','station_code'])['noon_ride'].transform('mean') 

# test['avg_dawn_ride_station_code'] = test.groupby(['date','station_code'])['dawn_ride'].transform('mean') 
# test['avg_morning_ride_bus_station_code'] = test.groupby(['date','station_code'])['morning_ride'].transform('mean') 
# test['avg_noon_ride_station_code'] = test.groupby(['date','station_code'])['noon_ride'].transform('mean')


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train # 승객에 대한 정보를 y_train 값으로 임시로 넣어줌 
    d = X_train.groupby(['station_code'])['label'].mean().to_dict() # 정류장별로 승객이 평균 몇 명 탔는지 
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.00336	valid's rmse: 2.52675
[200]	train's rmse: 1.78187	valid's rmse: 2.49772
[300]	train's rmse: 1.64639	valid's rmse: 2.47723
Early stopping, best iteration is:
[321]	train's rmse: 1.62216	valid's rmse: 2.4738
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.04421	valid's rmse: 2.32358
[200]	train's rmse: 1.8193	valid's rmse: 2.28644
[300]	train's rmse: 1.67692	valid's rmse: 2.26913
[400]	train's rmse: 1.57436	valid's rmse: 2.2538
[500]	train's rmse: 1.49306	valid's rmse: 2.24171
[600]	train's rmse: 1.42323	valid's rmse: 2.23465
[700]	train's rmse: 1.35851	valid's rmse: 2.23254
[800]	train's rmse: 1.30305	valid's rmse: 2.22767
Early stopping, best iteration is:
[814]	train's rmse: 1.29499	valid's rmse: 2.22617
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.01583	valid's rmse: 2.50339
[200]	train's rmse: 1.78231	valid's rmse: 2.45207
[300]	train's rmse: 1.64337	valid's rmse: 2.43175
[400]	train's rmse: 1.54414	valid's rmse: 2.42236
[500]	train's rmse: 1.46167	valid's rmse: 2.41846
[600]	train's rmse: 1.39314	valid's rmse: 2.41276
[700]	train's rmse: 1.33451	valid's rmse: 2.40836
[800]	train's rmse: 1.28564	valid's rmse: 2.40459
[900]	train's rmse: 1.23967	valid's rmse: 2.40028
[1000]	train's rmse: 1.19984	valid's rmse: 2.39861
[1100]	train's rmse: 1.16337	valid's rmse: 2.39758
[1200]	train's rmse: 1.1295	valid's rmse: 2.39471
[1300]	train's rmse: 1.09942	valid's rmse: 2.39336
Early stopping, best iteration is:
[1272]	train's rmse: 1.10728	valid's rmse: 2.39298
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.0207	valid's rmse: 2.53581
[200]	train's rmse: 1.79294	valid's rmse: 2.48558
[300]	train's rmse: 1.64926	valid's rmse: 2.4619
[400]	train's rmse: 1.54676	valid's rmse: 2.44907
[500]	train's rmse: 1.46669	valid's rmse: 2.44035
[600]	train's rmse: 1.39789	valid's rmse: 2.43475
[700]	train's rmse: 1.33807	valid's rmse: 2.43057
[800]	train's rmse: 1.28824	valid's rmse: 2.42792
[900]	train's rmse: 1.24086	valid's rmse: 2.42301
[1000]	train's rmse: 1.20028	valid's rmse: 2.41925
[1100]	train's rmse: 1.16345	valid's rmse: 2.41493
Early stopping, best iteration is:
[1113]	train's rmse: 1.15848	valid's rmse: 2.41409
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.01455	valid's rmse: 2.55595
[200]	train's rmse: 1.77955	valid's rmse: 2.50323
[300]	train's rmse: 1.644	valid's rmse: 2.47732
[400]	train's rmse: 1.54051	valid's rmse: 2.46135
[500]	train's rmse: 1.45935	valid's rmse: 2.45535
[600]	train's rmse: 1.39068	valid's rmse: 2.44851
[700]	train's rmse: 1.32873	valid's rmse: 2.44447
[800]	train's rmse: 1.27854	valid's rmse: 2.43948
Early stopping, best iteration is:
[837]	train's rmse: 1.26044	valid's rmse: 2.43763
================================================================================
<Light-GBM> OVERALL RMSE     : 2.3904698945031


# 날씨 정보 
df_weather = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/jeju_weather_dataset', encoding='cp949')
df_weather = df_weather[['일시', '강수량(mm)']]
df_weather.columns = ['date', 'precipitation']

# date의 type을 string에서 datetime으로 변환
df_weather['date'] = pd.to_datetime( df_weather['date'] )

# 대회 기간에 해당하는 데이터만 사용하도록 함
df_weather = df_weather[(df_weather['date']>='2019-08-31 00:00:00')&(df_weather['date']<='2019-10-16 23:00:00')].reset_index(drop=True)

# 대회 규정상 해당 날짜의 15시까지 정보만 사용할 수 있음
df_weather['hour'] = df_weather['date'].dt.hour
df_weather['date'] = df_weather['date'].dt.date

# 전날의 강수량을 정보를 대입할 때 사용
df_prevday_weather = df_weather.groupby('date')['precipitation'].sum().reset_index()
df_prevday_weather.columns = ['prev_date', 'prevday_precipitation']

# 해당 날짜의 강수량을 구함 
df_weather = df_weather[df_weather['hour']<=15].reset_index(drop=True)

# 00~15시까지의 강수량을 피쳐로 사용 
df_weather = df_weather.groupby('date')['precipitation'].sum().reset_index()

# Train/Test-set과 join하기 위하여 column의 타입을 datetime으로 변환한다. 
df_prevday_weather['prev_date'] = pd.to_datetime( df_prevday_weather['prev_date'] )
df_weather['date'] = pd.to_datetime( df_weather['date'] )


# 전날짜에 대하여 Train/Test-set과 강수량 정보를 join

# Train/Test-set에 대하여 전날을 구함
train['prev_date'] = train['date'] - pd.Timedelta('1 day')
test['prev_date'] = test['date'] - pd.Timedelta('1 day')

train = pd.merge(train, df_prevday_weather , on ='prev_date',  how ='left')
test = pd.merge(test, df_prevday_weather , on ='prev_date',how ='left')

# prev_date 칼럼은 삭제해줌
train = train.drop('prev_date',1)
test = test.drop('prev_date',1)

# 해당 날짜에 대하여 Train/Test-set과 강수량 정보를 join
train = pd.merge( train, df_weather , on ='date', how='left')
test = pd.merge( test, df_weather , on ='date', how='left')


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.98892	valid's rmse: 2.50777
[200]	train's rmse: 1.75765	valid's rmse: 2.47466
[300]	train's rmse: 1.61907	valid's rmse: 2.46216
[400]	train's rmse: 1.52395	valid's rmse: 2.45348
[500]	train's rmse: 1.44409	valid's rmse: 2.45138
[600]	train's rmse: 1.37676	valid's rmse: 2.44578
[700]	train's rmse: 1.32027	valid's rmse: 2.44222
[800]	train's rmse: 1.26823	valid's rmse: 2.43823
[900]	train's rmse: 1.22221	valid's rmse: 2.43711
[1000]	train's rmse: 1.18108	valid's rmse: 2.43461
Early stopping, best iteration is:
[1022]	train's rmse: 1.17186	valid's rmse: 2.43394
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.02505	valid's rmse: 2.31029
[200]	train's rmse: 1.7867	valid's rmse: 2.26391
[300]	train's rmse: 1.64805	valid's rmse: 2.24587
[400]	train's rmse: 1.54749	valid's rmse: 2.23398
[500]	train's rmse: 1.4668	valid's rmse: 2.22306
[600]	train's rmse: 1.4016	valid's rmse: 2.21615
[700]	train's rmse: 1.34121	valid's rmse: 2.21269
Early stopping, best iteration is:
[696]	train's rmse: 1.34286	valid's rmse: 2.21179
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.99473	valid's rmse: 2.46048
[200]	train's rmse: 1.75938	valid's rmse: 2.40683
[300]	train's rmse: 1.62116	valid's rmse: 2.39111
[400]	train's rmse: 1.52165	valid's rmse: 2.37788
[500]	train's rmse: 1.43592	valid's rmse: 2.3626
[600]	train's rmse: 1.36932	valid's rmse: 2.35485
[700]	train's rmse: 1.31183	valid's rmse: 2.35108
[800]	train's rmse: 1.26333	valid's rmse: 2.34833
[900]	train's rmse: 1.21817	valid's rmse: 2.34526
Early stopping, best iteration is:
[938]	train's rmse: 1.20349	valid's rmse: 2.3439
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.00644	valid's rmse: 2.51324
[200]	train's rmse: 1.77555	valid's rmse: 2.46179
[300]	train's rmse: 1.6328	valid's rmse: 2.43791
[400]	train's rmse: 1.52822	valid's rmse: 2.41681
[500]	train's rmse: 1.44884	valid's rmse: 2.40623
[600]	train's rmse: 1.38006	valid's rmse: 2.39611
[700]	train's rmse: 1.32129	valid's rmse: 2.3922
[800]	train's rmse: 1.27157	valid's rmse: 2.3895
[900]	train's rmse: 1.22409	valid's rmse: 2.38567
[1000]	train's rmse: 1.18314	valid's rmse: 2.38441
Early stopping, best iteration is:
[1011]	train's rmse: 1.17847	valid's rmse: 2.38343
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.97821	valid's rmse: 2.50554
[200]	train's rmse: 1.75319	valid's rmse: 2.45653
[300]	train's rmse: 1.61256	valid's rmse: 2.43048
[400]	train's rmse: 1.51372	valid's rmse: 2.41741
[500]	train's rmse: 1.43713	valid's rmse: 2.41027
Early stopping, best iteration is:
[547]	train's rmse: 1.40401	valid's rmse: 2.40453
================================================================================
<Light-GBM> OVERALL RMSE     : 2.356796981864371


train.head()


# 해당 딕셔너리에 bus_route_id 별 정차 순서를 구하도록 함
bus_route_sequence = {}

# 모든 bus_route_id 수집 
combined = train.append(test, ignore_index=True)
all_bus_route_ids = set(combined['bus_route_id'])

for bus_route_id in tqdm_notebook( all_bus_route_ids ) :
    # bus_route_id별 station_code를 오름차순으로 순서매김함
    df_bus_route = combined[combined['bus_route_id']==bus_route_id]
    sorted_station_codes = np.unique(df_bus_route['station_code'])
    
    # dictionary에 해당 정류장이 몇번째 정차 정류장인지 기입
    bus_route_sequence[bus_route_id] = {station_code: ind for ind, station_code in enumerate( list(sorted_station_codes) )}


# 몇 번째 정류장인지를 피쳐로 생성 
train['nth_station'] = train[['bus_route_id', 'station_code']].apply(lambda x: bus_route_sequence.get(x[0]).get(x[1]), axis=1)
test['nth_station'] = test[['bus_route_id', 'station_code']].apply(lambda x: bus_route_sequence.get(x[0]).get(x[1]), axis=1)


# 해당 bus_route_id에는 몇 개의 정류장이 있는지 
bus_route_id_total_station_count_dict = combined.groupby('bus_route_id')['station_code'].nunique().to_dict()

train['bus_route_id_total_station_count'] = train['bus_route_id'].apply(lambda x : bus_route_id_total_station_count_dict.get(x))
test['bus_route_id_total_station_count'] = test['bus_route_id'].apply(lambda x : bus_route_id_total_station_count_dict.get(x))


# 뒤에서부터 몇 번째 정류장인지 
train['nth_station_backward'] = train['nth_station'] - train['bus_route_id_total_station_count']
test['nth_station_backward'] = test['nth_station'] - test['bus_route_id_total_station_count']


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75289	valid's rmse: 2.35176
Early stopping, best iteration is:
[137]	train's rmse: 1.64643	valid's rmse: 2.33721
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.79312	valid's rmse: 2.13689
[200]	train's rmse: 1.55592	valid's rmse: 2.11103
Early stopping, best iteration is:
[242]	train's rmse: 1.48991	valid's rmse: 2.10712
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75385	valid's rmse: 2.30077
[200]	train's rmse: 1.52184	valid's rmse: 2.27475
[300]	train's rmse: 1.39039	valid's rmse: 2.26805
Early stopping, best iteration is:
[298]	train's rmse: 1.39264	valid's rmse: 2.26774
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75804	valid's rmse: 2.33279
[200]	train's rmse: 1.52563	valid's rmse: 2.29234
Early stopping, best iteration is:
[246]	train's rmse: 1.45673	valid's rmse: 2.2902
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76114	valid's rmse: 2.33683
[200]	train's rmse: 1.52142	valid's rmse: 2.30265
[300]	train's rmse: 1.38623	valid's rmse: 2.29356
Early stopping, best iteration is:
[308]	train's rmse: 1.37807	valid's rmse: 2.29245
================================================================================
<Light-GBM> OVERALL RMSE     : 2.260331683533115


# 중복되지 않는 위경도 값들을 수집함
combined = train[['latitude', 'longitude']].append(test[['latitude', 'longitude']])
combined = combined.drop_duplicates()

# kmeans를 통하여 군집화 
kmeans = KMeans(n_clusters = int(sqrt(len(combined))), random_state=1993)
kmeans.fit(combined)

train['station_code_kmeans'] = kmeans.predict(train[['latitude', 'longitude']])
test['station_code_kmeans'] = kmeans.predict(test[['latitude', 'longitude']])


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.74036	valid's rmse: 2.34768
Early stopping, best iteration is:
[139]	train's rmse: 1.63143	valid's rmse: 2.33906
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.77957	valid's rmse: 2.15445
[200]	train's rmse: 1.54419	valid's rmse: 2.14345
[300]	train's rmse: 1.40646	valid's rmse: 2.13406
Early stopping, best iteration is:
[311]	train's rmse: 1.39474	valid's rmse: 2.13285
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75307	valid's rmse: 2.30793
[200]	train's rmse: 1.52079	valid's rmse: 2.27812
Early stopping, best iteration is:
[192]	train's rmse: 1.53371	valid's rmse: 2.27501
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75665	valid's rmse: 2.30983
[200]	train's rmse: 1.52562	valid's rmse: 2.26954
[300]	train's rmse: 1.38872	valid's rmse: 2.26113
[400]	train's rmse: 1.29594	valid's rmse: 2.25728
[500]	train's rmse: 1.22455	valid's rmse: 2.25507
Early stopping, best iteration is:
[469]	train's rmse: 1.24543	valid's rmse: 2.25452
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.74818	valid's rmse: 2.35159
[200]	train's rmse: 1.5146	valid's rmse: 2.32137
[300]	train's rmse: 1.37806	valid's rmse: 2.30962
[400]	train's rmse: 1.28658	valid's rmse: 2.30433
Early stopping, best iteration is:
[401]	train's rmse: 1.2859	valid's rmse: 2.30412
================================================================================
<Light-GBM> OVERALL RMSE     : 2.2622008933054794


lgbm_param = {'objective': 'rmse',
             'boosting_type': 'gbdt',
             'random_state': 1993,
             'learning_rate': 0.1,
             'subsample': 0.7,
             'tree_learner': 'serial',
             'colsample_bytree': 0.78,
#              'early_stopping_rounds': 50,
             'subsample_freq': 1,
             'reg_lambda': 7,
             'reg_alpha': 5,
             'num_leaves': 96,
             'seed': 1993}


reg_model = lgbm.LGBMRegressor(**lgbm_param)

# RFE(Recursive Feature Elimination)
rfe = RFECV(estimator=reg_model, step=1, # 한 번 학습이 진행될 때마다 몇 개의 피처를 제거하고 싶으냐? 큰 숫자를 선택하면 학습시간 단축 
            cv=KFold(n_splits=5, shuffle=True, random_state=231), scoring='neg_mean_squared_error', verbose=2)
rfe.fit(train.drop(drop_cols,1), train_label)

Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.

RFECV(cv=KFold(n_splits=5, random_state=231, shuffle=True),
      estimator=LGBMRegressor(colsample_bytree=0.78, num_leaves=96,
                              objective='rmse', random_state=1993, reg_alpha=5,
                              reg_lambda=7, seed=1993, subsample=0.7,
                              subsample_freq=1, tree_learner='serial'),
      scoring='neg_mean_squared_error', verbose=2)


df_rank = pd.DataFrame(data = {'col' : list(train.drop(drop_cols,1)), 'imp' : rfe.ranking_}) # 랭킹이 1에 가까울수록 의미 있는 피처다 
use_cols = list(df_rank[df_rank['imp'] == 1]['col'])


lgbm_param = {'objective': 'rmse',
             'boosting_type': 'gbdt',
             'random_state': 1993,
             'learning_rate': 0.1,
             'subsample': 0.7,
             'tree_learner': 'serial',
             'colsample_bytree': 0.78,
             'early_stopping_rounds': 50,
             'subsample_freq': 1,
             'reg_lambda': 7,
             'reg_alpha': 5,
             'num_leaves': 96,
             'seed': 1993}


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train[use_cols], y_train)
    dvalid = lgbm.Dataset(X_valid[use_cols], y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid[use_cols])
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75777	valid's rmse: 2.35688
[200]	train's rmse: 1.52646	valid's rmse: 2.34582
[300]	train's rmse: 1.39089	valid's rmse: 2.33528
Early stopping, best iteration is:
[302]	train's rmse: 1.38892	valid's rmse: 2.33496
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.7989	valid's rmse: 2.16992
[200]	train's rmse: 1.56383	valid's rmse: 2.15231
Early stopping, best iteration is:
[240]	train's rmse: 1.50043	valid's rmse: 2.14869
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76513	valid's rmse: 2.32654
[200]	train's rmse: 1.53325	valid's rmse: 2.29655
[300]	train's rmse: 1.40288	valid's rmse: 2.28792
[400]	train's rmse: 1.31201	valid's rmse: 2.28522
[500]	train's rmse: 1.24289	valid's rmse: 2.28305
[600]	train's rmse: 1.18573	valid's rmse: 2.27828
Early stopping, best iteration is:
[639]	train's rmse: 1.16609	valid's rmse: 2.27801
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.77643	valid's rmse: 2.32143
[200]	train's rmse: 1.53992	valid's rmse: 2.27169
[300]	train's rmse: 1.40421	valid's rmse: 2.25443
[400]	train's rmse: 1.30902	valid's rmse: 2.24979
[500]	train's rmse: 1.23751	valid's rmse: 2.2473
[600]	train's rmse: 1.1799	valid's rmse: 2.24384
[700]	train's rmse: 1.13299	valid's rmse: 2.24388
Early stopping, best iteration is:
[651]	train's rmse: 1.15509	valid's rmse: 2.24292
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76481	valid's rmse: 2.34269
[200]	train's rmse: 1.52873	valid's rmse: 2.30365
[300]	train's rmse: 1.39461	valid's rmse: 2.29107
Early stopping, best iteration is:
[308]	train's rmse: 1.38632	valid's rmse: 2.29066
================================================================================
<Light-GBM> OVERALL RMSE     : 2.259915030786093


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.01,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.68,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }

n_rounds = 100000
cat_params = {
        'n_estimators': n_rounds,
        'learning_rate': 0.08, # 0.005로 하면 폴드당 2시간 걸림 
        'eval_metric': 'RMSE', 
        'loss_function': 'RMSE',
        'random_seed': 42, 
        'metric_period': 500, # 메트릭스 평가 주기 
        'od_wait': 500, # 오버피팅을 방지 할 때 
        'task_type': 'GPU', # 무조건 GPU를 사용하는게 좋음. settings > accelerator를 GPU로 설정 
       'l2_leaf_reg' : 3, # 얼마나 많이 regularization을 줄것인가. 2~3
        'depth': 8, # 보통 5~10 정도 
    }


target_col = '18~20_ride'
drop_cols = ['date','id',target_col]
train_label = train[target_col]


# 형식을 맞춰주기 위해서 Test-set에 '18~20_ride' columns을 만들어줌
test[target_col] = np.NaN


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))
lgbm_oof_test = np.zeros((test.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)


# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    test['station_code_te'] = test['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
    lgbm_test_pred = model.predict(test.drop(drop_cols, 1))
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    lgbm_oof_test += lgbm_test_pred/ n_splits
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.02832	valid's rmse: 3.07656
[200]	train's rmse: 2.44893	valid's rmse: 2.61618
[300]	train's rmse: 2.20433	valid's rmse: 2.459
[400]	train's rmse: 2.07505	valid's rmse: 2.40168
[500]	train's rmse: 1.98787	valid's rmse: 2.37225
[600]	train's rmse: 1.92107	valid's rmse: 2.35463
[700]	train's rmse: 1.86691	valid's rmse: 2.34302
[800]	train's rmse: 1.82279	valid's rmse: 2.33718
[900]	train's rmse: 1.78382	valid's rmse: 2.33224
[1000]	train's rmse: 1.74746	valid's rmse: 2.32641
[1100]	train's rmse: 1.71501	valid's rmse: 2.32323
[1200]	train's rmse: 1.68576	valid's rmse: 2.31939
[1300]	train's rmse: 1.65859	valid's rmse: 2.31672
[1400]	train's rmse: 1.63323	valid's rmse: 2.31355
[1500]	train's rmse: 1.61027	valid's rmse: 2.31166
[1600]	train's rmse: 1.58846	valid's rmse: 2.31103
[1700]	train's rmse: 1.56825	valid's rmse: 2.30873
[1800]	train's rmse: 1.54876	valid's rmse: 2.30801
[1900]	train's rmse: 1.53052	valid's rmse: 2.30615
Early stopping, best iteration is:
[1901]	train's rmse: 1.53036	valid's rmse: 2.30614
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.0602	valid's rmse: 2.92058
[200]	train's rmse: 2.49002	valid's rmse: 2.44412
[300]	train's rmse: 2.2498	valid's rmse: 2.28688
[400]	train's rmse: 2.11524	valid's rmse: 2.21899
[500]	train's rmse: 2.02692	valid's rmse: 2.18198
[600]	train's rmse: 1.96029	valid's rmse: 2.16117
[700]	train's rmse: 1.90338	valid's rmse: 2.14572
[800]	train's rmse: 1.85608	valid's rmse: 2.13595
[900]	train's rmse: 1.814	valid's rmse: 2.12795
[1000]	train's rmse: 1.77657	valid's rmse: 2.12116
[1100]	train's rmse: 1.74299	valid's rmse: 2.11643
[1200]	train's rmse: 1.7123	valid's rmse: 2.11336
[1300]	train's rmse: 1.68407	valid's rmse: 2.10895
[1400]	train's rmse: 1.65797	valid's rmse: 2.106
[1500]	train's rmse: 1.6331	valid's rmse: 2.10387
[1600]	train's rmse: 1.61059	valid's rmse: 2.10236
[1700]	train's rmse: 1.5889	valid's rmse: 2.10005
[1800]	train's rmse: 1.5693	valid's rmse: 2.09801
[1900]	train's rmse: 1.55018	valid's rmse: 2.09565
[2000]	train's rmse: 1.53166	valid's rmse: 2.09422
[2100]	train's rmse: 1.51406	valid's rmse: 2.09279
[2200]	train's rmse: 1.49808	valid's rmse: 2.09074
[2300]	train's rmse: 1.48272	valid's rmse: 2.08948
[2400]	train's rmse: 1.46826	valid's rmse: 2.08842
[2500]	train's rmse: 1.45394	valid's rmse: 2.08739
[2600]	train's rmse: 1.44087	valid's rmse: 2.08656
Early stopping, best iteration is:
[2591]	train's rmse: 1.44206	valid's rmse: 2.08648
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.02987	valid's rmse: 3.11315
[200]	train's rmse: 2.45429	valid's rmse: 2.63787
[300]	train's rmse: 2.21383	valid's rmse: 2.46554
[400]	train's rmse: 2.08319	valid's rmse: 2.38673
[500]	train's rmse: 1.99474	valid's rmse: 2.34121
[600]	train's rmse: 1.92878	valid's rmse: 2.31435
[700]	train's rmse: 1.87421	valid's rmse: 2.2968
[800]	train's rmse: 1.82899	valid's rmse: 2.28744
[900]	train's rmse: 1.78902	valid's rmse: 2.27901
[1000]	train's rmse: 1.75319	valid's rmse: 2.2725
[1100]	train's rmse: 1.72081	valid's rmse: 2.26646
[1200]	train's rmse: 1.69108	valid's rmse: 2.26111
[1300]	train's rmse: 1.66418	valid's rmse: 2.257
[1400]	train's rmse: 1.63893	valid's rmse: 2.25445
[1500]	train's rmse: 1.61546	valid's rmse: 2.25114
[1600]	train's rmse: 1.59346	valid's rmse: 2.24926
[1700]	train's rmse: 1.57329	valid's rmse: 2.24786
[1800]	train's rmse: 1.554	valid's rmse: 2.24547
[1900]	train's rmse: 1.53555	valid's rmse: 2.24353
[2000]	train's rmse: 1.51814	valid's rmse: 2.24104
[2100]	train's rmse: 1.50178	valid's rmse: 2.23935
[2200]	train's rmse: 1.48642	valid's rmse: 2.23717
Early stopping, best iteration is:
[2226]	train's rmse: 1.48243	valid's rmse: 2.23657
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.01285	valid's rmse: 3.20948
[200]	train's rmse: 2.44406	valid's rmse: 2.7019
[300]	train's rmse: 2.20952	valid's rmse: 2.5168
[400]	train's rmse: 2.0783	valid's rmse: 2.43012
[500]	train's rmse: 1.99352	valid's rmse: 2.38218
[600]	train's rmse: 1.92875	valid's rmse: 2.34991
[700]	train's rmse: 1.8763	valid's rmse: 2.33044
[800]	train's rmse: 1.83172	valid's rmse: 2.31512
[900]	train's rmse: 1.79225	valid's rmse: 2.30426
[1000]	train's rmse: 1.75681	valid's rmse: 2.29425
[1100]	train's rmse: 1.72582	valid's rmse: 2.28926
[1200]	train's rmse: 1.69662	valid's rmse: 2.28237
[1300]	train's rmse: 1.66927	valid's rmse: 2.27698
[1400]	train's rmse: 1.64447	valid's rmse: 2.27193
[1500]	train's rmse: 1.62098	valid's rmse: 2.26841
[1600]	train's rmse: 1.59909	valid's rmse: 2.26439
[1700]	train's rmse: 1.57884	valid's rmse: 2.26032
[1800]	train's rmse: 1.55961	valid's rmse: 2.25677
[1900]	train's rmse: 1.54096	valid's rmse: 2.25487
[2000]	train's rmse: 1.5236	valid's rmse: 2.25187
[2100]	train's rmse: 1.50699	valid's rmse: 2.24896
[2200]	train's rmse: 1.49103	valid's rmse: 2.24703
[2300]	train's rmse: 1.47617	valid's rmse: 2.24501
[2400]	train's rmse: 1.46181	valid's rmse: 2.24327
[2500]	train's rmse: 1.44781	valid's rmse: 2.24184
[2600]	train's rmse: 1.43446	valid's rmse: 2.24091
[2700]	train's rmse: 1.42182	valid's rmse: 2.23904
[2800]	train's rmse: 1.4097	valid's rmse: 2.2378
[2900]	train's rmse: 1.39779	valid's rmse: 2.23601
[3000]	train's rmse: 1.38649	valid's rmse: 2.23537
[3100]	train's rmse: 1.37548	valid's rmse: 2.23405
[3200]	train's rmse: 1.36494	valid's rmse: 2.23236
[3300]	train's rmse: 1.35466	valid's rmse: 2.23181
Early stopping, best iteration is:
[3304]	train's rmse: 1.35426	valid's rmse: 2.23171
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.99377	valid's rmse: 3.23045
[200]	train's rmse: 2.43337	valid's rmse: 2.71394
[300]	train's rmse: 2.19939	valid's rmse: 2.53204
[400]	train's rmse: 2.07129	valid's rmse: 2.44952
[500]	train's rmse: 1.986	valid's rmse: 2.39908
[600]	train's rmse: 1.92161	valid's rmse: 2.37141
[700]	train's rmse: 1.86878	valid's rmse: 2.35128
[800]	train's rmse: 1.82458	valid's rmse: 2.33858
[900]	train's rmse: 1.78543	valid's rmse: 2.32872
[1000]	train's rmse: 1.74897	valid's rmse: 2.31906
[1100]	train's rmse: 1.71658	valid's rmse: 2.31134
[1200]	train's rmse: 1.68727	valid's rmse: 2.30474
[1300]	train's rmse: 1.65982	valid's rmse: 2.29863
[1400]	train's rmse: 1.63429	valid's rmse: 2.29387
[1500]	train's rmse: 1.61017	valid's rmse: 2.28881
[1600]	train's rmse: 1.58838	valid's rmse: 2.28518
[1700]	train's rmse: 1.56794	valid's rmse: 2.28271
[1800]	train's rmse: 1.54872	valid's rmse: 2.27911
[1900]	train's rmse: 1.53063	valid's rmse: 2.27727
[2000]	train's rmse: 1.51323	valid's rmse: 2.27499
[2100]	train's rmse: 1.49665	valid's rmse: 2.27284
[2200]	train's rmse: 1.48094	valid's rmse: 2.27168
[2300]	train's rmse: 1.46596	valid's rmse: 2.26997
[2400]	train's rmse: 1.45187	valid's rmse: 2.26905
[2500]	train's rmse: 1.43816	valid's rmse: 2.26857
[2600]	train's rmse: 1.42495	valid's rmse: 2.26721
[2700]	train's rmse: 1.41254	valid's rmse: 2.26589
[2800]	train's rmse: 1.40113	valid's rmse: 2.26527
[2900]	train's rmse: 1.38953	valid's rmse: 2.26442
[3000]	train's rmse: 1.37847	valid's rmse: 2.26289
[3100]	train's rmse: 1.36795	valid's rmse: 2.26221
[3200]	train's rmse: 1.35745	valid's rmse: 2.26139
[3300]	train's rmse: 1.34749	valid's rmse: 2.26099
[3400]	train's rmse: 1.3378	valid's rmse: 2.26073
[3500]	train's rmse: 1.32838	valid's rmse: 2.26008
Early stopping, best iteration is:
[3483]	train's rmse: 1.32989	valid's rmse: 2.25993
================================================================================
<Light-GBM> OVERALL RMSE     : 2.2253883326990787


# 각 모델에 대한 oof 정의
cat_oof_train = np.zeros((train.shape[0]))
cat_oof_test = np.zeros((test.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)


# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    test['station_code_te'] = test['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    # (5) CATBOOST
    print("---TRAINING CATBOOST---")
    
    # model 정의&학습
    model = CatBoostRegressor(**cat_params)
    
    model.fit( X_train, y_train, eval_set = (X_valid, y_valid), 
              cat_features  = ['bus_route_id','station_code', 'station_code_kmeans'],
              use_best_model=True,
              verbose=True)
    
    # local_valid/local_test에 대한 예측
    cat_valid_pred = model.predict(X_valid)
    cat_test_pred = model.predict(test.drop(drop_cols, 1))
        
    cat_oof_train[val_ind] = cat_valid_pred
    cat_oof_test += cat_test_pred/ n_splits
    print('='*80)
    
print(f"<CATBOOST> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, cat_oof_train ) )}")

---TRAINING CATBOOST---
0:	learn: 4.5380824	test: 4.4306551	best: 4.4306551 (0)	total: 51.4ms	remaining: 1h 25m 37s
500:	learn: 1.9458630	test: 2.4234457	best: 2.4234457 (500)	total: 18.4s	remaining: 1h 51s
1000:	learn: 1.7374907	test: 2.3683804	best: 2.3683399 (999)	total: 35.7s	remaining: 58m 52s
1500:	learn: 1.6057637	test: 2.3434642	best: 2.3433077 (1490)	total: 55.4s	remaining: 1h 33s
2000:	learn: 1.5104954	test: 2.3334114	best: 2.3331079 (1992)	total: 1m 14s	remaining: 1h 1m 3s
2500:	learn: 1.4356077	test: 2.3268342	best: 2.3268342 (2500)	total: 1m 33s	remaining: 1h 38s
3000:	learn: 1.3728601	test: 2.3220920	best: 2.3220725 (2999)	total: 1m 51s	remaining: 59m 49s
3500:	learn: 1.3176372	test: 2.3196005	best: 2.3196005 (3500)	total: 2m 14s	remaining: 1h 1m 47s
4000:	learn: 1.2679179	test: 2.3186669	best: 2.3179452 (3758)	total: 2m 33s	remaining: 1h 1m 21s
4500:	learn: 1.2233283	test: 2.3157834	best: 2.3156541 (4486)	total: 2m 51s	remaining: 1h 40s
5000:	learn: 1.1817428	test: 2.3139396	best: 2.3137003 (4955)	total: 3m 9s	remaining: 1h
5500:	learn: 1.1435710	test: 2.3125356	best: 2.3124823 (5331)	total: 3m 27s	remaining: 59m 31s
6000:	learn: 1.1105757	test: 2.3122229	best: 2.3115961 (5675)	total: 3m 46s	remaining: 59m 2s
bestTest = 2.311596107
bestIteration = 5675
Shrink model to first 5676 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.5606310	test: 4.3299385	best: 4.3299385 (0)	total: 72.8ms	remaining: 2h 1m 16s
500:	learn: 1.9502860	test: 2.2283864	best: 2.2278009 (499)	total: 17.1s	remaining: 56m 42s
1000:	learn: 1.7326403	test: 2.1649495	best: 2.1649495 (1000)	total: 35.6s	remaining: 58m 38s
1500:	learn: 1.6062223	test: 2.1456226	best: 2.1456226 (1500)	total: 53s	remaining: 58m
2000:	learn: 1.5094705	test: 2.1333419	best: 2.1333354 (1997)	total: 1m 13s	remaining: 59m 56s
2500:	learn: 1.4362050	test: 2.1259394	best: 2.1258195 (2485)	total: 1m 31s	remaining: 59m 12s
3000:	learn: 1.3701407	test: 2.1206459	best: 2.1206111 (2986)	total: 1m 48s	remaining: 58m 39s
3500:	learn: 1.3168496	test: 2.1159085	best: 2.1158653 (3456)	total: 2m 6s	remaining: 58m 16s
4000:	learn: 1.2666220	test: 2.1130611	best: 2.1130342 (3988)	total: 2m 24s	remaining: 57m 58s
4500:	learn: 1.2218785	test: 2.1104105	best: 2.1102724 (4473)	total: 2m 43s	remaining: 57m 40s
5000:	learn: 1.1821355	test: 2.1087003	best: 2.1085766 (4982)	total: 3m 1s	remaining: 57m 24s
5500:	learn: 1.1451785	test: 2.1069246	best: 2.1069234 (5499)	total: 3m 19s	remaining: 57m 9s
6000:	learn: 1.1098986	test: 2.1061286	best: 2.1060720 (5992)	total: 3m 38s	remaining: 56m 58s
6500:	learn: 1.0784733	test: 2.1053291	best: 2.1052393 (6461)	total: 3m 57s	remaining: 56m 49s
7000:	learn: 1.0492986	test: 2.1046939	best: 2.1045145 (6882)	total: 4m 15s	remaining: 56m 39s
7500:	learn: 1.0222655	test: 2.1049285	best: 2.1044806 (7229)	total: 4m 37s	remaining: 57m
bestTest = 2.104480622
bestIteration = 7229
Shrink model to first 7230 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.5317548	test: 4.4867464	best: 4.4867464 (0)	total: 62.3ms	remaining: 1h 43m 51s
500:	learn: 1.9579903	test: 2.3418965	best: 2.3418965 (500)	total: 17.4s	remaining: 57m 43s
1000:	learn: 1.7426254	test: 2.2779696	best: 2.2779324 (996)	total: 34.7s	remaining: 57m 13s
1500:	learn: 1.6108901	test: 2.2524855	best: 2.2523982 (1499)	total: 52.1s	remaining: 56m 56s
2000:	learn: 1.5137670	test: 2.2393605	best: 2.2393192 (1996)	total: 1m 9s	remaining: 56m 48s
2500:	learn: 1.4350160	test: 2.2303720	best: 2.2303720 (2500)	total: 1m 27s	remaining: 56m 40s
3000:	learn: 1.3697180	test: 2.2240323	best: 2.2238934 (2992)	total: 1m 44s	remaining: 56m 28s
3500:	learn: 1.3135203	test: 2.2190919	best: 2.2190630 (3499)	total: 2m 2s	remaining: 56m 25s
4000:	learn: 1.2640887	test: 2.2157533	best: 2.2155969 (3836)	total: 2m 20s	remaining: 56m 19s
4500:	learn: 1.2188958	test: 2.2143293	best: 2.2142076 (4473)	total: 2m 41s	remaining: 57m 1s
5000:	learn: 1.1769693	test: 2.2127681	best: 2.2126931 (4987)	total: 2m 59s	remaining: 56m 46s
5500:	learn: 1.1393745	test: 2.2111695	best: 2.2111040 (5492)	total: 3m 17s	remaining: 56m 35s
6000:	learn: 1.1054532	test: 2.2103737	best: 2.2101124 (5927)	total: 3m 36s	remaining: 56m 25s
6500:	learn: 1.0748606	test: 2.2093511	best: 2.2092954 (6496)	total: 3m 54s	remaining: 56m 12s
7000:	learn: 1.0458828	test: 2.2086489	best: 2.2085531 (6989)	total: 4m 13s	remaining: 56m 2s
bestTest = 2.208553133
bestIteration = 6989
Shrink model to first 6990 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.4939595	test: 4.6255407	best: 4.6255407 (0)	total: 50.5ms	remaining: 1h 24m 5s
500:	learn: 1.9440616	test: 2.3928196	best: 2.3926411 (499)	total: 17.5s	remaining: 57m 47s
1000:	learn: 1.7339401	test: 2.3241139	best: 2.3241139 (1000)	total: 37.5s	remaining: 1h 1m 52s
1500:	learn: 1.5988560	test: 2.2907101	best: 2.2907101 (1500)	total: 55s	remaining: 1h 11s
2000:	learn: 1.5021932	test: 2.2749802	best: 2.2749802 (2000)	total: 1m 12s	remaining: 59m 13s
2500:	learn: 1.4257824	test: 2.2665642	best: 2.2665642 (2500)	total: 1m 30s	remaining: 58m 40s
3000:	learn: 1.3612599	test: 2.2616051	best: 2.2615572 (2999)	total: 1m 48s	remaining: 58m 17s
3500:	learn: 1.3022347	test: 2.2583020	best: 2.2582232 (3498)	total: 2m 6s	remaining: 57m 58s
4000:	learn: 1.2524294	test: 2.2567844	best: 2.2567844 (4000)	total: 2m 24s	remaining: 57m 43s
4500:	learn: 1.2057899	test: 2.2533363	best: 2.2530857 (4429)	total: 2m 42s	remaining: 57m 27s
5000:	learn: 1.1652112	test: 2.2519719	best: 2.2517978 (4940)	total: 3m	remaining: 57m 12s
5500:	learn: 1.1289520	test: 2.2511565	best: 2.2509896 (5419)	total: 3m 19s	remaining: 57m 1s
6000:	learn: 1.0966734	test: 2.2506204	best: 2.2502090 (5943)	total: 3m 37s	remaining: 56m 51s
6500:	learn: 1.0648663	test: 2.2491292	best: 2.2490359 (6351)	total: 3m 58s	remaining: 57m 13s
7000:	learn: 1.0356337	test: 2.2488041	best: 2.2486141 (6918)	total: 4m 17s	remaining: 56m 58s
7500:	learn: 1.0097624	test: 2.2483700	best: 2.2478296 (7255)	total: 4m 36s	remaining: 56m 45s
bestTest = 2.247829629
bestIteration = 7255
Shrink model to first 7256 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.4609746	test: 4.7316398	best: 4.7316398 (0)	total: 119ms	remaining: 3h 17m 53s
500:	learn: 1.9311151	test: 2.4198401	best: 2.4198401 (500)	total: 21.1s	remaining: 1h 9m 49s
1000:	learn: 1.7206497	test: 2.3558822	best: 2.3558822 (1000)	total: 38.5s	remaining: 1h 3m 23s
1500:	learn: 1.5928112	test: 2.3273654	best: 2.3273654 (1500)	total: 55.9s	remaining: 1h 1m 7s
2000:	learn: 1.4977208	test: 2.3166917	best: 2.3166917 (2000)	total: 1m 13s	remaining: 59m 51s
2500:	learn: 1.4205029	test: 2.3103078	best: 2.3103078 (2500)	total: 1m 30s	remaining: 59m 4s
3000:	learn: 1.3558684	test: 2.3049002	best: 2.3047658 (2995)	total: 1m 51s	remaining: 59m 57s
3500:	learn: 1.3017111	test: 2.3025570	best: 2.3024489 (3487)	total: 2m 9s	remaining: 59m 24s
4000:	learn: 1.2525947	test: 2.3015429	best: 2.3015032 (3999)	total: 2m 27s	remaining: 58m 55s
4500:	learn: 1.2069589	test: 2.3015803	best: 2.3011106 (4316)	total: 2m 45s	remaining: 58m 29s
bestTest = 2.301110649
bestIteration = 4316
Shrink model to first 4317 iterations.
================================================================================
<CATBOOST> OVERALL RMSE     : 2.235972462865775


# 제출 파일 만들기
ensemble_pred = 0.5 * ( lgbm_oof_test+ cat_oof_test )
sample_submission[target_col] = np.clip( ensemble_pred, 0 , max(ensemble_pred) )


# Train-set의 실제값과 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( train_label, color='r' , label='real')
sns.distplot( 0.5*(lgbm_oof_train + cat_oof_train), color='b', label='prediction' )
plt.legend()
plt.title("Real Vs Prediction");


# Train-set/Test-set의  예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( 0.5*(lgbm_oof_train + cat_oof_train), color='r' , label='Train')
sns.distplot( ensemble_pred, color='b', label='Test' )
plt.legend()
plt.title("Prediction for Train/Test-set");


from IPython.display import FileLink

sample_submission.to_csv('lgbm_catboost_ensemble.csv', index=False)


FileLink('lgbm_catboost_ensemble.csv')


import multiprocessing 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings 
warnings.filterwarnings('ignore', category=RuntimeWarning)


column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

data = pd.read_csv('https://raw.githubusercontent.com/minaahayley/kaggle/main/The%20Boston%20Housing%20Dataset/housing.csv', header=None, delimiter=r"\s+", names=column_names)


data.head(5)


data.shape

(506, 14)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


data.describe()


fig, axs = plt.subplots(figsize=(24, 10), ncols=7, nrows=2)
features = data.columns

for i, feature in enumerate(features): 
  row = int(i/7)
  col = i%7
  sns.boxplot(y=feature, data=data, ax=axs[row][col])


data = data[~(data['CRIM'] >= 60)]
data = data[~(data['ZN'] >= 60)]
data = data[~(data['B'] <= 200)]
data = data[~(data['MEDV'] >= 50)]


data.hist(figsize=(20, 20), bins=50)
plt.show()


plt.figure(figsize=(16, 16))

sns.heatmap(data.corr(), 
            linewidths=0.1,
            square=True,
            annot=True,
            fmt='.2f',
            cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7f478c821150>


fig, axs = plt.subplots(figsize=(20, 20), ncols=4, nrows=4)
features = data.drop('MEDV', axis=1).describe().columns

for i, feature in enumerate(features): 
  row = int(i/4)
  col = i%4
  sns.regplot(y='MEDV', x=feature, data=data, ax=axs[row][col])


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  
from sklearn.preprocessing import StandardScaler, PolynomialFeatures 
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(lr_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(lr_reg.score(X_test, y_test)))

학습 데이터 점수 : 0.7692273559306411
평가 데이터 점수 : 0.7601485300868639


estimator = make_pipeline(
    StandardScaler(),
    LinearRegression()
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -7.90343922 -13.43802524 -29.71478895 -26.44629372 -20.4599371 ]
NSME scores mean: -19.59249684665766
RMSE scores : [2.81130561 3.66579122 5.45112731 5.14259601 4.5232662 ]
RMSE scores mean: 4.318817268412426


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


ridge_reg = Ridge(alpha=0.2)
ridge_reg.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(ridge_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(ridge_reg.score(X_test, y_test)))

학습 데이터 점수 : 0.7974029332754818
평가 데이터 점수 : 0.5967890252716412


estimator = make_pipeline(
    StandardScaler(),
    Ridge(alpha=0.2)
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -7.89999042 -13.43927207 -29.75215989 -26.43017164 -20.39878673]
NSME scores mean: -19.58407615161176
RMSE scores : [2.81069216 3.66596128 5.45455405 5.14102827 4.5165016 ]
RMSE scores mean: 4.317747471315019


# Lets try polinomial regression with L2 with degree for the best fit
pipe = Pipeline([('scaler', StandardScaler()),
                ('poly', PolynomialFeatures()),
                ('model', Ridge())
                 ])

param_grid = [{
    'model__alpha':  [1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1.0, 5.0, 10.0],
    'poly__degree' : [2, 3, 4, 5]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('poly', PolynomialFeatures()),
                                       ('model', Ridge())]),
             n_jobs=2,
             param_grid=[{'model__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5,
                                           1.0, 5.0, 10.0],
                          'poly__degree': [2, 3, 4, 5]}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__alpha': 10.0, 'poly__degree': 2}
GridSearchCV best score: -16.240601902637287


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.935980322533816
평가 데이터 점수 : 0.8149177039789857


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


rf_reg = RandomForestRegressor(n_estimators=500)
rf_reg.fit(X_train, y_train)
print(rf_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(rf_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(rf_reg.score(X_test, y_test)))

gbm_reg = GradientBoostingRegressor(n_estimators=500)
gbm_reg.fit(X_train.values, y_train.values)
print(gbm_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(gbm_reg.score(X_train.values, y_train.values)))
print('평가 데이터 점수 : {}'.format(gbm_reg.score(X_test.values, y_test.values)))

xgb_reg = XGBRegressor(n_estimators=500)
xgb_reg.fit(X_train, y_train)
print(xgb_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(xgb_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(xgb_reg.score(X_test, y_test)))

lgbm_reg = LGBMRegressor(n_estimators=500)
lgbm_reg.fit(X_train, y_train)
print(lgbm_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(lgbm_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(lgbm_reg.score(X_test, y_test)))

RandomForestRegressor
학습 데이터 점수 : 0.9774266949165727
평가 데이터 점수 : 0.9090184770195461
GradientBoostingRegressor
학습 데이터 점수 : 0.9994073221592501
평가 데이터 점수 : 0.903574495447218
[00:25:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor
학습 데이터 점수 : 0.9986843588761848
평가 데이터 점수 : 0.8960593693909321
LGBMRegressor
학습 데이터 점수 : 0.9983850695994405
평가 데이터 점수 : 0.9021560029616011


estimator = make_pipeline(
    StandardScaler(),
    GradientBoostingRegressor(n_estimators=500)
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -9.6284338   -9.21428619 -19.89510365 -12.6626706  -12.47762058]
NSME scores mean: -12.775622963814307
RMSE scores : [3.10297177 3.03550427 4.46039277 3.55846464 3.53236756]
RMSE scores mean: 3.537940202297387


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor())
])

param_grid = [{'model__n_estimators': [100, 200],
               'model__learning_rate': [0.1, 0.05, 0.02],
               'model__max_depth':[2, 4, 6],
               'model__min_samples_leaf':[3, 5, 9]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', GradientBoostingRegressor())]),
             n_jobs=2,
             param_grid=[{'model__learning_rate': [0.1, 0.05, 0.02],
                          'model__max_depth': [2, 4, 6],
                          'model__min_samples_leaf': [3, 5, 9],
                          'model__n_estimators': [100, 200]}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__min_samples_leaf': 3, 'model__n_estimators': 200}
GridSearchCV best score: -11.952847265529417


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.989011848031319
평가 데이터 점수 : 0.901290256127608


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


model = KNeighborsRegressor()
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.6849552608478233
평가 데이터 점수 : 0.5742663848300056


estimator = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [-13.30320482 -17.21877108 -33.15777831 -10.2794878  -17.13891707]
NSME scores mean: -18.21963181898325
RMSE scores : [3.64735587 4.14955071 5.75827911 3.20616403 4.13991752]
RMSE scores mean: 4.180253447759364


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsRegressor())
])

param_grid = [{'model__n_neighbors': [3, 5, 7],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', KNeighborsRegressor())]),
             n_jobs=2,
             param_grid=[{'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'model__n_neighbors': [3, 5, 7],
                          'model__weights': ['uniform', 'distance']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__algorithm': 'ball_tree', 'model__n_neighbors': 5, 'model__weights': 'distance'}
GridSearchCV best score: -17.805454212220518


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 1.0
평가 데이터 점수 : 0.7639422096383729


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


svr = SVR()
svr.fit(X_train, y_train)

print('SVR 학습 데이터 점수 : {}'.format(svr.score(X_train, y_train)))
print('SVR 평가 데이터 점수 : {}'.format(svr.score(X_test, y_test)))

SVR 학습 데이터 점수 : 0.24123466543839422
SVR 평가 데이터 점수 : 0.25537782133048303


param_grid = [{'kernel': ['rbf', 'polynomial', 'sigmoid', 'linear']}]

gs = GridSearchCV(
    estimator=SVR(),
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:972: UserWarning: One or more of the test scores are non-finite: [-45.01455364          nan -62.30917282 -20.83770115]
  category=UserWarning,

GridSearchCV(cv=5, estimator=SVR(), n_jobs=2,
             param_grid=[{'kernel': ['rbf', 'polynomial', 'sigmoid',
                                     'linear']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_estimator_)
print(gs.best_params_)

SVR(kernel='linear')
{'kernel': 'linear'}


linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train, y_train)

print('Linear SVR 학습 데이터 점수 : {}'.format(linear_svr.score(X_train, y_train)))
print('Linear SVR 평가 데이터 점수 : {}'.format(linear_svr.score(X_test, y_test)))

Linear SVR 학습 데이터 점수 : 0.7742112255059793
Linear SVR 평가 데이터 점수 : 0.6662563800418307


linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train_scale, y_train)

print('Linear SVR 학습 데이터 점수 : {}'.format(linear_svr.score(X_train_scale, y_train)))
print('Linear SVR 평가 데이터 점수 : {}'.format(linear_svr.score(X_test_scale, y_test)))

Linear SVR 학습 데이터 점수 : 0.780745552242202
Linear SVR 평가 데이터 점수 : 0.6506520061702693


estimator = make_pipeline(
    StandardScaler(),
    SVR(kernel='linear')
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -6.54959765 -12.3597567  -42.4239171  -24.02037228 -19.43607983]
NSME scores mean: -20.957944712932324
RMSE scores : [2.55921817 3.51564456 6.5133645  4.90105828 4.40863696]
RMSE scores mean: 4.379584495005061


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVR(kernel='linear'))
])

param_grid = [{
    'model__gamma' : ['scale', 'auto'],
    'model__C' : [1.0, 0.1, 0.01],
    'model__epsilon' : [1.0, 0.1, 0.01]
}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVR(kernel='linear'))]),
             n_jobs=2,
             param_grid=[{'model__C': [1.0, 0.1, 0.01],
                          'model__epsilon': [1.0, 0.1, 0.01],
                          'model__gamma': ['scale', 'auto']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__C': 0.1, 'model__epsilon': 1.0, 'model__gamma': 'scale'}
GridSearchCV best score: -20.282059924605562


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.766644936233691
평가 데이터 점수 : 0.670167100161414


import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


submission = pd.read_csv('https://raw.githubusercontent.com/minaahayley/kaggle/main/customer-churn-prediction-2020/data/sampleSubmission.csv')
test = pd.read_csv('https://raw.githubusercontent.com/minaahayley/kaggle/main/customer-churn-prediction-2020/data/test.csv')
train = pd.read_csv('https://raw.githubusercontent.com/minaahayley/kaggle/main/customer-churn-prediction-2020/data/train.csv')


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4250 non-null   object 
 1   account_length                 4250 non-null   int64  
 2   area_code                      4250 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4250 non-null   object 
 5   number_vmail_messages          4250 non-null   int64  
 6   total_day_minutes              4250 non-null   float64
 7   total_day_calls                4250 non-null   int64  
 8   total_day_charge               4250 non-null   float64
 9   total_eve_minutes              4250 non-null   float64
 10  total_eve_calls                4250 non-null   int64  
 11  total_eve_charge               4250 non-null   float64
 12  total_night_minutes            4250 non-null   float64
 13  total_night_calls              4250 non-null   int64  
 14  total_night_charge             4250 non-null   float64
 15  total_intl_minutes             4250 non-null   float64
 16  total_intl_calls               4250 non-null   int64  
 17  total_intl_charge              4250 non-null   float64
 18  number_customer_service_calls  4250 non-null   int64  
 19  churn                          4250 non-null   object 
dtypes: float64(8), int64(7), object(5)
memory usage: 664.2+ KB


test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             750 non-null    int64  
 1   state                          750 non-null    object 
 2   account_length                 750 non-null    int64  
 3   area_code                      750 non-null    object 
 4   international_plan             750 non-null    object 
 5   voice_mail_plan                750 non-null    object 
 6   number_vmail_messages          750 non-null    int64  
 7   total_day_minutes              750 non-null    float64
 8   total_day_calls                750 non-null    int64  
 9   total_day_charge               750 non-null    float64
 10  total_eve_minutes              750 non-null    float64
 11  total_eve_calls                750 non-null    int64  
 12  total_eve_charge               750 non-null    float64
 13  total_night_minutes            750 non-null    float64
 14  total_night_calls              750 non-null    int64  
 15  total_night_charge             750 non-null    float64
 16  total_intl_minutes             750 non-null    float64
 17  total_intl_calls               750 non-null    int64  
 18  total_intl_charge              750 non-null    float64
 19  number_customer_service_calls  750 non-null    int64  
dtypes: float64(8), int64(8), object(4)
memory usage: 117.3+ KB


plt.figure(figsize=(10,10))

# target을 제외한 변수끼리의 상관관계 
corr_data = train.drop('churn', axis=1)

sns.heatmap(corr_data.corr()
			      , linewidths = 0.1
            , square = True
            , annot = True
            , fmt = '.2f'
            , cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc1974d4810>


# Churn 비율 살펴보기 
train['churn'].value_counts(normalize=True)

no     0.859294
yes    0.140706
Name: churn, dtype: float64


fig, axs = plt.subplots(figsize=(20, 10), ncols=5, nrows=3)
features = train.describe().columns 

for i, feature in enumerate(features): 
    row = int(i/5)
    col = i%5
    sns.barplot(x='churn', y=feature, data=train, ax=axs[row][col])


train.hist(figsize=(20, 20), bins=50)
plt.show()


from scipy.stats import skew 

# 숫자형 피처의 컬럼 index 추출 
feature_index = train.dtypes[train.dtypes != 'object'].index 
 
skew_features = train[feature_index].apply(lambda x : skew(x))

# skew(왜곡) 정도가 1이상인 컬럼만 추출 
skew_features_top = skew_features[skew_features > 1].sort_values(ascending=False)
skew_features_top

number_vmail_messages            1.372606
total_intl_calls                 1.359642
number_customer_service_calls    1.082309
dtype: float64


# 로그 변환 
train[skew_features_top.index] = np.log1p(train[skew_features_top.index])
test[skew_features_top.index] = np.log1p(test[skew_features_top.index])


train.hist(figsize=(20, 20), bins=50)
plt.show()


# 레이블 인코딩 
train['state'] = pd.Categorical(train['state']).codes 
train['area_code'] = pd.Categorical(train['area_code']).codes 
train['international_plan'] = pd.Categorical(train['international_plan']).codes 
train['voice_mail_plan'] = pd.Categorical(train['voice_mail_plan']).codes 

test['state'] = pd.Categorical(test['state']).codes 
test['area_code'] = pd.Categorical(test['area_code']).codes 
test['international_plan'] = pd.Categorical(test['international_plan']).codes 
test['voice_mail_plan'] = pd.Categorical(test['voice_mail_plan']).codes


# Churn 컬럼을 숫자로 바꾸기 
train['churn'] = train['churn'].apply(lambda x : 1 if x == 'yes' else 0)


train['churn'].value_counts()

0    3652
1     598
Name: churn, dtype: int64


#submission을 위해 test_id는 별도로 저장
test_id = test['id']


# X, y 설정 
X = train.drop('churn', axis=1)
y = train['churn']


# 학습, 훈련 데이터 분할 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=1000)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)


from sklearn.metrics import accuracy_score

print(model.__class__.__name__,': training set에서의 성능')
y_train_pred = rf_clf.predict(X_train)
print(accuracy_score(y_train, y_train_pred))

print(model.__class__.__name__,': test set에서의 성능')
y_test_pred = rf_clf.predict(X_test)
print(accuracy_score(y_test, y_test_pred))

LogisticRegression : training set에서의 성능
1.0
LogisticRegression : test set에서의 성능
0.9564705882352941


# 최종 예측 
y_test_pred = rf_clf.predict(test.drop('id', axis=1))


# 제출 파일 생성 
df = pd.DataFrame()
df['id'] = test_id
df['churn'] = pd.Series(y_test_pred)
df['churn'] = df['churn'].apply(lambda x : 'yes' if x == 1 else 'no')
df.to_csv('submission.csv', index=False)


def get_model_predict(model, X_train, X_test, y_train, y_test): 
  model.fit(X_train, y_train)
  
  print('\n', model.__class__.__name__)
  print('training set에서의 성능')
  y_train_pred = model.predict(X_train)
  print(accuracy_score(y_train, y_train_pred))

  print('test set에서의 성능')
  y_test_pred = model.predict(X_test)
  print(accuracy_score(y_test, y_test_pred))


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier 
from sklearn.linear_model import LogisticRegression

# 평가 수행 
rf_clf = RandomForestClassifier(n_estimators=200)
gbm_clf = GradientBoostingClassifier(n_estimators=200)
xgb_clf = XGBClassifier(n_estimators=200, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
lgbm_clf = LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=4,
                          subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
lr_reg = LogisticRegression(solver='liblinear', max_iter=200)

for model in [rf_clf, gbm_clf, xgb_clf, lgbm_clf, lr_reg]: 
    # XGBoost의 경우 DataFrame이 입력될 경우 버전에 따라 오류 발생 가능. naddray로 변환 
    get_model_predict(model, X_train.values, X_test.values, y_train.values, y_test.values)

 RandomForestClassifier
training set에서의 성능
1.0
test set에서의 성능
0.9564705882352941

 GradientBoostingClassifier
training set에서의 성능
0.9779411764705882
test set에서의 성능
0.96

 XGBClassifier
training set에서의 성능
0.9658823529411765
test set에서의 성능
0.9458823529411765

 LGBMClassifier
training set에서의 성능
0.9473529411764706
test set에서의 성능
0.9341176470588235

 LogisticRegression
training set에서의 성능
0.875
test set에서의 성능
0.8576470588235294


# 최종 예측 
y_test_pred = xgb_clf.predict(test.drop('id', axis=1).values)


# 제출 파일 생성 
df = pd.DataFrame()
df['id'] = test_id
df['churn'] = pd.Series(y_test_pred)
df['churn'] = df['churn'].apply(lambda x : 'yes' if x == 1 else 'no')
df.to_csv('submission.csv', index=False)


!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}


ls -1ha kaggle.json

kaggle.json


!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning 

#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

kaggle.json


#Copy API command 후 데이터셋 다운로드하기  
!kaggle competitions download -c house-prices-advanced-regression-techniques

#파일 압축 풀기
!unzip house-prices-advanced-regression-techniques.zip

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 50.0MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv


!mkdir -p ~/.kaggle/competitions/house-prices-advanced-regression-techniques #create folder name Kaggle
!cp sample_submission.csv ~/.kaggle/competitions/house-prices-advanced-regression-techniques #copy into folder Kaggle
!cp test.csv ~/.kaggle/competitions/house-prices-advanced-regression-techniques
!cp train.csv ~/.kaggle/competitions/house-prices-advanced-regression-techniques

#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/house-prices-advanced-regression-techniques

sample_submission.csv  test.csv  train.csv


import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

import warnings 
warnings.filterwarnings('ignore', category=RuntimeWarning)


submission = pd.read_csv('~/.kaggle/competitions/house-prices-advanced-regression-techniques/sample_submission.csv')
test = pd.read_csv('~/.kaggle/competitions/house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('~/.kaggle/competitions/house-prices-advanced-regression-techniques/train.csv')


submission.sample()


# train 원본 데이터 저장 
train_org = train.copy()


# Null이 너무 많은 컬럼과 불필요한 컬럼 삭제 
train.drop(['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)


# 숫자형 Null 컬럼은 평균값으로 대체 
train.fillna(train.mean(), inplace=True)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


# Null 값이 있는 피처명과 타입 추출 
train.isnull().sum()[train.isnull().sum()>0].index

Index(['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')


# y 값의 분포 
train['SalePrice'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fde5c1eb310>


# 로그 변환 
train['SalePrice'] = np.log1p(train['SalePrice'])
train['SalePrice'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fde5c214b90>


from scipy.stats import skew 

# 숫자형 피처의 컬럼 index 추출 
feature_index = train.dtypes[train.dtypes != 'object'].index 
 
skew_features = train[feature_index].apply(lambda x : skew(x))

# skew(왜곡) 정도가 1이상인 컬럼만 추출 
skew_features_top = skew_features[skew_features > 1].sort_values(ascending=False)
skew_features_top

MiscVal          24.451640
PoolArea         14.813135
LotArea          12.195142
3SsnPorch        10.293752
LowQualFinSF      9.002080
KitchenAbvGr      4.483784
BsmtFinSF2        4.250888
ScreenPorch       4.117977
BsmtHalfBath      4.099186
EnclosedPorch     3.086696
MasVnrArea        2.673661
LotFrontage       2.382499
OpenPorchSF       2.361912
BsmtFinSF1        1.683771
WoodDeckSF        1.539792
TotalBsmtSF       1.522688
MSSubClass        1.406210
1stFlrSF          1.375342
GrLivArea         1.365156
dtype: float64


# 왜곡 정도가 높은 피처를 로그 변환
train[skew_features_top.index] = np.log1p(train[skew_features_top.index])


# 문자형 피처는 원 핫 인코딩으로 변환 
train = pd.get_dummies(train)
train.shape

(1460, 271)


# 원 핫 인코딩 후 Null 값 없음 
train.isnull().sum()[train.isnull().sum()>0].index

Index([], dtype='object')


# 회귀 계수 시각화 
def visualize_coefficient(models): 
    fig, axs = plt.subplots(figsize=(24,10), nrows=1, ncols=3)
    fig.tight_layout()

    for i, model in enumerate(models): 
        # 상위 10개, 하위 10개 회귀 계수를 구하고, 이를 판다스 concat으로 결합 
        coef = pd.Series(model.coef_, index=X.columns)
        coef_high = coef.sort_values(ascending=False).head(10)
        coef_low = coef.sort_values(ascending=False).tail(10)
        coef_concat = pd.concat([coef_high, coef_low])

        sns.barplot(x=coef_concat.values, y=coef_concat.index, ax=axs[i])


from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 

X = train.drop('SalePrice', axis=1) 
y = np.log1p(train['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)

# LinearRegression, Ridge, Lasso 학습, 예측, 평가 
lr_reg = LinearRegression()
ridge_reg = Ridge()
lasso_reg = Lasso()

lr_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)


plt.scatter(x=train_org['GrLivArea'], y=train_org['SalePrice'])

<matplotlib.collections.PathCollection at 0x7fde571b4250>


# GrLivArea, SalePrice 모두 로그 변환됐으므로 이를 반영한 조건 생성 
outlier_index = train[(train['GrLivArea'] > np.log1p(4000)) & (train['SalePrice'] > np.log1p(50000))].index

# 이상치 데이터 삭제 
train.drop(outlier_index, axis=0, inplace=True)


# 로그 변환 된 rmse를 측정하는 함수 생성 
def get_rmse(model):
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    print(model.__class__.__name__, '로그 변환된 rmse:', np.round(rmse,3))
    return rmse


# 여러 모델의 rmse 값을 반환 
def get_rmses(models):
    rmses = []
    for model in models:
        rmse = get_rmse(model)
        rmses.append(rmse)
    return rmses


from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 

X = train.drop('SalePrice', axis=1) 
y = train['SalePrice'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)

# LinearRegression, Ridge, Lasso 학습, 예측, 평가 
lr_reg = LinearRegression()
ridge_reg = Ridge()
lasso_reg = Lasso()

lr_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

LinearRegression 로그 변환된 rmse: 0.132
Ridge 로그 변환된 rmse: 0.125
Lasso 로그 변환된 rmse: 0.258

[0.13195207393230912, 0.12459727425511735, 0.2575777799483299]


from sklearn.model_selection import GridSearchCV 

def best_params(model, params): 
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X, y)
    rmse = np.sqrt(-1 * grid_model.best_score_) 
    print('{0} 5 CV 시 최적 평균 rmse: {1}, 최적 평균 alpha: {2}'.format(model.__class__.__name__, np.round(rmse, 4), grid_model.best_params_))

ridge_params = {'alpha' : [0.05, 0.1, 1, 5, 8, 10, 12, 15, 20]}
lasso_params = {'alpha' : [0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1, 5, 10]}

best_params(ridge_reg, ridge_params)
best_params(lasso_reg, lasso_params)

Ridge 5 CV 시 최적 평균 rmse: 0.1125, 최적 평균 alpha: {'alpha': 8}
Lasso 5 CV 시 최적 평균 rmse: 0.1121, 최적 평균 alpha: {'alpha': 0.001}


# 앞의 최적화 alpha 값으로 학습, 예측, 평가 
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=8)
lasso_reg = Lasso(alpha=0.001)

lr_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

LinearRegression 로그 변환된 rmse: 0.132
Ridge 로그 변환된 rmse: 0.117
Lasso 로그 변환된 rmse: 0.116

[0.13195207393230912, 0.11736730851679039, 0.1158658441187506]


# 피처 중요도 시각화 
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)


from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor 

gbm_reg = GradientBoostingRegressor(n_estimators=1000)
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4, 
                         subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)

gbm_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)
lgbm_reg.fit(X_train, y_train)

models = [gbm_reg, xgb_reg, lgbm_reg]
get_rmses(models)

[07:42:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
GradientBoostingRegressor 로그 변환된 rmse: 0.124
XGBRegressor 로그 변환된 rmse: 0.116
LGBMRegressor 로그 변환된 rmse: 0.118

[0.12434554514384757, 0.11569128150068062, 0.11806601740834839]


# 개별 모델의 학습 
ridge_reg = Ridge(alpha=8)
lasso_reg = Lasso(alpha=0.001)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

# 개별 모델 예측 
ridge_pred = ridge_reg.predict(X_test)
lasso_pred = lasso_reg.predict(X_test)

# 개별 모델 예측값 혼합으로 최종 예측값 도출 
pred = 0.4 * ridge_pred + 0.6 * lasso_pred 

# 최종 혼합 모델의 rmse 값 출력 
mse = mean_squared_error(y_test, pred) 
rmse = np.sqrt(mse)
print('로그 변환된 rmse:', np.round(rmse,3))

로그 변환된 rmse: 0.115


# 개별 모델의 학습 
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4, 
                         subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
xgb_reg.fit(X_train, y_train)
lgbm_reg.fit(X_train, y_train)

# 개별 모델 예측 
xgb_pred = xgb_reg.predict(X_test)
lgbm_pred = lgbm_reg.predict(X_test)

# 개별 모델 예측값 혼합으로 최종 예측값 도출 
pred = 0.5 * xgb_pred + 0.5 * lgbm_pred 

# 최종 혼합 모델의 rmse 값 출력 
mse = mean_squared_error(y_test, pred) 
rmse = np.sqrt(mse)
print('로그 변환된 rmse:', np.round(rmse,3))

[07:44:06] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
로그 변환된 rmse: 0.114


!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}


ls -1ha kaggle.json

kaggle.json


!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning 

#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

kaggle.json


#Copy API command 후 데이터셋 다운로드하기  
!kaggle competitions download -c bike-sharing-demand

#파일 압축 풀기
!unzip bike-sharing-demand.zip

Downloading bike-sharing-demand.zip to /content
  0% 0.00/189k [00:00<?, ?B/s]
100% 189k/189k [00:00<00:00, 58.0MB/s]
Archive:  bike-sharing-demand.zip
  inflating: sampleSubmission.csv    
  inflating: test.csv                
  inflating: train.csv


!mkdir -p ~/.kaggle/competitions/bike-sharing-demand #create folder name Kaggle
!cp sampleSubmission.csv ~/.kaggle/competitions/bike-sharing-demand #copy into folder Kaggle
!cp test.csv ~/.kaggle/competitions/bike-sharing-demand
!cp train.csv ~/.kaggle/competitions/bike-sharing-demand

#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/bike-sharing-demand

sampleSubmission.csv  test.csv  train.csv


import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

import warnings 
warnings.filterwarnings('ignore', category=RuntimeWarning)


submission = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/sampleSubmission.csv')
test = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/test.csv')
train = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/train.csv')


# datetime 전처리 
train['datetime'] = pd.to_datetime(train['datetime'])
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek

test['datetime'] = pd.to_datetime(test['datetime'])
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['dayofweek'] = test['datetime'].dt.dayofweek


sns.distplot(train['count'])

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x7f5a769d8610>


fig, axs = plt.subplots(figsize=(16,8), ncols=4, nrows=2)
cat_features = ['season', 'workingday', 'weather', 'year', 'month', 'day', 'hour', 'dayofweek']

for i, feature in enumerate(cat_features):
    row = int(i/4)
    col = i%4
    sns.barplot(x=feature, y='count', data=train, ax=axs[row][col])


fig, axs = plt.subplots(figsize=(16,8), ncols=4, nrows=2)
cat_features = ['season', 'workingday', 'weather', 'year', 'month', 'day', 'hour', 'dayofweek']

for i, feature in enumerate(cat_features):
    row = int(i/4)
    col = i%4
    sns.barplot(x=feature, y='count', data=train, ax=axs[row][col], hue='year', ci=None)


# submission을 위한 datetime 
test_datetime = test['datetime']


# 필요없는 피처 제거 
train.drop(['datetime', 'temp', 'casual', 'registered','holiday'], axis=1, inplace=True)
test.drop(['datetime', 'temp','holiday'], axis=1, inplace=True)


# 원 핫 인코딩 
train = pd.get_dummies(train, columns=['season', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek'])
test = pd.get_dummies(test, columns=['season', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek'])


train.columns

Index(['atemp', 'humidity', 'windspeed', 'count', 'day', 'season_1',
       'season_2', 'season_3', 'season_4', 'workingday_0', 'workingday_1',
       'weather_1', 'weather_2', 'weather_3', 'weather_4', 'year_2011',
       'year_2012', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'dayofweek_0',
       'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4',
       'dayofweek_5', 'dayofweek_6'],
      dtype='object')


test.columns

Index(['atemp', 'humidity', 'windspeed', 'day', 'season_1', 'season_2',
       'season_3', 'season_4', 'workingday_0', 'workingday_1', 'weather_1',
       'weather_2', 'weather_3', 'weather_4', 'year_2011', 'year_2012',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6',
       'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'dayofweek_0',
       'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4',
       'dayofweek_5', 'dayofweek_6'],
      dtype='object')


train['count'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f5a76a47310>


from sklearn.metrics import mean_squared_error

# log 값 변환 시 NaN 등의 이슈로 log()가 아닌 log1p()를 이용해 RMSLE 계산 
def rmsle(y, pred) : 
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y-log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    rmsle = round(rmsle,3)
    return rmsle


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso 

# X, y 설정 
X = train.drop('count', axis=1)
y = train['count']
y_log = np.log1p(y) 

# 로그 변환된 y_log를 반영해 학습/테스트 데이터 세트 분할 
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.3, random_state=0)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train) 
y_test_pred = lr_reg.predict(X_test)

# 다시 expm1을 이용해 원래 스케일로 변환 
y_test_exp = np.expm1(y_test)
y_test_pred_exp = np.expm1(y_test_pred)

rmsle(y_test_exp, y_test_pred_exp)

0.585


def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False): 

    model.fit(X_train, y_train) 
    y_test_pred = model.predict(X_test)

    # 다시 expm1을 이용해 원래 스케일로 변환 
    y_test_exp = np.expm1(y_test)
    y_test_pred_exp = np.expm1(y_test_pred)

    print(model.__class__.__name__)
    print(rmsle(y_test_exp, y_test_pred_exp))
  

# 모델별로 평가 수행 
lr_reg = LinearRegression() 
ridge_reg = Ridge(alpha=10)
lasso_reg = Lasso(alpha=0.01)

for model in [lr_reg, ridge_reg, lasso_reg]:
    get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=True)

LinearRegression
0.585
Ridge
0.586
Lasso
0.632


# 회귀 트리 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor

# 평가 수행 
rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg = GradientBoostingRegressor(n_estimators=500)
xgb_reg = XGBRegressor(n_estimators=500)
lgbm_reg = LGBMRegressor(n_estimators=500)

for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:
    # XGBoost의 경우 DataFrame이 입력될 경우 버전에 따라 오류 발생 가능. ndarray로 변환 
    get_model_predict(model, X_train.values, X_test.values, y_train.values, y_test.values, is_expm1=True)

RandomForestRegressor
0.34
GradientBoostingRegressor
0.334
[06:40:32] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor
0.32
LGBMRegressor
0.294


# 가장 예측 성능이 좋은 lgbm regressor를 이용
X = train.drop('count', axis=1)
y = train['count']
y_log = np.log1p(y)

lgbm_reg.fit(X, y_log) 
pred = lgbm_reg.predict(test)
pred_exp = np.expm1(pred) # 다시 expm1을 이용해 원래 스케일로 변환 
pred_exp

array([ 12.13747607,   4.87622752,   3.37225367, ..., 124.15231696,
        84.58941302,  41.02087479])


# 제출 파일 생성 
submission = pd.DataFrame()
submission['datetime'] = test_datetime
submission['count'] = pred_exp 
submission


submission.to_csv('submission.csv', index=False)


222/3242*100

6.847624922887106


# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()


# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)


# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}


ls -1ha kaggle.json

kaggle.json


!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

kaggle.json


#Copy API command 후 데이터셋 다운로드하기  
!kaggle competitions download -c titanic

#파일 압축 풀기
!unzip titanic.zip

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 20.0MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv


!mkdir -p ~/.kaggle/competitions/titanic #create folder name Kaggle
!cp gender_submission.csv ~/.kaggle/competitions/titanic #copy into folder Kaggle
!cp test.csv ~/.kaggle/competitions/titanic
!cp train.csv ~/.kaggle/competitions/titanic


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/titanic

gender_submission.csv  test.csv  train.csv


import pandas as pd


submission = pd.read_csv('~/.kaggle/competitions/titanic/gender_submission.csv')
test = pd.read_csv('~/.kaggle/competitions/titanic/test.csv')
train = pd.read_csv('~/.kaggle/competitions/titanic/train.csv')


train.head()


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Age, Cabin, Embarked, Fare의 null 값을 채워준다. 
train['Age'].fillna(train['Age'].mean(),inplace=True)
train['Cabin'].fillna('N', inplace=True)
train['Embarked'].fillna('N', inplace=True)
train['Fare'].fillna(0, inplace=True)

test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Cabin'].fillna('N', inplace=True)
test['Embarked'].fillna('N', inplace=True)
test['Fare'].fillna(0, inplace=True)


train['Age'].value_counts()

29.699118    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
36.500000      1
55.500000      1
0.920000       1
23.500000      1
74.000000      1
Name: Age, Length: 89, dtype: int64


train['Cabin'].value_counts()

N              687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: Cabin, Length: 148, dtype: int64


train['Embarked'].value_counts()

S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64


# Cabin 선실 번호의 경우 앞자리만 데이터 전처리를 해줘야 한다. 
train['Cabin'] = train['Cabin'].str[:1]
test['Cabin'] = test['Cabin'].str[:1]


print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


import seaborn as sns


sns.barplot(x='Sex', y='Survived', data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e9f02ce90>


sns.barplot(x='Pclass', y='Survived', data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e9f066110>


sns.barplot(x='Pclass', y='Survived', hue='Sex', data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e9ef7b2d0>


train['Age'].describe()

count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64


train['Age_cat'] = train['Age'].apply(lambda x : 'Over 45' if x >= 45 else '35-44' if x >=35 else '25-34' if x >= 25 else '15-24' if x >= 15 else '5-14' if x >=5 else 'Under 5')
train['Age_cat'].value_counts()

25-34      378
15-24      200
35-44      120
Over 45    115
Under 5     40
5-14        38
Name: Age_cat, dtype: int64


# X축의 값을 순차적으로 표시 
order = ['Under 5', '5-14', '15-24', '25-34', '35-44', 'Over 45']

plt.figure(figsize=(10,6))
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=train, order=order, ci=None)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e9ef8ae50>


# submission을 위해 passengerId는 별도로 저장 
test_passengerId = test['PassengerId']


# 피처 제거 
train.drop(['Name','Age_cat', 'PassengerId', 'Ticket'], axis=1, inplace=True)
test.drop(['Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)


from sklearn.preprocessing import LabelEncoder 

def encode_features(df):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features: 
      le = LabelEncoder()
      le = le.fit(df[feature])
      df[feature] = le.transform(df[feature])

    return df


train = encode_features(train)
test = encode_features(test)


from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 

# 입력변수 X, 목표변수 y 설정 
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=11)

# DecisionTreeClassifier, LogisticRegression 학습, 예측, 평가 
dt_clf = DecisionTreeClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear')

dt_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)

dt_predict = dt_clf.predict(X_val)
print('결정 트리 정확도: {0:.4f}'.format(accuracy_score(y_val, dt_predict)))
lr_predict = lr_clf.predict(X_val)
print('로지스틱 회귀 정확도: {0:.4f}'.format(accuracy_score(y_val, lr_predict)))

결정 트리 정확도: 0.7877
로지스틱 회귀 정확도: 0.8659


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier 

# 입력변수 X, 목표변수 y 설정 
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=11)

# 앙상블 모델 학습, 예측, 평가
rf_clf = RandomForestClassifier(n_estimators=1000)
gbm_clf = GradientBoostingClassifier(n_estimators=1000)
xgb_clf = XGBClassifier(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
lgbm_clf = LGBMClassifier(n_estimators=1000, learning_rate=0.05, num_leaves=4,
                          subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)

rf_clf.fit(X_train, y_train)
gbm_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)
lgbm_clf.fit(X_train, y_train)

rf_predict = rf_clf.predict(X_val)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy_score(y_val, rf_predict)))
gbm_predict = gbm_clf.predict(X_val)
print('GBM 정확도: {0:.4f}'.format(accuracy_score(y_val, gbm_predict)))
xgb_predict = rf_clf.predict(X_val)
print('XGBoost 정확도: {0:.4f}'.format(accuracy_score(y_val, xgb_predict)))
lgbm_predict = rf_clf.predict(X_val)
print('LightGBM 정확도: {0:.4f}'.format(accuracy_score(y_val, lgbm_predict)))

랜덤 포레스트 정확도: 0.8547
GBM 정확도: 0.8380
XGBoost 정확도: 0.8547
LightGBM 정확도: 0.8547


from sklearn.model_selection import cross_val_score 

scores = cross_val_score(dt_clf, X_train, y_train, cv=5)


import numpy as np

print('평균 정확도: {0:.4f}'.format((np.mean(scores))))

평균 정확도: 0.7711


from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2,3,5,10],
              'min_samples_split': [2,3,5], # 노드를 분할하기 위한 최소한의 샘플 데이터 수 
              'min_samples_leaf':[1,5,8]} # 말단 노드가 되기 위한 최소한의 샘플 데이터 수 

grid_dt_clf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dt_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')


# 최적 하이퍼 파라미터 
grid_dt_clf.best_params_

{'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}


# 최고 정확도
grid_dt_clf.best_score_

0.7991825076332119


# 최적 하이퍼 파라미터로 학습된 Estimator로 예측, 평가 
estimator = grid_dt_clf.best_estimator_ 
y_val_predict = estimator.predict(X_val)
accuracy = accuracy_score(y_val, y_val_predict)

accuracy

0.8715083798882681


# 최종 예측 
predict = estimator.predict(test)
predict[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])


submission.head()


# 새로운 데이터프레임을 생성하고, 거기에 PassengerId와 예측한 결과값인 predict를 넣어준다
submission = pd.DataFrame()
submission['PassengerId'] = test_passengerId
submission['Survived'] = predict 

submission.head()


submission.to_csv('submission.csv', index=False)


1957/14654 # 상위 13%

0.13354715436058415

[Kaggle][필사] Customer Lifetime Value Prediction (0)	2023.01.05
[Kaggle][필사] Wallmart_Sales_Top_3___EDA_Feature_Engineering (0)	2022.12.28
[Kaggle][필사] Future Sales Prediction: playground (0)	2022.12.26
bus passenger prediction (0)	2022.12.23
The Boston Housing Dataset (0)	2022.11.26

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

	Quantity	UnitPrice	CustomerID	InvoiceYearMonth
count	541909.000000	541909.000000	406829.000000	541909.000000
mean	9.552250	4.611114	15287.690570	201099.713989
std	218.081158	96.759853	1713.600303	25.788703
min	-80995.000000	-11062.060000	12346.000000	201012.000000
25%	1.000000	1.250000	13953.000000	201103.000000
50%	3.000000	2.080000	15152.000000	201107.000000
75%	10.000000	4.130000	16791.000000	201110.000000
max	80995.000000	38970.000000	18287.000000	201112.000000

	CustomerID
0	17850.0
1	13047.0
2	12583.0
3	13748.0
4	15100.0

	CustomerID	MaxPurchaseData
0	12346.0	2011-01-18 10:17:00
1	12747.0	2011-12-07 14:34:00
2	12748.0	2011-12-09 12:20:00
3	12749.0	2011-12-06 09:56:00
4	12820.0	2011-12-06 15:12:00

	shop_id	item_id	date	item_price	item_cnt_day
0	5	5037	NaN	NaN	NaN
1	5	5320	NaN	NaN	NaN
2	5	5233	NaN	NaN	NaN
3	5	5232	NaN	NaN	NaN
4	5	5268	NaN	NaN	NaN

shop_id	2										...	59
item_id	30	31	32	33	38	42	45	51	53	57	...	22118	22137	22139	22145	22154	22162	22163	22164	22166	22167
2013-01-01	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
2013-02-01	0.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2013-03-01	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2013-04-01	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2013-05-01	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	item_name	item_id	item_category_id	item_group
0	! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D	0	40	7
1	!ABBYY FineReader 12 Professional Edition Full...	1	76	11
2	***В ЛУЧАХ СЛАВЫ (UNV) D	2	40	7
3	***ГОЛУБАЯ ВОЛНА (Univ) D	3	40	7
4	***КОРОБКА (СТЕКЛО) D	4	40	7

	shop_name	shop_id	city
0	!Якутск Орджоникидзе, 56 фран	0	29
1	!Якутск ТЦ "Центральный" фран	1	29
2	Адыгея ТЦ "Мега"	2	0
3	Балашиха ТРК "Октябрь-Киномир"	3	1
4	Волжский ТЦ "Волга Молл"	4	2

	count	mean	std	min	25%	50%	75%	max
RecencyCluster
0	1950.0	17.488205	13.237058	0.0	6.00	16.0	28.00	47.0
1	478.0	304.393305	41.183489	245.0	266.25	300.0	336.00	373.0
2	568.0	184.625000	31.753602	132.0	156.75	184.0	211.25	244.0
3	954.0	77.679245	22.850898	48.0	59.00	72.5	93.00	131.0

	CustomerID	Recency	RecencyCluster
0	17850.0	301	0
1	15100.0	329	0
2	18074.0	373	0
3	16250.0	260	0
4	13747.0	373	0
...	...	...	...
3945	15942.0	133	1
3946	14143.0	133	1
3947	16147.0	133	1
3948	15149.0	133	1
3949	15776.0	132	1

	count	mean	std	min	25%	50%	75%	max
FrequencyCluster
0	2399.0	23.549812	15.869986	1.0	10.0	21.0	35.0	59.0
1	2.0	4885.000000	343.653896	4642.0	4763.5	4885.0	5006.5	5128.0
2	181.0	344.629834	55.844437	270.0	297.0	329.0	392.0	463.0
3	5.0	2089.400000	516.483591	1640.0	1677.0	1857.0	2491.0	2782.0
4	1.0	7983.000000	NaN	7983.0	7983.0	7983.0	7983.0	7983.0
5	67.0	585.761194	91.042130	468.0	504.0	563.0	665.0	803.0
6	369.0	193.135501	34.891650	145.0	163.0	188.0	219.0	269.0
7	909.0	94.443344	23.983069	60.0	73.0	91.0	113.0	144.0
8	17.0	1084.823529	159.934063	872.0	977.0	1076.0	1160.0	1508.0

	CustomerID	Revenue
0	12346.0	0.00
1	12747.0	4196.01
2	12748.0	29072.10
3	12749.0	3868.20
4	12820.0	942.34

	CustomerID	Recency	RecencyCluster	Frequency	FrequencyCluster	Revenue
0	17850.0	301	0	312	3	5288.63
1	14688.0	7	3	359	3	5107.38
2	16029.0	38	3	274	3	50992.61
3	13767.0	1	3	399	3	16945.71
4	15513.0	30	3	314	3	14520.08

	count	mean	std	min	25%	50%	75%	max
RevenueCluster
0	3687.0	907.254414	921.910820	-4287.63	263.115	572.56	1258.220	4314.72
1	234.0	7760.699530	3637.173671	4330.67	5161.485	6549.38	9142.305	21535.90
2	27.0	43070.445185	15939.249588	25748.35	28865.490	36351.42	53489.790	88125.38
3	2.0	221960.330000	48759.481478	187482.17	204721.250	221960.33	239199.410	256438.49

	shop_id	item_id
0	5	5037
1	5	5320
2	5	5233
3	5	5232
4	5	5268
...	...	...
214195	45	18454
214196	45	16188
214197	45	15757
214198	45	19648
214199	45	969

	Recency	Frequency	Revenue
OverallScore
0	304.554545	16.736364	274.571659
1	193.902778	24.160714	381.332044
2	90.514628	32.034574	668.233235
3	34.285990	43.710145	828.620204
4	23.754934	103.190789	1589.534803
5	17.870748	185.908163	3161.098027
6	14.095588	292.205882	4655.580882
7	10.920354	401.946903	7614.822920
8	7.717949	582.076923	14393.304615
9	6.947368	916.421053	37605.533158
10	0.500000	1649.000000	33814.445000
11	1.333333	1996.000000	55889.220000
12	1.500000	4885.000000	43096.505000
13	1.000000	7983.000000	40340.780000

	CustomerID	m6_Revenue
0	12747.0	1666.11
1	12748.0	18679.01
2	12749.0	2323.04
3	12820.0	561.53
4	12822.0	918.98
...	...	...
3162	18278.0	173.90
3163	18281.0	80.82
3164	18282.0	98.76
3165	18283.0	1351.83
3166	18287.0	1072.00

	CustomerID	Recency	RecencyCluster	Frequency	FrequencyCluster	Revenue	RevenueCluster	OverallScore	Segment	m6_Revenue
0	17850.0	301	0	312	3	5288.63	1	4	Mid-Value	0.00
1	14688.0	7	3	359	3	5107.38	1	7	High-Value	1702.06
4	14849.0	21	3	392	3	7904.28	1	7	High-Value	5498.07
5	13468.0	1	3	306	3	5656.75	1	7	High-Value	1813.09
6	15601.0	10	3	414	3	6745.36	1	7	High-Value	2003.97

	count	mean	std	min	25%	50%	75%	max
LTVCluster
0	2981.0	282.036328	286.627690	-4287.63	0.0000	231.60	461.4900	958.46
1	777.0	1635.282355	555.078601	961.56	1168.7600	1509.58	1977.9500	3147.77
2	152.0	4685.321382	1340.544718	3166.40	3563.4725	4278.37	5531.1075	8432.68

	CustomerID	Recency	RecencyCluster	Frequency	FrequencyCluster	Revenue	RevenueCluster	OverallScore	Segment_Low-Value	Segment_Mid-Value
0	17850.0	301	0	312	3	5288.63	1	4	0	1
1	15032.0	255	0	55	0	4464.10	1	1	1	0
2	16000.0	2	3	9	0	12393.70	1	4	0	1
3	15749.0	234	1	15	0	21535.90	1	2	1	0
4	13093.0	266	0	170	2	7741.47	1	3	0	1

[Kaggle][자체_풀이]_Predict_Future_Sales (0)	2023.01.17
[Kaggle][필사] Wallmart_Sales_Top_3___EDA_Feature_Engineering (0)	2022.12.28
[Kaggle][필사] Future Sales Prediction: playground (0)	2022.12.26
bus passenger prediction (0)	2022.12.23
The Boston Housing Dataset (0)	2022.11.26

	count	mean	std	min	25%	50%	75%	max
Store	421570.0	22.200546	12.785297	1.000	11.000000	22.00000	33.000000	45.000000
Dept	421570.0	44.260317	30.492054	1.000	18.000000	37.00000	74.000000	99.000000
Weekly_Sales	421570.0	15981.258123	22711.183519	-4988.940	2079.650000	7612.03000	20205.852500	693099.360000
Temperature	421570.0	60.090059	18.447931	-2.060	46.680000	62.09000	74.280000	100.140000
Fuel_Price	421570.0	3.361027	0.458515	2.472	2.933000	3.45200	3.738000	4.468000
MarkDown1	150681.0	7246.420196	8291.221345	0.270	2240.270000	5347.45000	9210.900000	88646.760000
MarkDown2	111248.0	3334.628621	9475.357325	-265.760	41.600000	192.00000	1926.940000	104519.540000
MarkDown3	137091.0	1439.421384	9623.078290	-29.100	5.080000	24.60000	103.990000	141630.610000
MarkDown4	134967.0	3383.168256	6292.384031	0.220	504.220000	1481.31000	3595.040000	67474.850000
MarkDown5	151432.0	4628.975079	5962.887455	135.160	1878.440000	3359.45000	5563.800000	108519.280000
CPI	421570.0	171.201947	39.159276	126.064	132.022667	182.31878	212.416993	227.232807
Unemployment	421570.0	7.960289	1.863296	3.879	6.891000	7.86600	8.572000	14.313000
Size	421570.0	136727.915739	60980.583328	34875.000	93638.000000	140167.00000	202505.000000	219622.000000

Weight	Feature
1.1548 ± 0.0174	Dept
0.3394 ± 0.0157	Size
0.0415 ± 0.0025	Store
0.0184 ± 0.0039	Thanksgiving
0.0098 ± 0.0008	CPI
0.0081 ± 0.0009	Week
0.0042 ± 0.0002	Type
0.0028 ± 0.0005	Days_to_Thanksgiving
0.0013 ± 0.0002	MarkDown3
0.0011 ± 0.0002	IsHoliday
0.0010 ± 0.0002	Unemployment
0.0003 ± 0.0001	MarkDown5
0.0002 ± 0.0001	Temperature
0.0002 ± 0.0000	Day
0.0001 ± 0.0000	MarkDown4
0.0001 ± 0.0000	MarkDownSum
0 ± 0.0000	Month
0 ± 0.0000	SuperBowlWeek
0 ± 0.0000	MarkDown2
0 ± 0.0000	MarkDown1
0 ± 0.0000	Fuel_Price
0 ± 0.0000	Christmas
0 ± 0.0000	Days_to_Christmas
0 ± 0.0000	LaborDay
0 ± 0.0000	Year

	date	shop_id	item_id	item_price	item_cnt_day
0	02.01.2013	59	22154	999.00	1.0
1	03.01.2013	25	2552	899.00	1.0
2	05.01.2013	25	2552	899.00	-1.0
3	06.01.2013	25	2554	1709.05	1.0
4	15.01.2013	25	2555	1099.00	1.0

	item_category_name	item_category_id
0	PC - Гарнитуры/Наушники	0
1	Аксессуары - PS2	1
2	Аксессуары - PS3	2
3	Аксессуары - PS4	3
4	Аксессуары - PSP	4

	shop_id	item_id	cnt_mean_shop	cnt_std_shop	order_mean_shop
0	2	30	0.117647	0.327035	1.000000
1	2	31	0.235294	0.740959	1.088235
2	2	32	0.323529	0.638207	1.088235
3	2	33	0.323529	0.534880	1.029412
4	2	38	0.000000	0.000000	1.000000

	date_block_num	shop_id	item_id	cnt_prev	order_prev	cnt_prev2	order_prev2	cnt_prev12	order_prev12
0	1	2	30	0.0	1	NaN	NaN	NaN	NaN
1	1	2	31	0.0	1	NaN	NaN	NaN	NaN
2	1	2	32	0.0	1	NaN	NaN	NaN	NaN
3	1	2	33	1.0	1	NaN	NaN	NaN	NaN
4	1	2	38	0.0	1	NaN	NaN	NaN	NaN

	date_block_num	shop_id	item_group	cnt_prev_cat
0	1	2	0	0.000000
1	1	2	1	0.280000
2	1	2	3	0.000000
3	1	2	4	0.000000
4	1	2	5	0.100186

kaggle

Preprocessing¶

'kaggle' 카테고리의 다른 글

Feature Engineering¶

Recency¶

Assigning a recency score¶

Ordering clusters¶

Frequency¶

Frequency clusters¶

Revenue¶

Revenue clusters¶

Overall Score based on RFM Clustering¶

Customer Lifetime Value¶

Feature engineering¶

Machine Learning Model for Customer Lifetime Value Prediction¶

Final Clusters for Customer Lifetime Value¶

'kaggle' 카테고리의 다른 글

Load Dataset¶

Data loading¶

EDA¶

Markdowns relationship with sales¶

Mean sales across the years¶

Other features analysis¶

Feature engineering¶

Holidays¶

Markdowns¶

Preprocessing¶

missing values¶

encoding categorical data¶

Feature Selection¶

Modeling¶

Baseline predictions¶

Simple Blend¶

'kaggle' 카테고리의 다른 글

Load Dataset¶

Preprocessing¶

Aggregate¶

Feature creation¶

아이템 그룹 Feature 생성¶

도시 Feature 생성¶

수치 대표 값들로 Feature 생성¶

지난 달의 값들을 Feature로 생성¶

이동평균값과 MACD, Signal Feature 생성¶

비어있는 가격 예측¶

할인율 계산¶

Data preparation¶

Predict¶

모델 학습에 있어 중요한 피쳐는 무엇인가?!¶

Submit¶

'kaggle' 카테고리의 다른 글

Load Dataset¶

데이터 이해하기¶

1. 데이터의 사이즈는?¶

2. Train/Test는 어떻게 분리되어 있는가?¶

1. id에 차이가 있는가?¶

2. date에 차이가 있는가?¶

3. 정류장에 차이가 있는가?¶

3. Missing Value는 존재하는가?¶

4. Target Variable의 분포는?¶

1. Target Variable이 0인 데이터는 어떤 특징을 가지고 있는가?¶

5. 데이터의 특이한/주목해야할 부분은?¶

1. 같은 정류장 이름이 여러 번 나오는 경우?¶

1.1 고유한 정거장의 기준은?¶

station_name를 기준으로 삼는다면?¶

station_code는 어떨까?¶

Validation 전략¶

LightGBM을 통하여 모델링 하기¶

BASELINE MODEL 만들기¶

모델 학습에 있어 중요한 피쳐는 무엇인가?!¶

linear model¶

Tree-based Model¶

PDP PLOT¶

엔티티 개념을 활용한 피쳐 생성¶

날짜¶

버스 노선¶

정류장¶

Feature 선택/제거¶

Test-set 값 예측 (Ensemble)¶

제출 파일 만들기¶

'kaggle' 카테고리의 다른 글

	date_block_num	shop_id	item_id	cnt_prev	order_prev	cnt_prev2	order_prev2	cnt_prev12	order_prev12
0	1	2	30	0.0	1	NaN	NaN	NaN	NaN
1	1	2	31	0.0	1	NaN	NaN	NaN	NaN
2	1	2	32	0.0	1	NaN	NaN	NaN	NaN
3	1	2	33	1.0	1	NaN	NaN	NaN	NaN
4	1	2	38	0.0	1	NaN	NaN	NaN	NaN

date_block_num	shop_id	item_id	0	1	2	3	5	10	11	12	14	15	16	18	20	21	23	25	26	27	28	30	31	32	33
0	2	30	0	0	1	0	1	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0	0
1	2	31	0	4	1	1	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1
2	2	32	0	0	0	0	0	0	0	1	1	0	0	1	2	2	2	0	1	0	0	0	1	0	0
3	2	33	1	0	0	0	0	2	1	1	0	0	0	0	1	0	0	1	0	1	1	1	0	1	0
4	2	38	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	shop_id	item_id	cnt_ema_s_prev	cnt_ema_m_prev	cnt_ema_l_prev	date_block_num
0	2	30	0.0	0.0	0.0	1
1	2	31	0.0	0.0	0.0	1
2	2	32	0.0	0.0	0.0	1
3	2	33	1.0	1.0	1.0	1
4	2	38	0.0	0.0	0.0	1

	shop_id	item_id	item_price
209	2	1204	299.0
211	2	1224	399.0
219	2	1247	299.0
826	2	3656	299.0
1064	2	4531	279.0

	shop_id	item_id	item_price	discount_rate
0	2	33	499.0	0.000000
1	2	482	3300.0	0.000000
2	2	491	600.0	0.000000
3	2	839	3300.0	0.000000
4	2	1007	449.0	0.625521

	date_block_num	item_price	discount_rate	city	cnt_mean_shop	cnt_std_shop	order_mean_shop	cnt_mean_shop_cat	cnt_prev	order_prev	cnt_prev2	order_prev2	cnt_prev12	order_prev12	cnt_prev_cat	cnt_ema_s_prev	cnt_ema_m_prev	cnt_ema_l_prev	cnt_macd_prev	cnt_sig_prev	month	days	item_order
0	34	749.500000	0.711620	3	0.382353	0.739152	1.117647	0.274392	0.0	1	1.0	1.0	1.0	1.0	0.377551	0.856217	0.870737	0.673720	0.182497	-0.211641	10	30	0.856217
1	34	304.310272	0.000000	3	0.000000	0.000000	1.000000	0.093711	0.0	1	0.0	1.0	0.0	1.0	0.103647	0.000000	0.000000	0.000000	0.000000	-0.216406	10	30	1.000000
2	34	1199.000000	0.000000	3	0.294118	0.798841	1.117647	0.274392	1.0	1	3.0	3.0	0.0	1.0	0.377551	1.460992	1.015923	0.651031	0.809961	0.146857	10	30	1.460992
3	34	599.000000	0.500417	3	0.029412	0.171499	1.000000	0.274392	0.0	1	0.0	1.0	0.0	1.0	0.377551	0.144000	0.110528	0.068511	0.075489	-0.018919	10	30	0.144000
4	34	2221.775391	0.000000	3	0.000000	0.000000	1.000000	0.274392	0.0	1	0.0	1.0	0.0	1.0	0.377551	0.000000	0.000000	0.000000	0.000000	-0.033769	10	30	1.000000

	id	date	in_out	station_code	station_name	latitude	longitude	6~7_ride	7~8_ride	8~9_ride	9~10_ride	10~11_ride	11~12_ride	18~20_ride
0	0	2019-09-01	1	321	1481	33.48990	126.49373	0.0	1.0	2.0	5.0	2.0	6.0	0.0
1	1	2019-09-01	1	334	1822	33.48944	126.48508	1.0	4.0	4.0	2.0	5.0	6.0	5.0
2	2	2019-09-01	1	407	1406	33.48181	126.47352	1.0	1.0	0.0	2.0	0.0	0.0	2.0

	id	date	bus_route_id	in_out	station_code	station_name	latitude	longitude	6~7_ride	7~8_ride	8~9_ride	9~10_ride	10~11_ride	11~12_ride	11~12_takeoff
0	415423	2019-10-01	4270000	시외	344	제주썬호텔	33.48990	126.49373	4.0	4.0	7.0	2.0	9.0	1.0	1.0
1	415424	2019-10-01	4270000	시외	357	한라병원	33.48944	126.48508	1.0	6.0	6.0	1.0	8.0	11.0	0.0
2	415425	2019-10-01	4270000	시외	432	정존마을	33.48181	126.47352	2.0	4.0	2.0	2.0	2.0	1.0	0.0

	station_code	station_name	latitude	longitude
0	341	1001	33.51912	126.63493
1	787	1001	33.51037	126.63498
2	789	1001	33.51734	126.63563
3	340	1001	33.51957	126.63523
4	873	203	33.36658	126.28713
5	867	203	33.35763	126.29492
6	2703	1001	33.51493	126.63479
7	2704	1001	33.51500	126.63504
8	661	203	33.35743	126.29466
9	786	1001	33.51007	126.63523
10	874	203	33.36667	126.28690
11	788	1001	33.51695	126.63535
12	872	203	33.36255	126.29223
13	870	203	33.36098	126.29807
14	869	203	33.36053	126.29786

	col	imp
0	bus_route_id	3934
1	noon_ride	3515
2	morning_ride	3322
3	longitude	2773
4	latitude	2764
5	station_name	2747
6	dawn_ride	2726
7	station_code	2702
8	morning_takeoff	2291
9	noon_takeoff	2101
10	dawn_takeoff	1295
11	in_out	40