# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()


# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)


# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from glob import glob
import os, random, time, gc, warnings

from tqdm import tqdm_notebook

import lightgbm as lgbm
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

!pip install catboost
from catboost import CatBoostRegressor
from sklearn.feature_selection import RFECV


from sklearn.cluster import KMeans

from datetime import datetime

from math import sqrt

import folium
from folium import Marker, Icon, CircleMarker

!pip install pdpbox
from pdpbox import pdp, info_plots

warnings.filterwarnings('ignore')

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: catboost in /usr/local/lib/python3.8/dist-packages (1.1.1)
Requirement already satisfied: plotly in /usr/local/lib/python3.8/dist-packages (from catboost) (5.5.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from catboost) (1.7.3)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.3.5)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.21.6)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from catboost) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24.0->catboost) (2022.6)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (1.4.4)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.8/dist-packages (from plotly->catboost) (8.1.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pdpbox in /usr/local/lib/python3.8/dist-packages (0.2.1)
Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.2.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.21.6)
Requirement already satisfied: matplotlib==3.1.1 in /usr/local/lib/python3.8/dist-packages (from pdpbox) (3.1.1)
Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from pdpbox) (5.4.8)
Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.3.5)
Requirement already satisfied: sklearn in /usr/local/lib/python3.8/dist-packages (from pdpbox) (0.0.post1)
Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from pdpbox) (1.7.3)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (1.4.4)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (2.8.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib==3.1.1->pdpbox) (0.11.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.1->matplotlib==3.1.1->pdpbox) (1.15.0)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->pdpbox) (2022.6)


df_bus = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/bus_bts.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/submission_sample.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/test.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/train.csv')


# Train/Test-set을 각각 체크해봅니다.
display(train.head(3))
display(test.head(3))


print('Train-set : {}'.format(train.shape))
print('Test-set : {}'.format(test.shape))

Train-set : (415423, 21)
Test-set : (228170, 20)


# Train-set에만 있는 칼럼은?

train.columns.difference(test.columns)

Index(['18~20_ride'], dtype='object')


# Train-set의 id는?
print(train['id'].agg(['min', 'max']) )
print('Size : {}'.format(len(train)))

min         0
max    415422
Name: id, dtype: int64
Size : 415423


# Test-set의 id는?
print(test['id'].agg(['min', 'max']) )
print('Size : {}'.format(len(test)))

min    415423
max    643592
Name: id, dtype: int64
Size : 228170


# Train-set의 date는?
print(train['date'].agg(['min', 'max']) )
print('Size : {}'.format(len(train)))

min    2019-09-01
max    2019-09-30
Name: date, dtype: object
Size : 415423


# Test-set의 date는?
print(test['date'].agg(['min', 'max']) )
print('Size : {}'.format(len(test)))

min    2019-10-01
max    2019-10-16
Name: date, dtype: object
Size : 228170


# Train/Test의 date 분포는?
plt.figure(figsize = (12,8))

pd.to_datetime(train['date']).value_counts().sort_index().plot(color='b', lw=2, label='train')
pd.to_datetime(test['date']).value_counts().sort_index().plot(color='r', lw=2, label='test')

plt.legend()
plt.xlabel("date")
plt.ylabel("# of rows")
plt.title("Distribution of date in Train/Test-set");

# 평균적으로 탑승객이 어떤 식으로 분포되어있는지 살펴보자.
# 굉장히 떨어지는 지점이 특별한 공휴일은 아닌지 살펴보자.


display(train.head(3))
display(test.head(3))


# Train/Test-set의 고유한 bus_route_id를 구함.
train_bus_route_id_set = set(train['bus_route_id'])
test_bus_route_id_set  = set(test['bus_route_id'])


# Train/Test-set 고유한 bus_route의 개수를 구함.
print('Train-set에 있는 고유한 bus_route의 개수 : {}'.format( len( train_bus_route_id_set) ))
print('Test-set에 있는 고유한 bus_route의 개수 : {}'.format( len( test_bus_route_id_set) ))

Train-set에 있는 고유한 bus_route의 개수 : 613
Test-set에 있는 고유한 bus_route의 개수 : 601


# Train/Test-set 모두에 포함되어있는 bus_route를 구함.
common_bus_route_id = train_bus_route_id_set.intersection( test_bus_route_id_set )
print('Train/Test-set에 공통으로 포함되어 있는 bus_route 개수 : {}'.format( len(common_bus_route_id) ))

Train/Test-set에 공통으로 포함되어 있는 bus_route 개수 : 0


# Train-set에만 있는 bus_route를 구함. 
only_train_bus_route = train_bus_route_id_set.difference( test_bus_route_id_set ) 
print( 'Train-set에만 있는 bus_route 개수 : {}'.format (len( only_train_bus_route) ))

only_test_bus_route = test_bus_route_id_set.difference( train_bus_route_id_set ) 
print ('Test-set에만 있는 bus_route 개수 : {}'.format(len( only_test_bus_route)))

Train-set에만 있는 bus_route 개수 : 613
Test-set에만 있는 bus_route 개수 : 601


# 탑승 관련 columns & 하차 관련 columns
ride_columns = [col for col in test.columns if '_ride' in col] + ['bus_route_id','date']
take_off_columns = [col for col in test.columns if '_takeoff' in col] + ['bus_route_id','date']


# 두 경우의 탑승 관련 columns 비교
plt.figure(figsize=(12,5))
test[test['bus_route_id'].isin( only_test_bus_route)][ride_columns].groupby(['date', 'bus_route_id'])['8~9_ride'].sum().groupby('date').mean().plot(color='b', label='only in Test-set')
test[test['bus_route_id'].isin( common_bus_route_id)][ride_columns].groupby(['date', 'bus_route_id'])['8~9_ride'].sum().groupby('date').mean().plot(color='r', label='Both in Train/Test-set')
plt.legend()
plt.title('Average number of passengers\n bus_route_id only in Test-set VS bus_route_id both in Train/Test-set');


# Missing Values
msno.matrix(train)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd86b120580>


print("Train-set")
display(train.isnull().sum())

print('=' * 80)

print("Test-set")
display(test.isnull().sum())

Train-set

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
18~20_ride       0
dtype: int64

================================================================================
Test-set

id               0
date             0
bus_route_id     0
in_out           0
station_code     0
station_name     0
latitude         0
longitude        0
6~7_ride         0
7~8_ride         0
8~9_ride         0
9~10_ride        0
10~11_ride       0
11~12_ride       0
6~7_takeoff      0
7~8_takeoff      0
8~9_takeoff      0
9~10_takeoff     0
10~11_takeoff    0
11~12_takeoff    0
dtype: int64


# Target Variable의 분포를 살펴보자
target_col = '18~20_ride'
train[ target_col].value_counts().sort_index()

0.0      296528
1.0       44268
2.0       23752
3.0       13560
4.0        8630
5.0        5911
6.0        4291
7.0        3152
8.0        2288
9.0        1865
10.0       1409
11.0       1183
12.0        996
13.0        807
14.0        660
15.0        597
16.0        508
17.0        450
18.0        408
19.0        394
20.0        331
21.0        261
22.0        234
23.0        218
24.0        198
25.0        168
26.0        194
27.0        150
28.0        144
29.0        132
30.0        113
31.0         94
32.0         80
33.0         79
34.0         77
35.0         78
36.0         58
37.0         61
38.0         64
39.0         53
40.0         37
41.0         51
42.0         43
43.0         41
44.0         52
45.0         37
46.0         32
47.0         35
48.0         21
49.0         28
50.0         26
51.0         24
52.0         21
53.0         24
54.0         22
55.0         22
56.0         20
57.0         17
58.0         21
59.0         13
60.0         16
61.0         20
62.0         13
63.0         11
64.0         14
65.0         17
66.0         15
67.0          6
68.0          6
69.0          5
70.0          9
71.0          7
72.0          7
73.0         10
74.0         10
75.0          6
76.0          7
77.0          6
78.0          7
79.0          5
80.0          9
81.0          5
82.0          7
83.0         10
84.0          5
85.0          6
86.0          4
87.0          6
88.0          6
89.0          4
90.0          1
91.0          6
92.0          5
93.0          3
95.0          4
96.0          4
97.0          4
98.0          2
99.0          1
100.0         2
101.0         5
102.0         1
103.0         2
104.0         2
105.0         3
106.0         1
107.0         2
108.0         2
109.0         2
111.0         1
112.0         2
113.0         1
114.0         1
115.0         3
116.0         3
118.0         1
119.0         2
120.0         1
122.0         3
123.0         1
124.0         1
125.0         1
126.0         2
127.0         3
128.0         2
129.0         1
131.0         1
132.0         1
134.0         2
135.0         1
136.0         1
138.0         1
143.0         1
146.0         1
149.0         1
150.0         2
152.0         2
155.0         1
157.0         1
158.0         1
161.0         1
162.0         2
164.0         2
166.0         1
167.0         1
168.0         1
169.0         2
170.0         1
173.0         1
175.0         1
180.0         1
181.0         2
184.0         1
185.0         2
198.0         1
200.0         1
201.0         1
202.0         1
209.0         1
213.0         1
215.0         1
218.0         1
226.0         1
227.0         1
229.0         1
240.0         1
241.0         1
245.0         1
265.0         1
272.0         1
Name: 18~20_ride, dtype: int64


# Dist-plot을 그려보도록 한다.
# --> (1) 0이 굉장히 많다. 
# --> (2) right-skewed된 형태이며, 값이 매우 큰 outlier들이 존재한다.
sns.distplot( train[target_col] );


# log1p transformation을 적용해봐도 정규분포에 근사한 모양을 보이지 않는다.
sns.distplot( np.log1p( train[target_col] ) );


# 탑승 관련 columns & 하차 관련 columns
ride_columns = [col for col in test.columns if '_ride' in col]
take_off_columns = [col for col in test.columns if '_takeoff' in col]


# Train-set의 승차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ride_columns].sum(axis=1).agg(['min', 'max']))
print('=' * 80)

# Train-set의 하차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ take_off_columns].sum(axis=1).agg(['min', 'max']))
print('=' * 80)

# Train-set의 승하차관련 칼럼들의 rowsum 
display( train[train[target_col]==0][ride_columns+ take_off_columns].sum(axis=1).agg(['min', 'max']))

min      0.0
max    138.0
dtype: float64

================================================================================

min      0.0
max    165.0
dtype: float64

================================================================================

min      1.0
max    283.0
dtype: float64


# (1)의 경우에는 어떤 것들이 있나 예시를 통해 살펴보도록 하자
# 하나의 station_name에 여러 개의 station_code가 기록되어 있는 경우는 어떤 상황인가?
multiple_station_name = train.groupby('station_name')['station_code'].nunique()
multiple_station_name = multiple_station_name[multiple_station_name>=7]
print(multiple_station_name)

df_sample = train[train['station_name'].isin(multiple_station_name.index)][['station_code','station_name','latitude','longitude']]
df_sample = df_sample.drop_duplicates().reset_index(drop=True) # drop=True, 기존 인덱스가 컬럼으로 더해지는 것을 방지 
df_sample

station_name
203     7
1001    8
Name: station_code, dtype: int64


def generateMap(default_location=[33.35098, 126.79807], default_zoom_start=10):
    base_map = folium.Map(location=default_location, 
                          control_scale=True, 
                          zoom_start=default_zoom_start)
    
    # 여러 개의 정거장에 대해서 Icon 생성하기
    for row in df_sample.itertuples():
        station_code, station_name, latitude, longitude = row[1:]
        
        # Create Icon
        if station_name == '금악리':
            icon = Icon(color='red',icon='station')
        else:
            icon = Icon(color='blue',icon='station')
                
        # Add Marker
        Marker(location=[ latitude , longitude], 
               popup=f'station_code : {station_code} station_name : {station_name}',
               icon = icon).add_to(base_map)
        
    
    base_map.save('하나의 station_name에 여러개의 station_code.html')
    return base_map

generateMap()


display( train.groupby('station_name')['station_code'].nunique().value_counts() )

2    1308
1     540
3      60
4      45
5       4
6       2
7       1
8       1
Name: station_code, dtype: int64


# station_name를 기준으로 삼는다면?
# --> 하나의 station_name가 여러 개의 latitude, longitude를 갖는 것으로 보임
display(train.groupby('station_name')['latitude'].nunique().value_counts())
print('='*80)
display(train.groupby('station_name')['longitude'].nunique().value_counts())

2    1283
1     566
3      59
4      45
5       4
6       2
7       1
8       1
Name: latitude, dtype: int64

================================================================================

2    1291
1     560
3      58
4      45
5       3
7       2
6       2
Name: longitude, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code에는 1개의  station_name이 매핑되어있음.
display(train.groupby('station_code')['station_name'].nunique().value_counts())

1    3563
Name: station_name, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code는 latitude, longitude와 1대1 관계를 만족함
display( train.groupby('station_code')['latitude'].nunique().value_counts() )
display( train.groupby('station_code')['longitude'].nunique().value_counts() )

1    3563
Name: latitude, dtype: int64

1    3563
Name: longitude, dtype: int64


# station_code를 기준으로 삼는다면?
# --> station_code는 in_out와 1대1 관계를 만족함
display( train.groupby('station_code')['in_out'].nunique().value_counts() )

1    3563
Name: in_out, dtype: int64


# date, bus_route_id, station_code이 특정 날짜에 몇번 등장했는지 재확인하기 
display( train.groupby(['date','bus_route_id','station_code']).size().value_counts() )
print('='* 80)
print(f'Train-set size : {len(train)}')

1    415423
dtype: int64

================================================================================
Train-set size : 415423


# local_train/local_test를 만든다.
local_train = train[train['date']<='2019-09-24'].reset_index(drop=True)
local_test  = train[train['date']>'2019-09-24'].reset_index(drop=True)


display(local_train.head(1))
display(local_test.head(1))


# categorical variable인 'bus_route_id','in_out','station_code','station_name' 에 대해선 label_encoding을 적용해주고,
# numeric variable들에 대해선 있는 그대로 학습을 시켜보도록 한다.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    # local_train과 local_test를 concat하여 temp_df에 저장
    temp_df = pd.concat([ local_train[[col]], local_test[[col]] ] , axis=0)
    
    # Label-Encoding을 fitting함
    lbl.fit( temp_df[col] )
    
    # local_train/local_test에 label_encoding한 값을 대입함
    local_train[col] = lbl.transform(local_train[col])
    local_test[col] = lbl.transform(local_test[col])


display(local_train.head(1))
display(local_test.head(1))


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt', # 가장 많이 쓰는 게 그라디언트 부스팅 트리 
              'random_state':1993,
              'learning_rate':0.3, # 모델 학습을 얼마나 빠르게 진행할거냐 
              'subsample':0.7, # 전체 학습 데이터 중 몇 퍼센트 데이터를 사용할 것인가? 서브 샘플링을 낮추면 오버피팅을 방지 할 수 있다
              'tree_learner': 'serial', # 트리 학습기 
              'colsample_bytree':0.78, # 여러 개의 칼럼 중 몇 퍼센트를 사용할 것인가? 컬럼의 개수가 1000개, 2000개 되면 과감하게 0.3, 0.2
              'early_stopping_rounds':50, # train set의 loss는 지속적으로 감소하지만, validation의 loss가 하락하지 않을 경우 50 iteration이 넘었을 때 학습 중단 
              'subsample_freq': 1,
              'reg_lambda':7, # 정규화, 보통 1,2,3, 오버피팅을 방지하고자 할때 값을 높게 설정 
              'reg_alpha': 5, # 정규화, 보통 1,2,3, 오버피팅을 방지하고자 할때 값을 높게 설정 
              'num_leaves': 96,
              'seed' : SEED
            }


# 제거해야 하는 columns 정의 
drop_cols = ['id', 'date', target_col]

# local_train/local_test에 대한 label 정의
local_train_label = local_train[target_col]
local_test_label  = local_test[target_col]


# local_train/local_test의 예측값을 저장하기 위한 OOF 만들기 & CV를 저장할 list 정의 
oof_train = np.zeros((local_train.shape[0],))
oof_test = np.zeros((local_test.shape[0],))

cv_list = []

# Kfold 정의
kfolds = KFold(n_splits=n_splits
               , random_state=1993, shuffle=True) # 여러 개의 모델을 만들어 앙상블할때 random_state를 똑같이 지정해야 함 

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = local_train, y = local_train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = local_train.iloc[trn_ind].drop(drop_cols, 1), local_train_label[trn_ind]
    X_valid , y_valid = local_train.iloc[val_ind].drop(drop_cols, 1), local_train_label[val_ind]
    
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100) # 몇번당 학습의 결과를 화면에 내보낼 것인가 
    
    # local_valid/local_test에 대한 예측
    valid_pred = model.predict(X_valid)
    test_pred  = model.predict( local_test.drop(drop_cols,1) )
    
    # CV를 저장
    cv_list.append( sqrt( mean_squared_error( y_valid, valid_pred )  ) )
    
    # OOF에 예측값을 저장
    oof_train[val_ind] = valid_pred
    oof_test += test_pred/n_splits # 학습이 진행 될 때마다 1/5만큼만 더해라 
    print('='*80)
    
print(f"<LOCAL_TRAIN> OVERALL RMSE : {sqrt( mean_squared_error( local_train_label, oof_train ) )}")
print(f"<LOCAL_TEST>  OVERALL RMSE : {sqrt( mean_squared_error( local_test_label, oof_test ) )}")

Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34013	valid's rmse: 2.76295
[200]	train's rmse: 2.10389	valid's rmse: 2.74295
[300]	train's rmse: 1.95722	valid's rmse: 2.72343
[400]	train's rmse: 1.84886	valid's rmse: 2.71699
Early stopping, best iteration is:
[445]	train's rmse: 1.80723	valid's rmse: 2.71068
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34328	valid's rmse: 2.73387
[200]	train's rmse: 2.10399	valid's rmse: 2.71437
[300]	train's rmse: 1.96205	valid's rmse: 2.70933
[400]	train's rmse: 1.85441	valid's rmse: 2.70514
Early stopping, best iteration is:
[357]	train's rmse: 1.89796	valid's rmse: 2.70329
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32837	valid's rmse: 2.91084
[200]	train's rmse: 2.09175	valid's rmse: 2.8965
Early stopping, best iteration is:
[192]	train's rmse: 2.10446	valid's rmse: 2.89016
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.35792	valid's rmse: 2.6745
[200]	train's rmse: 2.11241	valid's rmse: 2.64986
Early stopping, best iteration is:
[223]	train's rmse: 2.07327	valid's rmse: 2.64049
================================================================================
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32428	valid's rmse: 2.88111
[200]	train's rmse: 2.0944	valid's rmse: 2.85047
[300]	train's rmse: 1.94905	valid's rmse: 2.8433
Early stopping, best iteration is:
[318]	train's rmse: 1.92777	valid's rmse: 2.8406
================================================================================
<LOCAL_TRAIN> OVERALL RMSE : 2.7586152832265656
<LOCAL_TEST>  OVERALL RMSE : 2.6149846987553675


# 실제값과 예측값의 분포 비교 
fig, axes = plt.subplots(1, 2, figsize=(20,8), sharex=True, sharey= True)

# y=x를 그리기 위하여 
x_range = np.linspace(0, 300, 1000) # 0에서 300까지 1000개의 요소로 만들어라 

# <SUBPLOT 1> : local_train에 대한 예측/실제값 비교
axes[0].scatter( local_train_label, oof_train )
axes[0].set_xlabel("Prediction")
axes[0].set_ylabel("Real")
axes[0].plot(x_range, x_range, color='r')

# <SUBPLOT 2> : local_test에 대한 예측/실제값 비교
axes[1].scatter( local_test_label, oof_test )
axes[1].set_xlabel("Prediction")
axes[1].set_ylabel("Real")
axes[1].plot(x_range, x_range, color='r')

plt.suptitle('Comparison between Prediction VS Real');


# 실제값 vs 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( oof_train, color='r' , label='Prediction for Local-Train')
sns.distplot( local_train_label, color='b', label='Real' )
plt.legend()
plt.title("Comparing the real vs prediction in Local-Train");


min(oof_train) # 0보다 작은 수에 대해서는 0으로 보정

-16.972225679224387


min(local_train_label) # 최소 승객수 0

0.0


# 실제값 vs 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( oof_test, color='r' , label='Prediction for Local-Test')
sns.distplot( local_test_label, color='b', label='Real' )
plt.legend()
plt.title("Comparing the real vs prediction in Local-Test");


del local_train, local_test, local_train_label, local_test_label; 
gc.collect();


# train_label 정의
train_label = train[target_col]


# categorical variable에 대해서는 Label-Encoding을 수행 
# --> One-Hot Encoding가 바람직하다고 생각되나 메모리 문제로 실행할 수 없음.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    
    # Label-Encoding을 fitting함
    lbl.fit( train[col] )
    
    # local_train/local_test에 label_encoding한 값을 대입함
    train[col] = lbl.transform(train[col])


# 각 모델에 대한 oof 정의 
ridge_oof_train = np.zeros((train.shape[0]))
lasso_oof_train = np.zeros((train.shape[0]))
dt_oof_train = np.zeros((train.shape[0]))
rf_oof_train = np.zeros((train.shape[0]))
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의 
Kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True) 

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (1) Ridge
    print("---TRAINING RIDGE---")
    ridge = Ridge(random_state = 1993)
    
    ridge.fit(X_train, y_train)
    
    ridge_valid_pred = ridge.predict(X_valid)
    ridge_oof_train[val_ind] = ridge_valid_pred
    
    # (2) Lasso
    print("---TRAINING LASSO---")
    lasso = Lasso(random_state = 1993)
    
    lasso.fit(X_train, y_train)
    
    lasso_valid_pred = lasso.predict(X_valid)
    lasso_oof_train[val_ind] = lasso_valid_pred
    
    # (3) Decision Tree
    print("---TRAINING DECISION TREE---")
    dt = DecisionTreeRegressor(random_state=231)
    
    dt.fit(X_train, y_train)
    
    dt_valid_pred = dt.predict(X_valid)
    dt_oof_train[val_ind] = dt_valid_pred
    
    
    # (4) Random Forest
    print("---TRAINING RANDOM FOREST---")
    rf = RandomForestRegressor(random_state=231, n_estimators=20 )
    
    rf.fit(X_train, y_train)
    
    rf_valid_pred = rf.predict(X_valid)
    rf_oof_train[val_ind] = rf_valid_pred
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 0)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Ridge> OVERALL RMSE         : {sqrt( mean_squared_error( train_label, ridge_oof_train ) )}")
print(f"<Lasso> OVERALL RMSE         : {sqrt( mean_squared_error( train_label, lasso_oof_train ) )}")
print(f"<Decision-Tree> OVERALL RMSE : {sqrt( mean_squared_error( train_label, dt_oof_train ) )}")
print(f"<Random-Forest> OVERALL RMSE : {sqrt( mean_squared_error( train_label, rf_oof_train ) )}")
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
---TRAINING RIDGE---
---TRAINING LASSO---
---TRAINING DECISION TREE---
---TRAINING RANDOM FOREST---
---TRAINING LIGHT GBM---
================================================================================
<Ridge> OVERALL RMSE         : 3.5256174413470043
<Lasso> OVERALL RMSE         : 3.6478537865113023
<Decision-Tree> OVERALL RMSE : 4.174571185391608
<Random-Forest> OVERALL RMSE : 2.922049207507851
<Light-GBM> OVERALL RMSE     : 2.7837265002333376


plt.figure(figsize=(24,5))

# Ridge의 Coef를 barplot으로 그린다. 
plt.bar( train.drop(drop_cols,1).columns, ridge.coef_   )

# y=0인 horizental한 선을 그린다. 
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title('Coef of Ridge Model');


plt.figure(figsize=(24,5))

# lasso의 Coef를 barplot으로 그린다.
plt.bar( train.drop(drop_cols,1).columns,  lasso.coef_ )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of lasso Model");


# 상관관계를 살펴보도록 하자.
train.corr()[target_col].sort_values()

bus_route_id    -0.137364
station_code    -0.034048
longitude       -0.021368
id              -0.000544
in_out           0.022116
station_name     0.054216
latitude         0.079261
6~7_takeoff      0.178353
7~8_takeoff      0.219430
6~7_ride         0.262173
8~9_takeoff      0.274360
10~11_takeoff    0.290691
9~10_takeoff     0.295875
11~12_takeoff    0.313540
7~8_ride         0.371751
8~9_ride         0.445316
9~10_ride        0.494085
10~11_ride       0.512666
11~12_ride       0.569747
18~20_ride       1.000000
Name: 18~20_ride, dtype: float64


plt.figure(figsize=(24,5))

# Random Forest의 Feature Importances를 barplot으로 그린다. 
plt.bar( train.drop(drop_cols, 1).columns, rf.feature_importances_ )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of Random Forest Model");


plt.figure(figsize=(24,5))

# lgbm의 Feature Importances를 barplot으로 그린다.
plt.bar( train.drop(drop_cols,1).columns,  model.feature_importance() )

# y=0인 horizental한 선을 그린다.
plt.axhline(y=0, color='r', linestyle='-')

plt.xticks(rotation=45)
plt.title("Coef of light GBM Model");


# 전체 데이터를 사용할 시 너무 많은 시간이 소요되어 일부 샘플만 사용하도록 하겠습니다.
sample = train.drop(drop_cols,1).sample(1000)


# PDP Plot For 11~12_ride
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), #컬럼의 이름 
    feature='11~12_ride')

fig, axes = pdp.pdp_plot(pdp_, '11~12_ride')

# 해당 피처가 변함에 따라 타겟이 어떻게 변하는가 
# 11~12시 탑승 승객수가 많은 곳일수록 퇴근 시간에 타는 승객수가 많을 것이다. 
# 폭이 작을수록 예측값에 대한 편차가 적다. = 신뢰도가 높다


# PDP Plot For 8~9_takeoff
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='8~9_takeoff'
)
fig, axes = pdp.pdp_plot(pdp_, '8~9_takeoff')


# PDP Plot For latitude
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='latitude'
)
fig, axes = pdp.pdp_plot(pdp_, 'latitude')


# PDP Plot For longitude
pdp_ = pdp.pdp_isolate(
    model= model, dataset=sample, model_features=list(sample), feature='longitude'
)
fig, axes = pdp.pdp_plot(pdp_, 'longitude')


# Interactive PDP Plot For latitude,longitude
pdp_ = pdp.pdp_interact(
    model= model, dataset=sample, model_features=list(sample), features=['latitude','longitude']
)

fig, axes = pdp.pdp_interact_plot(pdp_interact_out=pdp_,
                                  feature_names=['longitude', 'latitude'],
                                  plot_type='grid',
                                  x_quantile=True,
                                  plot_pdp=False)


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000 # boosting을 얼마나 돌릴지 지정한다 
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.1,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.78,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }


df_bus = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/bus_bts.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/submission_sample.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/test.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/train.csv')


# categorical variable에 대해서는 Label-Encoding을 수행 
# --> One-Hot Encoding가 바람직하다고 생각되나 메모리 문제로 실행할 수 없음.
lbl = LabelEncoder()

# Implement Label Encoding 
cat_cols = ['bus_route_id','in_out','station_code','station_name']
for col in tqdm_notebook( cat_cols ):
    
    # Label-Encoding을 fitting함
    lbl.fit( train[[col]].append(test[[col]]) )
    
    # train/test label_encoding한 값을 대입함
    train[col] = lbl.transform(train[col])
    test[col] = lbl.transform(test[col])


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30574	valid's rmse: 2.74915
[200]	train's rmse: 2.05878	valid's rmse: 2.70246
[300]	train's rmse: 1.90955	valid's rmse: 2.66699
[400]	train's rmse: 1.80317	valid's rmse: 2.65386
[500]	train's rmse: 1.72525	valid's rmse: 2.63958
[600]	train's rmse: 1.66145	valid's rmse: 2.63267
[700]	train's rmse: 1.60627	valid's rmse: 2.62778
Early stopping, best iteration is:
[660]	train's rmse: 1.62635	valid's rmse: 2.62755
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32874	valid's rmse: 2.66488
[200]	train's rmse: 2.08707	valid's rmse: 2.63891
[300]	train's rmse: 1.94474	valid's rmse: 2.62898
[400]	train's rmse: 1.83968	valid's rmse: 2.61132
[500]	train's rmse: 1.75473	valid's rmse: 2.60217
[600]	train's rmse: 1.68747	valid's rmse: 2.59209
[700]	train's rmse: 1.63126	valid's rmse: 2.58362
[800]	train's rmse: 1.58192	valid's rmse: 2.57711
[900]	train's rmse: 1.53579	valid's rmse: 2.57242
[1000]	train's rmse: 1.49653	valid's rmse: 2.56804
Early stopping, best iteration is:
[1046]	train's rmse: 1.47885	valid's rmse: 2.5658
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30812	valid's rmse: 2.79685
[200]	train's rmse: 2.06095	valid's rmse: 2.78083
Early stopping, best iteration is:
[187]	train's rmse: 2.08647	valid's rmse: 2.77592
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.30582	valid's rmse: 2.7847
[200]	train's rmse: 2.05827	valid's rmse: 2.74054
[300]	train's rmse: 1.91722	valid's rmse: 2.72011
[400]	train's rmse: 1.81145	valid's rmse: 2.70567
[500]	train's rmse: 1.72925	valid's rmse: 2.69213
[600]	train's rmse: 1.66309	valid's rmse: 2.68586
[700]	train's rmse: 1.60667	valid's rmse: 2.6818
[800]	train's rmse: 1.55937	valid's rmse: 2.68089
[900]	train's rmse: 1.51643	valid's rmse: 2.67663
Early stopping, best iteration is:
[895]	train's rmse: 1.51852	valid's rmse: 2.67584
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.29918	valid's rmse: 2.86883
[200]	train's rmse: 2.05735	valid's rmse: 2.82333
[300]	train's rmse: 1.91679	valid's rmse: 2.81182
[400]	train's rmse: 1.81536	valid's rmse: 2.79259
[500]	train's rmse: 1.73252	valid's rmse: 2.77651
[600]	train's rmse: 1.66839	valid's rmse: 2.77229
[700]	train's rmse: 1.61136	valid's rmse: 2.76939
[800]	train's rmse: 1.56388	valid's rmse: 2.76638
[900]	train's rmse: 1.52321	valid's rmse: 2.76538
Early stopping, best iteration is:
[886]	train's rmse: 1.52821	valid's rmse: 2.7637
================================================================================
<Light-GBM> OVERALL RMSE     : 2.6829542351234412


# 승하차 간격을 2시간 간격으로 설정할 수는 없는가? (3시간으로 설정해도 ok -> 결국 실험의 영역)
dawn_ride_cols, dawn_takeoff_cols = ['6~7_ride','7~8_ride'], ['6~7_takeoff','7~8_takeoff']
morning_ride_cols, morning_takeoff_cols = ['8~9_ride','9~10_ride'], ['8~9_takeoff','9~10_takeoff']
noon_ride_cols, noon_takeoff_cols = ['10~11_ride','11~12_ride'], ['10~11_takeoff','11~12_takeoff']

def modify_terms(df): 
  # ride columns
  df['dawn_ride'] = df[dawn_ride_cols].sum(axis=1)
  df['morning_ride'] = df[morning_ride_cols].sum(axis=1)
  df['noon_ride'] = df[noon_ride_cols].sum(axis=1)

  # takeoff columns
  df['dawn_takeoff'] = df[dawn_takeoff_cols].sum(axis=1)
  df['morning_takeoff'] = df[morning_takeoff_cols].sum(axis=1)
  df['noon_takeoff'] = df[noon_takeoff_cols].sum(axis=1)

  # drop columns 
  drop_cols = dawn_ride_cols + morning_ride_cols + noon_ride_cols + dawn_takeoff_cols + morning_takeoff_cols + noon_takeoff_cols
  df = df.drop(drop_cols, 1)

  return df 

train = modify_terms(train)
test = modify_terms(test)


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32718	valid's rmse: 2.75328
Early stopping, best iteration is:
[140]	train's rmse: 2.21752	valid's rmse: 2.74061
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.37316	valid's rmse: 2.63744
[200]	train's rmse: 2.14616	valid's rmse: 2.60199
[300]	train's rmse: 2.01191	valid's rmse: 2.59023
[400]	train's rmse: 1.92035	valid's rmse: 2.58024
[500]	train's rmse: 1.84405	valid's rmse: 2.57767
[600]	train's rmse: 1.77875	valid's rmse: 2.57152
Early stopping, best iteration is:
[565]	train's rmse: 1.79968	valid's rmse: 2.56977
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34121	valid's rmse: 2.75993
[200]	train's rmse: 2.11551	valid's rmse: 2.71367
[300]	train's rmse: 1.98132	valid's rmse: 2.70279
Early stopping, best iteration is:
[347]	train's rmse: 1.93213	valid's rmse: 2.69721
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.34294	valid's rmse: 2.72779
[200]	train's rmse: 2.12502	valid's rmse: 2.69412
[300]	train's rmse: 1.99061	valid's rmse: 2.67572
[400]	train's rmse: 1.89143	valid's rmse: 2.66737
[500]	train's rmse: 1.81195	valid's rmse: 2.66091
[600]	train's rmse: 1.74383	valid's rmse: 2.65133
[700]	train's rmse: 1.68879	valid's rmse: 2.64864
Early stopping, best iteration is:
[744]	train's rmse: 1.66847	valid's rmse: 2.64618
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.32519	valid's rmse: 2.80855
[200]	train's rmse: 2.10711	valid's rmse: 2.77664
[300]	train's rmse: 1.97337	valid's rmse: 2.76544
Early stopping, best iteration is:
[293]	train's rmse: 1.98019	valid's rmse: 2.76211
================================================================================
<Light-GBM> OVERALL RMSE     : 2.6840671696237


# 피쳐 중요도 확인 
data = { 'col' : model.feature_name(), 
         'imp' : model.feature_importance()}

df_imp = pd.DataFrame(data).sort_values(by='imp', ascending=False).reset_index(drop=True)
df_imp


train['date'] = pd.to_datetime(train['date'].astype(str))
test['date'] = pd.to_datetime(test['date'].astype(str))


# 요일 정보 추가 
train['weekday'] = train['date'].dt.weekday
test['weekday'] = test['date'].dt.weekday

# 공휴일 정보 추가 
# -> EDA 필요 
holidays = [datetime(2019, 9 ,12), datetime(2019, 9, 13), datetime(2019, 9 ,14), datetime(2019, 10,3), datetime(2019, 10,9) ]
train['is_holiday'] = train['date'].apply(lambda x : x in holidays).astype(np.int8)
test['is_holiday'] = test['date'].apply(lambda x : x in holidays).astype(np.int8)


# Mean Encoding
# (1) 일자별로 dawn, morning, noon에 각각 몇몇의 승객이 탑승하였는가
# (2) 일자별로 dawn, morning, noon에 각각 몇몇의 승객이 하차하였는가
# - 기준 :
# - (1) bus_route_id
# - (2) bus_route_id , station_code
# - (3) station_code

# (1) bus_route_id 기준

# 탑승
train['avg_dawn_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['dawn_ride'].transform('mean')
train['avg_morning_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['morning_ride'].transform('mean')
train['avg_noon_ride_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['noon_ride'].transform('mean')

test['avg_dawn_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['dawn_ride'].transform('mean')
test['avg_morning_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['morning_ride'].transform('mean')
test['avg_noon_ride_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['noon_ride'].transform('mean')

# 하차 
train['avg_dawn_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['dawn_takeoff'].transform('mean')
train['avg_morning_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['morning_takeoff'].transform('mean')
train['avg_noon_takeoff_bus_route_id'] = train.groupby(['date', 'bus_route_id'])['noon_takeoff'].transform('mean')

test['avg_dawn_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['dawn_takeoff'].transform('mean')
test['avg_morning_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['morning_takeoff'].transform('mean')
test['avg_noon_takeoff_bus_route_id'] = test.groupby(['date', 'bus_route_id'])['noon_takeoff'].transform('mean')

# (2) bus_route_id, station_code 기준
# train['avg_dawn_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['dawn_ride'].transform('mean') 
# train['avg_morning_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['morning_ride'].transform('mean') 
# train['avg_noon_ride_bus_route_id_station_code'] = train.groupby(['date','bus_route_id','station_code'])['noon_ride'].transform('mean') 

# test['avg_dawn_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['dawn_ride'].transform('mean') 
# test['avg_morning_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['morning_ride'].transform('mean') 
# test['avg_noon_ride_bus_route_id_station_code'] = test.groupby(['date','bus_route_id','station_code'])['noon_ride'].transform('mean') 

# (3) station_code 기준
# train['avg_dawn_ride_station_code'] = train.groupby(['date','station_code'])['dawn_ride'].transform('mean') 
# train['avg_morning_ride_bus_station_code'] = train.groupby(['date','station_code'])['morning_ride'].transform('mean') 
# train['avg_noon_ride_station_code'] = train.groupby(['date','station_code'])['noon_ride'].transform('mean') 

# test['avg_dawn_ride_station_code'] = test.groupby(['date','station_code'])['dawn_ride'].transform('mean') 
# test['avg_morning_ride_bus_station_code'] = test.groupby(['date','station_code'])['morning_ride'].transform('mean') 
# test['avg_noon_ride_station_code'] = test.groupby(['date','station_code'])['noon_ride'].transform('mean')


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train # 승객에 대한 정보를 y_train 값으로 임시로 넣어줌 
    d = X_train.groupby(['station_code'])['label'].mean().to_dict() # 정류장별로 승객이 평균 몇 명 탔는지 
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.00336	valid's rmse: 2.52675
[200]	train's rmse: 1.78187	valid's rmse: 2.49772
[300]	train's rmse: 1.64639	valid's rmse: 2.47723
Early stopping, best iteration is:
[321]	train's rmse: 1.62216	valid's rmse: 2.4738
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.04421	valid's rmse: 2.32358
[200]	train's rmse: 1.8193	valid's rmse: 2.28644
[300]	train's rmse: 1.67692	valid's rmse: 2.26913
[400]	train's rmse: 1.57436	valid's rmse: 2.2538
[500]	train's rmse: 1.49306	valid's rmse: 2.24171
[600]	train's rmse: 1.42323	valid's rmse: 2.23465
[700]	train's rmse: 1.35851	valid's rmse: 2.23254
[800]	train's rmse: 1.30305	valid's rmse: 2.22767
Early stopping, best iteration is:
[814]	train's rmse: 1.29499	valid's rmse: 2.22617
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.01583	valid's rmse: 2.50339
[200]	train's rmse: 1.78231	valid's rmse: 2.45207
[300]	train's rmse: 1.64337	valid's rmse: 2.43175
[400]	train's rmse: 1.54414	valid's rmse: 2.42236
[500]	train's rmse: 1.46167	valid's rmse: 2.41846
[600]	train's rmse: 1.39314	valid's rmse: 2.41276
[700]	train's rmse: 1.33451	valid's rmse: 2.40836
[800]	train's rmse: 1.28564	valid's rmse: 2.40459
[900]	train's rmse: 1.23967	valid's rmse: 2.40028
[1000]	train's rmse: 1.19984	valid's rmse: 2.39861
[1100]	train's rmse: 1.16337	valid's rmse: 2.39758
[1200]	train's rmse: 1.1295	valid's rmse: 2.39471
[1300]	train's rmse: 1.09942	valid's rmse: 2.39336
Early stopping, best iteration is:
[1272]	train's rmse: 1.10728	valid's rmse: 2.39298
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.0207	valid's rmse: 2.53581
[200]	train's rmse: 1.79294	valid's rmse: 2.48558
[300]	train's rmse: 1.64926	valid's rmse: 2.4619
[400]	train's rmse: 1.54676	valid's rmse: 2.44907
[500]	train's rmse: 1.46669	valid's rmse: 2.44035
[600]	train's rmse: 1.39789	valid's rmse: 2.43475
[700]	train's rmse: 1.33807	valid's rmse: 2.43057
[800]	train's rmse: 1.28824	valid's rmse: 2.42792
[900]	train's rmse: 1.24086	valid's rmse: 2.42301
[1000]	train's rmse: 1.20028	valid's rmse: 2.41925
[1100]	train's rmse: 1.16345	valid's rmse: 2.41493
Early stopping, best iteration is:
[1113]	train's rmse: 1.15848	valid's rmse: 2.41409
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.01455	valid's rmse: 2.55595
[200]	train's rmse: 1.77955	valid's rmse: 2.50323
[300]	train's rmse: 1.644	valid's rmse: 2.47732
[400]	train's rmse: 1.54051	valid's rmse: 2.46135
[500]	train's rmse: 1.45935	valid's rmse: 2.45535
[600]	train's rmse: 1.39068	valid's rmse: 2.44851
[700]	train's rmse: 1.32873	valid's rmse: 2.44447
[800]	train's rmse: 1.27854	valid's rmse: 2.43948
Early stopping, best iteration is:
[837]	train's rmse: 1.26044	valid's rmse: 2.43763
================================================================================
<Light-GBM> OVERALL RMSE     : 2.3904698945031


# 날씨 정보 
df_weather = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/229255_bus_riders_at_rush_hour_data/jeju_weather_dataset', encoding='cp949')
df_weather = df_weather[['일시', '강수량(mm)']]
df_weather.columns = ['date', 'precipitation']

# date의 type을 string에서 datetime으로 변환
df_weather['date'] = pd.to_datetime( df_weather['date'] )

# 대회 기간에 해당하는 데이터만 사용하도록 함
df_weather = df_weather[(df_weather['date']>='2019-08-31 00:00:00')&(df_weather['date']<='2019-10-16 23:00:00')].reset_index(drop=True)

# 대회 규정상 해당 날짜의 15시까지 정보만 사용할 수 있음
df_weather['hour'] = df_weather['date'].dt.hour
df_weather['date'] = df_weather['date'].dt.date

# 전날의 강수량을 정보를 대입할 때 사용
df_prevday_weather = df_weather.groupby('date')['precipitation'].sum().reset_index()
df_prevday_weather.columns = ['prev_date', 'prevday_precipitation']

# 해당 날짜의 강수량을 구함 
df_weather = df_weather[df_weather['hour']<=15].reset_index(drop=True)

# 00~15시까지의 강수량을 피쳐로 사용 
df_weather = df_weather.groupby('date')['precipitation'].sum().reset_index()

# Train/Test-set과 join하기 위하여 column의 타입을 datetime으로 변환한다. 
df_prevday_weather['prev_date'] = pd.to_datetime( df_prevday_weather['prev_date'] )
df_weather['date'] = pd.to_datetime( df_weather['date'] )


# 전날짜에 대하여 Train/Test-set과 강수량 정보를 join

# Train/Test-set에 대하여 전날을 구함
train['prev_date'] = train['date'] - pd.Timedelta('1 day')
test['prev_date'] = test['date'] - pd.Timedelta('1 day')

train = pd.merge(train, df_prevday_weather , on ='prev_date',  how ='left')
test = pd.merge(test, df_prevday_weather , on ='prev_date',how ='left')

# prev_date 칼럼은 삭제해줌
train = train.drop('prev_date',1)
test = test.drop('prev_date',1)

# 해당 날짜에 대하여 Train/Test-set과 강수량 정보를 join
train = pd.merge( train, df_weather , on ='date', how='left')
test = pd.merge( test, df_weather , on ='date', how='left')


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.98892	valid's rmse: 2.50777
[200]	train's rmse: 1.75765	valid's rmse: 2.47466
[300]	train's rmse: 1.61907	valid's rmse: 2.46216
[400]	train's rmse: 1.52395	valid's rmse: 2.45348
[500]	train's rmse: 1.44409	valid's rmse: 2.45138
[600]	train's rmse: 1.37676	valid's rmse: 2.44578
[700]	train's rmse: 1.32027	valid's rmse: 2.44222
[800]	train's rmse: 1.26823	valid's rmse: 2.43823
[900]	train's rmse: 1.22221	valid's rmse: 2.43711
[1000]	train's rmse: 1.18108	valid's rmse: 2.43461
Early stopping, best iteration is:
[1022]	train's rmse: 1.17186	valid's rmse: 2.43394
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.02505	valid's rmse: 2.31029
[200]	train's rmse: 1.7867	valid's rmse: 2.26391
[300]	train's rmse: 1.64805	valid's rmse: 2.24587
[400]	train's rmse: 1.54749	valid's rmse: 2.23398
[500]	train's rmse: 1.4668	valid's rmse: 2.22306
[600]	train's rmse: 1.4016	valid's rmse: 2.21615
[700]	train's rmse: 1.34121	valid's rmse: 2.21269
Early stopping, best iteration is:
[696]	train's rmse: 1.34286	valid's rmse: 2.21179
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.99473	valid's rmse: 2.46048
[200]	train's rmse: 1.75938	valid's rmse: 2.40683
[300]	train's rmse: 1.62116	valid's rmse: 2.39111
[400]	train's rmse: 1.52165	valid's rmse: 2.37788
[500]	train's rmse: 1.43592	valid's rmse: 2.3626
[600]	train's rmse: 1.36932	valid's rmse: 2.35485
[700]	train's rmse: 1.31183	valid's rmse: 2.35108
[800]	train's rmse: 1.26333	valid's rmse: 2.34833
[900]	train's rmse: 1.21817	valid's rmse: 2.34526
Early stopping, best iteration is:
[938]	train's rmse: 1.20349	valid's rmse: 2.3439
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.00644	valid's rmse: 2.51324
[200]	train's rmse: 1.77555	valid's rmse: 2.46179
[300]	train's rmse: 1.6328	valid's rmse: 2.43791
[400]	train's rmse: 1.52822	valid's rmse: 2.41681
[500]	train's rmse: 1.44884	valid's rmse: 2.40623
[600]	train's rmse: 1.38006	valid's rmse: 2.39611
[700]	train's rmse: 1.32129	valid's rmse: 2.3922
[800]	train's rmse: 1.27157	valid's rmse: 2.3895
[900]	train's rmse: 1.22409	valid's rmse: 2.38567
[1000]	train's rmse: 1.18314	valid's rmse: 2.38441
Early stopping, best iteration is:
[1011]	train's rmse: 1.17847	valid's rmse: 2.38343
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.97821	valid's rmse: 2.50554
[200]	train's rmse: 1.75319	valid's rmse: 2.45653
[300]	train's rmse: 1.61256	valid's rmse: 2.43048
[400]	train's rmse: 1.51372	valid's rmse: 2.41741
[500]	train's rmse: 1.43713	valid's rmse: 2.41027
Early stopping, best iteration is:
[547]	train's rmse: 1.40401	valid's rmse: 2.40453
================================================================================
<Light-GBM> OVERALL RMSE     : 2.356796981864371


train.head()


# 해당 딕셔너리에 bus_route_id 별 정차 순서를 구하도록 함
bus_route_sequence = {}

# 모든 bus_route_id 수집 
combined = train.append(test, ignore_index=True)
all_bus_route_ids = set(combined['bus_route_id'])

for bus_route_id in tqdm_notebook( all_bus_route_ids ) :
    # bus_route_id별 station_code를 오름차순으로 순서매김함
    df_bus_route = combined[combined['bus_route_id']==bus_route_id]
    sorted_station_codes = np.unique(df_bus_route['station_code'])
    
    # dictionary에 해당 정류장이 몇번째 정차 정류장인지 기입
    bus_route_sequence[bus_route_id] = {station_code: ind for ind, station_code in enumerate( list(sorted_station_codes) )}


# 몇 번째 정류장인지를 피쳐로 생성 
train['nth_station'] = train[['bus_route_id', 'station_code']].apply(lambda x: bus_route_sequence.get(x[0]).get(x[1]), axis=1)
test['nth_station'] = test[['bus_route_id', 'station_code']].apply(lambda x: bus_route_sequence.get(x[0]).get(x[1]), axis=1)


# 해당 bus_route_id에는 몇 개의 정류장이 있는지 
bus_route_id_total_station_count_dict = combined.groupby('bus_route_id')['station_code'].nunique().to_dict()

train['bus_route_id_total_station_count'] = train['bus_route_id'].apply(lambda x : bus_route_id_total_station_count_dict.get(x))
test['bus_route_id_total_station_count'] = test['bus_route_id'].apply(lambda x : bus_route_id_total_station_count_dict.get(x))


# 뒤에서부터 몇 번째 정류장인지 
train['nth_station_backward'] = train['nth_station'] - train['bus_route_id_total_station_count']
test['nth_station_backward'] = test['nth_station'] - test['bus_route_id_total_station_count']


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75289	valid's rmse: 2.35176
Early stopping, best iteration is:
[137]	train's rmse: 1.64643	valid's rmse: 2.33721
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.79312	valid's rmse: 2.13689
[200]	train's rmse: 1.55592	valid's rmse: 2.11103
Early stopping, best iteration is:
[242]	train's rmse: 1.48991	valid's rmse: 2.10712
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75385	valid's rmse: 2.30077
[200]	train's rmse: 1.52184	valid's rmse: 2.27475
[300]	train's rmse: 1.39039	valid's rmse: 2.26805
Early stopping, best iteration is:
[298]	train's rmse: 1.39264	valid's rmse: 2.26774
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75804	valid's rmse: 2.33279
[200]	train's rmse: 1.52563	valid's rmse: 2.29234
Early stopping, best iteration is:
[246]	train's rmse: 1.45673	valid's rmse: 2.2902
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76114	valid's rmse: 2.33683
[200]	train's rmse: 1.52142	valid's rmse: 2.30265
[300]	train's rmse: 1.38623	valid's rmse: 2.29356
Early stopping, best iteration is:
[308]	train's rmse: 1.37807	valid's rmse: 2.29245
================================================================================
<Light-GBM> OVERALL RMSE     : 2.260331683533115


# 중복되지 않는 위경도 값들을 수집함
combined = train[['latitude', 'longitude']].append(test[['latitude', 'longitude']])
combined = combined.drop_duplicates()

# kmeans를 통하여 군집화 
kmeans = KMeans(n_clusters = int(sqrt(len(combined))), random_state=1993)
kmeans.fit(combined)

train['station_code_kmeans'] = kmeans.predict(train[['latitude', 'longitude']])
test['station_code_kmeans'] = kmeans.predict(test[['latitude', 'longitude']])


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.74036	valid's rmse: 2.34768
Early stopping, best iteration is:
[139]	train's rmse: 1.63143	valid's rmse: 2.33906
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.77957	valid's rmse: 2.15445
[200]	train's rmse: 1.54419	valid's rmse: 2.14345
[300]	train's rmse: 1.40646	valid's rmse: 2.13406
Early stopping, best iteration is:
[311]	train's rmse: 1.39474	valid's rmse: 2.13285
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75307	valid's rmse: 2.30793
[200]	train's rmse: 1.52079	valid's rmse: 2.27812
Early stopping, best iteration is:
[192]	train's rmse: 1.53371	valid's rmse: 2.27501
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75665	valid's rmse: 2.30983
[200]	train's rmse: 1.52562	valid's rmse: 2.26954
[300]	train's rmse: 1.38872	valid's rmse: 2.26113
[400]	train's rmse: 1.29594	valid's rmse: 2.25728
[500]	train's rmse: 1.22455	valid's rmse: 2.25507
Early stopping, best iteration is:
[469]	train's rmse: 1.24543	valid's rmse: 2.25452
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.74818	valid's rmse: 2.35159
[200]	train's rmse: 1.5146	valid's rmse: 2.32137
[300]	train's rmse: 1.37806	valid's rmse: 2.30962
[400]	train's rmse: 1.28658	valid's rmse: 2.30433
Early stopping, best iteration is:
[401]	train's rmse: 1.2859	valid's rmse: 2.30412
================================================================================
<Light-GBM> OVERALL RMSE     : 2.2622008933054794


lgbm_param = {'objective': 'rmse',
             'boosting_type': 'gbdt',
             'random_state': 1993,
             'learning_rate': 0.1,
             'subsample': 0.7,
             'tree_learner': 'serial',
             'colsample_bytree': 0.78,
#              'early_stopping_rounds': 50,
             'subsample_freq': 1,
             'reg_lambda': 7,
             'reg_alpha': 5,
             'num_leaves': 96,
             'seed': 1993}


reg_model = lgbm.LGBMRegressor(**lgbm_param)

# RFE(Recursive Feature Elimination)
rfe = RFECV(estimator=reg_model, step=1, # 한 번 학습이 진행될 때마다 몇 개의 피처를 제거하고 싶으냐? 큰 숫자를 선택하면 학습시간 단축 
            cv=KFold(n_splits=5, shuffle=True, random_state=231), scoring='neg_mean_squared_error', verbose=2)
rfe.fit(train.drop(drop_cols,1), train_label)

Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 26 features.

RFECV(cv=KFold(n_splits=5, random_state=231, shuffle=True),
      estimator=LGBMRegressor(colsample_bytree=0.78, num_leaves=96,
                              objective='rmse', random_state=1993, reg_alpha=5,
                              reg_lambda=7, seed=1993, subsample=0.7,
                              subsample_freq=1, tree_learner='serial'),
      scoring='neg_mean_squared_error', verbose=2)


df_rank = pd.DataFrame(data = {'col' : list(train.drop(drop_cols,1)), 'imp' : rfe.ranking_}) # 랭킹이 1에 가까울수록 의미 있는 피처다 
use_cols = list(df_rank[df_rank['imp'] == 1]['col'])


lgbm_param = {'objective': 'rmse',
             'boosting_type': 'gbdt',
             'random_state': 1993,
             'learning_rate': 0.1,
             'subsample': 0.7,
             'tree_learner': 'serial',
             'colsample_bytree': 0.78,
             'early_stopping_rounds': 50,
             'subsample_freq': 1,
             'reg_lambda': 7,
             'reg_alpha': 5,
             'num_leaves': 96,
             'seed': 1993}


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)

# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train[use_cols], y_train)
    dvalid = lgbm.Dataset(X_valid[use_cols], y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid[use_cols])
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.75777	valid's rmse: 2.35688
[200]	train's rmse: 1.52646	valid's rmse: 2.34582
[300]	train's rmse: 1.39089	valid's rmse: 2.33528
Early stopping, best iteration is:
[302]	train's rmse: 1.38892	valid's rmse: 2.33496
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.7989	valid's rmse: 2.16992
[200]	train's rmse: 1.56383	valid's rmse: 2.15231
Early stopping, best iteration is:
[240]	train's rmse: 1.50043	valid's rmse: 2.14869
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76513	valid's rmse: 2.32654
[200]	train's rmse: 1.53325	valid's rmse: 2.29655
[300]	train's rmse: 1.40288	valid's rmse: 2.28792
[400]	train's rmse: 1.31201	valid's rmse: 2.28522
[500]	train's rmse: 1.24289	valid's rmse: 2.28305
[600]	train's rmse: 1.18573	valid's rmse: 2.27828
Early stopping, best iteration is:
[639]	train's rmse: 1.16609	valid's rmse: 2.27801
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.77643	valid's rmse: 2.32143
[200]	train's rmse: 1.53992	valid's rmse: 2.27169
[300]	train's rmse: 1.40421	valid's rmse: 2.25443
[400]	train's rmse: 1.30902	valid's rmse: 2.24979
[500]	train's rmse: 1.23751	valid's rmse: 2.2473
[600]	train's rmse: 1.1799	valid's rmse: 2.24384
[700]	train's rmse: 1.13299	valid's rmse: 2.24388
Early stopping, best iteration is:
[651]	train's rmse: 1.15509	valid's rmse: 2.24292
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.76481	valid's rmse: 2.34269
[200]	train's rmse: 1.52873	valid's rmse: 2.30365
[300]	train's rmse: 1.39461	valid's rmse: 2.29107
Early stopping, best iteration is:
[308]	train's rmse: 1.38632	valid's rmse: 2.29066
================================================================================
<Light-GBM> OVERALL RMSE     : 2.259915030786093


# 모델에 쓰일 parameter 정의하기
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.01,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.68,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }

n_rounds = 100000
cat_params = {
        'n_estimators': n_rounds,
        'learning_rate': 0.08, # 0.005로 하면 폴드당 2시간 걸림 
        'eval_metric': 'RMSE', 
        'loss_function': 'RMSE',
        'random_seed': 42, 
        'metric_period': 500, # 메트릭스 평가 주기 
        'od_wait': 500, # 오버피팅을 방지 할 때 
        'task_type': 'GPU', # 무조건 GPU를 사용하는게 좋음. settings > accelerator를 GPU로 설정 
       'l2_leaf_reg' : 3, # 얼마나 많이 regularization을 줄것인가. 2~3
        'depth': 8, # 보통 5~10 정도 
    }


target_col = '18~20_ride'
drop_cols = ['date','id',target_col]
train_label = train[target_col]


# 형식을 맞춰주기 위해서 Test-set에 '18~20_ride' columns을 만들어줌
test[target_col] = np.NaN


# 각 모델에 대한 oof 정의
lgbm_oof_train = np.zeros((train.shape[0]))
lgbm_oof_test = np.zeros((test.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)


# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    test['station_code_te'] = test['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    
    # (5) Light GBM
    print("---TRAINING LIGHT GBM---")
    # dtrain/dvalid 정의
    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)
    
    # model 정의&학습
    model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, 
                       valid_sets=(dtrain, dvalid), 
                       valid_names=('train','valid'), 
                       categorical_feature= ['bus_route_id','station_code', 'station_code_kmeans'],
                       verbose_eval= 100)
    
    # local_valid/local_test에 대한 예측
    lgbm_valid_pred = model.predict(X_valid)
    lgbm_test_pred = model.predict(test.drop(drop_cols, 1))
        
    lgbm_oof_train[val_ind] = lgbm_valid_pred
    lgbm_oof_test += lgbm_test_pred/ n_splits
    print('='*80)
    
print(f"<Light-GBM> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, lgbm_oof_train ) )}")

---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.02832	valid's rmse: 3.07656
[200]	train's rmse: 2.44893	valid's rmse: 2.61618
[300]	train's rmse: 2.20433	valid's rmse: 2.459
[400]	train's rmse: 2.07505	valid's rmse: 2.40168
[500]	train's rmse: 1.98787	valid's rmse: 2.37225
[600]	train's rmse: 1.92107	valid's rmse: 2.35463
[700]	train's rmse: 1.86691	valid's rmse: 2.34302
[800]	train's rmse: 1.82279	valid's rmse: 2.33718
[900]	train's rmse: 1.78382	valid's rmse: 2.33224
[1000]	train's rmse: 1.74746	valid's rmse: 2.32641
[1100]	train's rmse: 1.71501	valid's rmse: 2.32323
[1200]	train's rmse: 1.68576	valid's rmse: 2.31939
[1300]	train's rmse: 1.65859	valid's rmse: 2.31672
[1400]	train's rmse: 1.63323	valid's rmse: 2.31355
[1500]	train's rmse: 1.61027	valid's rmse: 2.31166
[1600]	train's rmse: 1.58846	valid's rmse: 2.31103
[1700]	train's rmse: 1.56825	valid's rmse: 2.30873
[1800]	train's rmse: 1.54876	valid's rmse: 2.30801
[1900]	train's rmse: 1.53052	valid's rmse: 2.30615
Early stopping, best iteration is:
[1901]	train's rmse: 1.53036	valid's rmse: 2.30614
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.0602	valid's rmse: 2.92058
[200]	train's rmse: 2.49002	valid's rmse: 2.44412
[300]	train's rmse: 2.2498	valid's rmse: 2.28688
[400]	train's rmse: 2.11524	valid's rmse: 2.21899
[500]	train's rmse: 2.02692	valid's rmse: 2.18198
[600]	train's rmse: 1.96029	valid's rmse: 2.16117
[700]	train's rmse: 1.90338	valid's rmse: 2.14572
[800]	train's rmse: 1.85608	valid's rmse: 2.13595
[900]	train's rmse: 1.814	valid's rmse: 2.12795
[1000]	train's rmse: 1.77657	valid's rmse: 2.12116
[1100]	train's rmse: 1.74299	valid's rmse: 2.11643
[1200]	train's rmse: 1.7123	valid's rmse: 2.11336
[1300]	train's rmse: 1.68407	valid's rmse: 2.10895
[1400]	train's rmse: 1.65797	valid's rmse: 2.106
[1500]	train's rmse: 1.6331	valid's rmse: 2.10387
[1600]	train's rmse: 1.61059	valid's rmse: 2.10236
[1700]	train's rmse: 1.5889	valid's rmse: 2.10005
[1800]	train's rmse: 1.5693	valid's rmse: 2.09801
[1900]	train's rmse: 1.55018	valid's rmse: 2.09565
[2000]	train's rmse: 1.53166	valid's rmse: 2.09422
[2100]	train's rmse: 1.51406	valid's rmse: 2.09279
[2200]	train's rmse: 1.49808	valid's rmse: 2.09074
[2300]	train's rmse: 1.48272	valid's rmse: 2.08948
[2400]	train's rmse: 1.46826	valid's rmse: 2.08842
[2500]	train's rmse: 1.45394	valid's rmse: 2.08739
[2600]	train's rmse: 1.44087	valid's rmse: 2.08656
Early stopping, best iteration is:
[2591]	train's rmse: 1.44206	valid's rmse: 2.08648
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.02987	valid's rmse: 3.11315
[200]	train's rmse: 2.45429	valid's rmse: 2.63787
[300]	train's rmse: 2.21383	valid's rmse: 2.46554
[400]	train's rmse: 2.08319	valid's rmse: 2.38673
[500]	train's rmse: 1.99474	valid's rmse: 2.34121
[600]	train's rmse: 1.92878	valid's rmse: 2.31435
[700]	train's rmse: 1.87421	valid's rmse: 2.2968
[800]	train's rmse: 1.82899	valid's rmse: 2.28744
[900]	train's rmse: 1.78902	valid's rmse: 2.27901
[1000]	train's rmse: 1.75319	valid's rmse: 2.2725
[1100]	train's rmse: 1.72081	valid's rmse: 2.26646
[1200]	train's rmse: 1.69108	valid's rmse: 2.26111
[1300]	train's rmse: 1.66418	valid's rmse: 2.257
[1400]	train's rmse: 1.63893	valid's rmse: 2.25445
[1500]	train's rmse: 1.61546	valid's rmse: 2.25114
[1600]	train's rmse: 1.59346	valid's rmse: 2.24926
[1700]	train's rmse: 1.57329	valid's rmse: 2.24786
[1800]	train's rmse: 1.554	valid's rmse: 2.24547
[1900]	train's rmse: 1.53555	valid's rmse: 2.24353
[2000]	train's rmse: 1.51814	valid's rmse: 2.24104
[2100]	train's rmse: 1.50178	valid's rmse: 2.23935
[2200]	train's rmse: 1.48642	valid's rmse: 2.23717
Early stopping, best iteration is:
[2226]	train's rmse: 1.48243	valid's rmse: 2.23657
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 3.01285	valid's rmse: 3.20948
[200]	train's rmse: 2.44406	valid's rmse: 2.7019
[300]	train's rmse: 2.20952	valid's rmse: 2.5168
[400]	train's rmse: 2.0783	valid's rmse: 2.43012
[500]	train's rmse: 1.99352	valid's rmse: 2.38218
[600]	train's rmse: 1.92875	valid's rmse: 2.34991
[700]	train's rmse: 1.8763	valid's rmse: 2.33044
[800]	train's rmse: 1.83172	valid's rmse: 2.31512
[900]	train's rmse: 1.79225	valid's rmse: 2.30426
[1000]	train's rmse: 1.75681	valid's rmse: 2.29425
[1100]	train's rmse: 1.72582	valid's rmse: 2.28926
[1200]	train's rmse: 1.69662	valid's rmse: 2.28237
[1300]	train's rmse: 1.66927	valid's rmse: 2.27698
[1400]	train's rmse: 1.64447	valid's rmse: 2.27193
[1500]	train's rmse: 1.62098	valid's rmse: 2.26841
[1600]	train's rmse: 1.59909	valid's rmse: 2.26439
[1700]	train's rmse: 1.57884	valid's rmse: 2.26032
[1800]	train's rmse: 1.55961	valid's rmse: 2.25677
[1900]	train's rmse: 1.54096	valid's rmse: 2.25487
[2000]	train's rmse: 1.5236	valid's rmse: 2.25187
[2100]	train's rmse: 1.50699	valid's rmse: 2.24896
[2200]	train's rmse: 1.49103	valid's rmse: 2.24703
[2300]	train's rmse: 1.47617	valid's rmse: 2.24501
[2400]	train's rmse: 1.46181	valid's rmse: 2.24327
[2500]	train's rmse: 1.44781	valid's rmse: 2.24184
[2600]	train's rmse: 1.43446	valid's rmse: 2.24091
[2700]	train's rmse: 1.42182	valid's rmse: 2.23904
[2800]	train's rmse: 1.4097	valid's rmse: 2.2378
[2900]	train's rmse: 1.39779	valid's rmse: 2.23601
[3000]	train's rmse: 1.38649	valid's rmse: 2.23537
[3100]	train's rmse: 1.37548	valid's rmse: 2.23405
[3200]	train's rmse: 1.36494	valid's rmse: 2.23236
[3300]	train's rmse: 1.35466	valid's rmse: 2.23181
Early stopping, best iteration is:
[3304]	train's rmse: 1.35426	valid's rmse: 2.23171
================================================================================
---TRAINING LIGHT GBM---
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 2.99377	valid's rmse: 3.23045
[200]	train's rmse: 2.43337	valid's rmse: 2.71394
[300]	train's rmse: 2.19939	valid's rmse: 2.53204
[400]	train's rmse: 2.07129	valid's rmse: 2.44952
[500]	train's rmse: 1.986	valid's rmse: 2.39908
[600]	train's rmse: 1.92161	valid's rmse: 2.37141
[700]	train's rmse: 1.86878	valid's rmse: 2.35128
[800]	train's rmse: 1.82458	valid's rmse: 2.33858
[900]	train's rmse: 1.78543	valid's rmse: 2.32872
[1000]	train's rmse: 1.74897	valid's rmse: 2.31906
[1100]	train's rmse: 1.71658	valid's rmse: 2.31134
[1200]	train's rmse: 1.68727	valid's rmse: 2.30474
[1300]	train's rmse: 1.65982	valid's rmse: 2.29863
[1400]	train's rmse: 1.63429	valid's rmse: 2.29387
[1500]	train's rmse: 1.61017	valid's rmse: 2.28881
[1600]	train's rmse: 1.58838	valid's rmse: 2.28518
[1700]	train's rmse: 1.56794	valid's rmse: 2.28271
[1800]	train's rmse: 1.54872	valid's rmse: 2.27911
[1900]	train's rmse: 1.53063	valid's rmse: 2.27727
[2000]	train's rmse: 1.51323	valid's rmse: 2.27499
[2100]	train's rmse: 1.49665	valid's rmse: 2.27284
[2200]	train's rmse: 1.48094	valid's rmse: 2.27168
[2300]	train's rmse: 1.46596	valid's rmse: 2.26997
[2400]	train's rmse: 1.45187	valid's rmse: 2.26905
[2500]	train's rmse: 1.43816	valid's rmse: 2.26857
[2600]	train's rmse: 1.42495	valid's rmse: 2.26721
[2700]	train's rmse: 1.41254	valid's rmse: 2.26589
[2800]	train's rmse: 1.40113	valid's rmse: 2.26527
[2900]	train's rmse: 1.38953	valid's rmse: 2.26442
[3000]	train's rmse: 1.37847	valid's rmse: 2.26289
[3100]	train's rmse: 1.36795	valid's rmse: 2.26221
[3200]	train's rmse: 1.35745	valid's rmse: 2.26139
[3300]	train's rmse: 1.34749	valid's rmse: 2.26099
[3400]	train's rmse: 1.3378	valid's rmse: 2.26073
[3500]	train's rmse: 1.32838	valid's rmse: 2.26008
Early stopping, best iteration is:
[3483]	train's rmse: 1.32989	valid's rmse: 2.25993
================================================================================
<Light-GBM> OVERALL RMSE     : 2.2253883326990787


# 각 모델에 대한 oof 정의
cat_oof_train = np.zeros((train.shape[0]))
cat_oof_test = np.zeros((test.shape[0]))

# Kfold 정의
kfolds = KFold(n_splits=n_splits, random_state=1993, shuffle=True)


# Fold별로 학습진행
for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate( kfolds.split( X = train, y = train_label ) ) ):
    
    # Train/Valid-set을 정의하기
    X_train , y_train = train.iloc[trn_ind].drop(drop_cols, 1), train_label[trn_ind]
    X_valid , y_valid = train.iloc[val_ind].drop(drop_cols, 1), train_label[val_ind]
    
    # Target- Mean Encoding
    X_train['label'] = y_train
    d = X_train.groupby(['station_code'])['label'].mean().to_dict()
    X_train['station_code_te'] = X_train['station_code'].apply(lambda x: d.get(x))
    X_valid['station_code_te'] = X_valid['station_code'].apply(lambda x: d.get(x))
    test['station_code_te'] = test['station_code'].apply(lambda x: d.get(x))
    
    X_train= X_train.drop('label',1)
    
    # (5) CATBOOST
    print("---TRAINING CATBOOST---")
    
    # model 정의&학습
    model = CatBoostRegressor(**cat_params)
    
    model.fit( X_train, y_train, eval_set = (X_valid, y_valid), 
              cat_features  = ['bus_route_id','station_code', 'station_code_kmeans'],
              use_best_model=True,
              verbose=True)
    
    # local_valid/local_test에 대한 예측
    cat_valid_pred = model.predict(X_valid)
    cat_test_pred = model.predict(test.drop(drop_cols, 1))
        
    cat_oof_train[val_ind] = cat_valid_pred
    cat_oof_test += cat_test_pred/ n_splits
    print('='*80)
    
print(f"<CATBOOST> OVERALL RMSE     : {sqrt( mean_squared_error( train_label, cat_oof_train ) )}")

---TRAINING CATBOOST---
0:	learn: 4.5380824	test: 4.4306551	best: 4.4306551 (0)	total: 51.4ms	remaining: 1h 25m 37s
500:	learn: 1.9458630	test: 2.4234457	best: 2.4234457 (500)	total: 18.4s	remaining: 1h 51s
1000:	learn: 1.7374907	test: 2.3683804	best: 2.3683399 (999)	total: 35.7s	remaining: 58m 52s
1500:	learn: 1.6057637	test: 2.3434642	best: 2.3433077 (1490)	total: 55.4s	remaining: 1h 33s
2000:	learn: 1.5104954	test: 2.3334114	best: 2.3331079 (1992)	total: 1m 14s	remaining: 1h 1m 3s
2500:	learn: 1.4356077	test: 2.3268342	best: 2.3268342 (2500)	total: 1m 33s	remaining: 1h 38s
3000:	learn: 1.3728601	test: 2.3220920	best: 2.3220725 (2999)	total: 1m 51s	remaining: 59m 49s
3500:	learn: 1.3176372	test: 2.3196005	best: 2.3196005 (3500)	total: 2m 14s	remaining: 1h 1m 47s
4000:	learn: 1.2679179	test: 2.3186669	best: 2.3179452 (3758)	total: 2m 33s	remaining: 1h 1m 21s
4500:	learn: 1.2233283	test: 2.3157834	best: 2.3156541 (4486)	total: 2m 51s	remaining: 1h 40s
5000:	learn: 1.1817428	test: 2.3139396	best: 2.3137003 (4955)	total: 3m 9s	remaining: 1h
5500:	learn: 1.1435710	test: 2.3125356	best: 2.3124823 (5331)	total: 3m 27s	remaining: 59m 31s
6000:	learn: 1.1105757	test: 2.3122229	best: 2.3115961 (5675)	total: 3m 46s	remaining: 59m 2s
bestTest = 2.311596107
bestIteration = 5675
Shrink model to first 5676 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.5606310	test: 4.3299385	best: 4.3299385 (0)	total: 72.8ms	remaining: 2h 1m 16s
500:	learn: 1.9502860	test: 2.2283864	best: 2.2278009 (499)	total: 17.1s	remaining: 56m 42s
1000:	learn: 1.7326403	test: 2.1649495	best: 2.1649495 (1000)	total: 35.6s	remaining: 58m 38s
1500:	learn: 1.6062223	test: 2.1456226	best: 2.1456226 (1500)	total: 53s	remaining: 58m
2000:	learn: 1.5094705	test: 2.1333419	best: 2.1333354 (1997)	total: 1m 13s	remaining: 59m 56s
2500:	learn: 1.4362050	test: 2.1259394	best: 2.1258195 (2485)	total: 1m 31s	remaining: 59m 12s
3000:	learn: 1.3701407	test: 2.1206459	best: 2.1206111 (2986)	total: 1m 48s	remaining: 58m 39s
3500:	learn: 1.3168496	test: 2.1159085	best: 2.1158653 (3456)	total: 2m 6s	remaining: 58m 16s
4000:	learn: 1.2666220	test: 2.1130611	best: 2.1130342 (3988)	total: 2m 24s	remaining: 57m 58s
4500:	learn: 1.2218785	test: 2.1104105	best: 2.1102724 (4473)	total: 2m 43s	remaining: 57m 40s
5000:	learn: 1.1821355	test: 2.1087003	best: 2.1085766 (4982)	total: 3m 1s	remaining: 57m 24s
5500:	learn: 1.1451785	test: 2.1069246	best: 2.1069234 (5499)	total: 3m 19s	remaining: 57m 9s
6000:	learn: 1.1098986	test: 2.1061286	best: 2.1060720 (5992)	total: 3m 38s	remaining: 56m 58s
6500:	learn: 1.0784733	test: 2.1053291	best: 2.1052393 (6461)	total: 3m 57s	remaining: 56m 49s
7000:	learn: 1.0492986	test: 2.1046939	best: 2.1045145 (6882)	total: 4m 15s	remaining: 56m 39s
7500:	learn: 1.0222655	test: 2.1049285	best: 2.1044806 (7229)	total: 4m 37s	remaining: 57m
bestTest = 2.104480622
bestIteration = 7229
Shrink model to first 7230 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.5317548	test: 4.4867464	best: 4.4867464 (0)	total: 62.3ms	remaining: 1h 43m 51s
500:	learn: 1.9579903	test: 2.3418965	best: 2.3418965 (500)	total: 17.4s	remaining: 57m 43s
1000:	learn: 1.7426254	test: 2.2779696	best: 2.2779324 (996)	total: 34.7s	remaining: 57m 13s
1500:	learn: 1.6108901	test: 2.2524855	best: 2.2523982 (1499)	total: 52.1s	remaining: 56m 56s
2000:	learn: 1.5137670	test: 2.2393605	best: 2.2393192 (1996)	total: 1m 9s	remaining: 56m 48s
2500:	learn: 1.4350160	test: 2.2303720	best: 2.2303720 (2500)	total: 1m 27s	remaining: 56m 40s
3000:	learn: 1.3697180	test: 2.2240323	best: 2.2238934 (2992)	total: 1m 44s	remaining: 56m 28s
3500:	learn: 1.3135203	test: 2.2190919	best: 2.2190630 (3499)	total: 2m 2s	remaining: 56m 25s
4000:	learn: 1.2640887	test: 2.2157533	best: 2.2155969 (3836)	total: 2m 20s	remaining: 56m 19s
4500:	learn: 1.2188958	test: 2.2143293	best: 2.2142076 (4473)	total: 2m 41s	remaining: 57m 1s
5000:	learn: 1.1769693	test: 2.2127681	best: 2.2126931 (4987)	total: 2m 59s	remaining: 56m 46s
5500:	learn: 1.1393745	test: 2.2111695	best: 2.2111040 (5492)	total: 3m 17s	remaining: 56m 35s
6000:	learn: 1.1054532	test: 2.2103737	best: 2.2101124 (5927)	total: 3m 36s	remaining: 56m 25s
6500:	learn: 1.0748606	test: 2.2093511	best: 2.2092954 (6496)	total: 3m 54s	remaining: 56m 12s
7000:	learn: 1.0458828	test: 2.2086489	best: 2.2085531 (6989)	total: 4m 13s	remaining: 56m 2s
bestTest = 2.208553133
bestIteration = 6989
Shrink model to first 6990 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.4939595	test: 4.6255407	best: 4.6255407 (0)	total: 50.5ms	remaining: 1h 24m 5s
500:	learn: 1.9440616	test: 2.3928196	best: 2.3926411 (499)	total: 17.5s	remaining: 57m 47s
1000:	learn: 1.7339401	test: 2.3241139	best: 2.3241139 (1000)	total: 37.5s	remaining: 1h 1m 52s
1500:	learn: 1.5988560	test: 2.2907101	best: 2.2907101 (1500)	total: 55s	remaining: 1h 11s
2000:	learn: 1.5021932	test: 2.2749802	best: 2.2749802 (2000)	total: 1m 12s	remaining: 59m 13s
2500:	learn: 1.4257824	test: 2.2665642	best: 2.2665642 (2500)	total: 1m 30s	remaining: 58m 40s
3000:	learn: 1.3612599	test: 2.2616051	best: 2.2615572 (2999)	total: 1m 48s	remaining: 58m 17s
3500:	learn: 1.3022347	test: 2.2583020	best: 2.2582232 (3498)	total: 2m 6s	remaining: 57m 58s
4000:	learn: 1.2524294	test: 2.2567844	best: 2.2567844 (4000)	total: 2m 24s	remaining: 57m 43s
4500:	learn: 1.2057899	test: 2.2533363	best: 2.2530857 (4429)	total: 2m 42s	remaining: 57m 27s
5000:	learn: 1.1652112	test: 2.2519719	best: 2.2517978 (4940)	total: 3m	remaining: 57m 12s
5500:	learn: 1.1289520	test: 2.2511565	best: 2.2509896 (5419)	total: 3m 19s	remaining: 57m 1s
6000:	learn: 1.0966734	test: 2.2506204	best: 2.2502090 (5943)	total: 3m 37s	remaining: 56m 51s
6500:	learn: 1.0648663	test: 2.2491292	best: 2.2490359 (6351)	total: 3m 58s	remaining: 57m 13s
7000:	learn: 1.0356337	test: 2.2488041	best: 2.2486141 (6918)	total: 4m 17s	remaining: 56m 58s
7500:	learn: 1.0097624	test: 2.2483700	best: 2.2478296 (7255)	total: 4m 36s	remaining: 56m 45s
bestTest = 2.247829629
bestIteration = 7255
Shrink model to first 7256 iterations.
================================================================================
---TRAINING CATBOOST---
0:	learn: 4.4609746	test: 4.7316398	best: 4.7316398 (0)	total: 119ms	remaining: 3h 17m 53s
500:	learn: 1.9311151	test: 2.4198401	best: 2.4198401 (500)	total: 21.1s	remaining: 1h 9m 49s
1000:	learn: 1.7206497	test: 2.3558822	best: 2.3558822 (1000)	total: 38.5s	remaining: 1h 3m 23s
1500:	learn: 1.5928112	test: 2.3273654	best: 2.3273654 (1500)	total: 55.9s	remaining: 1h 1m 7s
2000:	learn: 1.4977208	test: 2.3166917	best: 2.3166917 (2000)	total: 1m 13s	remaining: 59m 51s
2500:	learn: 1.4205029	test: 2.3103078	best: 2.3103078 (2500)	total: 1m 30s	remaining: 59m 4s
3000:	learn: 1.3558684	test: 2.3049002	best: 2.3047658 (2995)	total: 1m 51s	remaining: 59m 57s
3500:	learn: 1.3017111	test: 2.3025570	best: 2.3024489 (3487)	total: 2m 9s	remaining: 59m 24s
4000:	learn: 1.2525947	test: 2.3015429	best: 2.3015032 (3999)	total: 2m 27s	remaining: 58m 55s
4500:	learn: 1.2069589	test: 2.3015803	best: 2.3011106 (4316)	total: 2m 45s	remaining: 58m 29s
bestTest = 2.301110649
bestIteration = 4316
Shrink model to first 4317 iterations.
================================================================================
<CATBOOST> OVERALL RMSE     : 2.235972462865775


# 제출 파일 만들기
ensemble_pred = 0.5 * ( lgbm_oof_test+ cat_oof_test )
sample_submission[target_col] = np.clip( ensemble_pred, 0 , max(ensemble_pred) )


# Train-set의 실제값과 예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( train_label, color='r' , label='real')
sns.distplot( 0.5*(lgbm_oof_train + cat_oof_train), color='b', label='prediction' )
plt.legend()
plt.title("Real Vs Prediction");


# Train-set/Test-set의  예측값 비교
plt.figure(figsize=(12,6))

sns.distplot( 0.5*(lgbm_oof_train + cat_oof_train), color='r' , label='Train')
sns.distplot( ensemble_pred, color='b', label='Test' )
plt.legend()
plt.title("Prediction for Train/Test-set");


from IPython.display import FileLink

sample_submission.to_csv('lgbm_catboost_ensemble.csv', index=False)


FileLink('lgbm_catboost_ensemble.csv')

[Kaggle][필사] Wallmart_Sales_Top_3___EDA_Feature_Engineering (0)	2022.12.28
[Kaggle][필사] Future Sales Prediction: playground (0)	2022.12.26
The Boston Housing Dataset (0)	2022.11.26
customer churn prediction 2020 (0)	2022.11.18
house prices advanced regression techniques (0)	2022.11.18

hayley

bus passenger prediction

Load Dataset¶

데이터 이해하기¶

1. 데이터의 사이즈는?¶

2. Train/Test는 어떻게 분리되어 있는가?¶

1. id에 차이가 있는가?¶

2. date에 차이가 있는가?¶

3. 정류장에 차이가 있는가?¶

3. Missing Value는 존재하는가?¶

4. Target Variable의 분포는?¶

1. Target Variable이 0인 데이터는 어떤 특징을 가지고 있는가?¶

5. 데이터의 특이한/주목해야할 부분은?¶

1. 같은 정류장 이름이 여러 번 나오는 경우?¶

1.1 고유한 정거장의 기준은?¶

station_name를 기준으로 삼는다면?¶

station_code는 어떨까?¶

Validation 전략¶

LightGBM을 통하여 모델링 하기¶

BASELINE MODEL 만들기¶

모델 학습에 있어 중요한 피쳐는 무엇인가?!¶

linear model¶

Tree-based Model¶

PDP PLOT¶

엔티티 개념을 활용한 피쳐 생성¶

날짜¶

버스 노선¶

정류장¶

Feature 선택/제거¶

Test-set 값 예측 (Ensemble)¶

제출 파일 만들기¶

'kaggle' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	id	date	in_out	station_code	station_name	latitude	longitude	6~7_ride	7~8_ride	8~9_ride	9~10_ride	10~11_ride	11~12_ride	18~20_ride
0	0	2019-09-01	1	321	1481	33.48990	126.49373	0.0	1.0	2.0	5.0	2.0	6.0	0.0
1	1	2019-09-01	1	334	1822	33.48944	126.48508	1.0	4.0	4.0	2.0	5.0	6.0	5.0
2	2	2019-09-01	1	407	1406	33.48181	126.47352	1.0	1.0	0.0	2.0	0.0	0.0	2.0

	id	date	bus_route_id	in_out	station_code	station_name	latitude	longitude	6~7_ride	7~8_ride	8~9_ride	9~10_ride	10~11_ride	11~12_ride	11~12_takeoff
0	415423	2019-10-01	4270000	시외	344	제주썬호텔	33.48990	126.49373	4.0	4.0	7.0	2.0	9.0	1.0	1.0
1	415424	2019-10-01	4270000	시외	357	한라병원	33.48944	126.48508	1.0	6.0	6.0	1.0	8.0	11.0	0.0
2	415425	2019-10-01	4270000	시외	432	정존마을	33.48181	126.47352	2.0	4.0	2.0	2.0	2.0	1.0	0.0

	station_code	station_name	latitude	longitude
0	341	1001	33.51912	126.63493
1	787	1001	33.51037	126.63498
2	789	1001	33.51734	126.63563
3	340	1001	33.51957	126.63523
4	873	203	33.36658	126.28713
5	867	203	33.35763	126.29492
6	2703	1001	33.51493	126.63479
7	2704	1001	33.51500	126.63504
8	661	203	33.35743	126.29466
9	786	1001	33.51007	126.63523
10	874	203	33.36667	126.28690
11	788	1001	33.51695	126.63535
12	872	203	33.36255	126.29223
13	870	203	33.36098	126.29807
14	869	203	33.36053	126.29786

	col	imp
0	bus_route_id	3934
1	noon_ride	3515
2	morning_ride	3322
3	longitude	2773
4	latitude	2764
5	station_name	2747
6	dawn_ride	2726
7	station_code	2702
8	morning_takeoff	2291
9	noon_takeoff	2101
10	dawn_takeoff	1295
11	in_out	40

	id	date	in_out	station_code	station_name	latitude	longitude	18~20_ride	dawn_ride	morning_ride	noon_ride	morning_takeoff	weekday	avg_dawn_ride_bus_route_id	avg_morning_ride_bus_route_id	avg_noon_ride_bus_route_id	avg_dawn_takeoff_bus_route_id	avg_morning_takeoff_bus_route_id	avg_noon_takeoff_bus_route_id	precipitation
0	0	2019-09-01	1	322	1489	33.48990	126.49373	0.0	1.0	7.0	8.0	0.0	6	1.28	2.24	2.72	0.16	0.32	1.88	0.6
1	1	2019-09-01	1	335	1831	33.48944	126.48508	5.0	5.0	6.0	11.0	0.0	6	1.28	2.24	2.72	0.16	0.32	1.88	0.6
2	2	2019-09-01	1	408	1413	33.48181	126.47352	2.0	2.0	2.0	0.0	0.0	6	1.28	2.24	2.72	0.16	0.32	1.88	0.6
3	3	2019-09-01	0	1448	1438	33.50577	126.49252	53.0	17.0	32.0	30.0	0.0	6	1.28	2.24	2.72	0.16	0.32	1.88	0.6
4	4	2019-09-01	0	1510	1583	33.25579	126.41260	0.0	0.0	0.0	0.0	1.0	6	1.28	2.24	2.72	0.16	0.32	1.88	0.6