import multiprocessing 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings 
warnings.filterwarnings('ignore', category=RuntimeWarning)


column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

data = pd.read_csv('https://raw.githubusercontent.com/minaahayley/kaggle/main/The%20Boston%20Housing%20Dataset/housing.csv', header=None, delimiter=r"\s+", names=column_names)


data.head(5)


data.shape

(506, 14)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


data.describe()


fig, axs = plt.subplots(figsize=(24, 10), ncols=7, nrows=2)
features = data.columns

for i, feature in enumerate(features): 
  row = int(i/7)
  col = i%7
  sns.boxplot(y=feature, data=data, ax=axs[row][col])


data = data[~(data['CRIM'] >= 60)]
data = data[~(data['ZN'] >= 60)]
data = data[~(data['B'] <= 200)]
data = data[~(data['MEDV'] >= 50)]


data.hist(figsize=(20, 20), bins=50)
plt.show()


plt.figure(figsize=(16, 16))

sns.heatmap(data.corr(), 
            linewidths=0.1,
            square=True,
            annot=True,
            fmt='.2f',
            cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7f478c821150>


fig, axs = plt.subplots(figsize=(20, 20), ncols=4, nrows=4)
features = data.drop('MEDV', axis=1).describe().columns

for i, feature in enumerate(features): 
  row = int(i/4)
  col = i%4
  sns.regplot(y='MEDV', x=feature, data=data, ax=axs[row][col])


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  
from sklearn.preprocessing import StandardScaler, PolynomialFeatures 
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(lr_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(lr_reg.score(X_test, y_test)))

학습 데이터 점수 : 0.7692273559306411
평가 데이터 점수 : 0.7601485300868639


estimator = make_pipeline(
    StandardScaler(),
    LinearRegression()
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -7.90343922 -13.43802524 -29.71478895 -26.44629372 -20.4599371 ]
NSME scores mean: -19.59249684665766
RMSE scores : [2.81130561 3.66579122 5.45112731 5.14259601 4.5232662 ]
RMSE scores mean: 4.318817268412426


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


ridge_reg = Ridge(alpha=0.2)
ridge_reg.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(ridge_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(ridge_reg.score(X_test, y_test)))

학습 데이터 점수 : 0.7974029332754818
평가 데이터 점수 : 0.5967890252716412


estimator = make_pipeline(
    StandardScaler(),
    Ridge(alpha=0.2)
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -7.89999042 -13.43927207 -29.75215989 -26.43017164 -20.39878673]
NSME scores mean: -19.58407615161176
RMSE scores : [2.81069216 3.66596128 5.45455405 5.14102827 4.5165016 ]
RMSE scores mean: 4.317747471315019


# Lets try polinomial regression with L2 with degree for the best fit
pipe = Pipeline([('scaler', StandardScaler()),
                ('poly', PolynomialFeatures()),
                ('model', Ridge())
                 ])

param_grid = [{
    'model__alpha':  [1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1.0, 5.0, 10.0],
    'poly__degree' : [2, 3, 4, 5]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('poly', PolynomialFeatures()),
                                       ('model', Ridge())]),
             n_jobs=2,
             param_grid=[{'model__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5,
                                           1.0, 5.0, 10.0],
                          'poly__degree': [2, 3, 4, 5]}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__alpha': 10.0, 'poly__degree': 2}
GridSearchCV best score: -16.240601902637287


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.935980322533816
평가 데이터 점수 : 0.8149177039789857


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


rf_reg = RandomForestRegressor(n_estimators=500)
rf_reg.fit(X_train, y_train)
print(rf_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(rf_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(rf_reg.score(X_test, y_test)))

gbm_reg = GradientBoostingRegressor(n_estimators=500)
gbm_reg.fit(X_train.values, y_train.values)
print(gbm_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(gbm_reg.score(X_train.values, y_train.values)))
print('평가 데이터 점수 : {}'.format(gbm_reg.score(X_test.values, y_test.values)))

xgb_reg = XGBRegressor(n_estimators=500)
xgb_reg.fit(X_train, y_train)
print(xgb_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(xgb_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(xgb_reg.score(X_test, y_test)))

lgbm_reg = LGBMRegressor(n_estimators=500)
lgbm_reg.fit(X_train, y_train)
print(lgbm_reg.__class__.__name__)
print('학습 데이터 점수 : {}'.format(lgbm_reg.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(lgbm_reg.score(X_test, y_test)))

RandomForestRegressor
학습 데이터 점수 : 0.9774266949165727
평가 데이터 점수 : 0.9090184770195461
GradientBoostingRegressor
학습 데이터 점수 : 0.9994073221592501
평가 데이터 점수 : 0.903574495447218
[00:25:56] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor
학습 데이터 점수 : 0.9986843588761848
평가 데이터 점수 : 0.8960593693909321
LGBMRegressor
학습 데이터 점수 : 0.9983850695994405
평가 데이터 점수 : 0.9021560029616011


estimator = make_pipeline(
    StandardScaler(),
    GradientBoostingRegressor(n_estimators=500)
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -9.6284338   -9.21428619 -19.89510365 -12.6626706  -12.47762058]
NSME scores mean: -12.775622963814307
RMSE scores : [3.10297177 3.03550427 4.46039277 3.55846464 3.53236756]
RMSE scores mean: 3.537940202297387


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor())
])

param_grid = [{'model__n_estimators': [100, 200],
               'model__learning_rate': [0.1, 0.05, 0.02],
               'model__max_depth':[2, 4, 6],
               'model__min_samples_leaf':[3, 5, 9]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', GradientBoostingRegressor())]),
             n_jobs=2,
             param_grid=[{'model__learning_rate': [0.1, 0.05, 0.02],
                          'model__max_depth': [2, 4, 6],
                          'model__min_samples_leaf': [3, 5, 9],
                          'model__n_estimators': [100, 200]}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__min_samples_leaf': 3, 'model__n_estimators': 200}
GridSearchCV best score: -11.952847265529417


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.989011848031319
평가 데이터 점수 : 0.901290256127608


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


model = KNeighborsRegressor()
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.6849552608478233
평가 데이터 점수 : 0.5742663848300056


estimator = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [-13.30320482 -17.21877108 -33.15777831 -10.2794878  -17.13891707]
NSME scores mean: -18.21963181898325
RMSE scores : [3.64735587 4.14955071 5.75827911 3.20616403 4.13991752]
RMSE scores mean: 4.180253447759364


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsRegressor())
])

param_grid = [{'model__n_neighbors': [3, 5, 7],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', KNeighborsRegressor())]),
             n_jobs=2,
             param_grid=[{'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'model__n_neighbors': [3, 5, 7],
                          'model__weights': ['uniform', 'distance']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__algorithm': 'ball_tree', 'model__n_neighbors': 5, 'model__weights': 'distance'}
GridSearchCV best score: -17.805454212220518


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 1.0
평가 데이터 점수 : 0.7639422096383729


X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


svr = SVR()
svr.fit(X_train, y_train)

print('SVR 학습 데이터 점수 : {}'.format(svr.score(X_train, y_train)))
print('SVR 평가 데이터 점수 : {}'.format(svr.score(X_test, y_test)))

SVR 학습 데이터 점수 : 0.24123466543839422
SVR 평가 데이터 점수 : 0.25537782133048303


param_grid = [{'kernel': ['rbf', 'polynomial', 'sigmoid', 'linear']}]

gs = GridSearchCV(
    estimator=SVR(),
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:972: UserWarning: One or more of the test scores are non-finite: [-45.01455364          nan -62.30917282 -20.83770115]
  category=UserWarning,

GridSearchCV(cv=5, estimator=SVR(), n_jobs=2,
             param_grid=[{'kernel': ['rbf', 'polynomial', 'sigmoid',
                                     'linear']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_estimator_)
print(gs.best_params_)

SVR(kernel='linear')
{'kernel': 'linear'}


linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train, y_train)

print('Linear SVR 학습 데이터 점수 : {}'.format(linear_svr.score(X_train, y_train)))
print('Linear SVR 평가 데이터 점수 : {}'.format(linear_svr.score(X_test, y_test)))

Linear SVR 학습 데이터 점수 : 0.7742112255059793
Linear SVR 평가 데이터 점수 : 0.6662563800418307


linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train_scale, y_train)

print('Linear SVR 학습 데이터 점수 : {}'.format(linear_svr.score(X_train_scale, y_train)))
print('Linear SVR 평가 데이터 점수 : {}'.format(linear_svr.score(X_test_scale, y_test)))

Linear SVR 학습 데이터 점수 : 0.780745552242202
Linear SVR 평가 데이터 점수 : 0.6506520061702693


estimator = make_pipeline(
    StandardScaler(),
    SVR(kernel='linear')
) 

# cross_val_score는 자체적으로 교차 검증하기 때문에 train_test_split() 필요 없음!
nsme = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_error')
mse = -1 * nsme 
rmse = np.sqrt(mse)

print('NMSE scores: {}'.format(nsme))
print('NSME scores mean: {}'.format(nsme.mean()))
print('RMSE scores : {}'.format(rmse))
print('RMSE scores mean: {}'.format(rmse.mean()))

NMSE scores: [ -6.54959765 -12.3597567  -42.4239171  -24.02037228 -19.43607983]
NSME scores mean: -20.957944712932324
RMSE scores : [2.55921817 3.51564456 6.5133645  4.90105828 4.40863696]
RMSE scores mean: 4.379584495005061


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVR(kernel='linear'))
])

param_grid = [{
    'model__gamma' : ['scale', 'auto'],
    'model__C' : [1.0, 0.1, 0.01],
    'model__epsilon' : [1.0, 0.1, 0.01]
}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVR(kernel='linear'))]),
             n_jobs=2,
             param_grid=[{'model__C': [1.0, 0.1, 0.01],
                          'model__epsilon': [1.0, 0.1, 0.01],
                          'model__gamma': ['scale', 'auto']}],
             scoring='neg_mean_squared_error', verbose=True)


print(gs.best_params_)
print('GridSearchCV best score: {}'.format(gs.best_score_))

{'model__C': 0.1, 'model__epsilon': 1.0, 'model__gamma': 'scale'}
GridSearchCV best score: -20.282059924605562


model = gs.best_estimator_
model.fit(X_train, y_train)

print('학습 데이터 점수 : {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수 : {}'.format(model.score(X_test, y_test)))

학습 데이터 점수 : 0.766644936233691
평가 데이터 점수 : 0.670167100161414

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677083	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

[Kaggle][필사] Future Sales Prediction: playground (0)	2022.12.26
bus passenger prediction (0)	2022.12.23
customer churn prediction 2020 (0)	2022.11.18
house prices advanced regression techniques (0)	2022.11.18
bike sharing demand (0)	2022.11.18

hayley

The Boston Housing Dataset

데이터 정제¶

이상치 제거¶

데이터 분포도¶

상관 관계¶

모델링¶

선형 회귀 (Linear Regression)¶

릿지 회귀 (Ridge Regression)¶

회귀 트리¶

최근접 이웃¶

서포트 벡터 머신¶

'kaggle' 카테고리의 다른 글

+ Recent posts

티스토리툴바