import pandas as pd 
import numpy as np

from sklearn import preprocessing


nba_player_of_the_week_df = pd.read_csv('/content/drive/MyDrive/data/NBA_player_of_the_week.csv')


nba_player_of_the_week_df.sample()


nba_player_of_the_week_df.describe()


height_weight_age_df = nba_player_of_the_week_df[['Height CM', 'Weight KG', 'Age']]


height_weight_age_df.sample()


# min-max-normalization 
scaler = preprocessing.MinMaxScaler()
normalized_data = scaler.fit_transform(height_weight_age_df)
normalized_data

array([[0.51851852, 0.32911392, 0.0952381 ],
       [0.7037037 , 0.56962025, 0.28571429],
       [0.48148148, 0.39240506, 0.19047619],
       ...,
       [0.48148148, 0.37974684, 0.23809524],
       [0.38888889, 0.21518987, 0.23809524],
       [0.42592593, 0.27848101, 0.52380952]])


normalized_df = pd.DataFrame(normalized_data, columns=['Height', 'Weight', 'Age'])


normalized_df.describe()


df = pd.DataFrame(np.array([25, 49, 32, 35, 40]), columns=['Age'])
df


# min-max-normalization 
from sklearn import preprocessing 
scaler = preprocessing.MinMaxScaler()


normalized_data = scaler.fit_transform(df)
normalized_data

array([[0.        ],
       [1.        ],
       [0.29166667],
       [0.41666667],
       [0.625     ]])


normalized_df = pd.DataFrame(normalized_data, columns=['Age'])
normalized_df


df = pd.DataFrame(np.array([25000000, 35000000, 30000000, 50000000, 35000000]), columns=['Salary'])

df


scaler = preprocessing.MinMaxScaler()


normalized_data = scaler.fit_transform(df)
normalized_data

array([[0. ],
       [0.4],
       [0.2],
       [1. ],
       [0.4]])


normalized_df = pd.DataFrame(normalized_data, columns=['Salary'])
normalized_df


import pandas as pd


titanic_df = pd.read_csv('/content/drive/MyDrive/data/titanic.csv')
titanic_df.sample()


titanic_sex_embarked = titanic_df[['Sex', 'Embarked']]
titanic_sex_embarked.head()


one_hot_encoded_df = pd.get_dummies(titanic_sex_embarked)
one_hot_encoded_df.head()


one_hot_encoded_df = pd.get_dummies(data=titanic_df, columns=['Sex', 'Embarked'])
one_hot_encoded_df.head()


import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

from scipy import stats 
from sklearn.datasets import load_boston 
import warnings 
warnings.filterwarnings('ignore')


boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df['Price'] = boston.target


boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  Price    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


fig, axs = plt.subplots(figsize=(16,8), ncols=4, nrows=2)

features = ['RM', 'ZN', 'INDUS', 'NOX', 'AGE', 'PTRATIO', 'LSTAT','RAD']

for i, feature in enumerate(features):
    row = int(i/4)
    col = i % 4  
    sns.regplot(x=feature, y='Price', data=boston_df, ax=axs[row][col])


from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score


X = boston_df.drop('Price', axis=1)
y = boston_df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=156)


lr = LinearRegression()
lr.fit(X_train, y_train)

print('training set에서의 성능')
y_train_predict = lr.predict(X_train)
mse = mean_squared_error(y_train, y_train_predict)
print('rmse: {0:.3f}'.format(np.sqrt(mse)))

print('test set에서의 성능')
y_test_predict = lr.predict(X_test)
mse = mean_squared_error(y_test, y_test_predict)
print('rmse: {0:.3f}'.format(np.sqrt(mse)))

training set에서의 성능
rmse: 4.943
test set에서의 성능
rmse: 4.159


from sklearn.preprocessing import PolynomialFeatures


# 다항변형기로 데이터를 다항회귀를 위해 가공한다  
polynomial_transformer = PolynomialFeatures(2)
polynomial_data = polynomial_transformer.fit_transform(boston.data)

print(polynomial_data.shape)
polynomial_data

(506, 105)

array([[1.00000000e+00, 6.32000000e-03, 1.80000000e+01, ...,
        1.57529610e+05, 1.97656200e+03, 2.48004000e+01],
       [1.00000000e+00, 2.73100000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 3.62766600e+03, 8.35396000e+01],
       [1.00000000e+00, 2.72900000e-02, 0.00000000e+00, ...,
        1.54315409e+05, 1.58310490e+03, 1.62409000e+01],
       ...,
       [1.00000000e+00, 6.07600000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 2.23851600e+03, 3.18096000e+01],
       [1.00000000e+00, 1.09590000e-01, 0.00000000e+00, ...,
        1.54802902e+05, 2.54955600e+03, 4.19904000e+01],
       [1.00000000e+00, 4.74100000e-02, 0.00000000e+00, ...,
        1.57529610e+05, 3.12757200e+03, 6.20944000e+01]])


polynomial_feature_names = polynomial_transformer.get_feature_names(boston.feature_names)


X = pd.DataFrame(polynomial_data, columns=polynomial_feature_names)
y = pd.DataFrame(boston.target, columns=['Price']) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)


lr = LinearRegression()
lr.fit(X_train, y_train)

print('training set에서의 성능')
y_train_predict = lr.predict(X_train)
mse = mean_squared_error(y_train, y_train_predict)
print('rmse: {0:.3f}'.format(np.sqrt(mse)))

print('test set에서의 성능')
y_test_predict = lr.predict(X_test)
mse = mean_squared_error(y_test, y_test_predict)
print('rmse: {0:.3f}'.format(np.sqrt(mse)))

training set에서의 성능
rmse: 2.425
test set에서의 성능
rmse: 3.197


from sklearn.model_selection import cross_val_score 
from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(random_state=0, n_estimators=1000)

neg_mse_scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
avg_rmse

4.386593953202736


def get_rmse(model, X, y): 
    neg_mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print(model.__class__.__name__)
    print('5 교차 검증의 평균 rmse: {0:.3f}'.format(avg_rmse))


X = boston_df.drop('Price', axis=1)
y = boston_df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=156)


from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor 

dt_reg = DecisionTreeRegressor(random_state=0, max_depth=4)
rf_reg = RandomForestRegressor(random_state=0, n_estimators=1000)
gb_reg = GradientBoostingRegressor(random_state=0, n_estimators=1000)
xgb_reg = XGBRegressor(n_estimators=1000)
lgb_reg =LGBMRegressor(n_estimators=1000)

models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]

for model in models: 
    get_rmse(model, X, y)

DecisionTreeRegressor
5 교차 검증의 평균 rmse: 5.978
RandomForestRegressor
5 교차 검증의 평균 rmse: 4.423
GradientBoostingRegressor
5 교차 검증의 평균 rmse: 4.269
[06:17:32] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:17:32] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:17:33] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:17:33] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:17:33] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor
5 교차 검증의 평균 rmse: 4.089
LGBMRegressor
5 교차 검증의 평균 rmse: 4.646


import seaborn as sns 

rf_reg = RandomForestRegressor(n_estimators=1000)

rf_reg.fit(X, y)

s = pd.Series(data=rf_reg.feature_importances_, index=X.columns).sort_values(ascending=False)
sns.barplot(x=s, y=s.index)

<matplotlib.axes._subplots.AxesSubplot at 0x7f093ef60250>


!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}


ls -1ha kaggle.json

kaggle.json


!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

kaggle.json


#Copy API command 후 데이터셋 다운로드하기  
!kaggle datasets download -d kaggle/kaggle-survey-2017

#파일 압축 풀기
!unzip kaggle-survey-2017.zip

Downloading kaggle-survey-2017.zip to /content
  0% 0.00/3.52M [00:00<?, ?B/s]
100% 3.52M/3.52M [00:00<00:00, 210MB/s]
Archive:  kaggle-survey-2017.zip
  inflating: RespondentTypeREADME.txt  
  inflating: conversionRates.csv     
  inflating: freeformResponses.csv   
  inflating: multipleChoiceResponses.csv  
  inflating: schema.csv


!mkdir -p ~/.kaggle/competitions/kaggle-survey-2017 #create folder name Kaggle
!cp RespondentTypeREADME.txt ~/.kaggle/competitions/kaggle-survey-2017 #copy into folder Kaggle
!cp conversionRates.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp freeformResponses.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp multipleChoiceResponses.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp schema.csv ~/.kaggle/competitions/kaggle-survey-2017


#다운로드가 제대로 되었는지 확인한다. 
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/kaggle-survey-2017

conversionRates.csv    multipleChoiceResponses.csv  schema.csv
freeformResponses.csv  RespondentTypeREADME.txt



# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild()



# 단계 2: 런타임 재시작
import os
os.kill(os.getpid(), 9)



# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild()

#레티나 디스플레이로 폰트가 선명하게 표시되도록 합니다.

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')



!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}



ls -1ha kaggle.json

kaggle.json



!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning



#대회 약관 동의하고, 데이터셋 불러오기 
!kaggle competitions download -c bike-sharing-demand
!unzip bike-sharing-demand.zip

Downloading bike-sharing-demand.zip to /content
  0% 0.00/189k [00:00<?, ?B/s]
100% 189k/189k [00:00<00:00, 75.8MB/s]
Archive:  bike-sharing-demand.zip
  inflating: sampleSubmission.csv    
  inflating: test.csv                
  inflating: train.csv



!mkdir -p ~/.kaggle/competitions/bike-sharing-demand #create folder name Kaggle
!cp sampleSubmission.csv ~/.kaggle/competitions/bike-sharing-demand #copy into folder Kaggle
!cp test.csv ~/.kaggle/competitions/bike-sharing-demand 
!cp train.csv ~/.kaggle/competitions/bike-sharing-demand



#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/bike-sharing-demand

sampleSubmission.csv  test.csv  train.csv



import pandas as pd
import numpy as np
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats  

#노트북 안에 그래프를 그리기 위해
%matplotlib inline 

#그래프에서 격자로 숫자 범위가 눈에 잘 띄도록 ggplot 스타일을 사용 
plt.style.use('ggplot')

#그래프에서 마이너스 폰트 깨지는 문제에 대한 대처 
mpl.rcParams['axes.unicode_minus'] = False



test = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/test.csv')
train = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/train.csv')
sampleSubmission = pd.read_csv('~/.kaggle/competitions/bike-sharing-demand/sampleSubmission.csv')



train.shape

(10886, 12)



train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB



train.head()



train.temp.describe()

count    10886.00000
mean        20.23086
std          7.79159
min          0.82000
25%         13.94000
50%         20.50000
75%         26.24000
max         41.00000
Name: temp, dtype: float64



train.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64



train['datetime'] = pd.to_datetime(train['datetime'])
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second



figure, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3)
figure.set_size_inches(18,8)

sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)
sns.barplot(data=train, x='day', y='count', ax=ax3)
sns.barplot(data=train, x='hour', y='count', ax=ax4)
sns.barplot(data=train, x='minute', y='count', ax=ax5)
sns.barplot(data=train, x='second', y='count', ax=ax6)

ax1.set(ylabel='Count', title='연도별 대여량')
ax2.set(ylabel='month', title='월별 대여량')
ax3.set(ylabel='day', title='일별 대여량')
ax4.set(ylabel='hour', title='시간별 대여량')

[Text(0, 0.5, 'hour'), Text(0.5, 1.0, '시간별 대여량')]



fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_size_inches(12, 10)

sns.boxplot(data=train, y='count', orient='v', ax=axes[0][0])
sns.boxplot(data=train, y='count', x='season', orient='v', ax=axes[0][1])
sns.boxplot(data=train, y='count', x='hour', orient='v', ax=axes[1][0])
sns.boxplot(data=train, y='count', x='workingday', orient='v', ax=axes[1][1])

axes[0][0].set(ylabel='Count', title='대여량')
axes[0][1].set(xlabel='Season', ylabel='Count', title='계절별 대여량')
axes[1][0].set(xlabel='Hour Of The Day', ylabel='Count', title='시간별 대여량')
axes[1][1].set(xlabel='Working Day', ylabel='Count', title='근무일 여부에 따른 대여량')

[Text(0, 0.5, 'Count'),
 Text(0.5, 0, 'Working Day'),
 Text(0.5, 1.0, '근무일 여부에 따른 대여량')]



train['dayofweek'] = train['datetime'].dt.dayofweek
train['dayofweek'].value_counts()

5    1584
6    1579
3    1553
0    1551
2    1551
1    1539
4    1529
Name: dayofweek, dtype: int64



fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows=5)
fig.set_size_inches(18,25)

sns.pointplot(data=train, x='hour', y='count', ax=ax1)

sns.pointplot(data=train, x='hour', y='count', hue='workingday', ax=ax2)

sns.pointplot(data=train, x='hour', y='count', hue='dayofweek', ax=ax3)

sns.pointplot(data=train, x='hour', y='count', hue='weather', ax=ax4)

sns.pointplot(data=train, x='hour', y='count', hue='season', ax=ax5)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0a3d0fc890>



corrMatt = train[['temp', 'atemp', 'casual', 'registered', 'humidity', 'windspeed', 'count']]
corrMatt = corrMatt.corr() #상관관계 
print(corrMatt)

#삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형에 False)
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False

                temp     atemp    casual  registered  humidity  windspeed  \
temp        1.000000  0.984948  0.467097    0.318571 -0.064949  -0.017852   
atemp       0.984948  1.000000  0.462067    0.314635 -0.043536  -0.057473   
casual      0.467097  0.462067  1.000000    0.497250 -0.348187   0.092276   
registered  0.318571  0.314635  0.497250    1.000000 -0.265458   0.091052   
humidity   -0.064949 -0.043536 -0.348187   -0.265458  1.000000  -0.318607   
windspeed  -0.017852 -0.057473  0.092276    0.091052 -0.318607   1.000000   
count       0.394454  0.389784  0.690414    0.970948 -0.317371   0.101369   

               count  
temp        0.394454  
atemp       0.389784  
casual      0.690414  
registered  0.970948  
humidity   -0.317371  
windspeed   0.101369  
count       1.000000



#상관관계 분석 시각화 

fig, ax = plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt, 
            mask=mask, #표시하지 않을 마스크 부분을 지정한다.   
            vmax=.8, #컬러바 범위  
            square=True, #정사각형 
            annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0a3cdb60d0>



fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
fig.set_size_inches(12, 5)
sns.regplot(x='temp', y='count', data=train, ax=ax1) #온도
sns.regplot(x='windspeed', y='count', data=train, ax=ax2) #풍속
sns.regplot(x='humidity', y='count', data=train, ax=ax3) #습도

<matplotlib.axes._subplots.AxesSubplot at 0x7f0a39fa6150>



def concatenate_year_month(datetime):
    return '{0}-{1}'.format(datetime.year, datetime.month)

train['year_month'] = train['datetime'].apply(concatenate_year_month)

train[['datetime', 'year_month']].head()



fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(18, 4)

sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)

fig, ax3 = plt.subplots(nrows=1, ncols=1)
fig.set_size_inches(18, 4)

sns.barplot(data=train, x='year_month', y='count', ax=ax3)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0a39e50190>



# count값의 데이터 분포도를 파악 

figure, axes = plt.subplots(ncols=2, nrows=2)
figure.set_size_inches(12, 10)

sns.distplot(train['count'], ax=axes[0][0])
stats.probplot(train['count'], dist='norm', fit=True, plot=axes[0][1])

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

((array([-3.83154229, -3.60754977, -3.48462983, ...,  3.48462983,
          3.60754977,  3.83154229]),
  array([  1,   1,   1, ..., 968, 970, 977])),
 (169.82942673231386, 191.5741319125482, 0.9372682766213176))



!pip install kaggle 
from google.colab import files 
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)

Saving kaggle.json to kaggle.json

{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}



ls -1ha kaggle.json

kaggle.json



!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning



#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle

competitions/  kaggle.json



!kaggle datasets download -d kaggle/kaggle-survey-2017
!unzip kaggle-survey-2017.zip

Downloading kaggle-survey-2017.zip to /content
  0% 0.00/3.52M [00:00<?, ?B/s]
100% 3.52M/3.52M [00:00<00:00, 236MB/s]
Archive:  kaggle-survey-2017.zip
  inflating: RespondentTypeREADME.txt  
  inflating: conversionRates.csv     
  inflating: freeformResponses.csv   
replace multipleChoiceResponses.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: multipleChoiceResponses.csv  
  inflating: schema.csv



!mkdir -p ~/.kaggle/competitions/kaggle-survey-2017 #create folder name Kaggle
!cp RespondentTypeREADME.txt ~/.kaggle/competitions/kaggle-survey-2017 #copy into folder Kaggle
!cp conversionRates.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp freeformResponses.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp multipleChoiceResponses.csv ~/.kaggle/competitions/kaggle-survey-2017 
!cp schema.csv ~/.kaggle/competitions/kaggle-survey-2017



#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다. 
%ls ~/.kaggle/competitions/kaggle-survey-2017

conversionRates.csv    multipleChoiceResponses.csv  SurveySchema.csv
freeformResponses.csv  RespondentTypeREADME.txt
freeFormResponses.csv  schema.csv



#노트북안에서 그래프를 그리기 위해
%matplotlib inline

import pandas as pd 
import numpy as np 
from scipy import stats 
import matplotlib.pyplot as plt
import seaborn as sns 

#노트북에서 warning이 보이지 않게 하기 위해 
import warnings 
warnings.filterwarnings('ignore')



survey = pd.read_csv('~/.kaggle/competitions/kaggle-survey-2017/schema.csv')
ffr = pd.read_csv('~/.kaggle/competitions/kaggle-survey-2017/freeformResponses.csv', encoding='ISO-8859-1', low_memory=False)
mcr = pd.read_csv('~/.kaggle/competitions/kaggle-survey-2017/multipleChoiceResponses.csv', encoding='ISO-8859-1', low_memory=False)



mcr.sample()



#성별 
#countplot 명령을 사용하면 각 카테고리 값별로 데이터가 얼마나 있는지 표시할 수 있다.
sns.countplot(y='GenderSelect', data=mcr)

<matplotlib.axes._subplots.AxesSubplot at 0x7f90472e0ad0>



#학력 
sns.countplot(y='FormalEducation', data=mcr)

<matplotlib.axes._subplots.AxesSubplot at 0x7f9043c79310>



sns.countplot(y='FormalEducation',
              hue='GenderSelect',
              data=mcr).legend(loc='center left',
                       bbox_to_anchor=(1, 0.5))

<matplotlib.legend.Legend at 0x7f9040d7b650>



#Seaborn의 distplot 명령은 러그와 커널 밀도 표시 기능이 있어서 Matplotlib의 hist 명령보다 많이 사용된다.

sns.distplot(mcr[mcr['Age']>0]['Age'])

<matplotlib.axes._subplots.AxesSubplot at 0x7f90472c96d0>



#연령대를 성별에 따라 그리기 
figure, (ax1, ax2) = plt.subplots(ncols=2)
figure.set_size_inches(12, 5)

sns.distplot(mcr[mcr['GenderSelect'] == 'Female']['Age'].dropna(),
             norm_hist=False, ax=ax1)
plt.title('Female')

sns.distplot(mcr[mcr['GenderSelect'] == 'Male']['Age'].dropna(),
             norm_hist=False, ax=ax2)
plt.title('Male')

Text(0.5, 1.0, 'Male')



import pandas as pd



netflix_titles = pd.read_csv('/content/drive/MyDrive/data/netflix_titles.csv')



netflix_titles.sample()



import seaborn as sns
import matplotlib.pyplot as plt



df = netflix_titles.groupby(['type','release_year'])['title'].nunique().reset_index()
df = df[df['release_year']>=2011]
df.sample()



sns.lineplot(data=df, x='release_year', y='title', ci=None)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6431ffab50>



df.sample()



sns.lineplot(data=df, x='release_year', y='title', ci=None)
plt.axhline(df['title'].median(), linestyle='dashed')

<matplotlib.lines.Line2D at 0x7f6423cedf90>



sns.lineplot(data=df, x='release_year', y='title', ci=None, hue='type')

<matplotlib.axes._subplots.AxesSubplot at 0x7f6421010210>



sns.lineplot(data=df, x='release_year', y='title', ci=None, hue='type', style='type')

<matplotlib.axes._subplots.AxesSubplot at 0x7f6420fa2610>



sns.lineplot(data=df, x='release_year', y='title', ci=None, hue='type', style='type', markers=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6420f3d2d0>



sns.displot(x=df['title'])

<seaborn.axisgrid.FacetGrid at 0x7f6420e9e2d0>



sns.displot(x=df['title'], kde=True)

<seaborn.axisgrid.FacetGrid at 0x7f6420df1f50>



sns.displot(x=df['title'], kind='kde')

<seaborn.axisgrid.FacetGrid at 0x7f6420d16cd0>



sns.displot(data=df, x='title', hue='type')

<seaborn.axisgrid.FacetGrid at 0x7f6420df1450>



sns.displot(data=df, x='title', hue='type', kind='kde')

<seaborn.axisgrid.FacetGrid at 0x7f6420db0850>



sns.displot(data=df, x='title', hue='type', col='release_year', col_wrap=4)

<seaborn.axisgrid.FacetGrid at 0x7f6420c70d50>



df.sample()



sns.boxplot(data=df, y='title')

<matplotlib.axes._subplots.AxesSubplot at 0x7f64205a2190>



Q1 = df['title'].quantile(q=0.25)
Q3 = df['title'].quantile(q=0.75)
IQR = Q3 - Q1
IQR

261.75



IQR_df = df[(df['title'] <= (Q3 + IQR *1.5)) & (df['title'] >= (Q1 - IQR *1.5))]
len(IQR_df)

22



sns.boxplot(data=IQR_df, y='title')

<matplotlib.axes._subplots.AxesSubplot at 0x7f64205a4490>



netflix_titles['year'] = pd.to_datetime(netflix_titles['date_added']).dt.year



heatmap = netflix_titles.pivot_table(index='year', columns='rating', values='title', aggfunc='count')
heatmap



plt.figure(figsize=(12, 6))
sns.heatmap(heatmap, annot=True, fmt='.0f', cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7f6420c11b10>



df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)



import numpy as np
import matplotlib.patches as mpatches



#top bar 
total = df.groupby('release_year').sum().reset_index()

# bar chart 1 -> top bars (group of 'type != Movie')
bar1 = sns.barplot(x="release_year",  y="title", data=total, color='darkblue')

# bottom bar ->  take only 'type == Movie' values from the data
movie = df[df.type=='Movie']

# bar chart 2 -> bottom bars (group of 'type == Movie')
bar2 = sns.barplot(x="release_year", y="title", data=movie, estimator=sum, ci=None,  color='lightblue')

# add legend
top_bar = mpatches.Patch(color='darkblue', label='TV Show')
bottom_bar = mpatches.Patch(color='lightblue', label='Movie')
plt.legend(handles=[top_bar, bottom_bar])

# show the graph
plt.show()



# from raw value to percentage
total = df.groupby('release_year')['title'].sum().reset_index()
movie = df[df.type=='Movie'].groupby('release_year')['title'].sum().reset_index()
movie['title'] = [i / j * 100 for i,j in zip(movie['title'], total['title'])]
total['title'] = [i / j * 100 for i,j in zip(total['title'], total['title'])]

# bar chart 1 -> top bars (group of 'type != Movie')
bar1 = sns.barplot(x="release_year",  y="title", data=total, color='darkblue')

# bar chart 2 -> bottom bars (group of 'type == Movie')
bar2 = sns.barplot(x="release_year", y="title", data=movie, color='lightblue')

# add legend
top_bar = mpatches.Patch(color='darkblue', label='TV Show')
bottom_bar = mpatches.Patch(color='lightblue', label='Movie')
plt.legend(handles=[top_bar, bottom_bar])

# show the graph
plt.show()



total = df.groupby('release_year')['title'].sum().reset_index()
movie = df[df.type=='Movie'].groupby('release_year')['title'].sum().reset_index()
total



movie



netflix_titles['ym'] = pd.to_datetime(netflix_titles['date_added']).dt.strftime('%Y-%m')
netflix_titles['ym']

0       2021-09
1       2021-09
2       2021-09
3       2021-09
4       2021-09
         ...   
8802    2019-11
8803    2019-07
8804    2019-11
8805    2020-01
8806    2019-03
Name: ym, Length: 8807, dtype: object



group = netflix_titles.groupby(['type','ym'])['title'].count().reset_index().copy()

#2019년 10월 이후 데이터만 출력 
group = group[group['ym']>='2019-10']

group.head()



#set_index()로 멀티 인덱스 지정하기 

group = group.set_index(['ym','type'])
group.head()



#unstack할 컬럼의 레벨 지정하기 

group = group.unstack(1)
group.head()



group.plot(kind='bar',stacked=True, color=['skyblue', 'orange', 'yellow'], figsize=(12,6))
plt.xticks(rotation=70)
plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) #legend 위치 변경

<matplotlib.legend.Legend at 0x7f642023bd50>



netflix_type = netflix_titles['type'].value_counts()
netflix_type

Movie      6131
TV Show    2676
Name: type, dtype: int64



import matplotlib.pyplot as plt
plt.pie(netflix_type, labels = netflix_type.index,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis=('equal')
plt.title('Type of Netflix Titles')
plt.show()


import pandas as pd


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


netflix_titles.sample()


netflix_titles['date_added'].dtype

dtype('O')


netflix_titles['date'] = pd.to_datetime(netflix_titles['date_added'])
netflix_titles['date']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date, Length: 8807, dtype: datetime64[ns]


netflix_titles['ym'] = pd.to_datetime(netflix_titles['date_added']).dt.strftime('%Y-%m')
netflix_titles['ym']

0       2021-09
1       2021-09
2       2021-09
3       2021-09
4       2021-09
         ...   
8802    2019-11
8803    2019-07
8804    2019-11
8805    2020-01
8806    2019-03
Name: ym, Length: 8807, dtype: object


netflix_titles['ym_dt'] = pd.to_datetime(netflix_titles['ym'], format='%Y-%m')
netflix_titles['ym_dt']

0      2021-09-01
1      2021-09-01
2      2021-09-01
3      2021-09-01
4      2021-09-01
          ...    
8802   2019-11-01
8803   2019-07-01
8804   2019-11-01
8805   2020-01-01
8806   2019-03-01
Name: ym_dt, Length: 8807, dtype: datetime64[ns]


netflix_titles['date'].dt.to_period('M')

0       2021-09
1       2021-09
2       2021-09
3       2021-09
4       2021-09
         ...   
8802    2019-11
8803    2019-07
8804    2019-11
8805    2020-01
8806    2019-03
Name: date, Length: 8807, dtype: period[M]


netflix_titles['date'].dt.to_period('Q')

0       2021Q3
1       2021Q3
2       2021Q3
3       2021Q3
4       2021Q3
         ...  
8802    2019Q4
8803    2019Q3
8804    2019Q4
8805    2020Q1
8806    2019Q1
Name: date, Length: 8807, dtype: period[Q-DEC]


netflix_titles['date'].dt.to_period('M').astype(int)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: casting period[M] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
  """Entry point for launching an IPython kernel.

0       620
1       620
2       620
3       620
4       620
       ... 
8802    598
8803    594
8804    598
8805    600
8806    590
Name: date, Length: 8807, dtype: int64


netflix_titles['date'].dt.to_period('Q').astype(int)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: casting period[Q-DEC] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
  """Entry point for launching an IPython kernel.

0       206
1       206
2       206
3       206
4       206
       ... 
8802    199
8803    198
8804    199
8805    200
8806    196
Name: date, Length: 8807, dtype: int64


pd.to_datetime(netflix_titles['ym'], format='%Y-%m').astype(int)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: casting datetime64[ns] values to int64 with .astype(...) is deprecated and will raise in a future version. Use .view(...) instead.
  """Entry point for launching an IPython kernel.

0       1630454400000000000
1       1630454400000000000
2       1630454400000000000
3       1630454400000000000
4       1630454400000000000
               ...         
8802    1572566400000000000
8803    1561939200000000000
8804    1572566400000000000
8805    1577836800000000000
8806    1551398400000000000
Name: ym, Length: 8807, dtype: int64


netflix_titles['date'].dt.to_period('Q').dt.strftime('%Y-%q')

0       2021-3
1       2021-3
2       2021-3
3       2021-3
4       2021-3
         ...  
8802    2019-4
8803    2019-3
8804    2019-4
8805    2020-1
8806    2019-1
Name: date, Length: 8807, dtype: object


netflix_titles['date']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date, Length: 8807, dtype: datetime64[ns]


netflix_titles['dayname'] = netflix_titles['date'].dt.day_name()
netflix_titles['dayname']

0        Saturday
1          Friday
2          Friday
3          Friday
4          Friday
          ...    
8802    Wednesday
8803       Monday
8804       Friday
8805     Saturday
8806     Saturday
Name: dayname, Length: 8807, dtype: object


dayname = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

pd.Categorical(netflix_titles['dayname'], categories=dayname, ordered=True)

['Saturday', 'Friday', 'Friday', 'Friday', 'Friday', ..., 'Wednesday', 'Monday', 'Friday', 'Saturday', 'Saturday']
Length: 8807
Categories (7, object): ['Monday' < 'Tuesday' < 'Wednesday' < 'Thursday' < 'Friday' < 'Saturday' <
                         'Sunday']


from dateutil.relativedelta import relativedelta


netflix_titles['date']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date, Length: 8807, dtype: datetime64[ns]


netflix_titles = netflix_titles[netflix_titles['date'].notna()]


netflix_titles['previous_date'] = netflix_titles['date'].apply(lambda x : x - relativedelta(years=1))


netflix_titles[['date','previous_date']]


growth = netflix_titles.groupby(['date'])['title'].nunique().reset_index()
growth.sample()


growth['previous_date'] = growth['date'].apply(lambda x : x - relativedelta(years=1))
growth.sample()


growth = growth.merge(growth, how='left', left_on='previous_date', right_on='date')
growth.sample()


growth = growth[['date_x', 'title_x', 'previous_date_x', 'title_y']].rename({'date_x': 'date', 'title_x':'contents', 'previous_date_x':'previous_date', 'title_y':'previous_contents'}, axis = 1)
growth.sample()


growth['yoy'] = (growth['contents']-growth['previous_contents'])/growth['previous_contents']
growth.tail()


au = growth[['date', 'contents']].rename({'contents':'dau'}, axis = 1)
au.sample()


au['mau'] = au.rolling(30).sum()


au.head(30)


import pandas as pd


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


netflix_titles.sample()


#value_counts를 사용하면 그룹화 된 데이터의 카운트 값을 보여준다.
type_count = pd.DataFrame(netflix_titles['type'].value_counts())
type_count


#normalize=True 옵션을 사용하면, 해당 데이터가 전체 데이터에서 어느 정도의 비율을 차지하는 지 알 수 있다.  
type_percent = pd.DataFrame(netflix_titles['type'].value_counts(normalize=True))
type_percent


df = type_count.merge(type_percent, how='left', left_index=True, right_index=True)

df.columns = ['콘텐츠 수', '비율']

df.head()


netflix_titles.groupby('release_year')['title'].count()

release_year
1925       1
1942       2
1943       3
1944       3
1945       4
        ... 
2017    1032
2018    1147
2019    1030
2020     953
2021     592
Name: title, Length: 74, dtype: int64


netflix_titles.groupby('release_year')['title'].count().reset_index()


group = netflix_titles.groupby('release_year').agg(title_cnt = ('title', 'count'), country_cnt = ('country', 'count'))
group


#release_yeaer가 2000년 이하인 인덱스 

group[group.index < 2000].index

Int64Index([1925, 1942, 1943, 1944, 1945, 1946, 1947, 1954, 1955, 1956, 1958,
            1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969,
            1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
            1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
            1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999],
           dtype='int64', name='release_year')


group.drop(group[group.index < 2000].index, inplace=True)
group


group.drop(['country_cnt'], axis=1, inplace=True)
group


#멀티 인덱스 생성 
multi_index = netflix_titles.groupby(['release_year', 'type'])['title'].count().reset_index().set_index(['release_year', 'type'])
multi_index


#멀티 인덱스로 2000년 release 된 title을 선택하고 싶은 경우  

multi_index.xs(2000, level='release_year')


#멀티 인덱스로 type이 Movie인 title을 선택하고 싶은 경우 

multi_index.xs('Movie', level='type')


#duration 전처리 
netflix_titles['duration'] = pd.to_numeric(netflix_titles['duration'].str.split('Season|min', expand=True)[0].replace({'NaN':None}))
netflix_titles['duration']

0        90.0
1         2.0
2         1.0
3         1.0
4         2.0
        ...  
8802    158.0
8803      2.0
8804     88.0
8805     88.0
8806    111.0
Name: duration, Length: 8807, dtype: float64


group = netflix_titles.groupby(['type','rating'])[['release_year', 'duration']].agg(['min', 'mean'])
group.head()


group.stack(level=0).head()


group.stack(level=1).head()


stacked = group.stack()
stacked.head()


group['release_year'].stack().head()

type   rating      
Movie  66 min  min     2015.0
               mean    2015.0
       74 min  min     2017.0
               mean    2017.0
       84 min  min     2010.0
dtype: float64


stacked.unstack(0).head()


stacked.unstack(1).head()


stacked.unstack().head()


stacked['duration'].unstack().head()


melted = netflix_titles.groupby(['type','rating'])[['release_year','duration']].mean().reset_index()
melted.head()


pd.melt(melted, id_vars=['type','rating']).head()


import pandas as pd


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


netflix_titles.sample()


netflix_titles[netflix_titles['rating'].str.contains('TV', na=False)].sample()


netflix_titles[~netflix_titles['rating'].str.contains('TV', na=False)].sample()


netflix_titles['rating'].str.split('-')

0       [PG, 13]
1       [TV, MA]
2       [TV, MA]
3       [TV, MA]
4       [TV, MA]
          ...   
8802         [R]
8803    [TV, Y7]
8804         [R]
8805        [PG]
8806    [TV, 14]
Name: rating, Length: 8807, dtype: object


netflix_titles['rating'].str.split('-', expand=True)


netflix_titles['rating'].str.split('-', expand=True)[0]

0       PG
1       TV
2       TV
3       TV
4       TV
        ..
8802     R
8803    TV
8804     R
8805    PG
8806    TV
Name: 0, Length: 8807, dtype: object


#Zombie를 포함하는 title 찾기  
zombie = [x for x in netflix_titles['title'] if x.find('Zombie') != -1]
zombie

['IZombie',
 'Rise of the Zombie',
 'Scooby-Doo on Zombie Island',
 'Zombie Dumb',
 'Zombieland']


netflix_titles.sample()


netflix_titles['added_year'] = pd.to_datetime(netflix_titles['date_added']).dt.year
netflix_titles['added_year']

0       2021.0
1       2021.0
2       2021.0
3       2021.0
4       2021.0
         ...  
8802    2019.0
8803    2019.0
8804    2019.0
8805    2020.0
8806    2019.0
Name: added_year, Length: 8807, dtype: float64


netflix_titles['release_added'] = netflix_titles['release_year'] - netflix_titles['added_year'] 
netflix_titles['release_added']

0       -1.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
8802   -12.0
8803    -1.0
8804   -10.0
8805   -14.0
8806    -4.0
Name: release_added, Length: 8807, dtype: float64


netflix_titles['release_added'] = netflix_titles['release_added'].astype(str)


pd.to_numeric(netflix_titles['release_added'])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "nan"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-15-f88196129310> in <module>
----> 1 pd.to_numeric(netflix_titles['release_added'])

/usr/local/lib/python3.7/dist-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    182         try:
    183             values, _ = lib.maybe_convert_numeric(
--> 184                 values, set(), coerce_numeric=coerce_numeric
    185             )
    186         except (ValueError, TypeError):

/usr/local/lib/python3.7/dist-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "nan" at position 6066


netflix_titles['release_added'] = pd.to_numeric(netflix_titles['release_added'].replace({'nan' : None}))
netflix_titles['release_added']

0       -1.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
8802   -12.0
8803    -1.0
8804   -10.0
8805   -14.0
8806    -4.0
Name: release_added, Length: 8807, dtype: float64


netflix_titles['release_added'].unique()

array([ -1.,   0., -28.,  -3., -25., -23., -24., -11.,  -8.,  -4., -46.,
       -43., -38., -34.,  -9., -20.,  -7., -19., -18., -17., -10., -13.,
       -12., -14., -16., -15., -27.,  -6.,  -2.,  -5., -39., -32., -31.,
       -30., -22., -35., -29., -37., -41., -60., -21., -26., -36., -45.,
       -62., -33., -40., -49., -57., -76.,   1., -66., -64., -50., -47.,
       -44., -93., -51., -55., -48., -42.,   2.,  nan, -54., -59., -61.,
       -52., -63.,   3., -72., -71., -75., -65., -73., -70., -74.])


netflix_titles['country'].value_counts()

United States                             2818
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64


netflix_titles['type'] = netflix_titles['country'].apply(lambda x : 'US' if x == 'United States' else 'Non_US')
netflix_titles[['type','country']]


netflix_titles['type'] = netflix_titles['country'].apply(lambda x : 'US' if x == 'United States' else 'India' if x == 'India' else 'Non-US/India')
netflix_titles[['type','country']]


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1)


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack()

0     0                                        United States
1     0                                         South Africa
4     0                                                India
7     0    United States, Ghana, Burkina Faso, United Kin...
8     0                                       United Kingdom
                                 ...                        
8801  0                         United Arab Emirates, Jordan
8802  0                                        United States
8804  0                                        United States
8805  0                                        United States
8806  0                                                India
Length: 7976, dtype: object


netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack().reset_index(level=1, drop=True)

0                                           United States
1                                            South Africa
4                                                   India
7       United States, Ghana, Burkina Faso, United Kin...
8                                          United Kingdom
                              ...                        
8801                         United Arab Emirates, Jordan
8802                                        United States
8804                                        United States
8805                                        United States
8806                                                India
Length: 7976, dtype: object


s = netflix_titles.apply(lambda x : pd.Series(x['country']), axis=1).stack().reset_index(level=1, drop=True)
s[s != 'nan'].value_counts().head(20)

United States                    2818
India                             972
United Kingdom                    419
Japan                             245
South Korea                       199
Canada                            181
Spain                             145
France                            124
Mexico                            110
Egypt                             106
Turkey                            105
Nigeria                            95
Australia                          87
Taiwan                             81
Indonesia                          79
Brazil                             77
Philippines                        75
United Kingdom, United States      75
United States, Canada              73
Germany                            67
dtype: int64


netflix_titles = pd.read_csv('https://raw.githubusercontent.com/minaahayley/Python/main/data/netflix_titles.csv')


import numpy as np

netflix_titles['type'] = np.where(netflix_titles['country']=='US', 'US', 'Non_US')
netflix_titles[['type','country']]

	Weight	Age	Draft Year	Seasons in league	Season short	Real_value	Height CM	Weight KG	Last Season
count	1340.000000	1340.000000	1340.000000	1340.000000	1340.000000	1340.000000	1340.000000	1340.000000	1340.000000
mean	224.567164	26.738060	1996.287313	5.740299	2003.156716	0.686940	201.071642	101.384328	0.023881
std	30.798885	3.400683	11.253558	3.293421	11.470164	0.242007	9.367970	14.011226	0.152734
min	150.000000	19.000000	1965.000000	0.000000	1980.000000	0.500000	175.000000	68.000000	0.000000
25%	205.000000	24.000000	1987.000000	3.000000	1994.000000	0.500000	193.000000	93.000000	0.000000
50%	220.000000	26.000000	1998.000000	5.000000	2005.000000	0.500000	201.000000	99.000000	0.000000
75%	250.000000	29.000000	2005.000000	8.000000	2013.000000	1.000000	208.000000	113.000000	0.000000
max	325.000000	40.000000	2018.000000	17.000000	2020.000000	1.000000	229.000000	147.000000	1.000000

	Sex_female	Sex_male	Embarked_C	Embarked_S
0	0	1	0	1
1	1	0	1	0
2	1	0	0	1
3	1	0	0	1
4	0	1	0	1

	Unnamed: 0	Survived	Pclass	Name	Age	SibSp	Ticket	Fare	Cabin	Sex_female	Sex_male	Embarked_C	Embarked_S
0	0	0	3	Braund, Mr. Owen Harris	22.0	1	A/5 21171	7.2500	NaN	0	1	0	1
1	1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.0	1	PC 17599	71.2833	C85	1	0	1	0
2	2	1	3	Heikkinen, Miss. Laina	26.0	0	STON/O2. 3101282	7.9250	NaN	1	0	0	1
3	3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.0	1	113803	53.1000	C123	1	0	0	1
4	4	0	3	Allen, Mr. William Henry	35.0	0	373450	8.0500	NaN	0	1	0	1

KFold 교차 검증과 GridSearchCV (0)	2022.11.18
기본 지도 학습 알고리즘 (2) 분류 (0)	2022.11.18
정규화와 모델 평가 (0)	2022.11.18
기본 지도 학습 알고리즘 (1) 회귀 (0)	2022.11.18
Colab과 Kaggle 연결 (0)	2022.11.18

KFold 교차 검증과 GridSearchCV (0)	2022.11.18
기본 지도 학습 알고리즘 (2) 분류 (0)	2022.11.18
정규화와 모델 평가 (0)	2022.11.18
데이터 전처리 (0)	2022.11.18
Colab과 Kaggle 연결 (0)	2022.11.18

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	datetime	year_month
0	2011-01-01 00:00:00	2011-1
1	2011-01-01 01:00:00	2011-1
2	2011-01-01 02:00:00	2011-1
3	2011-01-01 03:00:00	2011-1
4	2011-01-01 04:00:00	2011-1

matplotlib으로 누적 barplot 그리기 (0)	2023.02.02
barplot에 annotation 추가하기 (0)	2023.02.02
데이터의 빈도수 시각화 (0)	2022.11.18
Matplotlib, Seaborn 시각화 (0)	2022.11.18
시계열 데이터 (0)	2022.11.18

barplot에 annotation 추가하기 (0)	2023.02.02
Matplotlib Subplot (0)	2022.11.18
Matplotlib, Seaborn 시각화 (0)	2022.11.18
시계열 데이터 (0)	2022.11.18
데이터 집계와 재구조화 (0)	2022.11.18

rating	66 min	74 min	84 min	G	NC-17	NR	PG	PG-13	R	TV-14	TV-G	TV-MA	TV-PG	TV-Y	TV-Y7	TV-Y7-FV	UR
year
2008.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN
2009.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN
2010.0	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2011.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5.0	NaN	3.0	5.0	NaN	NaN	NaN	NaN
2012.0	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	1.0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN
2013.0	NaN	NaN	NaN	NaN	NaN	4.0	1.0	NaN	NaN	2.0	1.0	3.0	NaN	NaN	NaN	NaN	NaN
2014.0	NaN	NaN	NaN	1.0	NaN	NaN	3.0	NaN	NaN	2.0	1.0	12.0	4.0	1.0	NaN	NaN	NaN
2015.0	NaN	NaN	NaN	1.0	NaN	5.0	2.0	2.0	3.0	14.0	5.0	29.0	8.0	7.0	4.0	2.0	NaN
2016.0	1.0	NaN	1.0	2.0	1.0	27.0	3.0	6.0	14.0	98.0	9.0	162.0	50.0	10.0	43.0	1.0	NaN
2017.0	NaN	1.0	NaN	4.0	1.0	24.0	19.0	26.0	66.0	326.0	23.0	446.0	168.0	35.0	45.0	1.0	1.0
2018.0	NaN	NaN	NaN	12.0	NaN	14.0	33.0	53.0	129.0	451.0	36.0	650.0	184.0	40.0	45.0	1.0	NaN
2019.0	NaN	NaN	NaN	8.0	1.0	4.0	81.0	135.0	208.0	494.0	40.0	736.0	198.0	54.0	54.0	1.0	2.0
2020.0	NaN	NaN	NaN	9.0	NaN	NaN	86.0	122.0	188.0	439.0	61.0	671.0	146.0	102.0	55.0	NaN	NaN
2021.0	NaN	NaN	NaN	4.0	NaN	NaN	58.0	146.0	190.0	326.0	44.0	489.0	97.0	57.0	87.0	NaN	NaN

	release_year	title
0	2011	185
1	2012	237
2	2013	288
3	2014	352
4	2015	560
5	2016	902
6	2017	1032
7	2018	1147
8	2019	1030
9	2020	953
10	2021	592

	type	ym	title
81	Movie	2019-10	128
82	Movie	2019-11	187
83	Movie	2019-12	168
84	Movie	2020-01	152
85	Movie	2020-02	72

		title
ym	type
2019-10	Movie	128
2019-11	Movie	187
2019-12	Movie	168
2020-01	Movie	152
2020-02	Movie	72

	title
type	Movie	TV Show
ym
2019-10	128	65
2019-11	187	68
2019-12	168	47
2020-01	152	53
2020-02	72	42

	date	previous_date
0	2021-09-25	2020-09-25
1	2021-09-24	2020-09-24
2	2021-09-24	2020-09-24
3	2021-09-24	2020-09-24
4	2021-09-24	2020-09-24
...	...	...
8802	2019-11-20	2018-11-20
8803	2019-07-01	2018-07-01
8804	2019-11-01	2018-11-01
8805	2020-01-11	2019-01-11
8806	2019-03-02	2018-03-02

	Age
0	0.000000
1	1.000000
2	0.291667
3	0.416667
4	0.625000

	Salary
0	25000000
1	35000000
2	30000000
3	50000000
4	35000000

	Salary
0	0.0
1	0.4
2	0.2
3	1.0
4	0.4

	Sex	Embarked
0	male	S
1	female	C
2	female	S
3	female	S
4	male	S

	date	contents	previous_date	previous_contents	yoy
1709	2021-09-21	5	2020-09-21	1.0	4.000000
1710	2021-09-22	9	2020-09-22	2.0	3.500000
1711	2021-09-23	2	2020-09-23	2.0	0.000000
1712	2021-09-24	10	2020-09-24	2.0	4.000000
1713	2021-09-25	1	2020-09-25	6.0	-0.833333

	date	dau	mau
0	2008-01-01	1	NaN
1	2008-02-04	1	NaN
2	2009-05-05	1	NaN
3	2009-11-18	1	NaN
4	2010-11-01	1	NaN
5	2011-05-17	1	NaN
6	2011-09-27	1	NaN
7	2011-10-01	11	NaN
8	2012-02-21	1	NaN
9	2012-11-14	1	NaN
10	2012-12-01	1	NaN
11	2013-03-31	1	NaN
12	2013-08-02	1	NaN
13	2013-09-01	2	NaN
14	2013-10-08	1	NaN
15	2013-10-14	2	NaN
16	2013-11-01	2	NaN
17	2013-12-12	1	NaN
18	2013-12-29	1	NaN
19	2014-01-17	1	NaN
20	2014-01-24	1	NaN
21	2014-02-01	1	NaN
22	2014-02-16	1	NaN
23	2014-04-01	1	NaN
24	2014-04-15	1	NaN
25	2014-06-15	1	NaN
26	2014-07-11	1	NaN
27	2014-08-15	1	NaN
28	2014-09-26	1	NaN
29	2014-10-10	1	43.0

	release_year	title
0	1925	1
1	1942	2
2	1943	3
3	1944	3
4	1945	4
...	...	...
69	2017	1032
70	2018	1147
71	2019	1030
72	2020	953
73	2021	592

	title_cnt	country_cnt
release_year
2000	37	36
2001	45	44
2002	51	50
2003	61	59
2004	64	62
2005	80	79
2006	96	95
2007	88	87
2008	136	132
2009	152	142
2010	194	178
2011	185	179
2012	237	222
2013	288	270
2014	352	333
2015	560	516
2016	902	838
2017	1032	966
2018	1147	1038
2019	1030	913
2020	953	852
2021	592	383

	type	rating	release_year	duration
0	Movie	66 min	2015.000000	NaN
1	Movie	74 min	2017.000000	NaN
2	Movie	84 min	2010.000000	NaN
3	Movie	G	1997.804878	90.268293
4	Movie	NC-17	2015.000000	125.000000

	0	1	2
0	PG	13	None
1	TV	MA	None
2	TV	MA	None
3	TV	MA	None
4	TV	MA	None
...	...	...	...
8802	R	None	None
8803	TV	Y7	None
8804	R	None	None
8805	PG	None	None
8806	TV	14	None

	type	country
0	US	United States
1	Non_US	South Africa
2	Non_US	NaN
3	Non_US	NaN
4	Non_US	India
...	...	...
8802	US	United States
8803	Non_US	NaN
8804	US	United States
8805	US	United States
8806	Non_US	India

	type	country
0	US	United States
1	Non-US/India	South Africa
2	Non-US/India	NaN
3	Non-US/India	NaN
4	India	India
...	...	...
8802	US	United States
8803	Non-US/India	NaN
8804	US	United States
8805	US	United States
8806	India	India

전체 글

Min-max normalization¶

One-hot Encoding¶

'machine_learning' 카테고리의 다른 글

선형 회귀¶

Gradient Descent¶

다항 회귀¶

회귀 트리¶

'machine_learning' 카테고리의 다른 글

Kaggle과 Colab 연결하기¶

Kaggle 데이터셋 불러오기¶

'machine_learning' 카테고리의 다른 글

'visualization' 카테고리의 다른 글

Kaggle과 Colab 연결¶

sns.countplot()¶

sns.distplot()¶

'visualization' 카테고리의 다른 글

lineplot¶

displot¶

boxplot¶

heatmap¶

stacked barplot¶

matplotlib pie¶

'visualization' 카테고리의 다른 글

pd.to_datetime()¶

dt.strftime()¶

dt.to_period()¶

pd.Categorical()¶

relativedelta()¶

rolling()¶

'visualization' 카테고리의 다른 글

value_counts()¶

groupby().agg()¶

drop()¶

xs()¶

stack()¶

unstack()¶

pd.melt()¶

'visualization' 카테고리의 다른 글

str.contains()¶

str.split()¶

str.find()¶

pd.to_numeric()¶

apply()¶

np.where()¶

'visualization' 카테고리의 다른 글

티스토리툴바