machine_learning
분류 평가 지표
hayleyhell
2022. 11. 18. 19:04
Kaggle과 Colab 연동¶
In [1]:
!pip install kaggle
from google.colab import files
files.upload()
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12) Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2022.9.24) Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0) Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0) Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.2) Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.64.1) Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (6.1.2) Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3) Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)
Saving kaggle.json to kaggle.json
Out[1]:
{'kaggle.json': b'{"username":"minaahayley","key":"3af569f6a8c028e6233a6d29ce2439d3"}'}
In [2]:
ls -1ha kaggle.json
kaggle.json
In [3]:
!mkdir -p ~/.kaggle #create folder name Kaggle
!cp kaggle.json ~/.kaggle #copy kaggle.jason into folder Kaggle
!chmod 600 ~/.kaggle/kaggle.json #ignore Permission Warning
In [4]:
#다운로드가 제대로 되었는지 확인한다.
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다.
%ls ~/.kaggle
kaggle.json
In [5]:
#Copy API command 후 데이터셋 다운로드하기
!kaggle competitions download -c titanic
!kaggle datasets download -d uciml/pima-indians-diabetes-database
#파일 압축 풀기
!unzip titanic.zip
!unzip pima-indians-diabetes-database.zip
Downloading titanic.zip to /content 0% 0.00/34.1k [00:00<?, ?B/s] 100% 34.1k/34.1k [00:00<00:00, 19.7MB/s] Downloading pima-indians-diabetes-database.zip to /content 0% 0.00/8.91k [00:00<?, ?B/s] 100% 8.91k/8.91k [00:00<00:00, 6.11MB/s] Archive: titanic.zip inflating: gender_submission.csv inflating: test.csv inflating: train.csv Archive: pima-indians-diabetes-database.zip inflating: diabetes.csv
In [6]:
!mkdir -p ~/.kaggle/competitions/titanic #create folder name Kaggle
!cp gender_submission.csv ~/.kaggle/competitions/titanic #copy into folder Kaggle
!cp test.csv ~/.kaggle/competitions/titanic
!cp train.csv ~/.kaggle/competitions/titanic
!mkdir -p ~/.kaggle/data/pima-indians-diabetes-database #create folder name Kaggle
!cp diabetes.csv ~/.kaggle/data/pima-indians-diabetes-database #copy into folder Kaggle
In [7]:
#다운로드가 제대로 되었는지 확인한다.
#ls 명령어는 특정 경로에 어떤 파일이 있는지 확인해 보는 명령어다.
%ls ~/.kaggle/competitions/titanic
%ls ~/.kaggle/data/pima-indians-diabetes-database
gender_submission.csv test.csv train.csv diabetes.csv
타이타닉 생존자 예측¶
In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
submission = pd.read_csv('~/.kaggle/competitions/titanic/gender_submission.csv')
test = pd.read_csv('~/.kaggle/competitions/titanic/test.csv')
train = pd.read_csv('~/.kaggle/competitions/titanic/train.csv')
In [35]:
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
In [36]:
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.0+ KB
In [37]:
# Age, Cabin, Embarked, Fare의 null 값을 채워준다.
train['Age'].fillna(train['Age'].mean(),inplace=True)
train['Cabin'].fillna('N', inplace=True)
train['Embarked'].fillna('N', inplace=True)
train['Fare'].fillna(0, inplace=True)
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Cabin'].fillna('N', inplace=True)
test['Embarked'].fillna('N', inplace=True)
test['Fare'].fillna(0, inplace=True)
In [38]:
# 피처 제거
X_train = train.drop(['Name','PassengerId', 'Ticket','Survived'], axis=1)
y_train = train['Survived']
test = test.drop(['Name','PassengerId', 'Ticket'], axis=1)
In [39]:
# 문자열 카테고리 피처를 숫자형 카테고리 피처로 변환한다.
from sklearn.preprocessing import LabelEncoder
def encoding(df):
features = ['Cabin', 'Sex', 'Embarked']
for feature in features :
encoding = LabelEncoder()
encoding = encoding.fit(df[feature])
df[feature] = encoding.transform(df[feature])
return df
X_train = encoding(X_train)
In [40]:
# 데이터셋 분할
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=11)
y_train = y_train.values.ravel()
In [41]:
# 분류 평가 지표
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
def get_clf_eval(y_test, predict=None, predict_proba=None):
confusion = confusion_matrix(y_test, predict) # 오차행렬
accuracy = accuracy_score(y_test, predict) # 정확도
precision = precision_score(y_test, predict) # 정밀도
recall = recall_score(y_test, predict) # 재현도
f1 = f1_score(y_test, predict) # f1 스코어
roc_auc = roc_auc_score(y_test, predict_proba) # auc 스코어
print('오차 행렬')
print(confusion)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
In [42]:
# 모델 선언
lr_clf = LogisticRegression(solver='liblinear')
# 모델 학습/예측/평가
lr_clf.fit(X_train, y_train)
predict_proba = lr_clf.predict_proba(X_val)[:, 1] # Survived 1에 대한 예측 확률 array
predict = lr_clf.predict(X_val) # 예측 결과값 array
get_clf_eval(y_val, predict, predict_proba)
오차 행렬 [[108 10] [ 16 45]] 정확도: 0.8547, 정밀도: 0.8182, 재현율: 0.7377, F1: 0.7759, AUC:0.8980
피마 인디언 당뇨병 예측¶
In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
In [43]:
diabetes = pd.read_csv('~/.kaggle/data/pima-indians-diabetes-database/diabetes.csv')
diabetes.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
In [44]:
# X, y 추출
X = diabetes.iloc[:, :-1]
y = diabetes.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156,
stratify=y) # 원래 데이터의 분포와 유사하게 데이터 추출 (분류 모델에서 추천!)
In [45]:
# 로지스틱 회귀로 학습, 예측, 평가
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)
predict = lr_clf.predict(X_test)
predict_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, predict, predict_proba)
오차 행렬 [[87 13] [22 32]] 정확도: 0.7727, 정밀도: 0.7111, 재현율: 0.5926, F1: 0.6465, AUC:0.8083