text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'


from nltk import word_tokenize, sent_tokenize 
import nltk
nltk.download('punkt')

def tokenize_text(text): 

    # 문장 토큰화 
    sentences = sent_tokenize(text)

    # 단어 토큰화 
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens 

# 여러 문장에 대해 문장별 단어 토큰화 수행 
word_tokens = tokenize_text(text_sample)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

True


stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

for sentence in word_tokens: 
    for word in sentence: 
      word = word.lower()

      if word not in stopwords: 
        all_tokens.append(word)

all_tokens

['matrix',
 'everywhere',
 'around',
 'us',
 ',',
 'even',
 'room',
 '.',
 'see',
 'window',
 'television',
 '.',
 'feel',
 'go',
 'work',
 ',',
 'go',
 'church',
 'pay',
 'taxes',
 '.']


from nltk.stem import LancasterStemmer 
stemmer = LancasterStemmer()

print(stemmer.stem('working'))
print(stemmer.stem('works'))
print(stemmer.stem('worked'))

work
work
work


from nltk.stem import WordNetLemmatizer 

import nltk 
nltk.download('wordnet')
nltk.download('omw-1.4')

lemma = WordNetLemmatizer() 

# Lemmatization은 보다 정확한 원형 단어 추출을 위해 단어의 품사를 입력해줘야 한다  
print(lemma.lemmatize('happier','a'))
print(lemma.lemmatize('happiest','a'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...

happy
happy


import spacy
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# 필요한 리소스를 로드해야 한다. 영어 리소스 로드.
nlp = spacy.load("en_core_web_sm")


sample_review = "Phone will become defective in about three months. you have been warned."
print(sample_review)

# 소문자 변경 
doc = nlp(sample_review.lower())

# 문장 부호와 불필요한 공백 지우기 
lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space] 

tokens_normalized = ''.join(lemmas)
tokens_normalized

Phone will become defective in about three months. you have been warned.

'phonewillbecomedefectiveinaboutthreemonthyouhavebewarn'


import numpy as np

from scipy import sparse

dense = np.array([[0, 0, 1, 0, 0, 5],
                  [1, 4, 0, 3, 2, 5],
                  [0, 6, 0, 3, 0, 0],
                  [2, 0, 0, 0, 0, 0],
                  [0, 0, 0, 7, 0 ,8],
                  [1, 0, 0, 0, 0, 0]])

# 0이 아닌 데이터 추출 
data = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1])

# 0이 아닌 데이터 값의 행 위치와 열 위치 
row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])
col_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])

# COO 형식으로 변환 
sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

# 행 위치 배열의 고유한 값의 시작 위치 인덱스를 배열로 생성 
row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])

# CSR 형식으로 변환 
sparse_csr = sparse.csr_matrix((data, col_pos, row_pos_ind))

print(sparse_coo.toarray())
print(sparse_csr.toarray())

[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


from sklearn.datasets import fetch_20newsgroups 

news_data = fetch_20newsgroups(subset='all', random_state=156)

# 파이썬 딕셔너리와 유사한 Bunch 객체를 반환 
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


import pandas as pd 

# 클래스의 값과 분포도 
pd.Series(news_data.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64


# 클래스 이름 
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


news_data.data[0]

'From: egreen@east.sun.com (Ed Green - Pixel Cruncher)\nSubject: Re: Observation re: helmets\nOrganization: Sun Microsystems, RTP, NC\nLines: 21\nDistribution: world\nReply-To: egreen@east.sun.com\nNNTP-Posting-Host: laser.east.sun.com\n\nIn article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:\n> \n> The question for the day is re: passenger helmets, if you don\'t know for \n>certain who\'s gonna ride with you (like say you meet them at a .... church \n>meeting, yeah, that\'s the ticket)... What are some guidelines? Should I just \n>pick up another shoei in my size to have a backup helmet (XL), or should I \n>maybe get an inexpensive one of a smaller size to accomodate my likely \n>passenger? \n\nIf your primary concern is protecting the passenger in the event of a\ncrash, have him or her fitted for a helmet that is their size.  If your\nprimary concern is complying with stupid helmet laws, carry a real big\nspare (you can put a big or small head in a big helmet, but not in a\nsmall one).\n\n---\nEd Green, former Ninjaite |I was drinking last night with a biker,\n  Ed.Green@East.Sun.COM   |and I showed him a picture of you.  I said,\nDoD #0111  (919)460-8302  |"Go on, get to know her, you\'ll like her!"\n (The Grateful Dead) -->  |It seemed like the least I could do...\n\n'


from sklearn.datasets import fetch_20newsgroups 

# subset='train'으로 학습용 데이터만 추출, remove=('headers', 'footers', 'quotes')로 내용만 추출 
train_news = fetch_20newsgroups(subset='train', remove=('headers','footers', 'quotes'), random_state=156)
X_train = train_news.data 
y_train = train_news.target 

# subset='test'로 테스트 데이터만 추출 
test_news = fetch_20newsgroups(subset='test', remove=('headers','footers', 'quotes'), random_state=156)
X_test = test_news.data
y_test = test_news.target 

print(len(train_news.data))
print(len(test_news.data))

11314
7532


from sklearn.feature_extraction.text import CountVectorizer 

# Count Vectorization으로 피처 벡터화 변환 수행. 
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train) 
X_test_cnt_vect = cnt_vect.transform(X_test) 

print(X_train_cnt_vect.shape)
print(X_test_cnt_vect.shape)

(11314, 101631)
(7532, 101631)


from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
import warnings 
warnings.filterwarnings('ignore')

# LogisticRegression을 이용하여 학습/예측/평가 수행 
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_cnt_vect, y_train)

pred = lr_clf.predict(X_test_cnt_vect)
print('CounterVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CounterVectorized Logistic Regression의 예측 정확도는 0.617


# TF-IDF 기반으로 피처 벡터화 
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용하여 학습/예측/평가 수행 
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, y_train)

pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.678


# stop words 필터링을 추가하고 ngram을 기본 (1, 1)에서 (1, 2)로 변경해 피처 벡터화 적용 
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), 
                             max_df=300) # 전체 문서에서 300개 이하로 나타나는 단어만 피처로 추출 

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용하여 학습/예측/평가 수행 
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, y_train)

pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.678


# GridSearchCV를 이용해 로지스틱 회귀의 하이퍼 파라미터 최적화 
from sklearn.model_selection import GridSearchCV 

# 최적 C 값 도출 튜닝 수행
params = {'C':[0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print(grid_cv_lr.best_params_, grid_cv_lr.best_score_)

# 최적 C 값으로 학습된 grid_cv로 예측 및 정확도 평가 
pred = grid_cv_lr.best_estimator_.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 10} 0.74164778571448
TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.689


from sklearn.pipeline import Pipeline

# TfidfVecotizer 객체와 LogisticRegression 객체를 파이프라인으로 연결 
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
                     ('lr_clf', LogisticRegression(solver='liblinear', C=10, random_state=156))])

# 별도의 TfidfVectorizer 객체의 fit(), transform()과 LogisticRegression의 fit(), predict()가 필요 없음 
# pipeline의 fit()과 predict() 만으로 한꺼번에 피처 벡터화와 ML 학습/예측이 가능 
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Pipeline을 통한 Logistic Regression의 예측 정확도는 0.704


pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')),
                     ('lr_clf', LogisticRegression(solver='liblinear'))])

# Pipeline에 기술된 각각의 객체 변수에 __ 2개를 연달아 붙여 GridSearchCV에 사용될 
# 파라미터/하이퍼 파라미터 이름과 값을 설정 
params = {'tfidf_vect__ngram_range' : [(1,1), (1,2), (1,3)],
          'tfidf_vect__max_df' : [100, 300, 700],
          'lr_clf__C' : [1, 5, 10]
}

# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력 
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)

pred = grid_cv_pipe.best_estimator_.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.7550828826229531
Pipeline을 통한 Logistic Regression의 예측 정확도는 0.702

추천 시스템 (0)	2022.11.20
텍스트 분석 (2) 감성 분석 및 토픽 모델링 (0)	2022.11.18
RFM 기반 고객 세그먼테이션 (0)	2022.11.18
차원 축소와 군집화 (0)	2022.11.18
HyperOpt를 이용한 하이퍼 파라미터 튜닝.ipynb (0)	2022.11.18

hayley

텍스트 분석 (1) 텍스트 정규화 및 피처 벡터화

텍스트 정규화¶

토큰화¶

Stop word 제거¶

Stemming과 Lemmatization¶

BOW 피처 벡터화¶

BOW 벡터화를 위한 희소 행렬¶

ML 모델 학습/예측/평가¶

20 뉴스그룹 분류¶

Sklearn Pipeline 사용 및 GridSearchCV와의 결합¶

'machine_learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바