감성분석 - Machine Learing
IMDB 영화 리뷰 데이터 ML 기반 감성분석( CountVectorizer 활용 )
1 . IMDB 영화 리뷰 데이터를 활용 ( 다운로드 : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews )
import pandas as pd
import nltk
from afinn import Afinn
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import numpy as np
import matplotlib.pyplot as plt
# 분석파일로드
file_name = "IMDB Dataset.csv"
review = pd.read_csv(file_name, engine="python")
review.head(10)
2 . 데이터 전처리 하기
# HTML 태그 제거 및 데이터 클린닝 를 위한 모듈 설치
from bs4 import BeautifulSoup
import re
# 100 review만 분석
n = 100
reviews = []
for row in review['review'][0:n]:
review1 = BeautifulSoup(row, "html5lib").get_text()
reviews.append(review1)
# 소대 문자만 추출
review_list = []
for row1 in reviews:
review2 = re.sub('[^a-zA-z]',' ',row1)
review3 =review2.lower() # 모두 소문자로 변환
review_list.append(review3)
# 토큰 처리
token_list = []
for row2 in review_list:
review4 = row2.split() # 토큰화
token_list.append(review4)
# 반복문을 이용하여 불용어 제거
sentence_words = [w for w in token_list if not w in stopwords.words('english')]
clean_review = []
for sentence in sentence_words:
s = ' '
clean_review.append(s.join(sentence))
3 . DTM, TDM 행렬
# 리뷰의 토큰을 피쳐(독립변수)로 변환
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
# 튜토리얼과 다르게 파라미터 값을 수정
vectorizer = CountVectorizer(analyzer = 'word',
tokenizer = None,
preprocessor = None,
stop_words = None,
min_df = 2, # 토큰이 나타날 최소 문서 개수
ngram_range=(1, 1), # ngra_range = (최소, 최대)
max_features = 20000)
# 속도 개선을 위해 파이프라인을 사용하도록 개선
pipeline = Pipeline([('vect', vectorizer),])
# 벡터화
data_features = pipeline.fit_transform(clean_review)
data_features.shape
data_features.toarray()
vocab = vectorizer.get_feature_names()
import pandas as pd
df = pd.DataFrame(data_features.toarray())
print(df)
4 . Machine Learing 분석
# 랜덤포레스트 분류
from sklearn.ensemble import RandomForestClassifier
k = data_features.shape[0]
forest = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state=2018)
forest = forest.fit(data_features, review['sentiment'][0:k])
# k-fold 검정
from sklearn.model_selection import cross_val_score
# k-fold 각각의 ROC AUC 값
cross_val_score(forest, data_features,review['sentiment'][0:k], cv=10, scoring='roc_auc')
# k-fold 전체 ROC AUC 평균치
score = np.mean(cross_val_score(forest, data_features,review['sentiment'][0:k], cv=10, scoring='roc_auc'))
print(score)
IMDB 영화 리뷰 데이터 ML 기반 감성분석( TfidfVectorizer 활용 )
1 . TF-IDF 벡터 이용
from sklearn.model_selection import train_test_split
# 1000개 리뷰
n = 1000
review = review[0:n]
x_input = review['review']
y_output = review['sentiment']
x_train, x_test, y_train, y_test = train_test_split(x_input, y_output, stratify=y_output, test_size=0.2, random_state=15)
print(x_train.shape, x_test.shape) # 훈련세트, 테스트세트 비율 확인
np.unique(y_train, return_counts=True) # 훈련세트의 타깃(라벨) 확인
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = stopwords.words('english')
# TF-IDF 가중치를 통해 문서-단어 매트릭스로 바꾸기
vect = TfidfVectorizer(stop_words=stop_words).fit(x_train)
x_train_vectorized = vect.transform(x_train)
2 . 로지스틱 회귀 수행
from sklearn.linear_model import LogisticRegression, SGDClassifier model = LogisticRegression() model.fit(x_train_vectorized, y_train) print(model.score(x_train_vectorized, y_train)) print(model.score(vect.transform(x_test), y_test))
3 . Decision Tree 수행
from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier() clf.fit(x_train_vectorized, y_train) print(clf.score(x_train_vectorized, y_train)) print(clf.score(vect.transform(x_test), y_test))
댓글남기기