!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x.
import urllib.request
raw = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt").readlines()
print(raw[:5])
raw = [x.decode() for x in raw[1:]]
reviews = []
for i in raw :
reviews.append(i.split('\t')[1])
print(reviews[:5])
from konlpy.tag import Mecab
tagger = Mecab()
nouns = []
for review in reviews:
for noun in tagger.nouns(review):
nouns.append(noun)
nouns[:10]
stop_words = "영화 전 난 일 걸 뭐 줄 만 건 분 개 끝 잼 이거 번 중 듯 때 게 내 말 나 수 거 점 것"
stop_words = stop_words.split(" ")
print(stop_words)
nouns = []
for review in reviews:
for noun in tagger.nouns(review):
if noun not in stop_words:
nouns.append(noun)
nouns[:10]
from collections import Counter
nouns_counter = Counter(nouns)
top_nouns = dict(nouns_counter.most_common(50))
top_nouns