인공지능 공부/딥러닝 논문읽기

( 자연어처리) 기초 키워드 분석

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x.


import urllib.request

raw = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt").readlines()
print(raw[:5])

raw = [x.decode() for x in raw[1:]]

reviews = []

for i in raw :
  reviews.append(i.split('\t')[1])

print(reviews[:5])



from konlpy.tag import Mecab

tagger = Mecab()

nouns = []

for review in reviews:
  for noun in tagger.nouns(review):
    nouns.append(noun)

nouns[:10]


stop_words = "영화 전 난 일 걸 뭐 줄 만 건 분 개 끝 잼 이거 번 중 듯 때 게 내 말 나 수 거 점 것"
stop_words = stop_words.split(" ")
print(stop_words)


nouns = []
for review in reviews:
  for noun in tagger.nouns(review):
    if noun not in stop_words:
      nouns.append(noun)

nouns[:10]

from collections import Counter
nouns_counter = Counter(nouns)

top_nouns = dict(nouns_counter.most_common(50))
top_nouns

import numpy as np
y_pos = np.arange(len(top_nouns))
plt.figure(figsize=(12,12))
plt.barh(y_pos, top_nouns.values())
plt.title('Word count')
plt.yticks(y_pos, top_nouns.keys())
plt.show()

from wordcloud import WordCloud
wc = WordCloud(background_color = "white", font_path = './font/NanumBarunGothic.ttf')
wc.generate_from_frequencies(top_nouns)
figure = plt.figure(figsize = (12,12))
ax = figure.add_subplot(1,1,1,)
ax.axis('off')
ax.imshow(wc)
plt.show()

import squarify

norm = mpl.colors.Normalize(vmin = min(top_nouns.values()),
                            vmax=max(top_nouns.values()))

colors = [mpl.cm.Blues(norm(value)) for value in top_nouns.values()]

squarify.plot(label = top_nouns.keys(),
              sizes = top_nouns.values(),
              color = colors,
              alpha = .7)