텍스트마이닝: Word Representation
텍스트(데이터) 수집 방법
python을 사용하여 데이터를 크롤링하는 방법을 배운다. 본격적으로 코드에 대해 언급하기 전에 알아야 할 인코딩이나 환경에 대하여 먼저 서술한다.
Different types of Word Embeddings
- 빈도수 기반 Embedding : Count Vector, TF-IDF Vector, Co-Occurrence Vector
- 예측 기반 Embedding : CBOW(Continuous Bag of words), Skip-Gram model
Distributional Embeddings
Training Methods
tdm의 특징 sparse 하다 (필요없는 정보 예를들어 0이 많음)
GloVe
Word Representation 을 위한 Global Vector 이다.
FastText
페이스북에서 제작한 word2bag 의 변형. 단어단위가 아니라 캐릭터 단위의 레벨을 사용하며 스킵-그램 모델의 확장이다. n-gram 단위로 학습한다.
import re # For preprocessing
import pandas as pd # For data handling
from time import time # To time our operations
from collections import defaultdict # For word frequency
import spacy # For preprocessing
# python -m spacy download en_core_web_sm
import logging # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
df = pd.read_csv('data/simpsons_dataset.csv')
print(df.shape)
print(df.head())
def cleaning(doc):
# Lemmatizes and removes stopwords
# doc needs to be a spacy Doc object
txt = [token.lemma_ for token in doc if not token.is_stop]
# Word2Vec uses context words to learn the vector representation of a target word,
# if a sentence is only one or two words long,
# the benefit for the training is very small
if len(txt) > 2:
return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])
t = time()
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]
# Most Frequent Words:
word_freq = defaultdict(int)
for sentence in sentences:
for i in sentence:
word_freq[i] += 1
len(word_freq)
print(sorted(word_freq, key=word_freq.get, reverse=True)[:10])
# Train the Model
import multiprocessing
from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03,
min_alpha=0.0007, negative=20,workers=cores-1)
# min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)
# window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
# size = int - Dimensionality of the feature vectors. - (50, 300)
# sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
# alpha = float - The initial learning rate - (0.01, 0.05)
# min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
# negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
# workers = int - Use these many worker threads to train the model (=faster training with multicore machines)
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
w2v_model.init_sims(replace=True)
print(w2v_model.wv.most_similar(positive=["homer"]))
print(w2v_model.wv.most_similar(positive=["homer_simpson"]))
print(w2v_model.wv.most_similar(positive=["marge"]))
print(w2v_model.wv.similarity('maggie', 'baby'))
print(w2v_model.wv.similarity('bart', 'nelson'))
### Visualization
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
def tsnescatterplot(model, word, list_names):
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
its list of most similar words, and a list of words.
"""
arrays = np.empty((0, 300), dtype='f')
word_labels = [word]
color_list = ['red']
# adds the vector of the query word
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
# gets list of most similar words
close_words = model.wv.most_similar([word])
# adds the vector for each of the closest words to the array
for wrd_score in close_words:
wrd_vector = model.wv.__getitem__([wrd_score[0]])
word_labels.append(wrd_score[0])
color_list.append('blue')
arrays = np.append(arrays, wrd_vector, axis=0)
# adds the vector for each of the words from list_names to the array
for wrd in list_names:
wrd_vector = model.wv.__getitem__([wrd])
word_labels.append(wrd)
color_list.append('green')
arrays = np.append(arrays, wrd_vector, axis=0)
# Reduces the dimensionality from 300 to 50 dimensions with PCA
reduc = PCA(n_components=10).fit_transform(arrays)
# Finds t-SNE coordinates for 2 dimensions
np.set_printoptions(suppress=True)
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
# Sets everything up to plot
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'words': word_labels,
'color': color_list})
fig, _ = plt.subplots()
fig.set_size_inches(9, 9)
# Basic plot
p1 = sns.regplot(data=df,
x="x",
y="y",
fit_reg=False,
marker="o",
scatter_kws={'s': 40,
'facecolors': df['color']
}
)
# Adds annotations one by one with a loop
for line in range(0, df.shape[0]):
p1.text(df["x"][line],
df['y'][line],
' ' + df["words"][line].title(),
horizontalalignment='left',
verticalalignment='bottom', size='medium',
color=df['color'][line],
weight='normal'
).set_size(15)
plt.xlim(Y[:, 0].min() - 50, Y[:, 0].max() + 50)
plt.ylim(Y[:, 1].min() - 50, Y[:, 1].max() + 50)
plt.title('t-SNE visualization for {}'.format(word.title()))
plt.show()
tsnescatterplot(w2v_model, 'maggie', [i[0] for i in w2v_model.wv.most_similar(negative=["maggie"])])
Comments