76 lines
2.2 KiB
Python
76 lines
2.2 KiB
Python
#!/usr/bin/env python
|
|
|
|
import random
|
|
import numpy as np
|
|
from utils.treebank import StanfordSentiment
|
|
import matplotlib
|
|
matplotlib.use('agg')
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
|
|
from word2vec import *
|
|
from sgd import *
|
|
|
|
# Check Python Version
|
|
import sys
|
|
assert sys.version_info[0] == 3
|
|
assert sys.version_info[1] >= 5
|
|
|
|
# Reset the random seed to make sure that everyone gets the same results
|
|
random.seed(314)
|
|
dataset = StanfordSentiment()
|
|
tokens = dataset.tokens()
|
|
nWords = len(tokens)
|
|
|
|
# We are going to train 10-dimensional vectors for this assignment
|
|
dimVectors = 10
|
|
|
|
# Context size
|
|
C = 5
|
|
|
|
# Reset the random seed to make sure that everyone gets the same results
|
|
random.seed(31415)
|
|
np.random.seed(9265)
|
|
|
|
startTime=time.time()
|
|
wordVectors = np.concatenate(
|
|
((np.random.rand(nWords, dimVectors) - 0.5) /
|
|
dimVectors, np.zeros((nWords, dimVectors))),
|
|
axis=0)
|
|
wordVectors = sgd(
|
|
lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
|
|
negSamplingLossAndGradient),
|
|
wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
|
|
# Note that normalization is not called here. This is not a bug,
|
|
# normalizing during training loses the notion of length.
|
|
|
|
print("sanity check: cost at convergence should be around or below 10")
|
|
print("training took %d seconds" % (time.time() - startTime))
|
|
|
|
# concatenate the input and output word vectors
|
|
wordVectors = np.concatenate(
|
|
(wordVectors[:nWords,:], wordVectors[nWords:,:]),
|
|
axis=0)
|
|
|
|
visualizeWords = [
|
|
"great", "cool", "brilliant", "wonderful", "well", "amazing",
|
|
"worth", "sweet", "enjoyable", "boring", "bad", "dumb",
|
|
"annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
|
|
"hail", "coffee", "tea"]
|
|
|
|
visualizeIdx = [tokens[word] for word in visualizeWords]
|
|
visualizeVecs = wordVectors[visualizeIdx, :]
|
|
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
|
|
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
|
|
U,S,V = np.linalg.svd(covariance)
|
|
coord = temp.dot(U[:,0:2])
|
|
|
|
for i in range(len(visualizeWords)):
|
|
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
|
|
bbox=dict(facecolor='green', alpha=0.1))
|
|
|
|
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
|
|
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
|
|
|
|
plt.savefig('word_vectors.png')
|