This commit is contained in:
chongjiu.jin
2019-11-11 14:55:15 +08:00
parent 94d7ba8bd5
commit 90ef82b456
14 changed files with 1470 additions and 0 deletions

Binary file not shown.

View File

@@ -0,0 +1,2 @@
rm -f assignment2.zip
zip -r assignment2.zip *.py *.png saved_params_40000.npy

View File

@@ -0,0 +1,15 @@
#!/bin/bash
DATASETS_DIR="utils/datasets"
mkdir -p $DATASETS_DIR
cd $DATASETS_DIR
# Get Stanford Sentiment Treebank
if hash wget 2>/dev/null; then
wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
else
curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
fi
unzip stanfordSentimentTreebank.zip
rm stanfordSentimentTreebank.zip

View File

@@ -0,0 +1,461 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 为什么要学习pytorch\n",
"\n",
"tensorflow的学习曲线陡峭\n",
"\n",
"pytorch出自facebook\n",
"\n",
"PyTorch 可以当做 NumPy 用\n",
"\n",
"静态图 vs 动态图\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"最新版本全部使用tensor就可以\n",
"\n",
"numpy能实现的东西在pytorch基本可以实现\n",
"\n",
"torch.randn()=numpy.random.randn()\n",
"\n",
"torch.max()=np.max()\n",
"\n",
"torch.zeros()=np.zeros()\n",
"\n",
"\n",
"如果不能实现,Tensor与numpy之间的可以相互转化\n",
"\n",
"tensor.numpy()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.1.0\n"
]
}
],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"print(torch.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cpu')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"device"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([0., 1., 2., 3., 4., 5., 6.])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# python struct to torch tensor\n",
"temp=[0, 1, 2, 3, 4, 5, 6]\n",
"x = torch.tensor(temp, dtype=torch.float, device=device)\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#gpu/cpu\n",
"x = torch.tensor(temp).cuda()\n",
"x = x.cpu()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x=x.long()\n",
"x=x.float()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## numpy reshape squeeze expand_dims"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1 2 3 4 5 6 7 8 9]\n",
"(10,)\n"
]
}
],
"source": [
"a = np.arange(10)\n",
"print(a)\n",
"print(a.shape)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[[0]\n",
" [1]]\n",
"\n",
" [[2]\n",
" [3]]\n",
"\n",
" [[4]\n",
" [5]]\n",
"\n",
" [[6]\n",
" [7]]\n",
"\n",
" [[8]\n",
" [9]]]\n",
"(5, 2, 1)\n"
]
}
],
"source": [
"#a=a.reshape(1,-1)\n",
"#a.reshape(1,10)\n",
"a=a.reshape(5,2,1)\n",
"print(a)\n",
"print(a.shape)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1]\n",
" [2 3]\n",
" [4 5]\n",
" [6 7]\n",
" [8 9]]\n",
"(5, 2)\n"
]
}
],
"source": [
"b = np.squeeze(a)\n",
"print(b)\n",
"print(b.shape)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5, 1, 2)\n"
]
}
],
"source": [
"a = np.arange(10)\n",
"a=a.reshape(5,1,2)\n",
"print(a.shape)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1]\n",
" [2 3]\n",
" [4 5]\n",
" [6 7]\n",
" [8 9]]\n",
"(5, 2)\n"
]
}
],
"source": [
"b = np.squeeze(a)#b=a.reshape(5,2)\n",
"print(b)\n",
"print(b.shape)\n",
"#np.squeeze(e,axis = 0,1,2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a (10,)\n",
"b axis=0 (1, 10)\n",
"c axis=1 (10, 1)\n"
]
}
],
"source": [
"#expand_dims\n",
"a = np.arange(10)\n",
"print(\"a\",a.shape)\n",
"b = np.expand_dims(a, axis=0)\n",
"print(\"b axis=0\",b.shape)\n",
"c = np.expand_dims(a, axis=1)\n",
"print(\"c axis=1\",c.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## pytorch reshape squeeze unsqueeze"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n",
"x torch.Size([10])\n",
"b axis=0 torch.Size([1, 10])\n",
"c axis=1 torch.Size([10, 1])\n"
]
}
],
"source": [
"x = torch.arange(0,10)\n",
"print(\"x\",x)\n",
"print(\"x\",x.shape)\n",
"b = x.unsqueeze(0)\n",
"print(\"b axis=0\",b.shape)\n",
"c = torch.unsqueeze(x,1)\n",
"print(\"c axis=1\",c.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d torch.Size([10])\n",
"d torch.Size([10])\n"
]
}
],
"source": [
"d=c.squeeze(1)\n",
"print(\"d\",d.shape)\n",
"d=c.squeeze()\n",
"print(\"d\",d.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d torch.Size([10])\n"
]
}
],
"source": [
"d=b.squeeze()\n",
"print(\"d\",d.shape)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([8])\n",
"torch.Size([8])\n",
"tensor(0.4091, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)\n"
]
}
],
"source": [
"import torch.nn.functional as F\n",
"m = nn.Sigmoid()\n",
"\n",
"loss = nn.BCEWithLogitsLoss()\n",
"temp=[0, 1, 2,0, 1, 2,0, 1]\n",
"input =torch.tensor(temp, dtype=torch.float, requires_grad=True)\n",
"#input = torch.randn(3, requires_grad=True)\n",
"target = torch.tensor([0,1,1,0,1,1,0,1], dtype=torch.float)\n",
"lossinput = input\n",
"output = loss(lossinput, target)\n",
"\n",
"\n",
"print(lossinput.shape)\n",
"\n",
"print(target.shape)\n",
"\n",
"print(output)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## logist regression\n",
"\n",
"$$y(z) = \\frac{1}{1 + exp(-\\theta^T z)}$$\n",
"\n",
"nn.BCELoss\n",
"$$\n",
" \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad\n",
" l_n = - w_n \\left[ y_n \\cdot \\log x_n + (1 - y_n) \\cdot \\log (1 - x_n) \\right],\n",
" $$ \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch.nn.functional as F # 激励函数库\n",
"\n",
"feature_num=100\n",
"\n",
"class LogistRegression(torch.nn.Module):\n",
" def __init__(self):\n",
" super(LogistRegression, self).__init__()\n",
" self.linear = torch.nn.Linear(feature_num, 1)\n",
" def forward(self, x):\n",
" z=self.linear(x)\n",
" y_pred = F.sigmoid(z)\n",
" return y_pred\n",
" def loss(y_pred,label):#Binary Cross Entropy\n",
" criterion = torch.nn.BCELoss(size_average=True)\n",
" return criterion(y_pred,label)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"nn.CrossEntropyLoss() 自带sigmod不需要加"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,105 @@
import random
import re
import torch
import torch.optim as optim
from tqdm import tqdm
from pytorch_word2vec_model import SkipGram
epochs = 50
negative_sampling = 4
window = 2
vocab_size = 1
embd_size = 300
def batch_data(x, batch_size=128):
in_w = []
out_w = []
target = []
for text in x:
for i in range(window, len(text) - window):
word_set = set()
in_w.append(text[i])
in_w.append(text[i])
in_w.append(text[i])
in_w.append(text[i])
out_w.append(text[i - 2])
out_w.append(text[i - 1])
out_w.append(text[i + 1])
out_w.append(text[i + 2])
target.append(1)
target.append(1)
target.append(1)
target.append(1)
# negative sampling
count = 0
while count < negative_sampling:
rand_id = random.randint(0, vocab_size-1)
if not rand_id in word_set:
in_w.append(text[i])
out_w.append(rand_id)
target.append(0)
count += 1
if len(out_w) >= batch_size:
yield [in_w, out_w, target]
in_w = []
out_w = []
target = []
if out_w:
yield [in_w, out_w, target]
def train(train_text_id, model,opt):
model.train() # 启用dropout和batch normalization
ave_loss = 0
pbar = tqdm()
cnt=0
for x_batch in batch_data(train_text_id):
in_w, out_w, target = x_batch
in_w_var = torch.tensor(in_w)
out_w_var = torch.tensor(out_w)
target_var = torch.tensor(target,dtype=torch.float)
model.zero_grad()
log_probs = model(in_w_var, out_w_var)
loss = model.loss(log_probs, target_var)
loss.backward()
opt.step()
ave_loss += loss.item()
pbar.update(1)
cnt += 1
pbar.set_description('< loss: %.5f >' % (ave_loss / cnt))
pbar.close()
text_id = []
vocab_dict = {}
with open(
'D:\\project\\ml\\github\\cs224n-natural-language-processing-winter2019\\a1_intro_word_vectors\\a1\\corpus\\corpus.txt',
encoding='utf-8') as fp:
for line in fp:
lines = re.sub("[^A-Za-z0-9']+", ' ', line).lower().split()
line_id = []
for s in lines:
if not s:
continue
if s not in vocab_dict:
vocab_dict[s] = len(vocab_dict)
id = vocab_dict[s]
line_id.append(id)
if id==11500:
print(id,s)
text_id.append(line_id)
vocab_size = len(vocab_dict)
print('vocab_size', vocab_size)
model = SkipGram(vocab_size, embd_size)
for epoch in range(epochs):
print('epoch', epoch)
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=0.001, weight_decay=0)
train(text_id, model,opt)

View File

@@ -0,0 +1,40 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class SkipGram(nn.Module):
def __init__(self, vocab_size, embd_size):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embd_size)
def forward(self, focus, context):
embed_focus = self.embeddings(focus)
embed_ctx = self.embeddings(context)
# score = torch.mm(embed_focus, torch.t(embed_ctx))
score = torch.mul(embed_focus, embed_ctx).sum(dim=1)
log_probs = score #F.logsigmoid(score)
return log_probs
def loss(self, log_probs, target):
loss_fn = nn.BCEWithLogitsLoss()
# loss_fn = nn.NLLLoss()
loss = loss_fn(log_probs, target)
return loss
class CBOW(nn.Module):
def __init__(self, vocab_size, embd_size, context_size, hidden_size):
super(CBOW, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embd_size)
self.linear1 = nn.Linear(2 * context_size * embd_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, vocab_size)
def forward(self, inputs):
embedded = self.embeddings(inputs).view((1, -1))
hid = F.relu(self.linear1(embedded))
out = self.linear2(hid)
log_probs = F.log_softmax(out)
return log_probs

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
from word2vec import *
from sgd import *
# Check Python Version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5
# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10
# Context size
C = 5
# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)
startTime=time.time()
wordVectors = np.concatenate(
((np.random.rand(nWords, dimVectors) - 0.5) /
dimVectors, np.zeros((nWords, dimVectors))),
axis=0)
wordVectors = sgd(
lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
negSamplingLossAndGradient),
wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.
print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - startTime))
# concatenate the input and output word vectors
wordVectors = np.concatenate(
(wordVectors[:nWords,:], wordVectors[nWords:,:]),
axis=0)
visualizeWords = [
"great", "cool", "brilliant", "wonderful", "well", "amazing",
"worth", "sweet", "enjoyable", "boring", "bad", "dumb",
"annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
"hail", "coffee", "tea"]
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])
for i in range(len(visualizeWords)):
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
bbox=dict(facecolor='green', alpha=0.1))
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.savefig('word_vectors.png')

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python
# Save parameters every a few SGD iterations as fail-safe
SAVE_PARAMS_EVERY = 5000
import pickle
import glob
import random
import numpy as np
import os.path as op
def load_saved_params():
"""
A helper function that loads previously saved parameters and resets
iteration start.
"""
st = 0
for f in glob.glob("saved_params_*.npy"):
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
if (iter > st):
st = iter
if st > 0:
params_file = "saved_params_%d.npy" % st
state_file = "saved_state_%d.pickle" % st
params = np.load(params_file)
with open(state_file, "rb") as f:
state = pickle.load(f)
return st, params, state
else:
return st, None, None
def save_params(iter, params):
params_file = "saved_params_%d.npy" % iter
np.save(params_file, params)
with open("saved_state_%d.pickle" % iter, "wb") as f:
pickle.dump(random.getstate(), f)
def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
PRINT_EVERY=10):
""" Stochastic Gradient Descent
Implement the stochastic gradient descent method in this function.
Arguments:
f -- the function to optimize, it should take a single
argument and yield two outputs, a loss and the gradient
with respect to the arguments
x0 -- the initial point to start SGD from
step -- the step size for SGD
iterations -- total iterations to run SGD for
postprocessing -- postprocessing function for the parameters
if necessary. In the case of word2vec we will need to
normalize the word vectors to have unit length.
PRINT_EVERY -- specifies how many iterations to output loss
Return:
x -- the parameter value after SGD finishes
"""
# Anneal learning rate every several iterations
ANNEAL_EVERY = 20000
if useSaved:
start_iter, oldx, state = load_saved_params()
if start_iter > 0:
x0 = oldx
step *= 0.5 ** (start_iter / ANNEAL_EVERY)
if state:
random.setstate(state)
else:
start_iter = 0
x = x0
if not postprocessing:
postprocessing = lambda x: x
exploss = None
for iter in range(start_iter + 1, iterations + 1):
# You might want to print the progress every few iterations.
loss = None
### YOUR CODE HERE
loss,gd = f(x)
x = x - step*gd
x = postprocessing(x)
### END YOUR CODE
x = postprocessing(x)
if iter % PRINT_EVERY == 0:
if not exploss:
exploss = loss
else:
exploss = .95 * exploss + .05 * loss
print("iter %d: %f" % (iter, exploss))
if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
save_params(iter, x)
if iter % ANNEAL_EVERY == 0:
step *= 0.5
return x
def sanity_check():
quad = lambda x: (np.sum(x ** 2), x * 2)
print("Running sanity checks...")
t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
print("test 1 result:", t1)
assert abs(t1) <= 1e-6
t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
print("test 2 result:", t2)
assert abs(t2) <= 1e-6
t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
print("test 3 result:", t3)
assert abs(t3) <= 1e-6
print("-" * 40)
print("ALL TESTS PASSED")
print("-" * 40)
if __name__ == "__main__":
sanity_check()

Binary file not shown.

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python
import numpy as np
import random
# First implement a gradient checker by filling in the following functions
def gradcheck_naive(f, x, gradientText):
""" Gradient check for a function f.
Arguments:
f -- a function that takes a single argument and outputs the
loss and its gradients
x -- the point (numpy array) to check the gradient at
gradientText -- a string detailing some context about the gradient computation
"""
rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-4 # Do not change this!
# Iterate over all indexes ix in x to check the gradient.
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
x[ix] += h # increment by h
random.setstate(rndstate)
fxh, _ = f(x) # evalute f(x + h)
x[ix] -= 2 * h # restore to previous value (very important!)
random.setstate(rndstate)
fxnh, _ = f(x)
x[ix] += h
numgrad = (fxh - fxnh) / 2 / h
# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print("Gradient check failed for %s." % gradientText)
print("First gradient error found at index %s in the vector of gradients" % str(ix))
print("Your gradient: %f \t Numerical gradient: %f" % (
grad[ix], numgrad))
return
it.iternext() # Step to next dimension
print("Gradient check passed!")

View File

@@ -0,0 +1,248 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pickle
import numpy as np
import os
import random
class StanfordSentiment:
def __init__(self, path=None, tablesize = 1000000):
if not path:
path = "utils/datasets/stanfordSentimentTreebank"
self.path = path
self.tablesize = tablesize
def tokens(self):
if hasattr(self, "_tokens") and self._tokens:
return self._tokens
tokens = dict()
tokenfreq = dict()
wordcount = 0
revtokens = []
idx = 0
for sentence in self.sentences():
for w in sentence:
wordcount += 1
if not w in tokens:
tokens[w] = idx
revtokens += [w]
tokenfreq[w] = 1
idx += 1
else:
tokenfreq[w] += 1
tokens["UNK"] = idx
revtokens += ["UNK"]
tokenfreq["UNK"] = 1
wordcount += 1
self._tokens = tokens
self._tokenfreq = tokenfreq
self._wordcount = wordcount
self._revtokens = revtokens
return self._tokens
def sentences(self):
if hasattr(self, "_sentences") and self._sentences:
return self._sentences
sentences = []
with open(self.path + "/datasetSentences.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue
splitted = line.strip().split()[1:]
# Deal with some peculiar encoding issues with this file
sentences += [[w.lower() for w in splitted]]
self._sentences = sentences
self._sentlengths = np.array([len(s) for s in sentences])
self._cumsentlen = np.cumsum(self._sentlengths)
return self._sentences
def numSentences(self):
if hasattr(self, "_numSentences") and self._numSentences:
return self._numSentences
else:
self._numSentences = len(self.sentences())
return self._numSentences
def allSentences(self):
if hasattr(self, "_allsentences") and self._allsentences:
return self._allsentences
sentences = self.sentences()
rejectProb = self.rejectProb()
tokens = self.tokens()
allsentences = [[w for w in s
if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
for s in sentences * 30]
allsentences = [s for s in allsentences if len(s) > 1]
self._allsentences = allsentences
return self._allsentences
def getRandomContext(self, C=5):
allsent = self.allSentences()
sentID = random.randint(0, len(allsent) - 1)
sent = allsent[sentID]
wordID = random.randint(0, len(sent) - 1)
context = sent[max(0, wordID - C):wordID]
if wordID+1 < len(sent):
context += sent[wordID+1:min(len(sent), wordID + C + 1)]
centerword = sent[wordID]
context = [w for w in context if w != centerword]
if len(context) > 0:
return centerword, context
else:
return self.getRandomContext(C)
def sent_labels(self):
if hasattr(self, "_sent_labels") and self._sent_labels:
return self._sent_labels
dictionary = dict()
phrases = 0
with open(self.path + "/dictionary.txt", "r") as f:
for line in f:
line = line.strip()
if not line: continue
splitted = line.split("|")
dictionary[splitted[0].lower()] = int(splitted[1])
phrases += 1
labels = [0.0] * phrases
with open(self.path + "/sentiment_labels.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue
line = line.strip()
if not line: continue
splitted = line.split("|")
labels[int(splitted[0])] = float(splitted[1])
sent_labels = [0.0] * self.numSentences()
sentences = self.sentences()
for i in range(self.numSentences()):
sentence = sentences[i]
full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
sent_labels[i] = labels[dictionary[full_sent]]
self._sent_labels = sent_labels
return self._sent_labels
def dataset_split(self):
if hasattr(self, "_split") and self._split:
return self._split
split = [[] for i in range(3)]
with open(self.path + "/datasetSplit.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue
splitted = line.strip().split(",")
split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
self._split = split
return self._split
def getRandomTrainSentence(self):
split = self.dataset_split()
sentId = split[0][random.randint(0, len(split[0]) - 1)]
return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
def categorify(self, label):
if label <= 0.2:
return 0
elif label <= 0.4:
return 1
elif label <= 0.6:
return 2
elif label <= 0.8:
return 3
else:
return 4
def getDevSentences(self):
return self.getSplitSentences(2)
def getTestSentences(self):
return self.getSplitSentences(1)
def getTrainSentences(self):
return self.getSplitSentences(0)
def getSplitSentences(self, split=0):
ds_split = self.dataset_split()
return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
def sampleTable(self):
if hasattr(self, '_sampleTable') and self._sampleTable is not None:
return self._sampleTable
nTokens = len(self.tokens())
samplingFreq = np.zeros((nTokens,))
self.allSentences()
i = 0
for w in range(nTokens):
w = self._revtokens[i]
if w in self._tokenfreq:
freq = 1.0 * self._tokenfreq[w]
# Reweigh
freq = freq ** 0.75
else:
freq = 0.0
samplingFreq[i] = freq
i += 1
samplingFreq /= np.sum(samplingFreq)
samplingFreq = np.cumsum(samplingFreq) * self.tablesize
self._sampleTable = [0] * self.tablesize
j = 0
for i in range(self.tablesize):
while i > samplingFreq[j]:
j += 1
self._sampleTable[i] = j
return self._sampleTable
def rejectProb(self):
if hasattr(self, '_rejectProb') and self._rejectProb is not None:
return self._rejectProb
threshold = 1e-5 * self._wordcount
nTokens = len(self.tokens())
rejectProb = np.zeros((nTokens,))
for i in range(nTokens):
w = self._revtokens[i]
freq = 1.0 * self._tokenfreq[w]
# Reweigh
rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
self._rejectProb = rejectProb
return self._rejectProb
def sampleTokenIdx(self):
return self.sampleTable()[random.randint(0, self.tablesize - 1)]

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python
import numpy as np
def normalizeRows(x):
""" Row normalization function
Implement a function that normalizes each row of a matrix to have
unit length.
"""
N = x.shape[0]
x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
return x
def softmax(x):
"""Compute the softmax function for each row of the input x.
It is crucial that this function is optimized for speed because
it will be used frequently in later code.
Arguments:
x -- A D dimensional vector or N x D dimensional numpy matrix.
Return:
x -- You are allowed to modify x in-place
"""
orig_shape = x.shape
if len(x.shape) > 1:
# Matrix
tmp = np.max(x, axis=1)
x -= tmp.reshape((x.shape[0], 1))
x = np.exp(x)
tmp = np.sum(x, axis=1)
x /= tmp.reshape((x.shape[0], 1))
else:
# Vector
tmp = np.max(x)
x -= tmp
x = np.exp(x)
tmp = np.sum(x)
x /= tmp
assert x.shape == orig_shape
return x

View File

@@ -0,0 +1,301 @@
#!/usr/bin/env python
import numpy as np
import random
from utils.gradcheck import gradcheck_naive
from utils.utils import normalizeRows, softmax
def sigmoid(x):
"""
Compute the sigmoid function for the input here.
Arguments:
x -- A scalar or numpy array.
Return:
s -- sigmoid(x)
"""
### YOUR CODE HERE
s = 1/(1+np.exp(-x))
### END YOUR CODE
return s
def naiveSoftmaxLossAndGradient(
centerWordVec,
outsideWordIdx,
outsideVectors,
dataset
):
""" Naive Softmax loss & gradient function for word2vec models
Implement the naive softmax loss and gradients between a center word's
embedding and an outside word's embedding. This will be the building block
for our word2vec models.
Arguments:
centerWordVec -- numpy ndarray, center word's embedding
(v_c in the pdf handout)
outsideWordIdx -- integer, the index of the outside word
(o of u_o in the pdf handout)
outsideVectors -- outside vectors (rows of matrix) for all words in vocab
(U in the pdf handout)
dataset -- needed for negative sampling, unused here.
Return:
loss -- naive softmax loss
gradCenterVec -- the gradient with respect to the center word vector
(dJ / dv_c in the pdf handout)
gradOutsideVecs -- the gradient with respect to all the outside word vectors
(dJ / dU)
"""
### YOUR CODE HERE
score = np.dot(outsideVectors,centerWordVec)
y_h = softmax(score)
loss = -np.log(y_h[outsideWordIdx])
y = np.eye(y_h.shape[0])[outsideWordIdx]
diff = (y_h-y).reshape((y.shape[0],1))
gradCenterVec = np.dot(diff.T,outsideVectors)
centerWordVec = centerWordVec.reshape((1,centerWordVec.shape[0]))
gradOutsideVecs = np.dot(diff,centerWordVec)
### END YOUR CODE
return loss, gradCenterVec, gradOutsideVecs
def getNegativeSamples(outsideWordIdx, dataset, K):
""" Samples K indexes which are not the outsideWordIdx """
negSampleWordIndices = [None] * K
for k in range(K):
newidx = dataset.sampleTokenIdx()
while newidx == outsideWordIdx:
newidx = dataset.sampleTokenIdx()
negSampleWordIndices[k] = newidx
return negSampleWordIndices
def negSamplingLossAndGradient(
centerWordVec,
outsideWordIdx,
outsideVectors,
dataset,
K=10
):
""" Negative sampling loss function for word2vec models
Implement the negative sampling loss and gradients for a centerWordVec
and a outsideWordIdx word vector as a building block for word2vec
models. K is the number of negative samples to take.
Note: The same word may be negatively sampled multiple times. For
example if an outside word is sampled twice, you shall have to
double count the gradient with respect to this word. Thrice if
it was sampled three times, and so forth.
Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
"""
# Negative sampling of words is done for you. Do not modify this if you
# wish to match the autograder and receive points!
negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
indices = [outsideWordIdx] + negSampleWordIndices
### YOUR CODE HERE
score = np.dot(outsideVectors[outsideWordIdx],centerWordVec)
sig_1 = sigmoid(score)
sum_neg = 0.0
#Find unique negative samples and the number of times they are present in our sample window
unique_k, counts_k = np.unique(indices[1:], return_counts=True)
k_stack = outsideVectors[unique_k]
score_neg = -np.dot(k_stack,centerWordVec)
sig_neg = sigmoid(score_neg)
sum_neg = np.sum(counts_k*np.log(sig_neg),axis=0)
#J_neg_sam Loss
loss = -np.log(sig_1) - sum_neg
#Calculate gradients
k_term = 0.0
#delta term from previous layer for efficient implementation
delta_1msig = 1-sig_1
delta_1msig_neg = 1-sig_neg
gradOutsideVecs = np.zeros((outsideVectors.shape))
gradOutsideVecs[outsideWordIdx,:] = -delta_1msig*centerWordVec
common_term = np.dot(delta_1msig_neg.reshape(unique_k.shape[0],1),centerWordVec.reshape(1,centerWordVec.shape[0]))
gradOutsideVecs[unique_k,:] += counts_k.reshape(counts_k.shape[0],1)*common_term
#Reshape prep for center gradient calculation
counts_k = counts_k.reshape(counts_k.shape[0],1)
delta_1msig_neg = delta_1msig_neg.reshape(delta_1msig_neg.shape[0],1)
k_term = np.sum(np.dot((delta_1msig_neg.reshape(1,counts_k.shape[0])),counts_k*k_stack),axis=0)
gradCenterVec = -delta_1msig*outsideVectors[outsideWordIdx] + k_term
### END YOUR CODE
return loss, gradCenterVec, gradOutsideVecs
def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
centerWordVectors, outsideVectors, dataset,
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
""" Skip-gram model in word2vec
Implement the skip-gram model in this function.
Arguments:
currentCenterWord -- a string of the current center word
windowSize -- integer, context window size
outsideWords -- list of no more than 2*windowSize strings, the outside words
word2Ind -- a dictionary that maps words to their indices in
the word vector list
centerWordVectors -- center word vectors (as rows) for all words in vocab
(V in pdf handout)
outsideVectors -- outside word vectors (as rows) for all words in vocab
(U in pdf handout)
word2vecLossAndGradient -- the loss and gradient function for
a prediction vector given the outsideWordIdx
word vectors, could be one of the two
loss functions you implemented above.
Return:
loss -- the loss function value for the skip-gram model
(J in the pdf handout)
gradCenterVecs -- the gradient with respect to the center word vectors
(dJ / dV in the pdf handout)
gradOutsideVectors -- the gradient with respect to the outside word vectors
(dJ / dU in the pdf handout)
"""
loss = 0.0
gradCenterVecs = np.zeros(centerWordVectors.shape)
gradOutsideVectors = np.zeros(outsideVectors.shape)
### YOUR CODE HERE
for m in range(0,len(outsideWords)):
l,gradCenter,gradOutside= word2vecLossAndGradient(centerWordVectors[word2Ind[currentCenterWord]],word2Ind[outsideWords[m]],outsideVectors,dataset)
loss+=l
gradCenterVecs[word2Ind[currentCenterWord]] += gradCenter.reshape((centerWordVectors.shape[1],))
gradOutsideVectors += gradOutside
### END YOUR CODE
return loss, gradCenterVecs, gradOutsideVectors
#############################################
# Testing functions below. DO NOT MODIFY! #
#############################################
def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset,
windowSize,
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
batchsize = 50
loss = 0.0
grad = np.zeros(wordVectors.shape)
N = wordVectors.shape[0]
centerWordVectors = wordVectors[:int(N/2),:]
outsideVectors = wordVectors[int(N/2):,:]
for i in range(batchsize):
windowSize1 = random.randint(1, windowSize)
centerWord, context = dataset.getRandomContext(windowSize1)
c, gin, gout = word2vecModel(
centerWord, windowSize1, context, word2Ind, centerWordVectors,
outsideVectors, dataset, word2vecLossAndGradient
)
loss += c / batchsize
grad[:int(N/2), :] += gin / batchsize
grad[int(N/2):, :] += gout / batchsize
return loss, grad
def test_word2vec():
""" Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
dataset = type('dummy', (), {})()
def dummySampleTokenIdx():
return random.randint(0, 4)
def getRandomContext(C):
tokens = ["a", "b", "c", "d", "e"]
return tokens[random.randint(0,4)], \
[tokens[random.randint(0,4)] for i in range(2*C)]
dataset.sampleTokenIdx = dummySampleTokenIdx
dataset.getRandomContext = getRandomContext
random.seed(31415)
np.random.seed(9265)
dummy_vectors = normalizeRows(np.random.randn(10,3))
dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
dummy_vectors, "negSamplingLossAndGradient Gradient")
print("\n=== Results ===")
print ("Skip-Gram with naiveSoftmaxLossAndGradient")
print ("Your Result:")
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
*skipgram("c", 3, ["a", "b", "e", "d", "b", "c"],
dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
)
)
print ("Expected Result: Value should approximate these:")
print("""Loss: 11.16610900153398
Gradient wrt Center Vectors (dJ/dV):
[[ 0. 0. 0. ]
[ 0. 0. 0. ]
[-1.26947339 -1.36873189 2.45158957]
[ 0. 0. 0. ]
[ 0. 0. 0. ]]
Gradient wrt Outside Vectors (dJ/dU):
[[-0.41045956 0.18834851 1.43272264]
[ 0.38202831 -0.17530219 -1.33348241]
[ 0.07009355 -0.03216399 -0.24466386]
[ 0.09472154 -0.04346509 -0.33062865]
[-0.13638384 0.06258276 0.47605228]]
""")
print ("Skip-Gram with negSamplingLossAndGradient")
print ("Your Result:")
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:],
dummy_vectors[5:,:], dataset, negSamplingLossAndGradient)
)
)
print ("Expected Result: Value should approximate these:")
print("""Loss: 16.15119285363322
Gradient wrt Center Vectors (dJ/dV):
[[ 0. 0. 0. ]
[ 0. 0. 0. ]
[-4.54650789 -1.85942252 0.76397441]
[ 0. 0. 0. ]
[ 0. 0. 0. ]]
Gradient wrt Outside Vectors (dJ/dU):
[[-0.69148188 0.31730185 2.41364029]
[-0.22716495 0.10423969 0.79292674]
[-0.45528438 0.20891737 1.58918512]
[-0.31602611 0.14501561 1.10309954]
[-0.80620296 0.36994417 2.81407799]]
""")
if __name__ == "__main__":
test_word2vec()