a2
This commit is contained in:
BIN
[finished]Assignment_2_word2vec/a2 written.pdf
Normal file
BIN
[finished]Assignment_2_word2vec/a2 written.pdf
Normal file
Binary file not shown.
2
[finished]Assignment_2_word2vec/collect_submission.sh
Normal file
2
[finished]Assignment_2_word2vec/collect_submission.sh
Normal file
@@ -0,0 +1,2 @@
|
||||
rm -f assignment2.zip
|
||||
zip -r assignment2.zip *.py *.png saved_params_40000.npy
|
||||
15
[finished]Assignment_2_word2vec/get_datasets.sh
Normal file
15
[finished]Assignment_2_word2vec/get_datasets.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
DATASETS_DIR="utils/datasets"
|
||||
mkdir -p $DATASETS_DIR
|
||||
|
||||
cd $DATASETS_DIR
|
||||
|
||||
# Get Stanford Sentiment Treebank
|
||||
if hash wget 2>/dev/null; then
|
||||
wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
|
||||
else
|
||||
curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
|
||||
fi
|
||||
unzip stanfordSentimentTreebank.zip
|
||||
rm stanfordSentimentTreebank.zip
|
||||
461
[finished]Assignment_2_word2vec/others/pytorch review1.ipynb
Normal file
461
[finished]Assignment_2_word2vec/others/pytorch review1.ipynb
Normal file
@@ -0,0 +1,461 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 为什么要学习pytorch\n",
|
||||
"\n",
|
||||
"tensorflow的学习曲线陡峭\n",
|
||||
"\n",
|
||||
"pytorch出自facebook\n",
|
||||
"\n",
|
||||
"PyTorch 可以当做 NumPy 用\n",
|
||||
"\n",
|
||||
"静态图 vs 动态图\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"最新版本全部使用tensor就可以\n",
|
||||
"\n",
|
||||
"numpy能实现的东西,在pytorch基本可以实现\n",
|
||||
"\n",
|
||||
"torch.randn()=numpy.random.randn()\n",
|
||||
"\n",
|
||||
"torch.max()=np.max()\n",
|
||||
"\n",
|
||||
"torch.zeros()=np.zeros()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"如果不能实现,Tensor与numpy之间的可以相互转化\n",
|
||||
"\n",
|
||||
"tensor.numpy()\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1.1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"print(torch.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"device(type='cpu')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||||
"device"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([0., 1., 2., 3., 4., 5., 6.])"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# python struct to torch tensor\n",
|
||||
"temp=[0, 1, 2, 3, 4, 5, 6]\n",
|
||||
"x = torch.tensor(temp, dtype=torch.float, device=device)\n",
|
||||
"x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#gpu/cpu\n",
|
||||
"x = torch.tensor(temp).cuda()\n",
|
||||
"x = x.cpu()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x=x.long()\n",
|
||||
"x=x.float()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## numpy reshape squeeze expand_dims"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[0 1 2 3 4 5 6 7 8 9]\n",
|
||||
"(10,)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = np.arange(10)\n",
|
||||
"print(a)\n",
|
||||
"print(a.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[[0]\n",
|
||||
" [1]]\n",
|
||||
"\n",
|
||||
" [[2]\n",
|
||||
" [3]]\n",
|
||||
"\n",
|
||||
" [[4]\n",
|
||||
" [5]]\n",
|
||||
"\n",
|
||||
" [[6]\n",
|
||||
" [7]]\n",
|
||||
"\n",
|
||||
" [[8]\n",
|
||||
" [9]]]\n",
|
||||
"(5, 2, 1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#a=a.reshape(1,-1)\n",
|
||||
"#a.reshape(1,10)\n",
|
||||
"a=a.reshape(5,2,1)\n",
|
||||
"print(a)\n",
|
||||
"print(a.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[0 1]\n",
|
||||
" [2 3]\n",
|
||||
" [4 5]\n",
|
||||
" [6 7]\n",
|
||||
" [8 9]]\n",
|
||||
"(5, 2)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"b = np.squeeze(a)\n",
|
||||
"print(b)\n",
|
||||
"print(b.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(5, 1, 2)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = np.arange(10)\n",
|
||||
"a=a.reshape(5,1,2)\n",
|
||||
"print(a.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[0 1]\n",
|
||||
" [2 3]\n",
|
||||
" [4 5]\n",
|
||||
" [6 7]\n",
|
||||
" [8 9]]\n",
|
||||
"(5, 2)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"b = np.squeeze(a)#b=a.reshape(5,2)\n",
|
||||
"print(b)\n",
|
||||
"print(b.shape)\n",
|
||||
"#np.squeeze(e,axis = 0,1,2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"a (10,)\n",
|
||||
"b axis=0 (1, 10)\n",
|
||||
"c axis=1 (10, 1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#expand_dims\n",
|
||||
"a = np.arange(10)\n",
|
||||
"print(\"a\",a.shape)\n",
|
||||
"b = np.expand_dims(a, axis=0)\n",
|
||||
"print(\"b axis=0\",b.shape)\n",
|
||||
"c = np.expand_dims(a, axis=1)\n",
|
||||
"print(\"c axis=1\",c.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## pytorch reshape squeeze unsqueeze"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"x tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n",
|
||||
"x torch.Size([10])\n",
|
||||
"b axis=0 torch.Size([1, 10])\n",
|
||||
"c axis=1 torch.Size([10, 1])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = torch.arange(0,10)\n",
|
||||
"print(\"x\",x)\n",
|
||||
"print(\"x\",x.shape)\n",
|
||||
"b = x.unsqueeze(0)\n",
|
||||
"print(\"b axis=0\",b.shape)\n",
|
||||
"c = torch.unsqueeze(x,1)\n",
|
||||
"print(\"c axis=1\",c.shape)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"d torch.Size([10])\n",
|
||||
"d torch.Size([10])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"d=c.squeeze(1)\n",
|
||||
"print(\"d\",d.shape)\n",
|
||||
"d=c.squeeze()\n",
|
||||
"print(\"d\",d.shape)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"d torch.Size([10])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"d=b.squeeze()\n",
|
||||
"print(\"d\",d.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([8])\n",
|
||||
"torch.Size([8])\n",
|
||||
"tensor(0.4091, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch.nn.functional as F\n",
|
||||
"m = nn.Sigmoid()\n",
|
||||
"\n",
|
||||
"loss = nn.BCEWithLogitsLoss()\n",
|
||||
"temp=[0, 1, 2,0, 1, 2,0, 1]\n",
|
||||
"input =torch.tensor(temp, dtype=torch.float, requires_grad=True)\n",
|
||||
"#input = torch.randn(3, requires_grad=True)\n",
|
||||
"target = torch.tensor([0,1,1,0,1,1,0,1], dtype=torch.float)\n",
|
||||
"lossinput = input\n",
|
||||
"output = loss(lossinput, target)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(lossinput.shape)\n",
|
||||
"\n",
|
||||
"print(target.shape)\n",
|
||||
"\n",
|
||||
"print(output)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## logist regression\n",
|
||||
"\n",
|
||||
"$$y(z) = \\frac{1}{1 + exp(-\\theta^T z)}$$\n",
|
||||
"\n",
|
||||
"nn.BCELoss\n",
|
||||
"$$\n",
|
||||
" \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad\n",
|
||||
" l_n = - w_n \\left[ y_n \\cdot \\log x_n + (1 - y_n) \\cdot \\log (1 - x_n) \\right],\n",
|
||||
" $$ \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch.nn.functional as F # 激励函数库\n",
|
||||
"\n",
|
||||
"feature_num=100\n",
|
||||
"\n",
|
||||
"class LogistRegression(torch.nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(LogistRegression, self).__init__()\n",
|
||||
" self.linear = torch.nn.Linear(feature_num, 1)\n",
|
||||
" def forward(self, x):\n",
|
||||
" z=self.linear(x)\n",
|
||||
" y_pred = F.sigmoid(z)\n",
|
||||
" return y_pred\n",
|
||||
" def loss(y_pred,label):#Binary Cross Entropy\n",
|
||||
" criterion = torch.nn.BCELoss(size_average=True)\n",
|
||||
" return criterion(y_pred,label)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"nn.CrossEntropyLoss() 自带sigmod,不需要加"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
105
[finished]Assignment_2_word2vec/others/pytorch_train.py
Normal file
105
[finished]Assignment_2_word2vec/others/pytorch_train.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import random
|
||||
import re
|
||||
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
from tqdm import tqdm
|
||||
from pytorch_word2vec_model import SkipGram
|
||||
|
||||
epochs = 50
|
||||
negative_sampling = 4
|
||||
window = 2
|
||||
vocab_size = 1
|
||||
embd_size = 300
|
||||
|
||||
|
||||
def batch_data(x, batch_size=128):
|
||||
in_w = []
|
||||
out_w = []
|
||||
target = []
|
||||
for text in x:
|
||||
for i in range(window, len(text) - window):
|
||||
word_set = set()
|
||||
in_w.append(text[i])
|
||||
in_w.append(text[i])
|
||||
in_w.append(text[i])
|
||||
in_w.append(text[i])
|
||||
|
||||
out_w.append(text[i - 2])
|
||||
out_w.append(text[i - 1])
|
||||
out_w.append(text[i + 1])
|
||||
out_w.append(text[i + 2])
|
||||
|
||||
target.append(1)
|
||||
target.append(1)
|
||||
target.append(1)
|
||||
target.append(1)
|
||||
# negative sampling
|
||||
count = 0
|
||||
while count < negative_sampling:
|
||||
rand_id = random.randint(0, vocab_size-1)
|
||||
if not rand_id in word_set:
|
||||
in_w.append(text[i])
|
||||
out_w.append(rand_id)
|
||||
target.append(0)
|
||||
count += 1
|
||||
|
||||
if len(out_w) >= batch_size:
|
||||
yield [in_w, out_w, target]
|
||||
in_w = []
|
||||
out_w = []
|
||||
target = []
|
||||
if out_w:
|
||||
yield [in_w, out_w, target]
|
||||
|
||||
|
||||
def train(train_text_id, model,opt):
|
||||
model.train() # 启用dropout和batch normalization
|
||||
ave_loss = 0
|
||||
pbar = tqdm()
|
||||
cnt=0
|
||||
for x_batch in batch_data(train_text_id):
|
||||
in_w, out_w, target = x_batch
|
||||
in_w_var = torch.tensor(in_w)
|
||||
out_w_var = torch.tensor(out_w)
|
||||
target_var = torch.tensor(target,dtype=torch.float)
|
||||
|
||||
model.zero_grad()
|
||||
log_probs = model(in_w_var, out_w_var)
|
||||
loss = model.loss(log_probs, target_var)
|
||||
loss.backward()
|
||||
opt.step()
|
||||
ave_loss += loss.item()
|
||||
pbar.update(1)
|
||||
cnt += 1
|
||||
pbar.set_description('< loss: %.5f >' % (ave_loss / cnt))
|
||||
pbar.close()
|
||||
text_id = []
|
||||
vocab_dict = {}
|
||||
|
||||
with open(
|
||||
'D:\\project\\ml\\github\\cs224n-natural-language-processing-winter2019\\a1_intro_word_vectors\\a1\\corpus\\corpus.txt',
|
||||
encoding='utf-8') as fp:
|
||||
for line in fp:
|
||||
lines = re.sub("[^A-Za-z0-9']+", ' ', line).lower().split()
|
||||
line_id = []
|
||||
for s in lines:
|
||||
if not s:
|
||||
continue
|
||||
if s not in vocab_dict:
|
||||
vocab_dict[s] = len(vocab_dict)
|
||||
id = vocab_dict[s]
|
||||
line_id.append(id)
|
||||
if id==11500:
|
||||
print(id,s)
|
||||
text_id.append(line_id)
|
||||
vocab_size = len(vocab_dict)
|
||||
print('vocab_size', vocab_size)
|
||||
model = SkipGram(vocab_size, embd_size)
|
||||
|
||||
for epoch in range(epochs):
|
||||
print('epoch', epoch)
|
||||
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
|
||||
lr=0.001, weight_decay=0)
|
||||
train(text_id, model,opt)
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class SkipGram(nn.Module):
|
||||
def __init__(self, vocab_size, embd_size):
|
||||
super(SkipGram, self).__init__()
|
||||
self.embeddings = nn.Embedding(vocab_size, embd_size)
|
||||
|
||||
def forward(self, focus, context):
|
||||
embed_focus = self.embeddings(focus)
|
||||
embed_ctx = self.embeddings(context)
|
||||
# score = torch.mm(embed_focus, torch.t(embed_ctx))
|
||||
score = torch.mul(embed_focus, embed_ctx).sum(dim=1)
|
||||
log_probs = score #F.logsigmoid(score)
|
||||
|
||||
return log_probs
|
||||
|
||||
def loss(self, log_probs, target):
|
||||
loss_fn = nn.BCEWithLogitsLoss()
|
||||
# loss_fn = nn.NLLLoss()
|
||||
loss = loss_fn(log_probs, target)
|
||||
return loss
|
||||
|
||||
|
||||
class CBOW(nn.Module):
|
||||
def __init__(self, vocab_size, embd_size, context_size, hidden_size):
|
||||
super(CBOW, self).__init__()
|
||||
self.embeddings = nn.Embedding(vocab_size, embd_size)
|
||||
self.linear1 = nn.Linear(2 * context_size * embd_size, hidden_size)
|
||||
self.linear2 = nn.Linear(hidden_size, vocab_size)
|
||||
|
||||
def forward(self, inputs):
|
||||
embedded = self.embeddings(inputs).view((1, -1))
|
||||
hid = F.relu(self.linear1(embedded))
|
||||
out = self.linear2(hid)
|
||||
log_probs = F.log_softmax(out)
|
||||
return log_probs
|
||||
75
[finished]Assignment_2_word2vec/run.py
Normal file
75
[finished]Assignment_2_word2vec/run.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
from utils.treebank import StanfordSentiment
|
||||
import matplotlib
|
||||
matplotlib.use('agg')
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
|
||||
from word2vec import *
|
||||
from sgd import *
|
||||
|
||||
# Check Python Version
|
||||
import sys
|
||||
assert sys.version_info[0] == 3
|
||||
assert sys.version_info[1] >= 5
|
||||
|
||||
# Reset the random seed to make sure that everyone gets the same results
|
||||
random.seed(314)
|
||||
dataset = StanfordSentiment()
|
||||
tokens = dataset.tokens()
|
||||
nWords = len(tokens)
|
||||
|
||||
# We are going to train 10-dimensional vectors for this assignment
|
||||
dimVectors = 10
|
||||
|
||||
# Context size
|
||||
C = 5
|
||||
|
||||
# Reset the random seed to make sure that everyone gets the same results
|
||||
random.seed(31415)
|
||||
np.random.seed(9265)
|
||||
|
||||
startTime=time.time()
|
||||
wordVectors = np.concatenate(
|
||||
((np.random.rand(nWords, dimVectors) - 0.5) /
|
||||
dimVectors, np.zeros((nWords, dimVectors))),
|
||||
axis=0)
|
||||
wordVectors = sgd(
|
||||
lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
|
||||
negSamplingLossAndGradient),
|
||||
wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
|
||||
# Note that normalization is not called here. This is not a bug,
|
||||
# normalizing during training loses the notion of length.
|
||||
|
||||
print("sanity check: cost at convergence should be around or below 10")
|
||||
print("training took %d seconds" % (time.time() - startTime))
|
||||
|
||||
# concatenate the input and output word vectors
|
||||
wordVectors = np.concatenate(
|
||||
(wordVectors[:nWords,:], wordVectors[nWords:,:]),
|
||||
axis=0)
|
||||
|
||||
visualizeWords = [
|
||||
"great", "cool", "brilliant", "wonderful", "well", "amazing",
|
||||
"worth", "sweet", "enjoyable", "boring", "bad", "dumb",
|
||||
"annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
|
||||
"hail", "coffee", "tea"]
|
||||
|
||||
visualizeIdx = [tokens[word] for word in visualizeWords]
|
||||
visualizeVecs = wordVectors[visualizeIdx, :]
|
||||
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
|
||||
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
|
||||
U,S,V = np.linalg.svd(covariance)
|
||||
coord = temp.dot(U[:,0:2])
|
||||
|
||||
for i in range(len(visualizeWords)):
|
||||
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
|
||||
bbox=dict(facecolor='green', alpha=0.1))
|
||||
|
||||
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
|
||||
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
|
||||
|
||||
plt.savefig('word_vectors.png')
|
||||
133
[finished]Assignment_2_word2vec/sgd.py
Normal file
133
[finished]Assignment_2_word2vec/sgd.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Save parameters every a few SGD iterations as fail-safe
|
||||
SAVE_PARAMS_EVERY = 5000
|
||||
|
||||
import pickle
|
||||
import glob
|
||||
import random
|
||||
import numpy as np
|
||||
import os.path as op
|
||||
|
||||
def load_saved_params():
|
||||
"""
|
||||
A helper function that loads previously saved parameters and resets
|
||||
iteration start.
|
||||
"""
|
||||
st = 0
|
||||
for f in glob.glob("saved_params_*.npy"):
|
||||
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
|
||||
if (iter > st):
|
||||
st = iter
|
||||
|
||||
if st > 0:
|
||||
params_file = "saved_params_%d.npy" % st
|
||||
state_file = "saved_state_%d.pickle" % st
|
||||
params = np.load(params_file)
|
||||
with open(state_file, "rb") as f:
|
||||
state = pickle.load(f)
|
||||
return st, params, state
|
||||
else:
|
||||
return st, None, None
|
||||
|
||||
|
||||
def save_params(iter, params):
|
||||
params_file = "saved_params_%d.npy" % iter
|
||||
np.save(params_file, params)
|
||||
with open("saved_state_%d.pickle" % iter, "wb") as f:
|
||||
pickle.dump(random.getstate(), f)
|
||||
|
||||
|
||||
def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
|
||||
PRINT_EVERY=10):
|
||||
""" Stochastic Gradient Descent
|
||||
|
||||
Implement the stochastic gradient descent method in this function.
|
||||
|
||||
Arguments:
|
||||
f -- the function to optimize, it should take a single
|
||||
argument and yield two outputs, a loss and the gradient
|
||||
with respect to the arguments
|
||||
x0 -- the initial point to start SGD from
|
||||
step -- the step size for SGD
|
||||
iterations -- total iterations to run SGD for
|
||||
postprocessing -- postprocessing function for the parameters
|
||||
if necessary. In the case of word2vec we will need to
|
||||
normalize the word vectors to have unit length.
|
||||
PRINT_EVERY -- specifies how many iterations to output loss
|
||||
|
||||
Return:
|
||||
x -- the parameter value after SGD finishes
|
||||
"""
|
||||
|
||||
# Anneal learning rate every several iterations
|
||||
ANNEAL_EVERY = 20000
|
||||
|
||||
if useSaved:
|
||||
start_iter, oldx, state = load_saved_params()
|
||||
if start_iter > 0:
|
||||
x0 = oldx
|
||||
step *= 0.5 ** (start_iter / ANNEAL_EVERY)
|
||||
|
||||
if state:
|
||||
random.setstate(state)
|
||||
else:
|
||||
start_iter = 0
|
||||
|
||||
x = x0
|
||||
|
||||
if not postprocessing:
|
||||
postprocessing = lambda x: x
|
||||
|
||||
exploss = None
|
||||
|
||||
for iter in range(start_iter + 1, iterations + 1):
|
||||
# You might want to print the progress every few iterations.
|
||||
|
||||
loss = None
|
||||
### YOUR CODE HERE
|
||||
loss,gd = f(x)
|
||||
x = x - step*gd
|
||||
x = postprocessing(x)
|
||||
### END YOUR CODE
|
||||
|
||||
x = postprocessing(x)
|
||||
if iter % PRINT_EVERY == 0:
|
||||
if not exploss:
|
||||
exploss = loss
|
||||
else:
|
||||
exploss = .95 * exploss + .05 * loss
|
||||
print("iter %d: %f" % (iter, exploss))
|
||||
|
||||
if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
|
||||
save_params(iter, x)
|
||||
|
||||
if iter % ANNEAL_EVERY == 0:
|
||||
step *= 0.5
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def sanity_check():
|
||||
quad = lambda x: (np.sum(x ** 2), x * 2)
|
||||
|
||||
print("Running sanity checks...")
|
||||
t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
|
||||
print("test 1 result:", t1)
|
||||
assert abs(t1) <= 1e-6
|
||||
|
||||
t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
|
||||
print("test 2 result:", t2)
|
||||
assert abs(t2) <= 1e-6
|
||||
|
||||
t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
|
||||
print("test 3 result:", t3)
|
||||
assert abs(t3) <= 1e-6
|
||||
|
||||
print("-" * 40)
|
||||
print("ALL TESTS PASSED")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sanity_check()
|
||||
BIN
[finished]Assignment_2_word2vec/utils/.DS_Store
vendored
Normal file
BIN
[finished]Assignment_2_word2vec/utils/.DS_Store
vendored
Normal file
Binary file not shown.
0
[finished]Assignment_2_word2vec/utils/__init__.py
Normal file
0
[finished]Assignment_2_word2vec/utils/__init__.py
Normal file
47
[finished]Assignment_2_word2vec/utils/gradcheck.py
Normal file
47
[finished]Assignment_2_word2vec/utils/gradcheck.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
|
||||
# First implement a gradient checker by filling in the following functions
|
||||
def gradcheck_naive(f, x, gradientText):
|
||||
""" Gradient check for a function f.
|
||||
Arguments:
|
||||
f -- a function that takes a single argument and outputs the
|
||||
loss and its gradients
|
||||
x -- the point (numpy array) to check the gradient at
|
||||
gradientText -- a string detailing some context about the gradient computation
|
||||
"""
|
||||
|
||||
rndstate = random.getstate()
|
||||
random.setstate(rndstate)
|
||||
fx, grad = f(x) # Evaluate function value at original point
|
||||
h = 1e-4 # Do not change this!
|
||||
|
||||
# Iterate over all indexes ix in x to check the gradient.
|
||||
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
|
||||
while not it.finished:
|
||||
ix = it.multi_index
|
||||
|
||||
x[ix] += h # increment by h
|
||||
random.setstate(rndstate)
|
||||
fxh, _ = f(x) # evalute f(x + h)
|
||||
x[ix] -= 2 * h # restore to previous value (very important!)
|
||||
random.setstate(rndstate)
|
||||
fxnh, _ = f(x)
|
||||
x[ix] += h
|
||||
numgrad = (fxh - fxnh) / 2 / h
|
||||
|
||||
# Compare gradients
|
||||
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
|
||||
if reldiff > 1e-5:
|
||||
print("Gradient check failed for %s." % gradientText)
|
||||
print("First gradient error found at index %s in the vector of gradients" % str(ix))
|
||||
print("Your gradient: %f \t Numerical gradient: %f" % (
|
||||
grad[ix], numgrad))
|
||||
return
|
||||
|
||||
it.iternext() # Step to next dimension
|
||||
|
||||
print("Gradient check passed!")
|
||||
248
[finished]Assignment_2_word2vec/utils/treebank.py
Normal file
248
[finished]Assignment_2_word2vec/utils/treebank.py
Normal file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pickle
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
|
||||
class StanfordSentiment:
|
||||
def __init__(self, path=None, tablesize = 1000000):
|
||||
if not path:
|
||||
path = "utils/datasets/stanfordSentimentTreebank"
|
||||
|
||||
self.path = path
|
||||
self.tablesize = tablesize
|
||||
|
||||
def tokens(self):
|
||||
if hasattr(self, "_tokens") and self._tokens:
|
||||
return self._tokens
|
||||
|
||||
tokens = dict()
|
||||
tokenfreq = dict()
|
||||
wordcount = 0
|
||||
revtokens = []
|
||||
idx = 0
|
||||
|
||||
for sentence in self.sentences():
|
||||
for w in sentence:
|
||||
wordcount += 1
|
||||
if not w in tokens:
|
||||
tokens[w] = idx
|
||||
revtokens += [w]
|
||||
tokenfreq[w] = 1
|
||||
idx += 1
|
||||
else:
|
||||
tokenfreq[w] += 1
|
||||
|
||||
tokens["UNK"] = idx
|
||||
revtokens += ["UNK"]
|
||||
tokenfreq["UNK"] = 1
|
||||
wordcount += 1
|
||||
|
||||
self._tokens = tokens
|
||||
self._tokenfreq = tokenfreq
|
||||
self._wordcount = wordcount
|
||||
self._revtokens = revtokens
|
||||
return self._tokens
|
||||
|
||||
def sentences(self):
|
||||
if hasattr(self, "_sentences") and self._sentences:
|
||||
return self._sentences
|
||||
|
||||
sentences = []
|
||||
with open(self.path + "/datasetSentences.txt", "r") as f:
|
||||
first = True
|
||||
for line in f:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
|
||||
splitted = line.strip().split()[1:]
|
||||
# Deal with some peculiar encoding issues with this file
|
||||
sentences += [[w.lower() for w in splitted]]
|
||||
|
||||
self._sentences = sentences
|
||||
self._sentlengths = np.array([len(s) for s in sentences])
|
||||
self._cumsentlen = np.cumsum(self._sentlengths)
|
||||
|
||||
return self._sentences
|
||||
|
||||
def numSentences(self):
|
||||
if hasattr(self, "_numSentences") and self._numSentences:
|
||||
return self._numSentences
|
||||
else:
|
||||
self._numSentences = len(self.sentences())
|
||||
return self._numSentences
|
||||
|
||||
def allSentences(self):
|
||||
if hasattr(self, "_allsentences") and self._allsentences:
|
||||
return self._allsentences
|
||||
|
||||
sentences = self.sentences()
|
||||
rejectProb = self.rejectProb()
|
||||
tokens = self.tokens()
|
||||
allsentences = [[w for w in s
|
||||
if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
|
||||
for s in sentences * 30]
|
||||
|
||||
allsentences = [s for s in allsentences if len(s) > 1]
|
||||
|
||||
self._allsentences = allsentences
|
||||
|
||||
return self._allsentences
|
||||
|
||||
def getRandomContext(self, C=5):
|
||||
allsent = self.allSentences()
|
||||
sentID = random.randint(0, len(allsent) - 1)
|
||||
sent = allsent[sentID]
|
||||
wordID = random.randint(0, len(sent) - 1)
|
||||
|
||||
context = sent[max(0, wordID - C):wordID]
|
||||
if wordID+1 < len(sent):
|
||||
context += sent[wordID+1:min(len(sent), wordID + C + 1)]
|
||||
|
||||
centerword = sent[wordID]
|
||||
context = [w for w in context if w != centerword]
|
||||
|
||||
if len(context) > 0:
|
||||
return centerword, context
|
||||
else:
|
||||
return self.getRandomContext(C)
|
||||
|
||||
def sent_labels(self):
|
||||
if hasattr(self, "_sent_labels") and self._sent_labels:
|
||||
return self._sent_labels
|
||||
|
||||
dictionary = dict()
|
||||
phrases = 0
|
||||
with open(self.path + "/dictionary.txt", "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line: continue
|
||||
splitted = line.split("|")
|
||||
dictionary[splitted[0].lower()] = int(splitted[1])
|
||||
phrases += 1
|
||||
|
||||
labels = [0.0] * phrases
|
||||
with open(self.path + "/sentiment_labels.txt", "r") as f:
|
||||
first = True
|
||||
for line in f:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line: continue
|
||||
splitted = line.split("|")
|
||||
labels[int(splitted[0])] = float(splitted[1])
|
||||
|
||||
sent_labels = [0.0] * self.numSentences()
|
||||
sentences = self.sentences()
|
||||
for i in range(self.numSentences()):
|
||||
sentence = sentences[i]
|
||||
full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
|
||||
sent_labels[i] = labels[dictionary[full_sent]]
|
||||
|
||||
self._sent_labels = sent_labels
|
||||
return self._sent_labels
|
||||
|
||||
def dataset_split(self):
|
||||
if hasattr(self, "_split") and self._split:
|
||||
return self._split
|
||||
|
||||
split = [[] for i in range(3)]
|
||||
with open(self.path + "/datasetSplit.txt", "r") as f:
|
||||
first = True
|
||||
for line in f:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
|
||||
splitted = line.strip().split(",")
|
||||
split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
|
||||
|
||||
self._split = split
|
||||
return self._split
|
||||
|
||||
def getRandomTrainSentence(self):
|
||||
split = self.dataset_split()
|
||||
sentId = split[0][random.randint(0, len(split[0]) - 1)]
|
||||
return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
|
||||
|
||||
def categorify(self, label):
|
||||
if label <= 0.2:
|
||||
return 0
|
||||
elif label <= 0.4:
|
||||
return 1
|
||||
elif label <= 0.6:
|
||||
return 2
|
||||
elif label <= 0.8:
|
||||
return 3
|
||||
else:
|
||||
return 4
|
||||
|
||||
def getDevSentences(self):
|
||||
return self.getSplitSentences(2)
|
||||
|
||||
def getTestSentences(self):
|
||||
return self.getSplitSentences(1)
|
||||
|
||||
def getTrainSentences(self):
|
||||
return self.getSplitSentences(0)
|
||||
|
||||
def getSplitSentences(self, split=0):
|
||||
ds_split = self.dataset_split()
|
||||
return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
|
||||
|
||||
def sampleTable(self):
|
||||
if hasattr(self, '_sampleTable') and self._sampleTable is not None:
|
||||
return self._sampleTable
|
||||
|
||||
nTokens = len(self.tokens())
|
||||
samplingFreq = np.zeros((nTokens,))
|
||||
self.allSentences()
|
||||
i = 0
|
||||
for w in range(nTokens):
|
||||
w = self._revtokens[i]
|
||||
if w in self._tokenfreq:
|
||||
freq = 1.0 * self._tokenfreq[w]
|
||||
# Reweigh
|
||||
freq = freq ** 0.75
|
||||
else:
|
||||
freq = 0.0
|
||||
samplingFreq[i] = freq
|
||||
i += 1
|
||||
|
||||
samplingFreq /= np.sum(samplingFreq)
|
||||
samplingFreq = np.cumsum(samplingFreq) * self.tablesize
|
||||
|
||||
self._sampleTable = [0] * self.tablesize
|
||||
|
||||
j = 0
|
||||
for i in range(self.tablesize):
|
||||
while i > samplingFreq[j]:
|
||||
j += 1
|
||||
self._sampleTable[i] = j
|
||||
|
||||
return self._sampleTable
|
||||
|
||||
def rejectProb(self):
|
||||
if hasattr(self, '_rejectProb') and self._rejectProb is not None:
|
||||
return self._rejectProb
|
||||
|
||||
threshold = 1e-5 * self._wordcount
|
||||
|
||||
nTokens = len(self.tokens())
|
||||
rejectProb = np.zeros((nTokens,))
|
||||
for i in range(nTokens):
|
||||
w = self._revtokens[i]
|
||||
freq = 1.0 * self._tokenfreq[w]
|
||||
# Reweigh
|
||||
rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
|
||||
|
||||
self._rejectProb = rejectProb
|
||||
return self._rejectProb
|
||||
|
||||
def sampleTokenIdx(self):
|
||||
return self.sampleTable()[random.randint(0, self.tablesize - 1)]
|
||||
43
[finished]Assignment_2_word2vec/utils/utils.py
Normal file
43
[finished]Assignment_2_word2vec/utils/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import numpy as np
|
||||
|
||||
def normalizeRows(x):
|
||||
""" Row normalization function
|
||||
|
||||
Implement a function that normalizes each row of a matrix to have
|
||||
unit length.
|
||||
"""
|
||||
N = x.shape[0]
|
||||
x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
|
||||
return x
|
||||
|
||||
def softmax(x):
|
||||
"""Compute the softmax function for each row of the input x.
|
||||
It is crucial that this function is optimized for speed because
|
||||
it will be used frequently in later code.
|
||||
|
||||
Arguments:
|
||||
x -- A D dimensional vector or N x D dimensional numpy matrix.
|
||||
Return:
|
||||
x -- You are allowed to modify x in-place
|
||||
"""
|
||||
orig_shape = x.shape
|
||||
|
||||
if len(x.shape) > 1:
|
||||
# Matrix
|
||||
tmp = np.max(x, axis=1)
|
||||
x -= tmp.reshape((x.shape[0], 1))
|
||||
x = np.exp(x)
|
||||
tmp = np.sum(x, axis=1)
|
||||
x /= tmp.reshape((x.shape[0], 1))
|
||||
else:
|
||||
# Vector
|
||||
tmp = np.max(x)
|
||||
x -= tmp
|
||||
x = np.exp(x)
|
||||
tmp = np.sum(x)
|
||||
x /= tmp
|
||||
|
||||
assert x.shape == orig_shape
|
||||
return x
|
||||
301
[finished]Assignment_2_word2vec/word2vec.py
Normal file
301
[finished]Assignment_2_word2vec/word2vec.py
Normal file
@@ -0,0 +1,301 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
from utils.gradcheck import gradcheck_naive
|
||||
from utils.utils import normalizeRows, softmax
|
||||
|
||||
|
||||
def sigmoid(x):
|
||||
"""
|
||||
Compute the sigmoid function for the input here.
|
||||
Arguments:
|
||||
x -- A scalar or numpy array.
|
||||
Return:
|
||||
s -- sigmoid(x)
|
||||
"""
|
||||
|
||||
### YOUR CODE HERE
|
||||
s = 1/(1+np.exp(-x))
|
||||
### END YOUR CODE
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def naiveSoftmaxLossAndGradient(
|
||||
centerWordVec,
|
||||
outsideWordIdx,
|
||||
outsideVectors,
|
||||
dataset
|
||||
):
|
||||
""" Naive Softmax loss & gradient function for word2vec models
|
||||
|
||||
Implement the naive softmax loss and gradients between a center word's
|
||||
embedding and an outside word's embedding. This will be the building block
|
||||
for our word2vec models.
|
||||
|
||||
Arguments:
|
||||
centerWordVec -- numpy ndarray, center word's embedding
|
||||
(v_c in the pdf handout)
|
||||
outsideWordIdx -- integer, the index of the outside word
|
||||
(o of u_o in the pdf handout)
|
||||
outsideVectors -- outside vectors (rows of matrix) for all words in vocab
|
||||
(U in the pdf handout)
|
||||
dataset -- needed for negative sampling, unused here.
|
||||
|
||||
Return:
|
||||
loss -- naive softmax loss
|
||||
gradCenterVec -- the gradient with respect to the center word vector
|
||||
(dJ / dv_c in the pdf handout)
|
||||
gradOutsideVecs -- the gradient with respect to all the outside word vectors
|
||||
(dJ / dU)
|
||||
"""
|
||||
|
||||
### YOUR CODE HERE
|
||||
score = np.dot(outsideVectors,centerWordVec)
|
||||
y_h = softmax(score)
|
||||
loss = -np.log(y_h[outsideWordIdx])
|
||||
y = np.eye(y_h.shape[0])[outsideWordIdx]
|
||||
diff = (y_h-y).reshape((y.shape[0],1))
|
||||
|
||||
gradCenterVec = np.dot(diff.T,outsideVectors)
|
||||
centerWordVec = centerWordVec.reshape((1,centerWordVec.shape[0]))
|
||||
gradOutsideVecs = np.dot(diff,centerWordVec)
|
||||
|
||||
### END YOUR CODE
|
||||
|
||||
return loss, gradCenterVec, gradOutsideVecs
|
||||
|
||||
|
||||
def getNegativeSamples(outsideWordIdx, dataset, K):
|
||||
""" Samples K indexes which are not the outsideWordIdx """
|
||||
|
||||
negSampleWordIndices = [None] * K
|
||||
for k in range(K):
|
||||
newidx = dataset.sampleTokenIdx()
|
||||
while newidx == outsideWordIdx:
|
||||
newidx = dataset.sampleTokenIdx()
|
||||
negSampleWordIndices[k] = newidx
|
||||
return negSampleWordIndices
|
||||
|
||||
|
||||
def negSamplingLossAndGradient(
|
||||
centerWordVec,
|
||||
outsideWordIdx,
|
||||
outsideVectors,
|
||||
dataset,
|
||||
K=10
|
||||
):
|
||||
""" Negative sampling loss function for word2vec models
|
||||
|
||||
Implement the negative sampling loss and gradients for a centerWordVec
|
||||
and a outsideWordIdx word vector as a building block for word2vec
|
||||
models. K is the number of negative samples to take.
|
||||
|
||||
Note: The same word may be negatively sampled multiple times. For
|
||||
example if an outside word is sampled twice, you shall have to
|
||||
double count the gradient with respect to this word. Thrice if
|
||||
it was sampled three times, and so forth.
|
||||
|
||||
Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
|
||||
"""
|
||||
|
||||
# Negative sampling of words is done for you. Do not modify this if you
|
||||
# wish to match the autograder and receive points!
|
||||
negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
|
||||
indices = [outsideWordIdx] + negSampleWordIndices
|
||||
|
||||
|
||||
### YOUR CODE HERE
|
||||
|
||||
score = np.dot(outsideVectors[outsideWordIdx],centerWordVec)
|
||||
sig_1 = sigmoid(score)
|
||||
|
||||
sum_neg = 0.0
|
||||
|
||||
#Find unique negative samples and the number of times they are present in our sample window
|
||||
unique_k, counts_k = np.unique(indices[1:], return_counts=True)
|
||||
k_stack = outsideVectors[unique_k]
|
||||
|
||||
score_neg = -np.dot(k_stack,centerWordVec)
|
||||
sig_neg = sigmoid(score_neg)
|
||||
sum_neg = np.sum(counts_k*np.log(sig_neg),axis=0)
|
||||
|
||||
#J_neg_sam Loss
|
||||
loss = -np.log(sig_1) - sum_neg
|
||||
|
||||
#Calculate gradients
|
||||
k_term = 0.0
|
||||
#delta term from previous layer for efficient implementation
|
||||
delta_1msig = 1-sig_1
|
||||
delta_1msig_neg = 1-sig_neg
|
||||
|
||||
gradOutsideVecs = np.zeros((outsideVectors.shape))
|
||||
gradOutsideVecs[outsideWordIdx,:] = -delta_1msig*centerWordVec
|
||||
common_term = np.dot(delta_1msig_neg.reshape(unique_k.shape[0],1),centerWordVec.reshape(1,centerWordVec.shape[0]))
|
||||
gradOutsideVecs[unique_k,:] += counts_k.reshape(counts_k.shape[0],1)*common_term
|
||||
|
||||
#Reshape prep for center gradient calculation
|
||||
counts_k = counts_k.reshape(counts_k.shape[0],1)
|
||||
delta_1msig_neg = delta_1msig_neg.reshape(delta_1msig_neg.shape[0],1)
|
||||
k_term = np.sum(np.dot((delta_1msig_neg.reshape(1,counts_k.shape[0])),counts_k*k_stack),axis=0)
|
||||
gradCenterVec = -delta_1msig*outsideVectors[outsideWordIdx] + k_term
|
||||
|
||||
### END YOUR CODE
|
||||
|
||||
return loss, gradCenterVec, gradOutsideVecs
|
||||
|
||||
|
||||
def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
|
||||
centerWordVectors, outsideVectors, dataset,
|
||||
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
|
||||
""" Skip-gram model in word2vec
|
||||
|
||||
Implement the skip-gram model in this function.
|
||||
|
||||
Arguments:
|
||||
currentCenterWord -- a string of the current center word
|
||||
windowSize -- integer, context window size
|
||||
outsideWords -- list of no more than 2*windowSize strings, the outside words
|
||||
word2Ind -- a dictionary that maps words to their indices in
|
||||
the word vector list
|
||||
centerWordVectors -- center word vectors (as rows) for all words in vocab
|
||||
(V in pdf handout)
|
||||
outsideVectors -- outside word vectors (as rows) for all words in vocab
|
||||
(U in pdf handout)
|
||||
word2vecLossAndGradient -- the loss and gradient function for
|
||||
a prediction vector given the outsideWordIdx
|
||||
word vectors, could be one of the two
|
||||
loss functions you implemented above.
|
||||
|
||||
Return:
|
||||
loss -- the loss function value for the skip-gram model
|
||||
(J in the pdf handout)
|
||||
gradCenterVecs -- the gradient with respect to the center word vectors
|
||||
(dJ / dV in the pdf handout)
|
||||
gradOutsideVectors -- the gradient with respect to the outside word vectors
|
||||
(dJ / dU in the pdf handout)
|
||||
"""
|
||||
|
||||
loss = 0.0
|
||||
gradCenterVecs = np.zeros(centerWordVectors.shape)
|
||||
gradOutsideVectors = np.zeros(outsideVectors.shape)
|
||||
|
||||
### YOUR CODE HERE
|
||||
for m in range(0,len(outsideWords)):
|
||||
l,gradCenter,gradOutside= word2vecLossAndGradient(centerWordVectors[word2Ind[currentCenterWord]],word2Ind[outsideWords[m]],outsideVectors,dataset)
|
||||
loss+=l
|
||||
gradCenterVecs[word2Ind[currentCenterWord]] += gradCenter.reshape((centerWordVectors.shape[1],))
|
||||
gradOutsideVectors += gradOutside
|
||||
### END YOUR CODE
|
||||
|
||||
return loss, gradCenterVecs, gradOutsideVectors
|
||||
|
||||
#############################################
|
||||
# Testing functions below. DO NOT MODIFY! #
|
||||
#############################################
|
||||
|
||||
def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset,
|
||||
windowSize,
|
||||
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
|
||||
batchsize = 50
|
||||
loss = 0.0
|
||||
grad = np.zeros(wordVectors.shape)
|
||||
N = wordVectors.shape[0]
|
||||
centerWordVectors = wordVectors[:int(N/2),:]
|
||||
outsideVectors = wordVectors[int(N/2):,:]
|
||||
for i in range(batchsize):
|
||||
windowSize1 = random.randint(1, windowSize)
|
||||
centerWord, context = dataset.getRandomContext(windowSize1)
|
||||
|
||||
c, gin, gout = word2vecModel(
|
||||
centerWord, windowSize1, context, word2Ind, centerWordVectors,
|
||||
outsideVectors, dataset, word2vecLossAndGradient
|
||||
)
|
||||
loss += c / batchsize
|
||||
grad[:int(N/2), :] += gin / batchsize
|
||||
grad[int(N/2):, :] += gout / batchsize
|
||||
|
||||
return loss, grad
|
||||
|
||||
|
||||
def test_word2vec():
|
||||
""" Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
|
||||
dataset = type('dummy', (), {})()
|
||||
def dummySampleTokenIdx():
|
||||
return random.randint(0, 4)
|
||||
|
||||
def getRandomContext(C):
|
||||
tokens = ["a", "b", "c", "d", "e"]
|
||||
return tokens[random.randint(0,4)], \
|
||||
[tokens[random.randint(0,4)] for i in range(2*C)]
|
||||
dataset.sampleTokenIdx = dummySampleTokenIdx
|
||||
dataset.getRandomContext = getRandomContext
|
||||
|
||||
random.seed(31415)
|
||||
np.random.seed(9265)
|
||||
dummy_vectors = normalizeRows(np.random.randn(10,3))
|
||||
dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
|
||||
|
||||
print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
|
||||
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
|
||||
skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
|
||||
dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
|
||||
|
||||
print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
|
||||
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
|
||||
skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
|
||||
dummy_vectors, "negSamplingLossAndGradient Gradient")
|
||||
|
||||
print("\n=== Results ===")
|
||||
print ("Skip-Gram with naiveSoftmaxLossAndGradient")
|
||||
|
||||
print ("Your Result:")
|
||||
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
|
||||
*skipgram("c", 3, ["a", "b", "e", "d", "b", "c"],
|
||||
dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
|
||||
)
|
||||
)
|
||||
|
||||
print ("Expected Result: Value should approximate these:")
|
||||
print("""Loss: 11.16610900153398
|
||||
Gradient wrt Center Vectors (dJ/dV):
|
||||
[[ 0. 0. 0. ]
|
||||
[ 0. 0. 0. ]
|
||||
[-1.26947339 -1.36873189 2.45158957]
|
||||
[ 0. 0. 0. ]
|
||||
[ 0. 0. 0. ]]
|
||||
Gradient wrt Outside Vectors (dJ/dU):
|
||||
[[-0.41045956 0.18834851 1.43272264]
|
||||
[ 0.38202831 -0.17530219 -1.33348241]
|
||||
[ 0.07009355 -0.03216399 -0.24466386]
|
||||
[ 0.09472154 -0.04346509 -0.33062865]
|
||||
[-0.13638384 0.06258276 0.47605228]]
|
||||
""")
|
||||
|
||||
print ("Skip-Gram with negSamplingLossAndGradient")
|
||||
print ("Your Result:")
|
||||
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
|
||||
*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:],
|
||||
dummy_vectors[5:,:], dataset, negSamplingLossAndGradient)
|
||||
)
|
||||
)
|
||||
print ("Expected Result: Value should approximate these:")
|
||||
print("""Loss: 16.15119285363322
|
||||
Gradient wrt Center Vectors (dJ/dV):
|
||||
[[ 0. 0. 0. ]
|
||||
[ 0. 0. 0. ]
|
||||
[-4.54650789 -1.85942252 0.76397441]
|
||||
[ 0. 0. 0. ]
|
||||
[ 0. 0. 0. ]]
|
||||
Gradient wrt Outside Vectors (dJ/dU):
|
||||
[[-0.69148188 0.31730185 2.41364029]
|
||||
[-0.22716495 0.10423969 0.79292674]
|
||||
[-0.45528438 0.20891737 1.58918512]
|
||||
[-0.31602611 0.14501561 1.10309954]
|
||||
[-0.80620296 0.36994417 2.81407799]]
|
||||
""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_word2vec()
|
||||
Reference in New Issue
Block a user