a2
This commit is contained in:
BIN
[finished]Assignment_2_word2vec/a2 written.pdf
Normal file
BIN
[finished]Assignment_2_word2vec/a2 written.pdf
Normal file
Binary file not shown.
2
[finished]Assignment_2_word2vec/collect_submission.sh
Normal file
2
[finished]Assignment_2_word2vec/collect_submission.sh
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
rm -f assignment2.zip
|
||||||
|
zip -r assignment2.zip *.py *.png saved_params_40000.npy
|
||||||
15
[finished]Assignment_2_word2vec/get_datasets.sh
Normal file
15
[finished]Assignment_2_word2vec/get_datasets.sh
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
DATASETS_DIR="utils/datasets"
|
||||||
|
mkdir -p $DATASETS_DIR
|
||||||
|
|
||||||
|
cd $DATASETS_DIR
|
||||||
|
|
||||||
|
# Get Stanford Sentiment Treebank
|
||||||
|
if hash wget 2>/dev/null; then
|
||||||
|
wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
|
||||||
|
else
|
||||||
|
curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
|
||||||
|
fi
|
||||||
|
unzip stanfordSentimentTreebank.zip
|
||||||
|
rm stanfordSentimentTreebank.zip
|
||||||
461
[finished]Assignment_2_word2vec/others/pytorch review1.ipynb
Normal file
461
[finished]Assignment_2_word2vec/others/pytorch review1.ipynb
Normal file
@@ -0,0 +1,461 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 为什么要学习pytorch\n",
|
||||||
|
"\n",
|
||||||
|
"tensorflow的学习曲线陡峭\n",
|
||||||
|
"\n",
|
||||||
|
"pytorch出自facebook\n",
|
||||||
|
"\n",
|
||||||
|
"PyTorch 可以当做 NumPy 用\n",
|
||||||
|
"\n",
|
||||||
|
"静态图 vs 动态图\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"最新版本全部使用tensor就可以\n",
|
||||||
|
"\n",
|
||||||
|
"numpy能实现的东西,在pytorch基本可以实现\n",
|
||||||
|
"\n",
|
||||||
|
"torch.randn()=numpy.random.randn()\n",
|
||||||
|
"\n",
|
||||||
|
"torch.max()=np.max()\n",
|
||||||
|
"\n",
|
||||||
|
"torch.zeros()=np.zeros()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"如果不能实现,Tensor与numpy之间的可以相互转化\n",
|
||||||
|
"\n",
|
||||||
|
"tensor.numpy()\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"1.1.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import torch.nn as nn\n",
|
||||||
|
"print(torch.__version__)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"device(type='cpu')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||||||
|
"device"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"tensor([0., 1., 2., 3., 4., 5., 6.])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# python struct to torch tensor\n",
|
||||||
|
"temp=[0, 1, 2, 3, 4, 5, 6]\n",
|
||||||
|
"x = torch.tensor(temp, dtype=torch.float, device=device)\n",
|
||||||
|
"x"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#gpu/cpu\n",
|
||||||
|
"x = torch.tensor(temp).cuda()\n",
|
||||||
|
"x = x.cpu()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"x=x.long()\n",
|
||||||
|
"x=x.float()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## numpy reshape squeeze expand_dims"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[0 1 2 3 4 5 6 7 8 9]\n",
|
||||||
|
"(10,)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = np.arange(10)\n",
|
||||||
|
"print(a)\n",
|
||||||
|
"print(a.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[[0]\n",
|
||||||
|
" [1]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[2]\n",
|
||||||
|
" [3]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[4]\n",
|
||||||
|
" [5]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[6]\n",
|
||||||
|
" [7]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[8]\n",
|
||||||
|
" [9]]]\n",
|
||||||
|
"(5, 2, 1)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#a=a.reshape(1,-1)\n",
|
||||||
|
"#a.reshape(1,10)\n",
|
||||||
|
"a=a.reshape(5,2,1)\n",
|
||||||
|
"print(a)\n",
|
||||||
|
"print(a.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[0 1]\n",
|
||||||
|
" [2 3]\n",
|
||||||
|
" [4 5]\n",
|
||||||
|
" [6 7]\n",
|
||||||
|
" [8 9]]\n",
|
||||||
|
"(5, 2)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"b = np.squeeze(a)\n",
|
||||||
|
"print(b)\n",
|
||||||
|
"print(b.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(5, 1, 2)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = np.arange(10)\n",
|
||||||
|
"a=a.reshape(5,1,2)\n",
|
||||||
|
"print(a.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[0 1]\n",
|
||||||
|
" [2 3]\n",
|
||||||
|
" [4 5]\n",
|
||||||
|
" [6 7]\n",
|
||||||
|
" [8 9]]\n",
|
||||||
|
"(5, 2)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"b = np.squeeze(a)#b=a.reshape(5,2)\n",
|
||||||
|
"print(b)\n",
|
||||||
|
"print(b.shape)\n",
|
||||||
|
"#np.squeeze(e,axis = 0,1,2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"a (10,)\n",
|
||||||
|
"b axis=0 (1, 10)\n",
|
||||||
|
"c axis=1 (10, 1)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#expand_dims\n",
|
||||||
|
"a = np.arange(10)\n",
|
||||||
|
"print(\"a\",a.shape)\n",
|
||||||
|
"b = np.expand_dims(a, axis=0)\n",
|
||||||
|
"print(\"b axis=0\",b.shape)\n",
|
||||||
|
"c = np.expand_dims(a, axis=1)\n",
|
||||||
|
"print(\"c axis=1\",c.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## pytorch reshape squeeze unsqueeze"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"x tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n",
|
||||||
|
"x torch.Size([10])\n",
|
||||||
|
"b axis=0 torch.Size([1, 10])\n",
|
||||||
|
"c axis=1 torch.Size([10, 1])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x = torch.arange(0,10)\n",
|
||||||
|
"print(\"x\",x)\n",
|
||||||
|
"print(\"x\",x.shape)\n",
|
||||||
|
"b = x.unsqueeze(0)\n",
|
||||||
|
"print(\"b axis=0\",b.shape)\n",
|
||||||
|
"c = torch.unsqueeze(x,1)\n",
|
||||||
|
"print(\"c axis=1\",c.shape)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"d torch.Size([10])\n",
|
||||||
|
"d torch.Size([10])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d=c.squeeze(1)\n",
|
||||||
|
"print(\"d\",d.shape)\n",
|
||||||
|
"d=c.squeeze()\n",
|
||||||
|
"print(\"d\",d.shape)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"d torch.Size([10])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d=b.squeeze()\n",
|
||||||
|
"print(\"d\",d.shape)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"torch.Size([8])\n",
|
||||||
|
"torch.Size([8])\n",
|
||||||
|
"tensor(0.4091, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch.nn.functional as F\n",
|
||||||
|
"m = nn.Sigmoid()\n",
|
||||||
|
"\n",
|
||||||
|
"loss = nn.BCEWithLogitsLoss()\n",
|
||||||
|
"temp=[0, 1, 2,0, 1, 2,0, 1]\n",
|
||||||
|
"input =torch.tensor(temp, dtype=torch.float, requires_grad=True)\n",
|
||||||
|
"#input = torch.randn(3, requires_grad=True)\n",
|
||||||
|
"target = torch.tensor([0,1,1,0,1,1,0,1], dtype=torch.float)\n",
|
||||||
|
"lossinput = input\n",
|
||||||
|
"output = loss(lossinput, target)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(lossinput.shape)\n",
|
||||||
|
"\n",
|
||||||
|
"print(target.shape)\n",
|
||||||
|
"\n",
|
||||||
|
"print(output)\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## logist regression\n",
|
||||||
|
"\n",
|
||||||
|
"$$y(z) = \\frac{1}{1 + exp(-\\theta^T z)}$$\n",
|
||||||
|
"\n",
|
||||||
|
"nn.BCELoss\n",
|
||||||
|
"$$\n",
|
||||||
|
" \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad\n",
|
||||||
|
" l_n = - w_n \\left[ y_n \\cdot \\log x_n + (1 - y_n) \\cdot \\log (1 - x_n) \\right],\n",
|
||||||
|
" $$ \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch.nn.functional as F # 激励函数库\n",
|
||||||
|
"\n",
|
||||||
|
"feature_num=100\n",
|
||||||
|
"\n",
|
||||||
|
"class LogistRegression(torch.nn.Module):\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(LogistRegression, self).__init__()\n",
|
||||||
|
" self.linear = torch.nn.Linear(feature_num, 1)\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" z=self.linear(x)\n",
|
||||||
|
" y_pred = F.sigmoid(z)\n",
|
||||||
|
" return y_pred\n",
|
||||||
|
" def loss(y_pred,label):#Binary Cross Entropy\n",
|
||||||
|
" criterion = torch.nn.BCELoss(size_average=True)\n",
|
||||||
|
" return criterion(y_pred,label)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"nn.CrossEntropyLoss() 自带sigmod,不需要加"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
105
[finished]Assignment_2_word2vec/others/pytorch_train.py
Normal file
105
[finished]Assignment_2_word2vec/others/pytorch_train.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
import random
|
||||||
|
import re
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.optim as optim
|
||||||
|
from tqdm import tqdm
|
||||||
|
from pytorch_word2vec_model import SkipGram
|
||||||
|
|
||||||
|
epochs = 50
|
||||||
|
negative_sampling = 4
|
||||||
|
window = 2
|
||||||
|
vocab_size = 1
|
||||||
|
embd_size = 300
|
||||||
|
|
||||||
|
|
||||||
|
def batch_data(x, batch_size=128):
|
||||||
|
in_w = []
|
||||||
|
out_w = []
|
||||||
|
target = []
|
||||||
|
for text in x:
|
||||||
|
for i in range(window, len(text) - window):
|
||||||
|
word_set = set()
|
||||||
|
in_w.append(text[i])
|
||||||
|
in_w.append(text[i])
|
||||||
|
in_w.append(text[i])
|
||||||
|
in_w.append(text[i])
|
||||||
|
|
||||||
|
out_w.append(text[i - 2])
|
||||||
|
out_w.append(text[i - 1])
|
||||||
|
out_w.append(text[i + 1])
|
||||||
|
out_w.append(text[i + 2])
|
||||||
|
|
||||||
|
target.append(1)
|
||||||
|
target.append(1)
|
||||||
|
target.append(1)
|
||||||
|
target.append(1)
|
||||||
|
# negative sampling
|
||||||
|
count = 0
|
||||||
|
while count < negative_sampling:
|
||||||
|
rand_id = random.randint(0, vocab_size-1)
|
||||||
|
if not rand_id in word_set:
|
||||||
|
in_w.append(text[i])
|
||||||
|
out_w.append(rand_id)
|
||||||
|
target.append(0)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
if len(out_w) >= batch_size:
|
||||||
|
yield [in_w, out_w, target]
|
||||||
|
in_w = []
|
||||||
|
out_w = []
|
||||||
|
target = []
|
||||||
|
if out_w:
|
||||||
|
yield [in_w, out_w, target]
|
||||||
|
|
||||||
|
|
||||||
|
def train(train_text_id, model,opt):
|
||||||
|
model.train() # 启用dropout和batch normalization
|
||||||
|
ave_loss = 0
|
||||||
|
pbar = tqdm()
|
||||||
|
cnt=0
|
||||||
|
for x_batch in batch_data(train_text_id):
|
||||||
|
in_w, out_w, target = x_batch
|
||||||
|
in_w_var = torch.tensor(in_w)
|
||||||
|
out_w_var = torch.tensor(out_w)
|
||||||
|
target_var = torch.tensor(target,dtype=torch.float)
|
||||||
|
|
||||||
|
model.zero_grad()
|
||||||
|
log_probs = model(in_w_var, out_w_var)
|
||||||
|
loss = model.loss(log_probs, target_var)
|
||||||
|
loss.backward()
|
||||||
|
opt.step()
|
||||||
|
ave_loss += loss.item()
|
||||||
|
pbar.update(1)
|
||||||
|
cnt += 1
|
||||||
|
pbar.set_description('< loss: %.5f >' % (ave_loss / cnt))
|
||||||
|
pbar.close()
|
||||||
|
text_id = []
|
||||||
|
vocab_dict = {}
|
||||||
|
|
||||||
|
with open(
|
||||||
|
'D:\\project\\ml\\github\\cs224n-natural-language-processing-winter2019\\a1_intro_word_vectors\\a1\\corpus\\corpus.txt',
|
||||||
|
encoding='utf-8') as fp:
|
||||||
|
for line in fp:
|
||||||
|
lines = re.sub("[^A-Za-z0-9']+", ' ', line).lower().split()
|
||||||
|
line_id = []
|
||||||
|
for s in lines:
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if s not in vocab_dict:
|
||||||
|
vocab_dict[s] = len(vocab_dict)
|
||||||
|
id = vocab_dict[s]
|
||||||
|
line_id.append(id)
|
||||||
|
if id==11500:
|
||||||
|
print(id,s)
|
||||||
|
text_id.append(line_id)
|
||||||
|
vocab_size = len(vocab_dict)
|
||||||
|
print('vocab_size', vocab_size)
|
||||||
|
model = SkipGram(vocab_size, embd_size)
|
||||||
|
|
||||||
|
for epoch in range(epochs):
|
||||||
|
print('epoch', epoch)
|
||||||
|
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
|
||||||
|
lr=0.001, weight_decay=0)
|
||||||
|
train(text_id, model,opt)
|
||||||
|
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class SkipGram(nn.Module):
|
||||||
|
def __init__(self, vocab_size, embd_size):
|
||||||
|
super(SkipGram, self).__init__()
|
||||||
|
self.embeddings = nn.Embedding(vocab_size, embd_size)
|
||||||
|
|
||||||
|
def forward(self, focus, context):
|
||||||
|
embed_focus = self.embeddings(focus)
|
||||||
|
embed_ctx = self.embeddings(context)
|
||||||
|
# score = torch.mm(embed_focus, torch.t(embed_ctx))
|
||||||
|
score = torch.mul(embed_focus, embed_ctx).sum(dim=1)
|
||||||
|
log_probs = score #F.logsigmoid(score)
|
||||||
|
|
||||||
|
return log_probs
|
||||||
|
|
||||||
|
def loss(self, log_probs, target):
|
||||||
|
loss_fn = nn.BCEWithLogitsLoss()
|
||||||
|
# loss_fn = nn.NLLLoss()
|
||||||
|
loss = loss_fn(log_probs, target)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
class CBOW(nn.Module):
|
||||||
|
def __init__(self, vocab_size, embd_size, context_size, hidden_size):
|
||||||
|
super(CBOW, self).__init__()
|
||||||
|
self.embeddings = nn.Embedding(vocab_size, embd_size)
|
||||||
|
self.linear1 = nn.Linear(2 * context_size * embd_size, hidden_size)
|
||||||
|
self.linear2 = nn.Linear(hidden_size, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
embedded = self.embeddings(inputs).view((1, -1))
|
||||||
|
hid = F.relu(self.linear1(embedded))
|
||||||
|
out = self.linear2(hid)
|
||||||
|
log_probs = F.log_softmax(out)
|
||||||
|
return log_probs
|
||||||
75
[finished]Assignment_2_word2vec/run.py
Normal file
75
[finished]Assignment_2_word2vec/run.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
from utils.treebank import StanfordSentiment
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('agg')
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import time
|
||||||
|
|
||||||
|
from word2vec import *
|
||||||
|
from sgd import *
|
||||||
|
|
||||||
|
# Check Python Version
|
||||||
|
import sys
|
||||||
|
assert sys.version_info[0] == 3
|
||||||
|
assert sys.version_info[1] >= 5
|
||||||
|
|
||||||
|
# Reset the random seed to make sure that everyone gets the same results
|
||||||
|
random.seed(314)
|
||||||
|
dataset = StanfordSentiment()
|
||||||
|
tokens = dataset.tokens()
|
||||||
|
nWords = len(tokens)
|
||||||
|
|
||||||
|
# We are going to train 10-dimensional vectors for this assignment
|
||||||
|
dimVectors = 10
|
||||||
|
|
||||||
|
# Context size
|
||||||
|
C = 5
|
||||||
|
|
||||||
|
# Reset the random seed to make sure that everyone gets the same results
|
||||||
|
random.seed(31415)
|
||||||
|
np.random.seed(9265)
|
||||||
|
|
||||||
|
startTime=time.time()
|
||||||
|
wordVectors = np.concatenate(
|
||||||
|
((np.random.rand(nWords, dimVectors) - 0.5) /
|
||||||
|
dimVectors, np.zeros((nWords, dimVectors))),
|
||||||
|
axis=0)
|
||||||
|
wordVectors = sgd(
|
||||||
|
lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
|
||||||
|
negSamplingLossAndGradient),
|
||||||
|
wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
|
||||||
|
# Note that normalization is not called here. This is not a bug,
|
||||||
|
# normalizing during training loses the notion of length.
|
||||||
|
|
||||||
|
print("sanity check: cost at convergence should be around or below 10")
|
||||||
|
print("training took %d seconds" % (time.time() - startTime))
|
||||||
|
|
||||||
|
# concatenate the input and output word vectors
|
||||||
|
wordVectors = np.concatenate(
|
||||||
|
(wordVectors[:nWords,:], wordVectors[nWords:,:]),
|
||||||
|
axis=0)
|
||||||
|
|
||||||
|
visualizeWords = [
|
||||||
|
"great", "cool", "brilliant", "wonderful", "well", "amazing",
|
||||||
|
"worth", "sweet", "enjoyable", "boring", "bad", "dumb",
|
||||||
|
"annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
|
||||||
|
"hail", "coffee", "tea"]
|
||||||
|
|
||||||
|
visualizeIdx = [tokens[word] for word in visualizeWords]
|
||||||
|
visualizeVecs = wordVectors[visualizeIdx, :]
|
||||||
|
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
|
||||||
|
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
|
||||||
|
U,S,V = np.linalg.svd(covariance)
|
||||||
|
coord = temp.dot(U[:,0:2])
|
||||||
|
|
||||||
|
for i in range(len(visualizeWords)):
|
||||||
|
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
|
||||||
|
bbox=dict(facecolor='green', alpha=0.1))
|
||||||
|
|
||||||
|
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
|
||||||
|
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
|
||||||
|
|
||||||
|
plt.savefig('word_vectors.png')
|
||||||
133
[finished]Assignment_2_word2vec/sgd.py
Normal file
133
[finished]Assignment_2_word2vec/sgd.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Save parameters every a few SGD iterations as fail-safe
|
||||||
|
SAVE_PARAMS_EVERY = 5000
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
import glob
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
import os.path as op
|
||||||
|
|
||||||
|
def load_saved_params():
|
||||||
|
"""
|
||||||
|
A helper function that loads previously saved parameters and resets
|
||||||
|
iteration start.
|
||||||
|
"""
|
||||||
|
st = 0
|
||||||
|
for f in glob.glob("saved_params_*.npy"):
|
||||||
|
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
|
||||||
|
if (iter > st):
|
||||||
|
st = iter
|
||||||
|
|
||||||
|
if st > 0:
|
||||||
|
params_file = "saved_params_%d.npy" % st
|
||||||
|
state_file = "saved_state_%d.pickle" % st
|
||||||
|
params = np.load(params_file)
|
||||||
|
with open(state_file, "rb") as f:
|
||||||
|
state = pickle.load(f)
|
||||||
|
return st, params, state
|
||||||
|
else:
|
||||||
|
return st, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def save_params(iter, params):
|
||||||
|
params_file = "saved_params_%d.npy" % iter
|
||||||
|
np.save(params_file, params)
|
||||||
|
with open("saved_state_%d.pickle" % iter, "wb") as f:
|
||||||
|
pickle.dump(random.getstate(), f)
|
||||||
|
|
||||||
|
|
||||||
|
def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
|
||||||
|
PRINT_EVERY=10):
|
||||||
|
""" Stochastic Gradient Descent
|
||||||
|
|
||||||
|
Implement the stochastic gradient descent method in this function.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
f -- the function to optimize, it should take a single
|
||||||
|
argument and yield two outputs, a loss and the gradient
|
||||||
|
with respect to the arguments
|
||||||
|
x0 -- the initial point to start SGD from
|
||||||
|
step -- the step size for SGD
|
||||||
|
iterations -- total iterations to run SGD for
|
||||||
|
postprocessing -- postprocessing function for the parameters
|
||||||
|
if necessary. In the case of word2vec we will need to
|
||||||
|
normalize the word vectors to have unit length.
|
||||||
|
PRINT_EVERY -- specifies how many iterations to output loss
|
||||||
|
|
||||||
|
Return:
|
||||||
|
x -- the parameter value after SGD finishes
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Anneal learning rate every several iterations
|
||||||
|
ANNEAL_EVERY = 20000
|
||||||
|
|
||||||
|
if useSaved:
|
||||||
|
start_iter, oldx, state = load_saved_params()
|
||||||
|
if start_iter > 0:
|
||||||
|
x0 = oldx
|
||||||
|
step *= 0.5 ** (start_iter / ANNEAL_EVERY)
|
||||||
|
|
||||||
|
if state:
|
||||||
|
random.setstate(state)
|
||||||
|
else:
|
||||||
|
start_iter = 0
|
||||||
|
|
||||||
|
x = x0
|
||||||
|
|
||||||
|
if not postprocessing:
|
||||||
|
postprocessing = lambda x: x
|
||||||
|
|
||||||
|
exploss = None
|
||||||
|
|
||||||
|
for iter in range(start_iter + 1, iterations + 1):
|
||||||
|
# You might want to print the progress every few iterations.
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
### YOUR CODE HERE
|
||||||
|
loss,gd = f(x)
|
||||||
|
x = x - step*gd
|
||||||
|
x = postprocessing(x)
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
x = postprocessing(x)
|
||||||
|
if iter % PRINT_EVERY == 0:
|
||||||
|
if not exploss:
|
||||||
|
exploss = loss
|
||||||
|
else:
|
||||||
|
exploss = .95 * exploss + .05 * loss
|
||||||
|
print("iter %d: %f" % (iter, exploss))
|
||||||
|
|
||||||
|
if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
|
||||||
|
save_params(iter, x)
|
||||||
|
|
||||||
|
if iter % ANNEAL_EVERY == 0:
|
||||||
|
step *= 0.5
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def sanity_check():
|
||||||
|
quad = lambda x: (np.sum(x ** 2), x * 2)
|
||||||
|
|
||||||
|
print("Running sanity checks...")
|
||||||
|
t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
|
||||||
|
print("test 1 result:", t1)
|
||||||
|
assert abs(t1) <= 1e-6
|
||||||
|
|
||||||
|
t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
|
||||||
|
print("test 2 result:", t2)
|
||||||
|
assert abs(t2) <= 1e-6
|
||||||
|
|
||||||
|
t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
|
||||||
|
print("test 3 result:", t3)
|
||||||
|
assert abs(t3) <= 1e-6
|
||||||
|
|
||||||
|
print("-" * 40)
|
||||||
|
print("ALL TESTS PASSED")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sanity_check()
|
||||||
BIN
[finished]Assignment_2_word2vec/utils/.DS_Store
vendored
Normal file
BIN
[finished]Assignment_2_word2vec/utils/.DS_Store
vendored
Normal file
Binary file not shown.
0
[finished]Assignment_2_word2vec/utils/__init__.py
Normal file
0
[finished]Assignment_2_word2vec/utils/__init__.py
Normal file
47
[finished]Assignment_2_word2vec/utils/gradcheck.py
Normal file
47
[finished]Assignment_2_word2vec/utils/gradcheck.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
# First implement a gradient checker by filling in the following functions
|
||||||
|
def gradcheck_naive(f, x, gradientText):
|
||||||
|
""" Gradient check for a function f.
|
||||||
|
Arguments:
|
||||||
|
f -- a function that takes a single argument and outputs the
|
||||||
|
loss and its gradients
|
||||||
|
x -- the point (numpy array) to check the gradient at
|
||||||
|
gradientText -- a string detailing some context about the gradient computation
|
||||||
|
"""
|
||||||
|
|
||||||
|
rndstate = random.getstate()
|
||||||
|
random.setstate(rndstate)
|
||||||
|
fx, grad = f(x) # Evaluate function value at original point
|
||||||
|
h = 1e-4 # Do not change this!
|
||||||
|
|
||||||
|
# Iterate over all indexes ix in x to check the gradient.
|
||||||
|
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
|
||||||
|
while not it.finished:
|
||||||
|
ix = it.multi_index
|
||||||
|
|
||||||
|
x[ix] += h # increment by h
|
||||||
|
random.setstate(rndstate)
|
||||||
|
fxh, _ = f(x) # evalute f(x + h)
|
||||||
|
x[ix] -= 2 * h # restore to previous value (very important!)
|
||||||
|
random.setstate(rndstate)
|
||||||
|
fxnh, _ = f(x)
|
||||||
|
x[ix] += h
|
||||||
|
numgrad = (fxh - fxnh) / 2 / h
|
||||||
|
|
||||||
|
# Compare gradients
|
||||||
|
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
|
||||||
|
if reldiff > 1e-5:
|
||||||
|
print("Gradient check failed for %s." % gradientText)
|
||||||
|
print("First gradient error found at index %s in the vector of gradients" % str(ix))
|
||||||
|
print("Your gradient: %f \t Numerical gradient: %f" % (
|
||||||
|
grad[ix], numgrad))
|
||||||
|
return
|
||||||
|
|
||||||
|
it.iternext() # Step to next dimension
|
||||||
|
|
||||||
|
print("Gradient check passed!")
|
||||||
248
[finished]Assignment_2_word2vec/utils/treebank.py
Normal file
248
[finished]Assignment_2_word2vec/utils/treebank.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
class StanfordSentiment:
|
||||||
|
def __init__(self, path=None, tablesize = 1000000):
|
||||||
|
if not path:
|
||||||
|
path = "utils/datasets/stanfordSentimentTreebank"
|
||||||
|
|
||||||
|
self.path = path
|
||||||
|
self.tablesize = tablesize
|
||||||
|
|
||||||
|
def tokens(self):
|
||||||
|
if hasattr(self, "_tokens") and self._tokens:
|
||||||
|
return self._tokens
|
||||||
|
|
||||||
|
tokens = dict()
|
||||||
|
tokenfreq = dict()
|
||||||
|
wordcount = 0
|
||||||
|
revtokens = []
|
||||||
|
idx = 0
|
||||||
|
|
||||||
|
for sentence in self.sentences():
|
||||||
|
for w in sentence:
|
||||||
|
wordcount += 1
|
||||||
|
if not w in tokens:
|
||||||
|
tokens[w] = idx
|
||||||
|
revtokens += [w]
|
||||||
|
tokenfreq[w] = 1
|
||||||
|
idx += 1
|
||||||
|
else:
|
||||||
|
tokenfreq[w] += 1
|
||||||
|
|
||||||
|
tokens["UNK"] = idx
|
||||||
|
revtokens += ["UNK"]
|
||||||
|
tokenfreq["UNK"] = 1
|
||||||
|
wordcount += 1
|
||||||
|
|
||||||
|
self._tokens = tokens
|
||||||
|
self._tokenfreq = tokenfreq
|
||||||
|
self._wordcount = wordcount
|
||||||
|
self._revtokens = revtokens
|
||||||
|
return self._tokens
|
||||||
|
|
||||||
|
def sentences(self):
|
||||||
|
if hasattr(self, "_sentences") and self._sentences:
|
||||||
|
return self._sentences
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
with open(self.path + "/datasetSentences.txt", "r") as f:
|
||||||
|
first = True
|
||||||
|
for line in f:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
splitted = line.strip().split()[1:]
|
||||||
|
# Deal with some peculiar encoding issues with this file
|
||||||
|
sentences += [[w.lower() for w in splitted]]
|
||||||
|
|
||||||
|
self._sentences = sentences
|
||||||
|
self._sentlengths = np.array([len(s) for s in sentences])
|
||||||
|
self._cumsentlen = np.cumsum(self._sentlengths)
|
||||||
|
|
||||||
|
return self._sentences
|
||||||
|
|
||||||
|
def numSentences(self):
|
||||||
|
if hasattr(self, "_numSentences") and self._numSentences:
|
||||||
|
return self._numSentences
|
||||||
|
else:
|
||||||
|
self._numSentences = len(self.sentences())
|
||||||
|
return self._numSentences
|
||||||
|
|
||||||
|
def allSentences(self):
|
||||||
|
if hasattr(self, "_allsentences") and self._allsentences:
|
||||||
|
return self._allsentences
|
||||||
|
|
||||||
|
sentences = self.sentences()
|
||||||
|
rejectProb = self.rejectProb()
|
||||||
|
tokens = self.tokens()
|
||||||
|
allsentences = [[w for w in s
|
||||||
|
if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
|
||||||
|
for s in sentences * 30]
|
||||||
|
|
||||||
|
allsentences = [s for s in allsentences if len(s) > 1]
|
||||||
|
|
||||||
|
self._allsentences = allsentences
|
||||||
|
|
||||||
|
return self._allsentences
|
||||||
|
|
||||||
|
def getRandomContext(self, C=5):
|
||||||
|
allsent = self.allSentences()
|
||||||
|
sentID = random.randint(0, len(allsent) - 1)
|
||||||
|
sent = allsent[sentID]
|
||||||
|
wordID = random.randint(0, len(sent) - 1)
|
||||||
|
|
||||||
|
context = sent[max(0, wordID - C):wordID]
|
||||||
|
if wordID+1 < len(sent):
|
||||||
|
context += sent[wordID+1:min(len(sent), wordID + C + 1)]
|
||||||
|
|
||||||
|
centerword = sent[wordID]
|
||||||
|
context = [w for w in context if w != centerword]
|
||||||
|
|
||||||
|
if len(context) > 0:
|
||||||
|
return centerword, context
|
||||||
|
else:
|
||||||
|
return self.getRandomContext(C)
|
||||||
|
|
||||||
|
def sent_labels(self):
|
||||||
|
if hasattr(self, "_sent_labels") and self._sent_labels:
|
||||||
|
return self._sent_labels
|
||||||
|
|
||||||
|
dictionary = dict()
|
||||||
|
phrases = 0
|
||||||
|
with open(self.path + "/dictionary.txt", "r") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line: continue
|
||||||
|
splitted = line.split("|")
|
||||||
|
dictionary[splitted[0].lower()] = int(splitted[1])
|
||||||
|
phrases += 1
|
||||||
|
|
||||||
|
labels = [0.0] * phrases
|
||||||
|
with open(self.path + "/sentiment_labels.txt", "r") as f:
|
||||||
|
first = True
|
||||||
|
for line in f:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
line = line.strip()
|
||||||
|
if not line: continue
|
||||||
|
splitted = line.split("|")
|
||||||
|
labels[int(splitted[0])] = float(splitted[1])
|
||||||
|
|
||||||
|
sent_labels = [0.0] * self.numSentences()
|
||||||
|
sentences = self.sentences()
|
||||||
|
for i in range(self.numSentences()):
|
||||||
|
sentence = sentences[i]
|
||||||
|
full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
|
||||||
|
sent_labels[i] = labels[dictionary[full_sent]]
|
||||||
|
|
||||||
|
self._sent_labels = sent_labels
|
||||||
|
return self._sent_labels
|
||||||
|
|
||||||
|
def dataset_split(self):
|
||||||
|
if hasattr(self, "_split") and self._split:
|
||||||
|
return self._split
|
||||||
|
|
||||||
|
split = [[] for i in range(3)]
|
||||||
|
with open(self.path + "/datasetSplit.txt", "r") as f:
|
||||||
|
first = True
|
||||||
|
for line in f:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
splitted = line.strip().split(",")
|
||||||
|
split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
|
||||||
|
|
||||||
|
self._split = split
|
||||||
|
return self._split
|
||||||
|
|
||||||
|
def getRandomTrainSentence(self):
|
||||||
|
split = self.dataset_split()
|
||||||
|
sentId = split[0][random.randint(0, len(split[0]) - 1)]
|
||||||
|
return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
|
||||||
|
|
||||||
|
def categorify(self, label):
|
||||||
|
if label <= 0.2:
|
||||||
|
return 0
|
||||||
|
elif label <= 0.4:
|
||||||
|
return 1
|
||||||
|
elif label <= 0.6:
|
||||||
|
return 2
|
||||||
|
elif label <= 0.8:
|
||||||
|
return 3
|
||||||
|
else:
|
||||||
|
return 4
|
||||||
|
|
||||||
|
def getDevSentences(self):
|
||||||
|
return self.getSplitSentences(2)
|
||||||
|
|
||||||
|
def getTestSentences(self):
|
||||||
|
return self.getSplitSentences(1)
|
||||||
|
|
||||||
|
def getTrainSentences(self):
|
||||||
|
return self.getSplitSentences(0)
|
||||||
|
|
||||||
|
def getSplitSentences(self, split=0):
|
||||||
|
ds_split = self.dataset_split()
|
||||||
|
return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
|
||||||
|
|
||||||
|
def sampleTable(self):
|
||||||
|
if hasattr(self, '_sampleTable') and self._sampleTable is not None:
|
||||||
|
return self._sampleTable
|
||||||
|
|
||||||
|
nTokens = len(self.tokens())
|
||||||
|
samplingFreq = np.zeros((nTokens,))
|
||||||
|
self.allSentences()
|
||||||
|
i = 0
|
||||||
|
for w in range(nTokens):
|
||||||
|
w = self._revtokens[i]
|
||||||
|
if w in self._tokenfreq:
|
||||||
|
freq = 1.0 * self._tokenfreq[w]
|
||||||
|
# Reweigh
|
||||||
|
freq = freq ** 0.75
|
||||||
|
else:
|
||||||
|
freq = 0.0
|
||||||
|
samplingFreq[i] = freq
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
samplingFreq /= np.sum(samplingFreq)
|
||||||
|
samplingFreq = np.cumsum(samplingFreq) * self.tablesize
|
||||||
|
|
||||||
|
self._sampleTable = [0] * self.tablesize
|
||||||
|
|
||||||
|
j = 0
|
||||||
|
for i in range(self.tablesize):
|
||||||
|
while i > samplingFreq[j]:
|
||||||
|
j += 1
|
||||||
|
self._sampleTable[i] = j
|
||||||
|
|
||||||
|
return self._sampleTable
|
||||||
|
|
||||||
|
def rejectProb(self):
|
||||||
|
if hasattr(self, '_rejectProb') and self._rejectProb is not None:
|
||||||
|
return self._rejectProb
|
||||||
|
|
||||||
|
threshold = 1e-5 * self._wordcount
|
||||||
|
|
||||||
|
nTokens = len(self.tokens())
|
||||||
|
rejectProb = np.zeros((nTokens,))
|
||||||
|
for i in range(nTokens):
|
||||||
|
w = self._revtokens[i]
|
||||||
|
freq = 1.0 * self._tokenfreq[w]
|
||||||
|
# Reweigh
|
||||||
|
rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
|
||||||
|
|
||||||
|
self._rejectProb = rejectProb
|
||||||
|
return self._rejectProb
|
||||||
|
|
||||||
|
def sampleTokenIdx(self):
|
||||||
|
return self.sampleTable()[random.randint(0, self.tablesize - 1)]
|
||||||
43
[finished]Assignment_2_word2vec/utils/utils.py
Normal file
43
[finished]Assignment_2_word2vec/utils/utils.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def normalizeRows(x):
|
||||||
|
""" Row normalization function
|
||||||
|
|
||||||
|
Implement a function that normalizes each row of a matrix to have
|
||||||
|
unit length.
|
||||||
|
"""
|
||||||
|
N = x.shape[0]
|
||||||
|
x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
|
||||||
|
return x
|
||||||
|
|
||||||
|
def softmax(x):
|
||||||
|
"""Compute the softmax function for each row of the input x.
|
||||||
|
It is crucial that this function is optimized for speed because
|
||||||
|
it will be used frequently in later code.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
x -- A D dimensional vector or N x D dimensional numpy matrix.
|
||||||
|
Return:
|
||||||
|
x -- You are allowed to modify x in-place
|
||||||
|
"""
|
||||||
|
orig_shape = x.shape
|
||||||
|
|
||||||
|
if len(x.shape) > 1:
|
||||||
|
# Matrix
|
||||||
|
tmp = np.max(x, axis=1)
|
||||||
|
x -= tmp.reshape((x.shape[0], 1))
|
||||||
|
x = np.exp(x)
|
||||||
|
tmp = np.sum(x, axis=1)
|
||||||
|
x /= tmp.reshape((x.shape[0], 1))
|
||||||
|
else:
|
||||||
|
# Vector
|
||||||
|
tmp = np.max(x)
|
||||||
|
x -= tmp
|
||||||
|
x = np.exp(x)
|
||||||
|
tmp = np.sum(x)
|
||||||
|
x /= tmp
|
||||||
|
|
||||||
|
assert x.shape == orig_shape
|
||||||
|
return x
|
||||||
301
[finished]Assignment_2_word2vec/word2vec.py
Normal file
301
[finished]Assignment_2_word2vec/word2vec.py
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
|
||||||
|
from utils.gradcheck import gradcheck_naive
|
||||||
|
from utils.utils import normalizeRows, softmax
|
||||||
|
|
||||||
|
|
||||||
|
def sigmoid(x):
|
||||||
|
"""
|
||||||
|
Compute the sigmoid function for the input here.
|
||||||
|
Arguments:
|
||||||
|
x -- A scalar or numpy array.
|
||||||
|
Return:
|
||||||
|
s -- sigmoid(x)
|
||||||
|
"""
|
||||||
|
|
||||||
|
### YOUR CODE HERE
|
||||||
|
s = 1/(1+np.exp(-x))
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def naiveSoftmaxLossAndGradient(
|
||||||
|
centerWordVec,
|
||||||
|
outsideWordIdx,
|
||||||
|
outsideVectors,
|
||||||
|
dataset
|
||||||
|
):
|
||||||
|
""" Naive Softmax loss & gradient function for word2vec models
|
||||||
|
|
||||||
|
Implement the naive softmax loss and gradients between a center word's
|
||||||
|
embedding and an outside word's embedding. This will be the building block
|
||||||
|
for our word2vec models.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
centerWordVec -- numpy ndarray, center word's embedding
|
||||||
|
(v_c in the pdf handout)
|
||||||
|
outsideWordIdx -- integer, the index of the outside word
|
||||||
|
(o of u_o in the pdf handout)
|
||||||
|
outsideVectors -- outside vectors (rows of matrix) for all words in vocab
|
||||||
|
(U in the pdf handout)
|
||||||
|
dataset -- needed for negative sampling, unused here.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
loss -- naive softmax loss
|
||||||
|
gradCenterVec -- the gradient with respect to the center word vector
|
||||||
|
(dJ / dv_c in the pdf handout)
|
||||||
|
gradOutsideVecs -- the gradient with respect to all the outside word vectors
|
||||||
|
(dJ / dU)
|
||||||
|
"""
|
||||||
|
|
||||||
|
### YOUR CODE HERE
|
||||||
|
score = np.dot(outsideVectors,centerWordVec)
|
||||||
|
y_h = softmax(score)
|
||||||
|
loss = -np.log(y_h[outsideWordIdx])
|
||||||
|
y = np.eye(y_h.shape[0])[outsideWordIdx]
|
||||||
|
diff = (y_h-y).reshape((y.shape[0],1))
|
||||||
|
|
||||||
|
gradCenterVec = np.dot(diff.T,outsideVectors)
|
||||||
|
centerWordVec = centerWordVec.reshape((1,centerWordVec.shape[0]))
|
||||||
|
gradOutsideVecs = np.dot(diff,centerWordVec)
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
return loss, gradCenterVec, gradOutsideVecs
|
||||||
|
|
||||||
|
|
||||||
|
def getNegativeSamples(outsideWordIdx, dataset, K):
|
||||||
|
""" Samples K indexes which are not the outsideWordIdx """
|
||||||
|
|
||||||
|
negSampleWordIndices = [None] * K
|
||||||
|
for k in range(K):
|
||||||
|
newidx = dataset.sampleTokenIdx()
|
||||||
|
while newidx == outsideWordIdx:
|
||||||
|
newidx = dataset.sampleTokenIdx()
|
||||||
|
negSampleWordIndices[k] = newidx
|
||||||
|
return negSampleWordIndices
|
||||||
|
|
||||||
|
|
||||||
|
def negSamplingLossAndGradient(
|
||||||
|
centerWordVec,
|
||||||
|
outsideWordIdx,
|
||||||
|
outsideVectors,
|
||||||
|
dataset,
|
||||||
|
K=10
|
||||||
|
):
|
||||||
|
""" Negative sampling loss function for word2vec models
|
||||||
|
|
||||||
|
Implement the negative sampling loss and gradients for a centerWordVec
|
||||||
|
and a outsideWordIdx word vector as a building block for word2vec
|
||||||
|
models. K is the number of negative samples to take.
|
||||||
|
|
||||||
|
Note: The same word may be negatively sampled multiple times. For
|
||||||
|
example if an outside word is sampled twice, you shall have to
|
||||||
|
double count the gradient with respect to this word. Thrice if
|
||||||
|
it was sampled three times, and so forth.
|
||||||
|
|
||||||
|
Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Negative sampling of words is done for you. Do not modify this if you
|
||||||
|
# wish to match the autograder and receive points!
|
||||||
|
negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
|
||||||
|
indices = [outsideWordIdx] + negSampleWordIndices
|
||||||
|
|
||||||
|
|
||||||
|
### YOUR CODE HERE
|
||||||
|
|
||||||
|
score = np.dot(outsideVectors[outsideWordIdx],centerWordVec)
|
||||||
|
sig_1 = sigmoid(score)
|
||||||
|
|
||||||
|
sum_neg = 0.0
|
||||||
|
|
||||||
|
#Find unique negative samples and the number of times they are present in our sample window
|
||||||
|
unique_k, counts_k = np.unique(indices[1:], return_counts=True)
|
||||||
|
k_stack = outsideVectors[unique_k]
|
||||||
|
|
||||||
|
score_neg = -np.dot(k_stack,centerWordVec)
|
||||||
|
sig_neg = sigmoid(score_neg)
|
||||||
|
sum_neg = np.sum(counts_k*np.log(sig_neg),axis=0)
|
||||||
|
|
||||||
|
#J_neg_sam Loss
|
||||||
|
loss = -np.log(sig_1) - sum_neg
|
||||||
|
|
||||||
|
#Calculate gradients
|
||||||
|
k_term = 0.0
|
||||||
|
#delta term from previous layer for efficient implementation
|
||||||
|
delta_1msig = 1-sig_1
|
||||||
|
delta_1msig_neg = 1-sig_neg
|
||||||
|
|
||||||
|
gradOutsideVecs = np.zeros((outsideVectors.shape))
|
||||||
|
gradOutsideVecs[outsideWordIdx,:] = -delta_1msig*centerWordVec
|
||||||
|
common_term = np.dot(delta_1msig_neg.reshape(unique_k.shape[0],1),centerWordVec.reshape(1,centerWordVec.shape[0]))
|
||||||
|
gradOutsideVecs[unique_k,:] += counts_k.reshape(counts_k.shape[0],1)*common_term
|
||||||
|
|
||||||
|
#Reshape prep for center gradient calculation
|
||||||
|
counts_k = counts_k.reshape(counts_k.shape[0],1)
|
||||||
|
delta_1msig_neg = delta_1msig_neg.reshape(delta_1msig_neg.shape[0],1)
|
||||||
|
k_term = np.sum(np.dot((delta_1msig_neg.reshape(1,counts_k.shape[0])),counts_k*k_stack),axis=0)
|
||||||
|
gradCenterVec = -delta_1msig*outsideVectors[outsideWordIdx] + k_term
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
return loss, gradCenterVec, gradOutsideVecs
|
||||||
|
|
||||||
|
|
||||||
|
def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
|
||||||
|
centerWordVectors, outsideVectors, dataset,
|
||||||
|
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
|
||||||
|
""" Skip-gram model in word2vec
|
||||||
|
|
||||||
|
Implement the skip-gram model in this function.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
currentCenterWord -- a string of the current center word
|
||||||
|
windowSize -- integer, context window size
|
||||||
|
outsideWords -- list of no more than 2*windowSize strings, the outside words
|
||||||
|
word2Ind -- a dictionary that maps words to their indices in
|
||||||
|
the word vector list
|
||||||
|
centerWordVectors -- center word vectors (as rows) for all words in vocab
|
||||||
|
(V in pdf handout)
|
||||||
|
outsideVectors -- outside word vectors (as rows) for all words in vocab
|
||||||
|
(U in pdf handout)
|
||||||
|
word2vecLossAndGradient -- the loss and gradient function for
|
||||||
|
a prediction vector given the outsideWordIdx
|
||||||
|
word vectors, could be one of the two
|
||||||
|
loss functions you implemented above.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
loss -- the loss function value for the skip-gram model
|
||||||
|
(J in the pdf handout)
|
||||||
|
gradCenterVecs -- the gradient with respect to the center word vectors
|
||||||
|
(dJ / dV in the pdf handout)
|
||||||
|
gradOutsideVectors -- the gradient with respect to the outside word vectors
|
||||||
|
(dJ / dU in the pdf handout)
|
||||||
|
"""
|
||||||
|
|
||||||
|
loss = 0.0
|
||||||
|
gradCenterVecs = np.zeros(centerWordVectors.shape)
|
||||||
|
gradOutsideVectors = np.zeros(outsideVectors.shape)
|
||||||
|
|
||||||
|
### YOUR CODE HERE
|
||||||
|
for m in range(0,len(outsideWords)):
|
||||||
|
l,gradCenter,gradOutside= word2vecLossAndGradient(centerWordVectors[word2Ind[currentCenterWord]],word2Ind[outsideWords[m]],outsideVectors,dataset)
|
||||||
|
loss+=l
|
||||||
|
gradCenterVecs[word2Ind[currentCenterWord]] += gradCenter.reshape((centerWordVectors.shape[1],))
|
||||||
|
gradOutsideVectors += gradOutside
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
return loss, gradCenterVecs, gradOutsideVectors
|
||||||
|
|
||||||
|
#############################################
|
||||||
|
# Testing functions below. DO NOT MODIFY! #
|
||||||
|
#############################################
|
||||||
|
|
||||||
|
def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset,
|
||||||
|
windowSize,
|
||||||
|
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
|
||||||
|
batchsize = 50
|
||||||
|
loss = 0.0
|
||||||
|
grad = np.zeros(wordVectors.shape)
|
||||||
|
N = wordVectors.shape[0]
|
||||||
|
centerWordVectors = wordVectors[:int(N/2),:]
|
||||||
|
outsideVectors = wordVectors[int(N/2):,:]
|
||||||
|
for i in range(batchsize):
|
||||||
|
windowSize1 = random.randint(1, windowSize)
|
||||||
|
centerWord, context = dataset.getRandomContext(windowSize1)
|
||||||
|
|
||||||
|
c, gin, gout = word2vecModel(
|
||||||
|
centerWord, windowSize1, context, word2Ind, centerWordVectors,
|
||||||
|
outsideVectors, dataset, word2vecLossAndGradient
|
||||||
|
)
|
||||||
|
loss += c / batchsize
|
||||||
|
grad[:int(N/2), :] += gin / batchsize
|
||||||
|
grad[int(N/2):, :] += gout / batchsize
|
||||||
|
|
||||||
|
return loss, grad
|
||||||
|
|
||||||
|
|
||||||
|
def test_word2vec():
|
||||||
|
""" Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
|
||||||
|
dataset = type('dummy', (), {})()
|
||||||
|
def dummySampleTokenIdx():
|
||||||
|
return random.randint(0, 4)
|
||||||
|
|
||||||
|
def getRandomContext(C):
|
||||||
|
tokens = ["a", "b", "c", "d", "e"]
|
||||||
|
return tokens[random.randint(0,4)], \
|
||||||
|
[tokens[random.randint(0,4)] for i in range(2*C)]
|
||||||
|
dataset.sampleTokenIdx = dummySampleTokenIdx
|
||||||
|
dataset.getRandomContext = getRandomContext
|
||||||
|
|
||||||
|
random.seed(31415)
|
||||||
|
np.random.seed(9265)
|
||||||
|
dummy_vectors = normalizeRows(np.random.randn(10,3))
|
||||||
|
dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
|
||||||
|
|
||||||
|
print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
|
||||||
|
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
|
||||||
|
skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
|
||||||
|
dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
|
||||||
|
|
||||||
|
print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
|
||||||
|
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
|
||||||
|
skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
|
||||||
|
dummy_vectors, "negSamplingLossAndGradient Gradient")
|
||||||
|
|
||||||
|
print("\n=== Results ===")
|
||||||
|
print ("Skip-Gram with naiveSoftmaxLossAndGradient")
|
||||||
|
|
||||||
|
print ("Your Result:")
|
||||||
|
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
|
||||||
|
*skipgram("c", 3, ["a", "b", "e", "d", "b", "c"],
|
||||||
|
dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print ("Expected Result: Value should approximate these:")
|
||||||
|
print("""Loss: 11.16610900153398
|
||||||
|
Gradient wrt Center Vectors (dJ/dV):
|
||||||
|
[[ 0. 0. 0. ]
|
||||||
|
[ 0. 0. 0. ]
|
||||||
|
[-1.26947339 -1.36873189 2.45158957]
|
||||||
|
[ 0. 0. 0. ]
|
||||||
|
[ 0. 0. 0. ]]
|
||||||
|
Gradient wrt Outside Vectors (dJ/dU):
|
||||||
|
[[-0.41045956 0.18834851 1.43272264]
|
||||||
|
[ 0.38202831 -0.17530219 -1.33348241]
|
||||||
|
[ 0.07009355 -0.03216399 -0.24466386]
|
||||||
|
[ 0.09472154 -0.04346509 -0.33062865]
|
||||||
|
[-0.13638384 0.06258276 0.47605228]]
|
||||||
|
""")
|
||||||
|
|
||||||
|
print ("Skip-Gram with negSamplingLossAndGradient")
|
||||||
|
print ("Your Result:")
|
||||||
|
print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
|
||||||
|
*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:],
|
||||||
|
dummy_vectors[5:,:], dataset, negSamplingLossAndGradient)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print ("Expected Result: Value should approximate these:")
|
||||||
|
print("""Loss: 16.15119285363322
|
||||||
|
Gradient wrt Center Vectors (dJ/dV):
|
||||||
|
[[ 0. 0. 0. ]
|
||||||
|
[ 0. 0. 0. ]
|
||||||
|
[-4.54650789 -1.85942252 0.76397441]
|
||||||
|
[ 0. 0. 0. ]
|
||||||
|
[ 0. 0. 0. ]]
|
||||||
|
Gradient wrt Outside Vectors (dJ/dU):
|
||||||
|
[[-0.69148188 0.31730185 2.41364029]
|
||||||
|
[-0.22716495 0.10423969 0.79292674]
|
||||||
|
[-0.45528438 0.20891737 1.58918512]
|
||||||
|
[-0.31602611 0.14501561 1.10309954]
|
||||||
|
[-0.80620296 0.36994417 2.81407799]]
|
||||||
|
""")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_word2vec()
|
||||||
Reference in New Issue
Block a user