a2

2019-11-11 14:55:15 +08:00
parent 94d7ba8bd5
commit 90ef82b456
14 changed files with 1470 additions and 0 deletions
--- a/[finished]Assignment_2_word2vec/a2
+++ b/[finished]Assignment_2_word2vec/a2
--- a/[finished]Assignment_2_word2vec/collect_submission.sh
+++ b/[finished]Assignment_2_word2vec/collect_submission.sh
@@ -0,0 +1,2 @@
+rm -f assignment2.zip
+zip -r assignment2.zip *.py *.png saved_params_40000.npy
--- a/[finished]Assignment_2_word2vec/get_datasets.sh
+++ b/[finished]Assignment_2_word2vec/get_datasets.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DATASETS_DIR="utils/datasets"
+mkdir -p $DATASETS_DIR
+
+cd $DATASETS_DIR
+
+# Get Stanford Sentiment Treebank
+if hash wget 2>/dev/null; then
+  wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
+else
+  curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
+fi
+unzip stanfordSentimentTreebank.zip
+rm stanfordSentimentTreebank.zip
--- a/[finished]Assignment_2_word2vec/others/pytorch
+++ b/[finished]Assignment_2_word2vec/others/pytorch
@@ -0,0 +1,461 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 为什么要学习pytorch\n",
+    "\n",
+    "tensorflow的学习曲线陡峭\n",
+    "\n",
+    "pytorch出自facebook\n",
+    "\n",
+    "PyTorch 可以当做 NumPy 用\n",
+    "\n",
+    "静态图 vs 动态图\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "最新版本全部使用tensor就可以\n",
+    "\n",
+    "numpy能实现的东西，在pytorch基本可以实现\n",
+    "\n",
+    "torch.randn()=numpy.random.randn()\n",
+    "\n",
+    "torch.max()=np.max()\n",
+    "\n",
+    "torch.zeros()=np.zeros()\n",
+    "\n",
+    "\n",
+    "如果不能实现,Tensor与numpy之间的可以相互转化\n",
+    "\n",
+    "tensor.numpy()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "print(torch.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cpu')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0., 1., 2., 3., 4., 5., 6.])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# python struct to torch tensor\n",
+    "temp=[0, 1, 2, 3, 4, 5, 6]\n",
+    "x = torch.tensor(temp, dtype=torch.float, device=device)\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#gpu/cpu\n",
+    "x = torch.tensor(temp).cuda()\n",
+    "x = x.cpu()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x=x.long()\n",
+    "x=x.float()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## numpy reshape  squeeze expand_dims"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 1 2 3 4 5 6 7 8 9]\n",
+      "(10,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "a  = np.arange(10)\n",
+    "print(a)\n",
+    "print(a.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[0]\n",
+      "  [1]]\n",
+      "\n",
+      " [[2]\n",
+      "  [3]]\n",
+      "\n",
+      " [[4]\n",
+      "  [5]]\n",
+      "\n",
+      " [[6]\n",
+      "  [7]]\n",
+      "\n",
+      " [[8]\n",
+      "  [9]]]\n",
+      "(5, 2, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#a=a.reshape(1,-1)\n",
+    "#a.reshape(1,10)\n",
+    "a=a.reshape(5,2,1)\n",
+    "print(a)\n",
+    "print(a.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 1]\n",
+      " [2 3]\n",
+      " [4 5]\n",
+      " [6 7]\n",
+      " [8 9]]\n",
+      "(5, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "b = np.squeeze(a)\n",
+    "print(b)\n",
+    "print(b.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(5, 1, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "a  = np.arange(10)\n",
+    "a=a.reshape(5,1,2)\n",
+    "print(a.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 1]\n",
+      " [2 3]\n",
+      " [4 5]\n",
+      " [6 7]\n",
+      " [8 9]]\n",
+      "(5, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "b = np.squeeze(a)#b=a.reshape(5,2)\n",
+    "print(b)\n",
+    "print(b.shape)\n",
+    "#np.squeeze(e,axis = 0,1,2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a (10,)\n",
+      "b axis=0 (1, 10)\n",
+      "c axis=1 (10, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#expand_dims\n",
+    "a  = np.arange(10)\n",
+    "print(\"a\",a.shape)\n",
+    "b = np.expand_dims(a, axis=0)\n",
+    "print(\"b axis=0\",b.shape)\n",
+    "c = np.expand_dims(a, axis=1)\n",
+    "print(\"c axis=1\",c.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pytorch reshape  squeeze unsqueeze"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n",
+      "x torch.Size([10])\n",
+      "b axis=0 torch.Size([1, 10])\n",
+      "c axis=1 torch.Size([10, 1])\n"
+     ]
+    }
+   ],
+   "source": [
+    "x  = torch.arange(0,10)\n",
+    "print(\"x\",x)\n",
+    "print(\"x\",x.shape)\n",
+    "b = x.unsqueeze(0)\n",
+    "print(\"b axis=0\",b.shape)\n",
+    "c = torch.unsqueeze(x,1)\n",
+    "print(\"c axis=1\",c.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d torch.Size([10])\n",
+      "d torch.Size([10])\n"
+     ]
+    }
+   ],
+   "source": [
+    "d=c.squeeze(1)\n",
+    "print(\"d\",d.shape)\n",
+    "d=c.squeeze()\n",
+    "print(\"d\",d.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d torch.Size([10])\n"
+     ]
+    }
+   ],
+   "source": [
+    "d=b.squeeze()\n",
+    "print(\"d\",d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8])\n",
+      "torch.Size([8])\n",
+      "tensor(0.4091, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "m = nn.Sigmoid()\n",
+    "\n",
+    "loss = nn.BCEWithLogitsLoss()\n",
+    "temp=[0, 1, 2,0, 1, 2,0, 1]\n",
+    "input =torch.tensor(temp, dtype=torch.float, requires_grad=True)\n",
+    "#input = torch.randn(3, requires_grad=True)\n",
+    "target = torch.tensor([0,1,1,0,1,1,0,1], dtype=torch.float)\n",
+    "lossinput = input\n",
+    "output = loss(lossinput, target)\n",
+    "\n",
+    "\n",
+    "print(lossinput.shape)\n",
+    "\n",
+    "print(target.shape)\n",
+    "\n",
+    "print(output)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## logist regression\n",
+    "\n",
+    "$$y(z) = \\frac{1}{1 + exp(-\\theta^T z)}$$\n",
+    "\n",
+    "nn.BCELoss\n",
+    "$$\n",
+    "        \\ell(x, y) = L = \\{l_1,\\dots,l_N\\}^\\top, \\quad\n",
+    "        l_n = - w_n \\left[ y_n \\cdot \\log x_n + (1 - y_n) \\cdot \\log (1 - x_n) \\right],\n",
+    " $$ \n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch.nn.functional as F   # 激励函数库\n",
+    "\n",
+    "feature_num=100\n",
+    "\n",
+    "class LogistRegression(torch.nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(LogistRegression, self).__init__()\n",
+    "        self.linear = torch.nn.Linear(feature_num, 1)\n",
+    "    def forward(self, x):\n",
+    "        z=self.linear(x)\n",
+    "        y_pred = F.sigmoid(z)\n",
+    "        return y_pred\n",
+    "    def loss(y_pred,label):#Binary Cross Entropy\n",
+    "        criterion = torch.nn.BCELoss(size_average=True)\n",
+    "        return criterion(y_pred,label)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "nn.CrossEntropyLoss() 自带sigmod，不需要加"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/[finished]Assignment_2_word2vec/others/pytorch_train.py
+++ b/[finished]Assignment_2_word2vec/others/pytorch_train.py
@@ -0,0 +1,105 @@
+import random
+import re
+
+import torch
+import torch.optim as optim
+from tqdm import tqdm
+from pytorch_word2vec_model import SkipGram
+
+epochs = 50
+negative_sampling = 4
+window = 2
+vocab_size = 1
+embd_size = 300
+
+
+def batch_data(x, batch_size=128):
+    in_w = []
+    out_w = []
+    target = []
+    for text in x:
+        for i in range(window, len(text) - window):
+            word_set = set()
+            in_w.append(text[i])
+            in_w.append(text[i])
+            in_w.append(text[i])
+            in_w.append(text[i])
+
+            out_w.append(text[i - 2])
+            out_w.append(text[i - 1])
+            out_w.append(text[i + 1])
+            out_w.append(text[i + 2])
+
+            target.append(1)
+            target.append(1)
+            target.append(1)
+            target.append(1)
+            # negative sampling
+            count = 0
+            while count < negative_sampling:
+                rand_id = random.randint(0, vocab_size-1)
+                if not rand_id in word_set:
+                    in_w.append(text[i])
+                    out_w.append(rand_id)
+                    target.append(0)
+                    count += 1
+
+            if len(out_w) >= batch_size:
+                yield [in_w, out_w, target]
+                in_w = []
+                out_w = []
+                target = []
+    if out_w:
+        yield [in_w, out_w, target]
+
+
+def train(train_text_id, model,opt):
+    model.train()  # 启用dropout和batch normalization
+    ave_loss = 0
+    pbar = tqdm()
+    cnt=0
+    for x_batch in batch_data(train_text_id):
+        in_w, out_w, target = x_batch
+        in_w_var = torch.tensor(in_w)
+        out_w_var = torch.tensor(out_w)
+        target_var = torch.tensor(target,dtype=torch.float)
+
+        model.zero_grad()
+        log_probs = model(in_w_var, out_w_var)
+        loss = model.loss(log_probs, target_var)
+        loss.backward()
+        opt.step()
+        ave_loss += loss.item()
+        pbar.update(1)
+        cnt += 1
+        pbar.set_description('< loss: %.5f >' % (ave_loss / cnt))
+    pbar.close()
+text_id = []
+vocab_dict = {}
+
+with open(
+        'D:\\project\\ml\\github\\cs224n-natural-language-processing-winter2019\\a1_intro_word_vectors\\a1\\corpus\\corpus.txt',
+        encoding='utf-8') as fp:
+    for line in fp:
+        lines = re.sub("[^A-Za-z0-9']+", ' ', line).lower().split()
+        line_id = []
+        for s in lines:
+            if not s:
+                continue
+            if s not in vocab_dict:
+                vocab_dict[s] = len(vocab_dict)
+            id = vocab_dict[s]
+            line_id.append(id)
+            if id==11500:
+                print(id,s)
+        text_id.append(line_id)
+vocab_size = len(vocab_dict)
+print('vocab_size', vocab_size)
+model = SkipGram(vocab_size, embd_size)
+
+for epoch in range(epochs):
+    print('epoch', epoch)
+    opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
+                           lr=0.001, weight_decay=0)
+    train(text_id, model,opt)
+
--- a/[finished]Assignment_2_word2vec/others/pytorch_word2vec_model.py
+++ b/[finished]Assignment_2_word2vec/others/pytorch_word2vec_model.py
@@ -0,0 +1,40 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SkipGram(nn.Module):
+    def __init__(self, vocab_size, embd_size):
+        super(SkipGram, self).__init__()
+        self.embeddings = nn.Embedding(vocab_size, embd_size)
+
+    def forward(self, focus, context):
+        embed_focus = self.embeddings(focus)
+        embed_ctx = self.embeddings(context)
+        # score = torch.mm(embed_focus, torch.t(embed_ctx))
+        score = torch.mul(embed_focus, embed_ctx).sum(dim=1)
+        log_probs = score #F.logsigmoid(score)
+
+        return log_probs
+
+    def loss(self, log_probs, target):
+        loss_fn = nn.BCEWithLogitsLoss()
+        # loss_fn = nn.NLLLoss()
+        loss = loss_fn(log_probs, target)
+        return loss
+
+
+class CBOW(nn.Module):
+    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
+        super(CBOW, self).__init__()
+        self.embeddings = nn.Embedding(vocab_size, embd_size)
+        self.linear1 = nn.Linear(2 * context_size * embd_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, vocab_size)
+
+    def forward(self, inputs):
+        embedded = self.embeddings(inputs).view((1, -1))
+        hid = F.relu(self.linear1(embedded))
+        out = self.linear2(hid)
+        log_probs = F.log_softmax(out)
+        return log_probs
--- a/[finished]Assignment_2_word2vec/run.py
+++ b/[finished]Assignment_2_word2vec/run.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+import random
+import numpy as np
+from utils.treebank import StanfordSentiment
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+import time
+
+from word2vec import *
+from sgd import *
+
+# Check Python Version
+import sys
+assert sys.version_info[0] == 3
+assert sys.version_info[1] >= 5
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(314)
+dataset = StanfordSentiment()
+tokens = dataset.tokens()
+nWords = len(tokens)
+
+# We are going to train 10-dimensional vectors for this assignment
+dimVectors = 10
+
+# Context size
+C = 5
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(31415)
+np.random.seed(9265)
+
+startTime=time.time()
+wordVectors = np.concatenate(
+    ((np.random.rand(nWords, dimVectors) - 0.5) /
+       dimVectors, np.zeros((nWords, dimVectors))),
+    axis=0)
+wordVectors = sgd(
+    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
+        negSamplingLossAndGradient),
+    wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
+# Note that normalization is not called here. This is not a bug,
+# normalizing during training loses the notion of length.
+
+print("sanity check: cost at convergence should be around or below 10")
+print("training took %d seconds" % (time.time() - startTime))
+
+# concatenate the input and output word vectors
+wordVectors = np.concatenate(
+    (wordVectors[:nWords,:], wordVectors[nWords:,:]),
+    axis=0)
+
+visualizeWords = [
+    "great", "cool", "brilliant", "wonderful", "well", "amazing",
+    "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
+    "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
+    "hail", "coffee", "tea"]
+
+visualizeIdx = [tokens[word] for word in visualizeWords]
+visualizeVecs = wordVectors[visualizeIdx, :]
+temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
+covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
+U,S,V = np.linalg.svd(covariance)
+coord = temp.dot(U[:,0:2])
+
+for i in range(len(visualizeWords)):
+    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
+        bbox=dict(facecolor='green', alpha=0.1))
+
+plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
+plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
+
+plt.savefig('word_vectors.png')
--- a/[finished]Assignment_2_word2vec/sgd.py
+++ b/[finished]Assignment_2_word2vec/sgd.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+# Save parameters every a few SGD iterations as fail-safe
+SAVE_PARAMS_EVERY = 5000
+
+import pickle
+import glob
+import random
+import numpy as np
+import os.path as op
+
+def load_saved_params():
+    """
+    A helper function that loads previously saved parameters and resets
+    iteration start.
+    """
+    st = 0
+    for f in glob.glob("saved_params_*.npy"):
+        iter = int(op.splitext(op.basename(f))[0].split("_")[2])
+        if (iter > st):
+            st = iter
+
+    if st > 0:
+        params_file = "saved_params_%d.npy" % st
+        state_file = "saved_state_%d.pickle" % st
+        params = np.load(params_file)
+        with open(state_file, "rb") as f:
+            state = pickle.load(f)
+        return st, params, state
+    else:
+        return st, None, None
+
+
+def save_params(iter, params):
+    params_file = "saved_params_%d.npy" % iter
+    np.save(params_file, params)
+    with open("saved_state_%d.pickle" % iter, "wb") as f:
+        pickle.dump(random.getstate(), f)
+
+
+def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
+        PRINT_EVERY=10):
+    """ Stochastic Gradient Descent
+
+    Implement the stochastic gradient descent method in this function.
+
+    Arguments:
+    f -- the function to optimize, it should take a single
+         argument and yield two outputs, a loss and the gradient
+         with respect to the arguments
+    x0 -- the initial point to start SGD from
+    step -- the step size for SGD
+    iterations -- total iterations to run SGD for
+    postprocessing -- postprocessing function for the parameters
+                      if necessary. In the case of word2vec we will need to
+                      normalize the word vectors to have unit length.
+    PRINT_EVERY -- specifies how many iterations to output loss
+
+    Return:
+    x -- the parameter value after SGD finishes
+    """
+
+    # Anneal learning rate every several iterations
+    ANNEAL_EVERY = 20000
+
+    if useSaved:
+        start_iter, oldx, state = load_saved_params()
+        if start_iter > 0:
+            x0 = oldx
+            step *= 0.5 ** (start_iter / ANNEAL_EVERY)
+
+        if state:
+            random.setstate(state)
+    else:
+        start_iter = 0
+
+    x = x0
+
+    if not postprocessing:
+        postprocessing = lambda x: x
+
+    exploss = None
+
+    for iter in range(start_iter + 1, iterations + 1):
+        # You might want to print the progress every few iterations.
+
+        loss = None
+        ### YOUR CODE HERE
+        loss,gd = f(x)
+        x = x - step*gd
+        x = postprocessing(x)
+        ### END YOUR CODE
+
+        x = postprocessing(x)
+        if iter % PRINT_EVERY == 0:
+            if not exploss:
+                exploss = loss
+            else:
+                exploss = .95 * exploss + .05 * loss
+            print("iter %d: %f" % (iter, exploss))
+
+        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
+            save_params(iter, x)
+
+        if iter % ANNEAL_EVERY == 0:
+            step *= 0.5
+
+    return x
+
+
+def sanity_check():
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print("Running sanity checks...")
+    t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
+    print("test 1 result:", t1)
+    assert abs(t1) <= 1e-6
+
+    t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
+    print("test 2 result:", t2)
+    assert abs(t2) <= 1e-6
+
+    t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
+    print("test 3 result:", t3)
+    assert abs(t3) <= 1e-6
+
+    print("-" * 40)
+    print("ALL TESTS PASSED")
+    print("-" * 40)
+
+
+if __name__ == "__main__":
+    sanity_check()
--- a/[finished]Assignment_2_word2vec/utils/.DS_Store
+++ b/[finished]Assignment_2_word2vec/utils/.DS_Store
--- a/[finished]Assignment_2_word2vec/utils/init.py
+++ b/[finished]Assignment_2_word2vec/utils/init.py
--- a/[finished]Assignment_2_word2vec/utils/gradcheck.py
+++ b/[finished]Assignment_2_word2vec/utils/gradcheck.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import numpy as np
+import random
+
+
+# First implement a gradient checker by filling in the following functions
+def gradcheck_naive(f, x, gradientText):
+    """ Gradient check for a function f.
+    Arguments:
+    f -- a function that takes a single argument and outputs the
+         loss and its gradients
+    x -- the point (numpy array) to check the gradient at
+    gradientText -- a string detailing some context about the gradient computation
+    """
+
+    rndstate = random.getstate()
+    random.setstate(rndstate)
+    fx, grad = f(x) # Evaluate function value at original point
+    h = 1e-4        # Do not change this!
+
+    # Iterate over all indexes ix in x to check the gradient.
+    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
+    while not it.finished:
+        ix = it.multi_index
+
+        x[ix] += h # increment by h
+        random.setstate(rndstate)
+        fxh, _ = f(x) # evalute f(x + h)
+        x[ix] -= 2 * h # restore to previous value (very important!)
+        random.setstate(rndstate)
+        fxnh, _ = f(x)
+        x[ix] += h
+        numgrad = (fxh - fxnh) / 2 / h
+
+        # Compare gradients
+        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
+        if reldiff > 1e-5:
+            print("Gradient check failed for %s." % gradientText)
+            print("First gradient error found at index %s in the vector of gradients" % str(ix))
+            print("Your gradient: %f \t Numerical gradient: %f" % (
+                grad[ix], numgrad))
+            return
+
+        it.iternext() # Step to next dimension
+
+    print("Gradient check passed!")
--- a/[finished]Assignment_2_word2vec/utils/treebank.py
+++ b/[finished]Assignment_2_word2vec/utils/treebank.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pickle
+import numpy as np
+import os
+import random
+
+class StanfordSentiment:
+    def __init__(self, path=None, tablesize = 1000000):
+        if not path:
+            path = "utils/datasets/stanfordSentimentTreebank"
+
+        self.path = path
+        self.tablesize = tablesize
+
+    def tokens(self):
+        if hasattr(self, "_tokens") and self._tokens:
+            return self._tokens
+
+        tokens = dict()
+        tokenfreq = dict()
+        wordcount = 0
+        revtokens = []
+        idx = 0
+
+        for sentence in self.sentences():
+            for w in sentence:
+                wordcount += 1
+                if not w in tokens:
+                    tokens[w] = idx
+                    revtokens += [w]
+                    tokenfreq[w] = 1
+                    idx += 1
+                else:
+                    tokenfreq[w] += 1
+
+        tokens["UNK"] = idx
+        revtokens += ["UNK"]
+        tokenfreq["UNK"] = 1
+        wordcount += 1
+
+        self._tokens = tokens
+        self._tokenfreq = tokenfreq
+        self._wordcount = wordcount
+        self._revtokens = revtokens
+        return self._tokens
+
+    def sentences(self):
+        if hasattr(self, "_sentences") and self._sentences:
+            return self._sentences
+
+        sentences = []
+        with open(self.path + "/datasetSentences.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split()[1:]
+                # Deal with some peculiar encoding issues with this file
+                sentences += [[w.lower() for w in splitted]]
+
+        self._sentences = sentences
+        self._sentlengths = np.array([len(s) for s in sentences])
+        self._cumsentlen = np.cumsum(self._sentlengths)
+
+        return self._sentences
+
+    def numSentences(self):
+        if hasattr(self, "_numSentences") and self._numSentences:
+            return self._numSentences
+        else:
+            self._numSentences = len(self.sentences())
+            return self._numSentences
+
+    def allSentences(self):
+        if hasattr(self, "_allsentences") and self._allsentences:
+            return self._allsentences
+
+        sentences = self.sentences()
+        rejectProb = self.rejectProb()
+        tokens = self.tokens()
+        allsentences = [[w for w in s
+            if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
+            for s in sentences * 30]
+
+        allsentences = [s for s in allsentences if len(s) > 1]
+
+        self._allsentences = allsentences
+
+        return self._allsentences
+
+    def getRandomContext(self, C=5):
+        allsent = self.allSentences()
+        sentID = random.randint(0, len(allsent) - 1)
+        sent = allsent[sentID]
+        wordID = random.randint(0, len(sent) - 1)
+
+        context = sent[max(0, wordID - C):wordID]
+        if wordID+1 < len(sent):
+            context += sent[wordID+1:min(len(sent), wordID + C + 1)]
+
+        centerword = sent[wordID]
+        context = [w for w in context if w != centerword]
+
+        if len(context) > 0:
+            return centerword, context
+        else:
+            return self.getRandomContext(C)
+
+    def sent_labels(self):
+        if hasattr(self, "_sent_labels") and self._sent_labels:
+            return self._sent_labels
+
+        dictionary = dict()
+        phrases = 0
+        with open(self.path + "/dictionary.txt", "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                dictionary[splitted[0].lower()] = int(splitted[1])
+                phrases += 1
+
+        labels = [0.0] * phrases
+        with open(self.path + "/sentiment_labels.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                labels[int(splitted[0])] = float(splitted[1])
+
+        sent_labels = [0.0] * self.numSentences()
+        sentences = self.sentences()
+        for i in range(self.numSentences()):
+            sentence = sentences[i]
+            full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
+            sent_labels[i] = labels[dictionary[full_sent]]
+
+        self._sent_labels = sent_labels
+        return self._sent_labels
+
+    def dataset_split(self):
+        if hasattr(self, "_split") and self._split:
+            return self._split
+
+        split = [[] for i in range(3)]
+        with open(self.path + "/datasetSplit.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split(",")
+                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
+
+        self._split = split
+        return self._split
+
+    def getRandomTrainSentence(self):
+        split = self.dataset_split()
+        sentId = split[0][random.randint(0, len(split[0]) - 1)]
+        return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
+
+    def categorify(self, label):
+        if label <= 0.2:
+            return 0
+        elif label <= 0.4:
+            return 1
+        elif label <= 0.6:
+            return 2
+        elif label <= 0.8:
+            return 3
+        else:
+            return 4
+
+    def getDevSentences(self):
+        return self.getSplitSentences(2)
+
+    def getTestSentences(self):
+        return self.getSplitSentences(1)
+
+    def getTrainSentences(self):
+        return self.getSplitSentences(0)
+
+    def getSplitSentences(self, split=0):
+        ds_split = self.dataset_split()
+        return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
+
+    def sampleTable(self):
+        if hasattr(self, '_sampleTable') and self._sampleTable is not None:
+            return self._sampleTable
+
+        nTokens = len(self.tokens())
+        samplingFreq = np.zeros((nTokens,))
+        self.allSentences()
+        i = 0
+        for w in range(nTokens):
+            w = self._revtokens[i]
+            if w in self._tokenfreq:
+                freq = 1.0 * self._tokenfreq[w]
+                # Reweigh
+                freq = freq ** 0.75
+            else:
+                freq = 0.0
+            samplingFreq[i] = freq
+            i += 1
+
+        samplingFreq /= np.sum(samplingFreq)
+        samplingFreq = np.cumsum(samplingFreq) * self.tablesize
+
+        self._sampleTable = [0] * self.tablesize
+
+        j = 0
+        for i in range(self.tablesize):
+            while i > samplingFreq[j]:
+                j += 1
+            self._sampleTable[i] = j
+
+        return self._sampleTable
+
+    def rejectProb(self):
+        if hasattr(self, '_rejectProb') and self._rejectProb is not None:
+            return self._rejectProb
+
+        threshold = 1e-5 * self._wordcount
+
+        nTokens = len(self.tokens())
+        rejectProb = np.zeros((nTokens,))
+        for i in range(nTokens):
+            w = self._revtokens[i]
+            freq = 1.0 * self._tokenfreq[w]
+            # Reweigh
+            rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
+
+        self._rejectProb = rejectProb
+        return self._rejectProb
+
+    def sampleTokenIdx(self):
+        return self.sampleTable()[random.randint(0, self.tablesize - 1)]
--- a/[finished]Assignment_2_word2vec/utils/utils.py
+++ b/[finished]Assignment_2_word2vec/utils/utils.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import numpy as np
+
+def normalizeRows(x):
+    """ Row normalization function
+
+    Implement a function that normalizes each row of a matrix to have
+    unit length.
+    """
+    N = x.shape[0]
+    x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
+    return x
+
+def softmax(x):
+    """Compute the softmax function for each row of the input x.
+    It is crucial that this function is optimized for speed because
+    it will be used frequently in later code. 
+
+    Arguments:
+    x -- A D dimensional vector or N x D dimensional numpy matrix.
+    Return:
+    x -- You are allowed to modify x in-place
+    """
+    orig_shape = x.shape
+
+    if len(x.shape) > 1:
+        # Matrix
+        tmp = np.max(x, axis=1)
+        x -= tmp.reshape((x.shape[0], 1))
+        x = np.exp(x)
+        tmp = np.sum(x, axis=1)
+        x /= tmp.reshape((x.shape[0], 1))
+    else:
+        # Vector
+        tmp = np.max(x)
+        x -= tmp
+        x = np.exp(x)
+        tmp = np.sum(x)
+        x /= tmp
+
+    assert x.shape == orig_shape
+    return x
--- a/[finished]Assignment_2_word2vec/word2vec.py
+++ b/[finished]Assignment_2_word2vec/word2vec.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+
+import numpy as np
+import random
+
+from utils.gradcheck import gradcheck_naive
+from utils.utils import normalizeRows, softmax
+
+
+def sigmoid(x):
+    """
+    Compute the sigmoid function for the input here.
+    Arguments:
+    x -- A scalar or numpy array.
+    Return:
+    s -- sigmoid(x)
+    """
+
+    ### YOUR CODE HERE
+    s = 1/(1+np.exp(-x))
+    ### END YOUR CODE
+
+    return s
+
+
+def naiveSoftmaxLossAndGradient(
+    centerWordVec,
+    outsideWordIdx,
+    outsideVectors,
+    dataset
+):
+    """ Naive Softmax loss & gradient function for word2vec models
+
+    Implement the naive softmax loss and gradients between a center word's 
+    embedding and an outside word's embedding. This will be the building block
+    for our word2vec models.
+
+    Arguments:
+    centerWordVec -- numpy ndarray, center word's embedding
+                    (v_c in the pdf handout)
+    outsideWordIdx -- integer, the index of the outside word
+                    (o of u_o in the pdf handout)
+    outsideVectors -- outside vectors (rows of matrix) for all words in vocab
+                      (U in the pdf handout)
+    dataset -- needed for negative sampling, unused here.
+
+    Return:
+    loss -- naive softmax loss
+    gradCenterVec -- the gradient with respect to the center word vector
+                     (dJ / dv_c in the pdf handout)
+    gradOutsideVecs -- the gradient with respect to all the outside word vectors
+                    (dJ / dU)
+    """
+
+    ### YOUR CODE HERE
+    score = np.dot(outsideVectors,centerWordVec)
+    y_h = softmax(score)
+    loss = -np.log(y_h[outsideWordIdx])
+    y = np.eye(y_h.shape[0])[outsideWordIdx]
+    diff = (y_h-y).reshape((y.shape[0],1))
+
+    gradCenterVec = np.dot(diff.T,outsideVectors)
+    centerWordVec = centerWordVec.reshape((1,centerWordVec.shape[0]))
+    gradOutsideVecs = np.dot(diff,centerWordVec)
+
+    ### END YOUR CODE
+
+    return loss, gradCenterVec, gradOutsideVecs
+
+
+def getNegativeSamples(outsideWordIdx, dataset, K):
+    """ Samples K indexes which are not the outsideWordIdx """
+
+    negSampleWordIndices = [None] * K
+    for k in range(K):
+        newidx = dataset.sampleTokenIdx()
+        while newidx == outsideWordIdx:
+            newidx = dataset.sampleTokenIdx()
+        negSampleWordIndices[k] = newidx
+    return negSampleWordIndices
+
+
+def negSamplingLossAndGradient(
+    centerWordVec,
+    outsideWordIdx,
+    outsideVectors,
+    dataset,
+    K=10
+):
+    """ Negative sampling loss function for word2vec models
+
+    Implement the negative sampling loss and gradients for a centerWordVec
+    and a outsideWordIdx word vector as a building block for word2vec
+    models. K is the number of negative samples to take.
+
+    Note: The same word may be negatively sampled multiple times. For
+    example if an outside word is sampled twice, you shall have to
+    double count the gradient with respect to this word. Thrice if
+    it was sampled three times, and so forth.
+
+    Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
+    """
+
+    # Negative sampling of words is done for you. Do not modify this if you
+    # wish to match the autograder and receive points!
+    negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
+    indices = [outsideWordIdx] + negSampleWordIndices
+
+
+    ### YOUR CODE HERE
+
+    score = np.dot(outsideVectors[outsideWordIdx],centerWordVec) 
+    sig_1 = sigmoid(score)
+
+    sum_neg = 0.0
+
+    #Find unique negative samples and the number of times they are present in our sample window
+    unique_k, counts_k = np.unique(indices[1:], return_counts=True)
+    k_stack = outsideVectors[unique_k]
+
+    score_neg = -np.dot(k_stack,centerWordVec)
+    sig_neg = sigmoid(score_neg)
+    sum_neg = np.sum(counts_k*np.log(sig_neg),axis=0)
+
+    #J_neg_sam Loss
+    loss = -np.log(sig_1) - sum_neg
+
+    #Calculate gradients 
+    k_term = 0.0
+    #delta term from previous layer for efficient implementation
+    delta_1msig = 1-sig_1
+    delta_1msig_neg = 1-sig_neg
+
+    gradOutsideVecs = np.zeros((outsideVectors.shape))
+    gradOutsideVecs[outsideWordIdx,:] = -delta_1msig*centerWordVec
+    common_term = np.dot(delta_1msig_neg.reshape(unique_k.shape[0],1),centerWordVec.reshape(1,centerWordVec.shape[0]))
+    gradOutsideVecs[unique_k,:] += counts_k.reshape(counts_k.shape[0],1)*common_term
+
+    #Reshape prep for center gradient calculation
+    counts_k = counts_k.reshape(counts_k.shape[0],1)
+    delta_1msig_neg = delta_1msig_neg.reshape(delta_1msig_neg.shape[0],1)
+    k_term = np.sum(np.dot((delta_1msig_neg.reshape(1,counts_k.shape[0])),counts_k*k_stack),axis=0)
+    gradCenterVec = -delta_1msig*outsideVectors[outsideWordIdx] + k_term  
+
+    ### END YOUR CODE
+
+    return loss, gradCenterVec, gradOutsideVecs
+
+
+def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
+             centerWordVectors, outsideVectors, dataset,
+             word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
+    """ Skip-gram model in word2vec
+
+    Implement the skip-gram model in this function.
+
+    Arguments:
+    currentCenterWord -- a string of the current center word
+    windowSize -- integer, context window size
+    outsideWords -- list of no more than 2*windowSize strings, the outside words
+    word2Ind -- a dictionary that maps words to their indices in
+              the word vector list
+    centerWordVectors -- center word vectors (as rows) for all words in vocab
+                        (V in pdf handout)
+    outsideVectors -- outside word vectors (as rows) for all words in vocab
+                    (U in pdf handout)
+    word2vecLossAndGradient -- the loss and gradient function for
+                               a prediction vector given the outsideWordIdx
+                               word vectors, could be one of the two
+                               loss functions you implemented above.
+
+    Return:
+    loss -- the loss function value for the skip-gram model
+            (J in the pdf handout)
+    gradCenterVecs -- the gradient with respect to the center word vectors
+            (dJ / dV in the pdf handout)
+    gradOutsideVectors -- the gradient with respect to the outside word vectors
+                        (dJ / dU in the pdf handout)
+    """
+
+    loss = 0.0
+    gradCenterVecs = np.zeros(centerWordVectors.shape)
+    gradOutsideVectors = np.zeros(outsideVectors.shape)
+
+    ### YOUR CODE HERE
+    for m in range(0,len(outsideWords)):
+       l,gradCenter,gradOutside= word2vecLossAndGradient(centerWordVectors[word2Ind[currentCenterWord]],word2Ind[outsideWords[m]],outsideVectors,dataset)
+       loss+=l
+       gradCenterVecs[word2Ind[currentCenterWord]] += gradCenter.reshape((centerWordVectors.shape[1],))
+       gradOutsideVectors += gradOutside 
+    ### END YOUR CODE
+
+    return loss, gradCenterVecs, gradOutsideVectors
+
+#############################################
+# Testing functions below. DO NOT MODIFY!   #
+#############################################
+
+def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset, 
+                         windowSize,
+                         word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
+    batchsize = 50
+    loss = 0.0
+    grad = np.zeros(wordVectors.shape)
+    N = wordVectors.shape[0]
+    centerWordVectors = wordVectors[:int(N/2),:]
+    outsideVectors = wordVectors[int(N/2):,:]
+    for i in range(batchsize):
+        windowSize1 = random.randint(1, windowSize)
+        centerWord, context = dataset.getRandomContext(windowSize1)
+
+        c, gin, gout = word2vecModel(
+            centerWord, windowSize1, context, word2Ind, centerWordVectors,
+            outsideVectors, dataset, word2vecLossAndGradient
+        )
+        loss += c / batchsize
+        grad[:int(N/2), :] += gin / batchsize
+        grad[int(N/2):, :] += gout / batchsize
+
+    return loss, grad
+
+
+def test_word2vec():
+    """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
+    dataset = type('dummy', (), {})()
+    def dummySampleTokenIdx():
+        return random.randint(0, 4)
+
+    def getRandomContext(C):
+        tokens = ["a", "b", "c", "d", "e"]
+        return tokens[random.randint(0,4)], \
+            [tokens[random.randint(0,4)] for i in range(2*C)]
+    dataset.sampleTokenIdx = dummySampleTokenIdx
+    dataset.getRandomContext = getRandomContext
+
+    random.seed(31415)
+    np.random.seed(9265)
+    dummy_vectors = normalizeRows(np.random.randn(10,3))
+    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
+
+    print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
+        skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
+        dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
+
+    print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
+        skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
+        dummy_vectors, "negSamplingLossAndGradient Gradient")
+
+    print("\n=== Results ===")
+    print ("Skip-Gram with naiveSoftmaxLossAndGradient")
+
+    print ("Your Result:")
+    print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
+            *skipgram("c", 3, ["a", "b", "e", "d", "b", "c"],
+                dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset) 
+        )
+    )
+
+    print ("Expected Result: Value should approximate these:")
+    print("""Loss: 11.16610900153398
+Gradient wrt Center Vectors (dJ/dV):
+ [[ 0.          0.          0.        ]
+ [ 0.          0.          0.        ]
+ [-1.26947339 -1.36873189  2.45158957]
+ [ 0.          0.          0.        ]
+ [ 0.          0.          0.        ]]
+Gradient wrt Outside Vectors (dJ/dU):
+ [[-0.41045956  0.18834851  1.43272264]
+ [ 0.38202831 -0.17530219 -1.33348241]
+ [ 0.07009355 -0.03216399 -0.24466386]
+ [ 0.09472154 -0.04346509 -0.33062865]
+ [-0.13638384  0.06258276  0.47605228]]
+    """)
+
+    print ("Skip-Gram with negSamplingLossAndGradient")   
+    print ("Your Result:")
+    print("Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n".format(
+        *skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:],
+            dummy_vectors[5:,:], dataset, negSamplingLossAndGradient)
+        )
+    )
+    print ("Expected Result: Value should approximate these:")
+    print("""Loss: 16.15119285363322
+Gradient wrt Center Vectors (dJ/dV):
+ [[ 0.          0.          0.        ]
+ [ 0.          0.          0.        ]
+ [-4.54650789 -1.85942252  0.76397441]
+ [ 0.          0.          0.        ]
+ [ 0.          0.          0.        ]]
+ Gradient wrt Outside Vectors (dJ/dU):
+ [[-0.69148188  0.31730185  2.41364029]
+ [-0.22716495  0.10423969  0.79292674]
+ [-0.45528438  0.20891737  1.58918512]
+ [-0.31602611  0.14501561  1.10309954]
+ [-0.80620296  0.36994417  2.81407799]]
+    """)
+
+if __name__ == "__main__":
+    test_word2vec()