add a3
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
### Stanford / Winter 2019
|
### Stanford / Winter 2019
|
||||||
|
|
||||||
To be continued...
|
|
||||||
|
|
||||||
关于nlp职位面试相关的问题,请关注公众号:
|
关于nlp职位面试相关的问题,请关注公众号:
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
41817
[finished]Assignment_3_neural_dependency_parsing/data/dev.conll
Normal file
41817
[finished]Assignment_3_neural_dependency_parsing/data/dev.conll
Normal file
File diff suppressed because it is too large
Load Diff
41817
[finished]Assignment_3_neural_dependency_parsing/data/dev.gold.conll
Normal file
41817
[finished]Assignment_3_neural_dependency_parsing/data/dev.gold.conll
Normal file
File diff suppressed because it is too large
Load Diff
130000
[finished]Assignment_3_neural_dependency_parsing/data/en-cw.txt
Normal file
130000
[finished]Assignment_3_neural_dependency_parsing/data/en-cw.txt
Normal file
File diff suppressed because it is too large
Load Diff
59100
[finished]Assignment_3_neural_dependency_parsing/data/test.conll
Normal file
59100
[finished]Assignment_3_neural_dependency_parsing/data/test.conll
Normal file
File diff suppressed because it is too large
Load Diff
59100
[finished]Assignment_3_neural_dependency_parsing/data/test.gold.conll
Normal file
59100
[finished]Assignment_3_neural_dependency_parsing/data/test.gold.conll
Normal file
File diff suppressed because it is too large
Load Diff
989860
[finished]Assignment_3_neural_dependency_parsing/data/train.conll
Normal file
989860
[finished]Assignment_3_neural_dependency_parsing/data/train.conll
Normal file
File diff suppressed because it is too large
Load Diff
989860
[finished]Assignment_3_neural_dependency_parsing/data/train.gold.conll
Normal file
989860
[finished]Assignment_3_neural_dependency_parsing/data/train.gold.conll
Normal file
File diff suppressed because it is too large
Load Diff
159
[finished]Assignment_3_neural_dependency_parsing/parser_model.py
Normal file
159
[finished]Assignment_3_neural_dependency_parsing/parser_model.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
CS224N 2018-19: Homework 3
|
||||||
|
parser_model.py: Feed-Forward Neural Network for Dependency Parsing
|
||||||
|
Sahil Chopra <schopra8@stanford.edu>
|
||||||
|
"""
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
class ParserModel(nn.Module):
|
||||||
|
""" Feedforward neural network with an embedding layer and single hidden layer.
|
||||||
|
The ParserModel will predict which transition should be applied to a
|
||||||
|
given partial parse configuration.
|
||||||
|
|
||||||
|
PyTorch Notes:
|
||||||
|
- Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
|
||||||
|
are a subclass of this "nn.Module".
|
||||||
|
- The "__init__" method is where you define all the layers and their respective parameters
|
||||||
|
(embedding layers, linear layers, dropout layers, etc.).
|
||||||
|
- "__init__" gets automatically called when you create a new instance of your class, e.g.
|
||||||
|
when you write "m = ParserModel()".
|
||||||
|
- Other methods of ParserModel can access variables that have "self." prefix. Thus,
|
||||||
|
you should add the "self." prefix layers, values, etc. that you want to utilize
|
||||||
|
in other ParserModel methods.
|
||||||
|
- For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html.
|
||||||
|
"""
|
||||||
|
def __init__(self, embeddings, n_features=36,
|
||||||
|
hidden_size=200, n_classes=3, dropout_prob=0.5):
|
||||||
|
""" Initialize the parser model.
|
||||||
|
|
||||||
|
@param embeddings (Tensor): word embeddings (num_words, embedding_size)
|
||||||
|
@param n_features (int): number of input features
|
||||||
|
@param hidden_size (int): number of hidden units
|
||||||
|
@param n_classes (int): number of output classes
|
||||||
|
@param dropout_prob (float): dropout probability
|
||||||
|
"""
|
||||||
|
super(ParserModel, self).__init__()
|
||||||
|
self.n_features = n_features
|
||||||
|
self.n_classes = n_classes
|
||||||
|
self.dropout_prob = dropout_prob
|
||||||
|
self.embed_size = embeddings.shape[1]
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size)
|
||||||
|
self.pretrained_embeddings.weight = nn.Parameter(torch.tensor(embeddings))
|
||||||
|
|
||||||
|
### YOUR CODE HERE (~5 Lines)
|
||||||
|
### TODO:
|
||||||
|
### 1) Construct `self.embed_to_hidden` linear layer, initializing the weight matrix
|
||||||
|
### with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
|
||||||
|
### 2) Construct `self.dropout` layer.
|
||||||
|
### 3) Construct `self.hidden_to_logits` linear layer, initializing the weight matrix
|
||||||
|
### with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
|
||||||
|
###
|
||||||
|
### Note: Here, we use Xavier Uniform Initialization for our Weight initialization.
|
||||||
|
### It has been shown empirically, that this provides better initial weights
|
||||||
|
### for training networks than random uniform initialization.
|
||||||
|
### For more details checkout this great blogpost:
|
||||||
|
### http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
|
||||||
|
### Hints:
|
||||||
|
### - After you create a linear layer you can access the weight
|
||||||
|
### matrix via:
|
||||||
|
### linear_layer.weight
|
||||||
|
###
|
||||||
|
### Please see the following docs for support:
|
||||||
|
### Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
|
||||||
|
### Xavier Init: https://pytorch.org/docs/stable/nn.html#torch.nn.init.xavier_uniform_
|
||||||
|
### Dropout: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
|
||||||
|
|
||||||
|
self.embed_to_hidden = nn.Linear(self.embed_size*self.n_features,self.hidden_size)
|
||||||
|
torch.nn.init.xavier_uniform_(self.embed_to_hidden.weight, gain=1)
|
||||||
|
self.dropout = nn.Dropout(dropout_prob)
|
||||||
|
self.hidden_to_logits = nn.Linear(self.hidden_size,self.n_classes)
|
||||||
|
torch.nn.init.xavier_uniform_(self.hidden_to_logits.weight, gain=1)
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
def embedding_lookup(self, t):
|
||||||
|
""" Utilize `self.pretrained_embeddings` to map input `t` from input tokens (integers)
|
||||||
|
to embedding vectors.
|
||||||
|
|
||||||
|
PyTorch Notes:
|
||||||
|
- `self.pretrained_embeddings` is a torch.nn.Embedding object that we defined in __init__
|
||||||
|
- Here `t` is a tensor where each row represents a list of features. Each feature is represented by an integer (input token).
|
||||||
|
- In PyTorch the Embedding object, e.g. `self.pretrained_embeddings`, allows you to
|
||||||
|
go from an index to embedding. Please see the documentation (https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding)
|
||||||
|
to learn how to use `self.pretrained_embeddings` to extract the embeddings for your tensor `t`.
|
||||||
|
|
||||||
|
@param t (Tensor): input tensor of tokens (batch_size, n_features)
|
||||||
|
|
||||||
|
@return x (Tensor): tensor of embeddings for words represented in t
|
||||||
|
(batch_size, n_features * embed_size)
|
||||||
|
"""
|
||||||
|
### YOUR CODE HERE (~1-3 Lines)
|
||||||
|
### TODO:
|
||||||
|
### 1) Use `self.pretrained_embeddings` to lookup the embeddings for the input tokens in `t`.
|
||||||
|
### 2) After you apply the embedding lookup, you will have a tensor shape (batch_size, n_features, embedding_size).
|
||||||
|
### Use the tensor `view` method to reshape the embeddings tensor to (batch_size, n_features * embedding_size)
|
||||||
|
###
|
||||||
|
### Note: In order to get batch_size, you may need use the tensor .size() function:
|
||||||
|
### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.size
|
||||||
|
###
|
||||||
|
### Please see the following docs for support:
|
||||||
|
### Embedding Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
|
||||||
|
### View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
|
||||||
|
t_shape = t.size()
|
||||||
|
acc = self.pretrained_embeddings(t)
|
||||||
|
#print(acc.size())
|
||||||
|
x = acc.view(t_shape[0],t_shape[1] * self.embed_size)
|
||||||
|
#print(x.shape)
|
||||||
|
### END YOUR CODE
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, t):
|
||||||
|
""" Run the model forward.
|
||||||
|
|
||||||
|
Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss
|
||||||
|
|
||||||
|
PyTorch Notes:
|
||||||
|
- Every nn.Module object (PyTorch model) has a `forward` function.
|
||||||
|
- When you apply your nn.Module to an input tensor `t` this function is applied to the tensor.
|
||||||
|
For example, if you created an instance of your ParserModel and applied it to some `t` as follows,
|
||||||
|
the `forward` function would called on `t` and the result would be stored in the `output` variable:
|
||||||
|
model = ParserModel()
|
||||||
|
output = model(t) # this calls the forward function
|
||||||
|
- For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward
|
||||||
|
|
||||||
|
@param t (Tensor): input tensor of tokens (batch_size, n_features)
|
||||||
|
|
||||||
|
@return logits (Tensor): tensor of predictions (output after applying the layers of the network)
|
||||||
|
without applying softmax (batch_size, n_classes)
|
||||||
|
"""
|
||||||
|
### YOUR CODE HERE (~3-5 lines)
|
||||||
|
### TODO:
|
||||||
|
### 1) Apply `self.embedding_lookup` to `t` to get the embeddings
|
||||||
|
### 2) Apply `embed_to_hidden` linear layer to the embeddings
|
||||||
|
### 3) Apply relu non-linearity to the output of step 2 to get the hidden units.
|
||||||
|
### 4) Apply dropout layer to the output of step 3.
|
||||||
|
### 5) Apply `hidden_to_logits` layer to the output of step 4 to get the logits.
|
||||||
|
###
|
||||||
|
### Note: We do not apply the softmax to the logits here, because
|
||||||
|
### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently.
|
||||||
|
###
|
||||||
|
### Please see the following docs for support:
|
||||||
|
### ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu
|
||||||
|
|
||||||
|
embeddings = self.embedding_lookup(t)
|
||||||
|
hidden = self.embed_to_hidden(embeddings)
|
||||||
|
hidden_relu = nn.functional.relu(hidden)
|
||||||
|
dropped_out = self.dropout(hidden_relu)
|
||||||
|
logits = self.hidden_to_logits(dropped_out)
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
return logits
|
||||||
@@ -0,0 +1,234 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
CS224N 2018-19: Homework 3
|
||||||
|
parser_transitions.py: Algorithms for completing partial parsess.
|
||||||
|
Sahil Chopra <schopra8@stanford.edu>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
class PartialParse(object):
|
||||||
|
def __init__(self, sentence):
|
||||||
|
"""Initializes this partial parse.
|
||||||
|
|
||||||
|
@param sentence (list of str): The sentence to be parsed as a list of words.
|
||||||
|
Your code should not modify the sentence.
|
||||||
|
"""
|
||||||
|
# The sentence being parsed is kept for bookkeeping purposes. Do not alter it in your code.
|
||||||
|
self.sentence = sentence
|
||||||
|
|
||||||
|
### YOUR CODE HERE (3 Lines)
|
||||||
|
### Your code should initialize the following fields:
|
||||||
|
### self.stack: The current stack represented as a list with the top of the stack as the
|
||||||
|
### last element of the list.
|
||||||
|
### self.buffer: The current buffer represented as a list with the first item on the
|
||||||
|
### buffer as the first item of the list
|
||||||
|
### self.dependencies: The list of dependencies produced so far. Represented as a list of
|
||||||
|
### tuples where each tuple is of the form (head, dependent).
|
||||||
|
### Order for this list doesn't matter.
|
||||||
|
###
|
||||||
|
### Note: The root token should be represented with the string "ROOT"
|
||||||
|
###
|
||||||
|
self.stack = ['ROOT']
|
||||||
|
self.buffer = []
|
||||||
|
self.buffer = sentence
|
||||||
|
self.dependencies = []
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
|
||||||
|
def parse_step(self, transition):
|
||||||
|
"""Performs a single parse step by applying the given transition to this partial parse
|
||||||
|
|
||||||
|
@param transition (str): A string that equals "S", "LA", or "RA" representing the shift,
|
||||||
|
left-arc, and right-arc transitions. You can assume the provided
|
||||||
|
transition is a legal transition.
|
||||||
|
"""
|
||||||
|
### YOUR CODE HERE (~7-10 Lines)
|
||||||
|
### TODO:
|
||||||
|
### Implement a single parsing step, i.e. the logic for the following as
|
||||||
|
### described in the pdf handout:
|
||||||
|
### 1. Shift
|
||||||
|
### 2. Left Arc
|
||||||
|
### 3. Right Arc
|
||||||
|
|
||||||
|
|
||||||
|
if len(self.buffer) == 0 and len(self.stack)==1:
|
||||||
|
return
|
||||||
|
|
||||||
|
if transition == 'S':
|
||||||
|
self.stack.append(self.buffer[0])
|
||||||
|
self.buffer = self.buffer[1:]
|
||||||
|
elif transition == 'LA':
|
||||||
|
self.dependencies.append((self.stack[-1],self.stack[-2]))
|
||||||
|
del self.stack[-2]
|
||||||
|
else: #RA
|
||||||
|
self.dependencies.append((self.stack[-2],self.stack[-1]))
|
||||||
|
del self.stack[-1]
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
def parse(self, transitions):
|
||||||
|
"""Applies the provided transitions to this PartialParse
|
||||||
|
|
||||||
|
@param transitions (list of str): The list of transitions in the order they should be applied
|
||||||
|
|
||||||
|
@return dsependencies (list of string tuples): The list of dependencies produced when
|
||||||
|
parsing the sentence. Represented as a list of
|
||||||
|
tuples where each tuple is of the form (head, dependent).
|
||||||
|
"""
|
||||||
|
for transition in transitions:
|
||||||
|
self.parse_step(transition)
|
||||||
|
return self.dependencies
|
||||||
|
|
||||||
|
|
||||||
|
def minibatch_parse(sentences, model, batch_size):
|
||||||
|
"""Parses a list of sentences in minibatches using a model.
|
||||||
|
|
||||||
|
@param sentences (list of list of str): A list of sentences to be parsed
|
||||||
|
(each sentence is a list of words and each word is of type string)
|
||||||
|
@param model (ParserModel): The model that makes parsing decisions. It is assumed to have a function
|
||||||
|
model.predict(partial_parses) that takes in a list of PartialParses as input and
|
||||||
|
returns a list of transitions predicted for each parse. That is, after calling
|
||||||
|
transitions = model.predict(partial_parses)
|
||||||
|
transitions[i] will be the next transition to apply to partial_parses[i].
|
||||||
|
@param batch_size (int): The number of PartialParses to include in each minibatch
|
||||||
|
|
||||||
|
|
||||||
|
@return dependencies (list of dependency lists): A list where each element is the dependencies
|
||||||
|
list for a parsed sentence. Ordering should be the
|
||||||
|
same as in sentences (i.e., dependencies[i] should
|
||||||
|
contain the parse for sentences[i]).
|
||||||
|
"""
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
### YOUR CODE HERE (~8-10 Lines)
|
||||||
|
### TODO:
|
||||||
|
### Implement the minibatch parse algorithm as described in the pdf handout
|
||||||
|
###
|
||||||
|
### Note: A shallow copy (as denoted in the PDF) can be made with the "=" sign in python, e.g.
|
||||||
|
### unfinished_parses = partial_parses[:].
|
||||||
|
### Here `unfinished_parses` is a shallow copy of `partial_parses`.
|
||||||
|
### In Python, a shallow copied list like `unfinished_parses` does not contain new instances
|
||||||
|
### of the object stored in `partial_parses`. Rather both lists refer to the same objects.
|
||||||
|
### In our case, `partial_parses` contains a list of partial parses. `unfinished_parses`
|
||||||
|
### contains references to the same objects. Thus, you should NOT use the `del` operator
|
||||||
|
### to remove objects from the `unfinished_parses` list. This will free the underlying memory that
|
||||||
|
### is being accessed by `partial_parses` and may cause your code to crash.
|
||||||
|
partial_parses = [ PartialParse(sent) for sent in sentences]
|
||||||
|
unfinished_parses = partial_parses[:]
|
||||||
|
batch_idx = 0
|
||||||
|
while len(unfinished_parses) != 0:
|
||||||
|
obj_list = unfinished_parses[:batch_size]
|
||||||
|
transition = model.predict(obj_list)
|
||||||
|
#Parse Step
|
||||||
|
count = 0
|
||||||
|
for obj in obj_list:
|
||||||
|
obj.parse_step(transition[count])
|
||||||
|
count+=1
|
||||||
|
#Clean up
|
||||||
|
count = 0
|
||||||
|
for obj in obj_list:
|
||||||
|
if len(obj.buffer) == 0 and len(obj.stack)==1:
|
||||||
|
unfinished_parses = unfinished_parses[:max(count,0)] + unfinished_parses[min(count+1,len(partial_parses)):]
|
||||||
|
else:
|
||||||
|
count+=1
|
||||||
|
dependencies = [obj.dependencies for obj in partial_parses]
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
return dependencies
|
||||||
|
|
||||||
|
|
||||||
|
def test_step(name, transition, stack, buf, deps,
|
||||||
|
ex_stack, ex_buf, ex_deps):
|
||||||
|
"""Tests that a single parse step returns the expected output"""
|
||||||
|
pp = PartialParse([])
|
||||||
|
pp.stack, pp.buffer, pp.dependencies = stack, buf, deps
|
||||||
|
|
||||||
|
pp.parse_step(transition)
|
||||||
|
stack, buf, deps = (tuple(pp.stack), tuple(pp.buffer), tuple(sorted(pp.dependencies)))
|
||||||
|
assert stack == ex_stack, \
|
||||||
|
"{:} test resulted in stack {:}, expected {:}".format(name, stack, ex_stack)
|
||||||
|
assert buf == ex_buf, \
|
||||||
|
"{:} test resulted in buffer {:}, expected {:}".format(name, buf, ex_buf)
|
||||||
|
assert deps == ex_deps, \
|
||||||
|
"{:} test resulted in dependency list {:}, expected {:}".format(name, deps, ex_deps)
|
||||||
|
print("{:} test passed!".format(name))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_step():
|
||||||
|
"""Simple tests for the PartialParse.parse_step function
|
||||||
|
Warning: these are not exhaustive
|
||||||
|
"""
|
||||||
|
test_step("SHIFT", "S", ["ROOT", "the"], ["cat", "sat"], [],
|
||||||
|
("ROOT", "the", "cat"), ("sat",), ())
|
||||||
|
test_step("LEFT-ARC", "LA", ["ROOT", "the", "cat"], ["sat"], [],
|
||||||
|
("ROOT", "cat",), ("sat",), (("cat", "the"),))
|
||||||
|
test_step("RIGHT-ARC", "RA", ["ROOT", "run", "fast"], [], [],
|
||||||
|
("ROOT", "run",), (), (("run", "fast"),))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse():
|
||||||
|
"""Simple tests for the PartialParse.parse function
|
||||||
|
Warning: these are not exhaustive
|
||||||
|
"""
|
||||||
|
sentence = ["parse", "this", "sentence"]
|
||||||
|
dependencies = PartialParse(sentence).parse(["S", "S", "S", "LA", "RA", "RA"])
|
||||||
|
dependencies = tuple(sorted(dependencies))
|
||||||
|
expected = (('ROOT', 'parse'), ('parse', 'sentence'), ('sentence', 'this'))
|
||||||
|
assert dependencies == expected, \
|
||||||
|
"parse test resulted in dependencies {:}, expected {:}".format(dependencies, expected)
|
||||||
|
assert tuple(sentence) == ("parse", "this", "sentence"), \
|
||||||
|
"parse test failed: the input sentence should not be modified"
|
||||||
|
print("parse test passed!")
|
||||||
|
|
||||||
|
|
||||||
|
class DummyModel(object):
|
||||||
|
"""Dummy model for testing the minibatch_parse function
|
||||||
|
First shifts everything onto the stack and then does exclusively right arcs if the first word of
|
||||||
|
the sentence is "right", "left" if otherwise.
|
||||||
|
"""
|
||||||
|
def predict(self, partial_parses):
|
||||||
|
return [("RA" if pp.stack[1] is "right" else "LA") if len(pp.buffer) == 0 else "S"
|
||||||
|
for pp in partial_parses]
|
||||||
|
|
||||||
|
|
||||||
|
def test_dependencies(name, deps, ex_deps):
|
||||||
|
"""Tests the provided dependencies match the expected dependencies"""
|
||||||
|
deps = tuple(sorted(deps))
|
||||||
|
assert deps == ex_deps, \
|
||||||
|
"{:} test resulted in dependency list {:}, expected {:}".format(name, deps, ex_deps)
|
||||||
|
|
||||||
|
|
||||||
|
def test_minibatch_parse():
|
||||||
|
"""Simple tests for the minibatch_parse function
|
||||||
|
Warning: these are not exhaustive
|
||||||
|
"""
|
||||||
|
sentences = [["right", "arcs", "only"],
|
||||||
|
["right", "arcs", "only", "again"],
|
||||||
|
["left", "arcs", "only"],
|
||||||
|
["left", "arcs", "only", "again"]]
|
||||||
|
deps = minibatch_parse(sentences, DummyModel(), 2)
|
||||||
|
test_dependencies("minibatch_parse", deps[0],
|
||||||
|
(('ROOT', 'right'), ('arcs', 'only'), ('right', 'arcs')))
|
||||||
|
test_dependencies("minibatch_parse", deps[1],
|
||||||
|
(('ROOT', 'right'), ('arcs', 'only'), ('only', 'again'), ('right', 'arcs')))
|
||||||
|
test_dependencies("minibatch_parse", deps[2],
|
||||||
|
(('only', 'ROOT'), ('only', 'arcs'), ('only', 'left')))
|
||||||
|
test_dependencies("minibatch_parse", deps[3],
|
||||||
|
(('again', 'ROOT'), ('again', 'arcs'), ('again', 'left'), ('again', 'only')))
|
||||||
|
print("minibatch_parse test passed!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = sys.argv
|
||||||
|
if len(args) != 2:
|
||||||
|
raise Exception("You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script")
|
||||||
|
elif args[1] == "part_c":
|
||||||
|
test_parse_step()
|
||||||
|
test_parse()
|
||||||
|
elif args[1] == "part_d":
|
||||||
|
test_minibatch_parse()
|
||||||
|
else:
|
||||||
|
raise Exception("You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script")
|
||||||
158
[finished]Assignment_3_neural_dependency_parsing/run.py
Normal file
158
[finished]Assignment_3_neural_dependency_parsing/run.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
CS224N 2018-19: Homework 3
|
||||||
|
run.py: Run the dependency parser.
|
||||||
|
Sahil Chopra <schopra8@stanford.edu>
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
from torch import nn, optim
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from parser_model import ParserModel
|
||||||
|
from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter
|
||||||
|
|
||||||
|
# -----------------
|
||||||
|
# Primary Functions
|
||||||
|
# -----------------
|
||||||
|
def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005):
|
||||||
|
""" Train the neural dependency parser.
|
||||||
|
|
||||||
|
@param parser (Parser): Neural Dependency Parser
|
||||||
|
@param train_data ():
|
||||||
|
@param dev_data ():
|
||||||
|
@param output_path (str): Path to which model weights and results are written.
|
||||||
|
@param batch_size (int): Number of examples in a single batch
|
||||||
|
@param n_epochs (int): Number of training epochs
|
||||||
|
@param lr (float): Learning rate
|
||||||
|
"""
|
||||||
|
best_dev_UAS = 0
|
||||||
|
|
||||||
|
|
||||||
|
### YOUR CODE HERE (~2-7 lines)
|
||||||
|
### TODO:
|
||||||
|
### 1) Construct Adam Optimizer in variable `optimizer`
|
||||||
|
### 2) Construct the Cross Entropy Loss Function in variable `loss_func`
|
||||||
|
###
|
||||||
|
### Hint: Use `parser.model.parameters()` to pass optimizer
|
||||||
|
### necessary parameters to tune.
|
||||||
|
### Please see the following docs for support:
|
||||||
|
### Adam Optimizer: https://pytorch.org/docs/stable/optim.html
|
||||||
|
### Cross Entropy Loss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
|
||||||
|
optimizer = optim.Adam(list(parser.model.parameters()),lr)
|
||||||
|
loss_func = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
|
### END YOUR CODE
|
||||||
|
|
||||||
|
for epoch in range(n_epochs):
|
||||||
|
print("Epoch {:} out of {:}".format(epoch + 1, n_epochs))
|
||||||
|
dev_UAS = train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size)
|
||||||
|
if dev_UAS > best_dev_UAS:
|
||||||
|
best_dev_UAS = dev_UAS
|
||||||
|
print("New best dev UAS! Saving model.")
|
||||||
|
torch.save(parser.model.state_dict(), output_path)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size):
|
||||||
|
""" Train the neural dependency parser for single epoch.
|
||||||
|
|
||||||
|
Note: In PyTorch we can signify train versus test and automatically have
|
||||||
|
the Dropout Layer applied and removed, accordingly, by specifying
|
||||||
|
whether we are training, `model.train()`, or evaluating, `model.eval()`
|
||||||
|
|
||||||
|
@param parser (Parser): Neural Dependency Parser
|
||||||
|
@param train_data ():
|
||||||
|
@param dev_data ():
|
||||||
|
@param optimizer (nn.Optimizer): Adam Optimizer
|
||||||
|
@param loss_func (nn.CrossEntropyLoss): Cross Entropy Loss Function
|
||||||
|
@param batch_size (int): batch size
|
||||||
|
@param lr (float): learning rate
|
||||||
|
|
||||||
|
@return dev_UAS (float): Unlabeled Attachment Score (UAS) for dev data
|
||||||
|
"""
|
||||||
|
parser.model.train() # Places model in "train" mode, i.e. apply dropout layer
|
||||||
|
n_minibatches = math.ceil(len(train_data) / batch_size)
|
||||||
|
loss_meter = AverageMeter()
|
||||||
|
|
||||||
|
with tqdm(total=(n_minibatches)) as prog:
|
||||||
|
for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)):
|
||||||
|
optimizer.zero_grad() # remove any baggage in the optimizer
|
||||||
|
loss = 0. # store loss for this batch here
|
||||||
|
train_x = torch.from_numpy(train_x).long()
|
||||||
|
train_y = torch.from_numpy(train_y.nonzero()[1]).long()
|
||||||
|
|
||||||
|
### YOUR CODE HERE (~5-10 lines)
|
||||||
|
### TODO:
|
||||||
|
### 1) Run train_x forward through model to produce `logits`
|
||||||
|
### 2) Use the `loss_func` parameter to apply the PyTorch CrossEntropyLoss function.
|
||||||
|
### This will take `logits` and `train_y` as inputs. It will output the CrossEntropyLoss
|
||||||
|
### between softmax(`logits`) and `train_y`. Remember that softmax(`logits`)
|
||||||
|
### are the predictions (y^ from the PDF).
|
||||||
|
### 3) Backprop losses
|
||||||
|
### 4) Take step with the optimizer
|
||||||
|
### Please see the following docs for support:
|
||||||
|
### Optimizer Step: https://pytorch.org/docs/stable/optim.html#optimizer-step
|
||||||
|
optimizer.zero_grad()
|
||||||
|
logits = parser.model(train_x)
|
||||||
|
loss = loss_func(logits,train_y)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
### END YOUR CODE
|
||||||
|
prog.update(1)
|
||||||
|
loss_meter.update(loss.item())
|
||||||
|
|
||||||
|
print ("Average Train Loss: {}".format(loss_meter.avg))
|
||||||
|
|
||||||
|
print("Evaluating on dev set",)
|
||||||
|
parser.model.eval() # Places model in "eval" mode, i.e. don't apply dropout layer
|
||||||
|
dev_UAS, _ = parser.parse(dev_data)
|
||||||
|
print("- dev UAS: {:.2f}".format(dev_UAS * 100.0))
|
||||||
|
return dev_UAS
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Note: Set debug to False, when training on entire corpus
|
||||||
|
#debug = True
|
||||||
|
debug = False
|
||||||
|
|
||||||
|
assert(torch.__version__ == "1.0.0"), "Please install torch version 1.0.0"
|
||||||
|
|
||||||
|
print(80 * "=")
|
||||||
|
print("INITIALIZING")
|
||||||
|
print(80 * "=")
|
||||||
|
parser, embeddings, train_data, dev_data, test_data = load_and_preprocess_data(debug)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
model = ParserModel(embeddings)
|
||||||
|
parser.model = model
|
||||||
|
print("took {:.2f} seconds\n".format(time.time() - start))
|
||||||
|
|
||||||
|
print(80 * "=")
|
||||||
|
print("TRAINING")
|
||||||
|
print(80 * "=")
|
||||||
|
output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
|
||||||
|
output_path = output_dir + "model.weights"
|
||||||
|
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005)
|
||||||
|
|
||||||
|
if not debug:
|
||||||
|
print(80 * "=")
|
||||||
|
print("TESTING")
|
||||||
|
print(80 * "=")
|
||||||
|
print("Restoring the best model weights found on the dev set")
|
||||||
|
parser.model.load_state_dict(torch.load(output_path))
|
||||||
|
print("Final evaluation on test set",)
|
||||||
|
parser.model.eval()
|
||||||
|
UAS, dependencies = parser.parse(test_data)
|
||||||
|
print("- test UAS: {:.2f}".format(UAS * 100.0))
|
||||||
|
print("Done!")
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
CS224N 2018-19: Homework 3
|
||||||
|
general_utils.py: General purpose utilities.
|
||||||
|
Sahil Chopra <schopra8@stanford.edu>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def get_minibatches(data, minibatch_size, shuffle=True):
|
||||||
|
"""
|
||||||
|
Iterates through the provided data one minibatch at at time. You can use this function to
|
||||||
|
iterate through data in minibatches as follows:
|
||||||
|
|
||||||
|
for inputs_minibatch in get_minibatches(inputs, minibatch_size):
|
||||||
|
...
|
||||||
|
|
||||||
|
Or with multiple data sources:
|
||||||
|
|
||||||
|
for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
|
||||||
|
...
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: there are two possible values:
|
||||||
|
- a list or numpy array
|
||||||
|
- a list where each element is either a list or numpy array
|
||||||
|
minibatch_size: the maximum number of items in a minibatch
|
||||||
|
shuffle: whether to randomize the order of returned data
|
||||||
|
Returns:
|
||||||
|
minibatches: the return value depends on data:
|
||||||
|
- If data is a list/array it yields the next minibatch of data.
|
||||||
|
- If data a list of lists/arrays it returns the next minibatch of each element in the
|
||||||
|
list. This can be used to iterate through multiple data sources
|
||||||
|
(e.g., features and labels) at the same time.
|
||||||
|
|
||||||
|
"""
|
||||||
|
list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
|
||||||
|
data_size = len(data[0]) if list_data else len(data)
|
||||||
|
indices = np.arange(data_size)
|
||||||
|
if shuffle:
|
||||||
|
np.random.shuffle(indices)
|
||||||
|
for minibatch_start in np.arange(0, data_size, minibatch_size):
|
||||||
|
minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
|
||||||
|
yield [_minibatch(d, minibatch_indices) for d in data] if list_data \
|
||||||
|
else _minibatch(data, minibatch_indices)
|
||||||
|
|
||||||
|
|
||||||
|
def _minibatch(data, minibatch_idx):
|
||||||
|
return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_close(name, actual, expected):
|
||||||
|
if actual.shape != expected.shape:
|
||||||
|
raise ValueError("{:} failed, expected output to have shape {:} but has shape {:}"
|
||||||
|
.format(name, expected.shape, actual.shape))
|
||||||
|
if np.amax(np.fabs(actual - expected)) > 1e-6:
|
||||||
|
raise ValueError("{:} failed, expected {:} but value is {:}".format(name, expected, actual))
|
||||||
|
else:
|
||||||
|
print(name, "passed!")
|
||||||
@@ -0,0 +1,422 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
CS224N 2018-19: Homework 3
|
||||||
|
parser_utils.py: Utilities for training the dependency parser.
|
||||||
|
Sahil Chopra <schopra8@stanford.edu>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from collections import Counter
|
||||||
|
from . general_utils import get_minibatches
|
||||||
|
from parser_transitions import minibatch_parse
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
P_PREFIX = '<p>:'
|
||||||
|
L_PREFIX = '<l>:'
|
||||||
|
UNK = '<UNK>'
|
||||||
|
NULL = '<NULL>'
|
||||||
|
ROOT = '<ROOT>'
|
||||||
|
|
||||||
|
|
||||||
|
class Config(object):
|
||||||
|
language = 'english'
|
||||||
|
with_punct = True
|
||||||
|
unlabeled = True
|
||||||
|
lowercase = True
|
||||||
|
use_pos = True
|
||||||
|
use_dep = True
|
||||||
|
use_dep = use_dep and (not unlabeled)
|
||||||
|
data_path = './data'
|
||||||
|
train_file = 'train.conll'
|
||||||
|
dev_file = 'dev.conll'
|
||||||
|
test_file = 'test.conll'
|
||||||
|
embedding_file = './data/en-cw.txt'
|
||||||
|
|
||||||
|
|
||||||
|
class Parser(object):
|
||||||
|
"""Contains everything needed for transition-based dependency parsing except for the model"""
|
||||||
|
|
||||||
|
def __init__(self, dataset):
|
||||||
|
root_labels = list([l for ex in dataset
|
||||||
|
for (h, l) in zip(ex['head'], ex['label']) if h == 0])
|
||||||
|
counter = Counter(root_labels)
|
||||||
|
if len(counter) > 1:
|
||||||
|
logging.info('Warning: more than one root label')
|
||||||
|
logging.info(counter)
|
||||||
|
self.root_label = counter.most_common()[0][0]
|
||||||
|
deprel = [self.root_label] + list(set([w for ex in dataset
|
||||||
|
for w in ex['label']
|
||||||
|
if w != self.root_label]))
|
||||||
|
tok2id = {L_PREFIX + l: i for (i, l) in enumerate(deprel)}
|
||||||
|
tok2id[L_PREFIX + NULL] = self.L_NULL = len(tok2id)
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
self.unlabeled = config.unlabeled
|
||||||
|
self.with_punct = config.with_punct
|
||||||
|
self.use_pos = config.use_pos
|
||||||
|
self.use_dep = config.use_dep
|
||||||
|
self.language = config.language
|
||||||
|
|
||||||
|
if self.unlabeled:
|
||||||
|
trans = ['L', 'R', 'S']
|
||||||
|
self.n_deprel = 1
|
||||||
|
else:
|
||||||
|
trans = ['L-' + l for l in deprel] + ['R-' + l for l in deprel] + ['S']
|
||||||
|
self.n_deprel = len(deprel)
|
||||||
|
|
||||||
|
self.n_trans = len(trans)
|
||||||
|
self.tran2id = {t: i for (i, t) in enumerate(trans)}
|
||||||
|
self.id2tran = {i: t for (i, t) in enumerate(trans)}
|
||||||
|
|
||||||
|
# logging.info('Build dictionary for part-of-speech tags.')
|
||||||
|
tok2id.update(build_dict([P_PREFIX + w for ex in dataset for w in ex['pos']],
|
||||||
|
offset=len(tok2id)))
|
||||||
|
tok2id[P_PREFIX + UNK] = self.P_UNK = len(tok2id)
|
||||||
|
tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
|
||||||
|
tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
|
||||||
|
|
||||||
|
# logging.info('Build dictionary for words.')
|
||||||
|
tok2id.update(build_dict([w for ex in dataset for w in ex['word']],
|
||||||
|
offset=len(tok2id)))
|
||||||
|
tok2id[UNK] = self.UNK = len(tok2id)
|
||||||
|
tok2id[NULL] = self.NULL = len(tok2id)
|
||||||
|
tok2id[ROOT] = self.ROOT = len(tok2id)
|
||||||
|
|
||||||
|
self.tok2id = tok2id
|
||||||
|
self.id2tok = {v: k for (k, v) in tok2id.items()}
|
||||||
|
|
||||||
|
self.n_features = 18 + (18 if config.use_pos else 0) + (12 if config.use_dep else 0)
|
||||||
|
self.n_tokens = len(tok2id)
|
||||||
|
|
||||||
|
def vectorize(self, examples):
|
||||||
|
vec_examples = []
|
||||||
|
for ex in examples:
|
||||||
|
word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id
|
||||||
|
else self.UNK for w in ex['word']]
|
||||||
|
pos = [self.P_ROOT] + [self.tok2id[P_PREFIX + w] if P_PREFIX + w in self.tok2id
|
||||||
|
else self.P_UNK for w in ex['pos']]
|
||||||
|
head = [-1] + ex['head']
|
||||||
|
label = [-1] + [self.tok2id[L_PREFIX + w] if L_PREFIX + w in self.tok2id
|
||||||
|
else -1 for w in ex['label']]
|
||||||
|
vec_examples.append({'word': word, 'pos': pos,
|
||||||
|
'head': head, 'label': label})
|
||||||
|
return vec_examples
|
||||||
|
|
||||||
|
def extract_features(self, stack, buf, arcs, ex):
|
||||||
|
if stack[0] == "ROOT":
|
||||||
|
stack[0] = 0
|
||||||
|
|
||||||
|
def get_lc(k):
|
||||||
|
return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] < k])
|
||||||
|
|
||||||
|
def get_rc(k):
|
||||||
|
return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k],
|
||||||
|
reverse=True)
|
||||||
|
|
||||||
|
p_features = []
|
||||||
|
l_features = []
|
||||||
|
features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
|
||||||
|
features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
|
||||||
|
if self.use_pos:
|
||||||
|
p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
|
||||||
|
p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
|
||||||
|
|
||||||
|
for i in range(2):
|
||||||
|
if i < len(stack):
|
||||||
|
k = stack[-i-1]
|
||||||
|
lc = get_lc(k)
|
||||||
|
rc = get_rc(k)
|
||||||
|
llc = get_lc(lc[0]) if len(lc) > 0 else []
|
||||||
|
rrc = get_rc(rc[0]) if len(rc) > 0 else []
|
||||||
|
|
||||||
|
features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
|
||||||
|
features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
|
||||||
|
features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
|
||||||
|
features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
|
||||||
|
features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
|
||||||
|
features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)
|
||||||
|
|
||||||
|
if self.use_pos:
|
||||||
|
p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
|
||||||
|
p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
|
||||||
|
p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
|
||||||
|
p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
|
||||||
|
p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
|
||||||
|
p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)
|
||||||
|
|
||||||
|
if self.use_dep:
|
||||||
|
l_features.append(ex['label'][lc[0]] if len(lc) > 0 else self.L_NULL)
|
||||||
|
l_features.append(ex['label'][rc[0]] if len(rc) > 0 else self.L_NULL)
|
||||||
|
l_features.append(ex['label'][lc[1]] if len(lc) > 1 else self.L_NULL)
|
||||||
|
l_features.append(ex['label'][rc[1]] if len(rc) > 1 else self.L_NULL)
|
||||||
|
l_features.append(ex['label'][llc[0]] if len(llc) > 0 else self.L_NULL)
|
||||||
|
l_features.append(ex['label'][rrc[0]] if len(rrc) > 0 else self.L_NULL)
|
||||||
|
else:
|
||||||
|
features += [self.NULL] * 6
|
||||||
|
if self.use_pos:
|
||||||
|
p_features += [self.P_NULL] * 6
|
||||||
|
if self.use_dep:
|
||||||
|
l_features += [self.L_NULL] * 6
|
||||||
|
|
||||||
|
features += p_features + l_features
|
||||||
|
assert len(features) == self.n_features
|
||||||
|
return features
|
||||||
|
|
||||||
|
def get_oracle(self, stack, buf, ex):
|
||||||
|
if len(stack) < 2:
|
||||||
|
return self.n_trans - 1
|
||||||
|
|
||||||
|
i0 = stack[-1]
|
||||||
|
i1 = stack[-2]
|
||||||
|
h0 = ex['head'][i0]
|
||||||
|
h1 = ex['head'][i1]
|
||||||
|
l0 = ex['label'][i0]
|
||||||
|
l1 = ex['label'][i1]
|
||||||
|
|
||||||
|
if self.unlabeled:
|
||||||
|
if (i1 > 0) and (h1 == i0):
|
||||||
|
return 0
|
||||||
|
elif (i1 >= 0) and (h0 == i1) and \
|
||||||
|
(not any([x for x in buf if ex['head'][x] == i0])):
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return None if len(buf) == 0 else 2
|
||||||
|
else:
|
||||||
|
if (i1 > 0) and (h1 == i0):
|
||||||
|
return l1 if (l1 >= 0) and (l1 < self.n_deprel) else None
|
||||||
|
elif (i1 >= 0) and (h0 == i1) and \
|
||||||
|
(not any([x for x in buf if ex['head'][x] == i0])):
|
||||||
|
return l0 + self.n_deprel if (l0 >= 0) and (l0 < self.n_deprel) else None
|
||||||
|
else:
|
||||||
|
return None if len(buf) == 0 else self.n_trans - 1
|
||||||
|
|
||||||
|
def create_instances(self, examples):
|
||||||
|
all_instances = []
|
||||||
|
succ = 0
|
||||||
|
for id, ex in enumerate(examples):
|
||||||
|
n_words = len(ex['word']) - 1
|
||||||
|
|
||||||
|
# arcs = {(h, t, label)}
|
||||||
|
stack = [0]
|
||||||
|
buf = [i + 1 for i in range(n_words)]
|
||||||
|
arcs = []
|
||||||
|
instances = []
|
||||||
|
for i in range(n_words * 2):
|
||||||
|
gold_t = self.get_oracle(stack, buf, ex)
|
||||||
|
if gold_t is None:
|
||||||
|
break
|
||||||
|
legal_labels = self.legal_labels(stack, buf)
|
||||||
|
assert legal_labels[gold_t] == 1
|
||||||
|
instances.append((self.extract_features(stack, buf, arcs, ex),
|
||||||
|
legal_labels, gold_t))
|
||||||
|
if gold_t == self.n_trans - 1:
|
||||||
|
stack.append(buf[0])
|
||||||
|
buf = buf[1:]
|
||||||
|
elif gold_t < self.n_deprel:
|
||||||
|
arcs.append((stack[-1], stack[-2], gold_t))
|
||||||
|
stack = stack[:-2] + [stack[-1]]
|
||||||
|
else:
|
||||||
|
arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
|
||||||
|
stack = stack[:-1]
|
||||||
|
else:
|
||||||
|
succ += 1
|
||||||
|
all_instances += instances
|
||||||
|
|
||||||
|
return all_instances
|
||||||
|
|
||||||
|
def legal_labels(self, stack, buf):
|
||||||
|
labels = ([1] if len(stack) > 2 else [0]) * self.n_deprel
|
||||||
|
labels += ([1] if len(stack) >= 2 else [0]) * self.n_deprel
|
||||||
|
labels += [1] if len(buf) > 0 else [0]
|
||||||
|
return labels
|
||||||
|
|
||||||
|
def parse(self, dataset, eval_batch_size=5000):
|
||||||
|
sentences = []
|
||||||
|
sentence_id_to_idx = {}
|
||||||
|
for i, example in enumerate(dataset):
|
||||||
|
n_words = len(example['word']) - 1
|
||||||
|
sentence = [j + 1 for j in range(n_words)]
|
||||||
|
sentences.append(sentence)
|
||||||
|
sentence_id_to_idx[id(sentence)] = i
|
||||||
|
|
||||||
|
model = ModelWrapper(self, dataset, sentence_id_to_idx)
|
||||||
|
dependencies = minibatch_parse(sentences, model, eval_batch_size)
|
||||||
|
|
||||||
|
UAS = all_tokens = 0.0
|
||||||
|
with tqdm(total=len(dataset)) as prog:
|
||||||
|
for i, ex in enumerate(dataset):
|
||||||
|
head = [-1] * len(ex['word'])
|
||||||
|
for h, t, in dependencies[i]:
|
||||||
|
head[t] = h
|
||||||
|
for pred_h, gold_h, gold_l, pos in \
|
||||||
|
zip(head[1:], ex['head'][1:], ex['label'][1:], ex['pos'][1:]):
|
||||||
|
assert self.id2tok[pos].startswith(P_PREFIX)
|
||||||
|
pos_str = self.id2tok[pos][len(P_PREFIX):]
|
||||||
|
if (self.with_punct) or (not punct(self.language, pos_str)):
|
||||||
|
UAS += 1 if pred_h == gold_h else 0
|
||||||
|
all_tokens += 1
|
||||||
|
prog.update(i + 1)
|
||||||
|
UAS /= all_tokens
|
||||||
|
return UAS, dependencies
|
||||||
|
|
||||||
|
|
||||||
|
class ModelWrapper(object):
|
||||||
|
def __init__(self, parser, dataset, sentence_id_to_idx):
|
||||||
|
self.parser = parser
|
||||||
|
self.dataset = dataset
|
||||||
|
self.sentence_id_to_idx = sentence_id_to_idx
|
||||||
|
|
||||||
|
def predict(self, partial_parses):
|
||||||
|
mb_x = [self.parser.extract_features(p.stack, p.buffer, p.dependencies,
|
||||||
|
self.dataset[self.sentence_id_to_idx[id(p.sentence)]])
|
||||||
|
for p in partial_parses]
|
||||||
|
mb_x = np.array(mb_x).astype('int32')
|
||||||
|
mb_x = torch.from_numpy(mb_x).long()
|
||||||
|
mb_l = [self.parser.legal_labels(p.stack, p.buffer) for p in partial_parses]
|
||||||
|
|
||||||
|
pred = self.parser.model(mb_x)
|
||||||
|
pred = pred.detach().numpy()
|
||||||
|
pred = np.argmax(pred + 10000 * np.array(mb_l).astype('float32'), 1)
|
||||||
|
pred = ["S" if p == 2 else ("LA" if p == 0 else "RA") for p in pred]
|
||||||
|
return pred
|
||||||
|
|
||||||
|
|
||||||
|
def read_conll(in_file, lowercase=False, max_example=None):
|
||||||
|
examples = []
|
||||||
|
with open(in_file) as f:
|
||||||
|
word, pos, head, label = [], [], [], []
|
||||||
|
for line in f.readlines():
|
||||||
|
sp = line.strip().split('\t')
|
||||||
|
if len(sp) == 10:
|
||||||
|
if '-' not in sp[0]:
|
||||||
|
word.append(sp[1].lower() if lowercase else sp[1])
|
||||||
|
pos.append(sp[4])
|
||||||
|
head.append(int(sp[6]))
|
||||||
|
label.append(sp[7])
|
||||||
|
elif len(word) > 0:
|
||||||
|
examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
|
||||||
|
word, pos, head, label = [], [], [], []
|
||||||
|
if (max_example is not None) and (len(examples) == max_example):
|
||||||
|
break
|
||||||
|
if len(word) > 0:
|
||||||
|
examples.append({'word': word, 'pos': pos, 'head': head, 'label': label})
|
||||||
|
return examples
|
||||||
|
|
||||||
|
|
||||||
|
def build_dict(keys, n_max=None, offset=0):
|
||||||
|
count = Counter()
|
||||||
|
for key in keys:
|
||||||
|
count[key] += 1
|
||||||
|
ls = count.most_common() if n_max is None \
|
||||||
|
else count.most_common(n_max)
|
||||||
|
|
||||||
|
return {w[0]: index + offset for (index, w) in enumerate(ls)}
|
||||||
|
|
||||||
|
|
||||||
|
def punct(language, pos):
|
||||||
|
if language == 'english':
|
||||||
|
return pos in ["''", ",", ".", ":", "``", "-LRB-", "-RRB-"]
|
||||||
|
elif language == 'chinese':
|
||||||
|
return pos == 'PU'
|
||||||
|
elif language == 'french':
|
||||||
|
return pos == 'PUNC'
|
||||||
|
elif language == 'german':
|
||||||
|
return pos in ["$.", "$,", "$["]
|
||||||
|
elif language == 'spanish':
|
||||||
|
# http://nlp.stanford.edu/software/spanish-faq.shtml
|
||||||
|
return pos in ["f0", "faa", "fat", "fc", "fd", "fe", "fg", "fh",
|
||||||
|
"fia", "fit", "fp", "fpa", "fpt", "fs", "ft",
|
||||||
|
"fx", "fz"]
|
||||||
|
elif language == 'universal':
|
||||||
|
return pos == 'PUNCT'
|
||||||
|
else:
|
||||||
|
raise ValueError('language: %s is not supported.' % language)
|
||||||
|
|
||||||
|
|
||||||
|
def minibatches(data, batch_size):
|
||||||
|
x = np.array([d[0] for d in data])
|
||||||
|
y = np.array([d[2] for d in data])
|
||||||
|
one_hot = np.zeros((y.size, 3))
|
||||||
|
one_hot[np.arange(y.size), y] = 1
|
||||||
|
return get_minibatches([x, one_hot], batch_size)
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_preprocess_data(reduced=True):
|
||||||
|
config = Config()
|
||||||
|
|
||||||
|
print("Loading data...",)
|
||||||
|
start = time.time()
|
||||||
|
train_set = read_conll(os.path.join(config.data_path, config.train_file),
|
||||||
|
lowercase=config.lowercase)
|
||||||
|
dev_set = read_conll(os.path.join(config.data_path, config.dev_file),
|
||||||
|
lowercase=config.lowercase)
|
||||||
|
test_set = read_conll(os.path.join(config.data_path, config.test_file),
|
||||||
|
lowercase=config.lowercase)
|
||||||
|
if reduced:
|
||||||
|
train_set = train_set[:1000]
|
||||||
|
dev_set = dev_set[:500]
|
||||||
|
test_set = test_set[:500]
|
||||||
|
print("took {:.2f} seconds".format(time.time() - start))
|
||||||
|
|
||||||
|
print("Building parser...",)
|
||||||
|
start = time.time()
|
||||||
|
parser = Parser(train_set)
|
||||||
|
print("took {:.2f} seconds".format(time.time() - start))
|
||||||
|
|
||||||
|
print("Loading pretrained embeddings...",)
|
||||||
|
start = time.time()
|
||||||
|
word_vectors = {}
|
||||||
|
for line in open(config.embedding_file).readlines():
|
||||||
|
sp = line.strip().split()
|
||||||
|
word_vectors[sp[0]] = [float(x) for x in sp[1:]]
|
||||||
|
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype='float32')
|
||||||
|
|
||||||
|
for token in parser.tok2id:
|
||||||
|
i = parser.tok2id[token]
|
||||||
|
if token in word_vectors:
|
||||||
|
embeddings_matrix[i] = word_vectors[token]
|
||||||
|
elif token.lower() in word_vectors:
|
||||||
|
embeddings_matrix[i] = word_vectors[token.lower()]
|
||||||
|
print("took {:.2f} seconds".format(time.time() - start))
|
||||||
|
|
||||||
|
print("Vectorizing data...",)
|
||||||
|
start = time.time()
|
||||||
|
train_set = parser.vectorize(train_set)
|
||||||
|
dev_set = parser.vectorize(dev_set)
|
||||||
|
test_set = parser.vectorize(test_set)
|
||||||
|
print("took {:.2f} seconds".format(time.time() - start))
|
||||||
|
|
||||||
|
print("Preprocessing training data...",)
|
||||||
|
start = time.time()
|
||||||
|
train_examples = parser.create_instances(train_set)
|
||||||
|
print("took {:.2f} seconds".format(time.time() - start))
|
||||||
|
|
||||||
|
return parser, embeddings_matrix, train_examples, dev_set, test_set,
|
||||||
|
|
||||||
|
|
||||||
|
class AverageMeter(object):
|
||||||
|
"""Computes and stores the average and current value"""
|
||||||
|
def __init__(self):
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.val = 0
|
||||||
|
self.avg = 0
|
||||||
|
self.sum = 0
|
||||||
|
self.count = 0
|
||||||
|
|
||||||
|
def update(self, val, n=1):
|
||||||
|
self.val = val
|
||||||
|
self.sum += val * n
|
||||||
|
self.count += n
|
||||||
|
self.avg = self.sum / self.count
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pass
|
||||||
Reference in New Issue
Block a user