python bert code
This commit is contained in:
22
pytorch-bert-code/README.md
Normal file
22
pytorch-bert-code/README.md
Normal file
@@ -0,0 +1,22 @@
|
||||
### Stanford / Winter 2019
|
||||
|
||||
|
||||
|
||||
download bert file
|
||||
|
||||
google tensorflow bert(need to be converted)
|
||||
|
||||
https://github.com/google-research/bert
|
||||
|
||||
|
||||
https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
|
||||
|
||||
|
||||
chinese bert
|
||||
|
||||
https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
|
||||
|
||||
|
||||
关于nlp职位面试相关的问题,请关注公众号:
|
||||
|
||||

|
||||
89
pytorch-bert-code/bert-example.py
Normal file
89
pytorch-bert-code/bert-example.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#https://github.com/huggingface/transformers
|
||||
|
||||
#https://huggingface.co/transformers/quickstart.html
|
||||
#BERT example
|
||||
|
||||
#pip install transformers
|
||||
#老的pytorch_transformers
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BertConfig, BertModel
|
||||
from transformers.tokenization_bert import BertTokenizer as tokenization
|
||||
import os
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
#get_bert_model
|
||||
# bert预训练模型:
|
||||
# pytorch_model.bin
|
||||
# config.json
|
||||
# vocab.txt
|
||||
bert_path = './bert'
|
||||
do_lower_case=True
|
||||
|
||||
bert_config_file = os.path.join(bert_path, f'bert_config.json')
|
||||
vocab_file = os.path.join(bert_path, f'vocab.txt')
|
||||
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
|
||||
|
||||
#加载配置
|
||||
bert_config = BertConfig.from_json_file(bert_config_file)
|
||||
|
||||
# 加载词典
|
||||
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
|
||||
|
||||
# 加载模型
|
||||
model_bert = BertModel.from_pretrained(bert_path)
|
||||
model_bert.to(device)
|
||||
|
||||
|
||||
# Tokenize input
|
||||
text = "乌兹别克斯坦议会立法院主席获连任"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
|
||||
# Convert token to vocabulary indices
|
||||
# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
|
||||
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
|
||||
segment_ids = [0]*len(input_ids)
|
||||
# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
|
||||
input_mask = [1]*len(input_ids)
|
||||
|
||||
# Convert inputs to PyTorch tensors
|
||||
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
|
||||
print("input_ids",input_ids.size())
|
||||
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
|
||||
segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
|
||||
#输出
|
||||
all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
|
||||
# all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表,
|
||||
# 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列)
|
||||
# pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor,
|
||||
# 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文)
|
||||
|
||||
#如果我们要输出embeding 表示,只使用all_encoder_layer
|
||||
print('all_encoder_layer',all_encoder_layer.shape)
|
||||
print('pooled_output',pooled_output.size())
|
||||
#如果要分类,使用pooled_output
|
||||
|
||||
#padding
|
||||
max_seq_length=300
|
||||
|
||||
text = "乌兹别克斯坦议会立法院主席获连任"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
input_mask = [1]*len(input_ids)
|
||||
|
||||
padding = [0] * (max_seq_length - len(input_ids))
|
||||
input_ids += padding
|
||||
input_mask += padding
|
||||
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
|
||||
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
|
||||
print("padding input_ids",input_ids.size())
|
||||
|
||||
model_bert.eval()
|
||||
with torch.no_grad():
|
||||
all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
|
||||
print('padding all_encoder_layer', all_encoder_layer.shape)
|
||||
print('padding pooled_output', pooled_output.size())
|
||||
53
pytorch-bert-code/bert.py
Normal file
53
pytorch-bert-code/bert.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# coding: UTF-8
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
# from pytorch_pretrained_bert import BertModel, BertTokenizer
|
||||
from transformers import BertModel, BertTokenizer
|
||||
|
||||
|
||||
class Config(object):
|
||||
|
||||
"""配置参数"""
|
||||
def __init__(self, dataset):
|
||||
self.model_name = 'bert'
|
||||
self.train_path = dataset + '/data/train.txt'
|
||||
self.dev_path = dataset + '/data/dev.txt'
|
||||
self.test_path = dataset + '/data/test.txt'
|
||||
self.class_list = [x.strip() for x in open(
|
||||
dataset + '/data/class.txt').readlines()]
|
||||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'
|
||||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
|
||||
|
||||
self.num_classes = len(self.class_list)
|
||||
self.num_epochs = 3
|
||||
self.batch_size = 128
|
||||
self.pad_size = 32
|
||||
self.learning_rate = 5e-5
|
||||
self.bert_path = './bert'
|
||||
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
|
||||
self.hidden_size = 768
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super(Model, self).__init__()
|
||||
self.bert = BertModel.from_pretrained(config.bert_path)
|
||||
for param in self.bert.parameters():
|
||||
param.requires_grad = True
|
||||
self.fc = nn.Linear(config.hidden_size, config.num_classes)
|
||||
|
||||
|
||||
def forward(self, input_ids,# 输入的句子
|
||||
input_mask,# 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
|
||||
segments_ids
|
||||
):
|
||||
_, pooled = self.bert(input_ids, attention_mask=input_mask,token_type_ids=segments_ids)#pooled [batch_size, hidden_size]
|
||||
out = self.fc(pooled)
|
||||
return out
|
||||
def loss(self,outputs,labels):
|
||||
criterion=F.cross_entropy
|
||||
loss = criterion(outputs, labels)
|
||||
return loss
|
||||
105
pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py
Normal file
105
pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert BERT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import argparse
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from modeling import BertConfig, BertModel
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--tf_checkpoint_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--bert_config_file",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "The config json file corresponding to the pre-trained BERT model. \n"
|
||||
"This specifies the model architecture.")
|
||||
parser.add_argument("--pytorch_dump_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the output PyTorch model.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def convert():
|
||||
# Initialise PyTorch model
|
||||
config = BertConfig.from_json_file(args.bert_config_file)
|
||||
model = BertModel(config)
|
||||
|
||||
# Load weights from TF model
|
||||
path = args.tf_checkpoint_path
|
||||
print("Converting TensorFlow checkpoint from {}".format(path))
|
||||
|
||||
init_vars = tf.train.list_variables(path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(path, name)
|
||||
print("Numpy array shape {}".format(array.shape))
|
||||
names.append(name)
|
||||
arrays.append(array)
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
name = name[5:] # skip "bert/"
|
||||
print("Loading {}".format(name))
|
||||
name = name.split('/')
|
||||
if name[0] in ['redictions', 'eq_relationship']:
|
||||
print("Skipping")
|
||||
continue
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||
l = re.split(r'_(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
if l[0] == 'kernel':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
else:
|
||||
pointer = getattr(pointer, l[0])
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
if m_name[-11:] == '_embeddings':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif m_name == 'kernel':
|
||||
array = np.transpose(array)
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
pointer.data = torch.from_numpy(array)
|
||||
|
||||
# Save pytorch-model
|
||||
torch.save(model.state_dict(), args.pytorch_dump_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
convert()
|
||||
1
pytorch-bert-code/bert/run.sh
Normal file
1
pytorch-bert-code/bert/run.sh
Normal file
@@ -0,0 +1 @@
|
||||
python3 convert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path bert_model.ckpt --bert_config_file bert_config.json --pytorch_dump_path bert_model.bin
|
||||
36
pytorch-bert-code/run.py
Normal file
36
pytorch-bert-code/run.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# coding: UTF-8
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
from train_eval import train
|
||||
import argparse
|
||||
from utils import build_dataset, build_iterator, get_time_dif
|
||||
import bert
|
||||
parser = argparse.ArgumentParser(description='Chinese Text Classification')
|
||||
parser.add_argument('--model', type=str, required=False, help='choose a model: Bert, ERNIE')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataset = '.' # 数据集
|
||||
|
||||
model_name = 'bert'#args.model # bert
|
||||
x = bert
|
||||
config = x.Config(dataset)
|
||||
np.random.seed(1)
|
||||
torch.manual_seed(1)
|
||||
# torch.cuda.manual_seed_all(1)
|
||||
# torch.backends.cudnn.deterministic = True # 保证每次结果一样
|
||||
|
||||
start_time = time.time()
|
||||
print("Loading data...")
|
||||
train_data, dev_data, test_data = build_dataset(config)
|
||||
train_iter = build_iterator(train_data, config)
|
||||
dev_iter = build_iterator(dev_data, config)
|
||||
test_iter = build_iterator(test_data, config)
|
||||
time_dif = get_time_dif(start_time)
|
||||
print("Time usage:", time_dif)
|
||||
|
||||
# train
|
||||
model = x.Model(config).to(config.device)
|
||||
train(config, model, train_iter, dev_iter, test_iter)
|
||||
104
pytorch-bert-code/train_eval.py
Normal file
104
pytorch-bert-code/train_eval.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# coding: UTF-8
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from sklearn import metrics
|
||||
import time
|
||||
from utils import get_time_dif
|
||||
from transformers.optimization import AdamW
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def train(config, model, train_iter, dev_iter, test_iter):
|
||||
start_time = time.time()
|
||||
model.train()
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
|
||||
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
|
||||
optimizer = AdamW(optimizer_grouped_parameters,
|
||||
lr=config.learning_rate,
|
||||
)
|
||||
total_batch = 0 # 记录进行到多少batch
|
||||
dev_best_loss = float('inf')
|
||||
last_improve = 0 # 记录上次验证集loss下降的batch数
|
||||
flag = False # 记录是否很久没有效果提升
|
||||
model.train()
|
||||
for epoch in range(config.num_epochs):
|
||||
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
|
||||
for i, (trains, labels) in enumerate(train_iter):
|
||||
outputs = model(trains)
|
||||
model.zero_grad()
|
||||
loss = F.cross_entropy(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if total_batch % 100 == 0:
|
||||
# 每多少轮输出在训练集和验证集上的效果
|
||||
true = labels.data.cpu()
|
||||
predic = torch.max(outputs.data, 1)[1].cpu()
|
||||
train_acc = metrics.accuracy_score(true, predic)
|
||||
dev_acc, dev_loss = evaluate(config, model, dev_iter)
|
||||
if dev_loss < dev_best_loss:
|
||||
dev_best_loss = dev_loss
|
||||
torch.save(model.state_dict(), config.save_path)
|
||||
improve = '*'
|
||||
last_improve = total_batch
|
||||
else:
|
||||
improve = ''
|
||||
time_dif = get_time_dif(start_time)
|
||||
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
|
||||
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
|
||||
model.train()
|
||||
total_batch += 1
|
||||
if total_batch - last_improve > config.require_improvement:
|
||||
# 验证集loss超过1000batch没下降,结束训练
|
||||
print("No optimization for a long time, auto-stopping...")
|
||||
flag = True
|
||||
break
|
||||
if flag:
|
||||
break
|
||||
test(config, model, test_iter)
|
||||
|
||||
|
||||
def test(config, model, test_iter):
|
||||
# test
|
||||
model.load_state_dict(torch.load(config.save_path))
|
||||
model.eval()
|
||||
start_time = time.time()
|
||||
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
|
||||
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
|
||||
print(msg.format(test_loss, test_acc))
|
||||
print("Precision, Recall and F1-Score...")
|
||||
print(test_report)
|
||||
print("Confusion Matrix...")
|
||||
print(test_confusion)
|
||||
time_dif = get_time_dif(start_time)
|
||||
print("Time usage:", time_dif)
|
||||
|
||||
|
||||
def evaluate(config, model, data_iter, test=False):
|
||||
model.eval()
|
||||
loss_total = 0
|
||||
predict_all = np.array([], dtype=int)
|
||||
labels_all = np.array([], dtype=int)
|
||||
with torch.no_grad():
|
||||
for texts, labels in data_iter:
|
||||
outputs = model(texts)
|
||||
loss = F.cross_entropy(outputs, labels)
|
||||
loss_total += loss
|
||||
labels = labels.data.cpu().numpy()
|
||||
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
|
||||
labels_all = np.append(labels_all, labels)
|
||||
predict_all = np.append(predict_all, predic)
|
||||
|
||||
acc = metrics.accuracy_score(labels_all, predict_all)
|
||||
if test:
|
||||
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
|
||||
confusion = metrics.confusion_matrix(labels_all, predict_all)
|
||||
return acc, loss_total / len(data_iter), report, confusion
|
||||
return acc, loss_total / len(data_iter)
|
||||
97
pytorch-bert-code/utils.py
Normal file
97
pytorch-bert-code/utils.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# coding: UTF-8
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
PAD, CLS = '[PAD]', '[CLS]' # padding符号, bert中综合信息符号
|
||||
|
||||
|
||||
def build_dataset(config):
|
||||
|
||||
def load_dataset(path, pad_size=32):
|
||||
contents = []
|
||||
with open(path, 'r', encoding='UTF-8') as f:
|
||||
for line in tqdm(f):
|
||||
lin = line.strip()
|
||||
if not lin:
|
||||
continue
|
||||
content, label = lin.split('\t')
|
||||
token = config.tokenizer.tokenize(content)
|
||||
token = [CLS] + token
|
||||
seq_len = len(token)
|
||||
mask = []
|
||||
token_ids = config.tokenizer.convert_tokens_to_ids(token)
|
||||
|
||||
if pad_size:
|
||||
if len(token) < pad_size:
|
||||
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
|
||||
token_ids += ([0] * (pad_size - len(token)))
|
||||
else:
|
||||
mask = [1] * pad_size
|
||||
token_ids = token_ids[:pad_size]
|
||||
seq_len = pad_size
|
||||
contents.append((token_ids, int(label), seq_len, mask))
|
||||
return contents
|
||||
train = load_dataset(config.train_path, config.pad_size)
|
||||
dev = load_dataset(config.dev_path, config.pad_size)
|
||||
test = load_dataset(config.test_path, config.pad_size)
|
||||
return train, dev, test
|
||||
|
||||
|
||||
class DatasetIterater(object):
|
||||
def __init__(self, batches, batch_size, device):
|
||||
self.batch_size = batch_size
|
||||
self.batches = batches
|
||||
self.n_batches = len(batches) // batch_size
|
||||
self.residue = False # 记录batch数量是否为整数
|
||||
if len(batches) % self.n_batches != 0:
|
||||
self.residue = True
|
||||
self.index = 0
|
||||
self.device = device
|
||||
|
||||
def _to_tensor(self, datas):
|
||||
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
|
||||
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
|
||||
|
||||
# pad前的长度(超过pad_size的设为pad_size)
|
||||
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
|
||||
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
|
||||
return (x, seq_len, mask), y
|
||||
|
||||
def __next__(self):
|
||||
if self.residue and self.index == self.n_batches:
|
||||
batches = self.batches[self.index * self.batch_size: len(self.batches)]
|
||||
self.index += 1
|
||||
batches = self._to_tensor(batches)
|
||||
return batches
|
||||
|
||||
elif self.index > self.n_batches:
|
||||
self.index = 0
|
||||
raise StopIteration
|
||||
else:
|
||||
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
|
||||
self.index += 1
|
||||
batches = self._to_tensor(batches)
|
||||
return batches
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __len__(self):
|
||||
if self.residue:
|
||||
return self.n_batches + 1
|
||||
else:
|
||||
return self.n_batches
|
||||
|
||||
|
||||
def build_iterator(dataset, config):
|
||||
iter = DatasetIterater(dataset, config.batch_size, config.device)
|
||||
return iter
|
||||
|
||||
|
||||
def get_time_dif(start_time):
|
||||
"""获取已使用时间"""
|
||||
end_time = time.time()
|
||||
time_dif = end_time - start_time
|
||||
return timedelta(seconds=int(round(time_dif)))
|
||||
Reference in New Issue
Block a user