python bert code

This commit is contained in:
chongjiu.jin
2019-12-13 14:52:07 +08:00
parent b68d3f1152
commit 6229486d35
8 changed files with 507 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
### Stanford / Winter 2019
download bert file
google tensorflow bert(need to be converted)
https://github.com/google-research/bert
https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
chinese bert
https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
关于nlp职位面试相关的问题请关注公众号
![flypython微信公众号](https://flypython.com/images/wechat.png)

View File

@@ -0,0 +1,89 @@
#https://github.com/huggingface/transformers
#https://huggingface.co/transformers/quickstart.html
#BERT example
#pip install transformers
#老的pytorch_transformers
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel
from transformers.tokenization_bert import BertTokenizer as tokenization
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#get_bert_model
# bert预训练模型
# pytorch_model.bin
# config.json
# vocab.txt
bert_path = './bert'
do_lower_case=True
bert_config_file = os.path.join(bert_path, f'bert_config.json')
vocab_file = os.path.join(bert_path, f'vocab.txt')
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
#加载配置
bert_config = BertConfig.from_json_file(bert_config_file)
# 加载词典
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
# 加载模型
model_bert = BertModel.from_pretrained(bert_path)
model_bert.to(device)
# Tokenize input
text = "乌兹别克斯坦议会立法院主席获连任"
tokenized_text = tokenizer.tokenize(text)
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
# Convert token to vocabulary indices
# input_ids一个形状为[batch_size, sequence_length]的torch.LongTensor在词汇表中包含单词的token索引
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor在[0, 1]中选择token类型索引。类型0对应于句子A类型1对应于句子B。
segment_ids = [0]*len(input_ids)
# input_mask一个可选的torch.LongTensor形状为[batch_size, sequence_length],索引在[0, 1]中选择。
input_mask = [1]*len(input_ids)
# Convert inputs to PyTorch tensors
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
print("input_ids",input_ids.size())
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
#输出
all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
# all_encoder_layers一个大小为[batch_size, sequence_lengthhidden_size]的torch.FloatTensor列表
# 它是每个注意块末端隐藏状态的完整序列列表即BERT-base的12个完整序列BERT-large的24个完整序列
# pooled_output一个大小为[batch_size, hidden_size] 的torch.FloatTensor
# 它是在与输入CLF的第一个字符相关联的隐藏状态之上预训练的分类器的输出用于训练Next - Sentence任务参见BERT的论文
#如果我们要输出embeding 表示只使用all_encoder_layer
print('all_encoder_layer',all_encoder_layer.shape)
print('pooled_output',pooled_output.size())
#如果要分类使用pooled_output
#padding
max_seq_length=300
text = "乌兹别克斯坦议会立法院主席获连任"
tokenized_text = tokenizer.tokenize(text)
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
input_mask = [1]*len(input_ids)
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
print("padding input_ids",input_ids.size())
model_bert.eval()
with torch.no_grad():
all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
print('padding all_encoder_layer', all_encoder_layer.shape)
print('padding pooled_output', pooled_output.size())

53
pytorch-bert-code/bert.py Normal file
View File

@@ -0,0 +1,53 @@
# coding: UTF-8
import torch
import torch.nn as nn
import torch.nn.functional as F
# from pytorch_pretrained_bert import BertModel, BertTokenizer
from transformers import BertModel, BertTokenizer
class Config(object):
"""配置参数"""
def __init__(self, dataset):
self.model_name = 'bert'
self.train_path = dataset + '/data/train.txt'
self.dev_path = dataset + '/data/dev.txt'
self.test_path = dataset + '/data/test.txt'
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()]
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.num_classes = len(self.class_list)
self.num_epochs = 3
self.batch_size = 128
self.pad_size = 32
self.learning_rate = 5e-5
self.bert_path = './bert'
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size, config.num_classes)
def forward(self, input_ids,# 输入的句子
input_mask,# 对padding部分进行mask和句子一个sizepadding部分用0表示[1, 1, 1, 1, 0, 0]
segments_ids
):
_, pooled = self.bert(input_ids, attention_mask=input_mask,token_type_ids=segments_ids)#pooled [batch_size, hidden_size]
out = self.fc(pooled)
return out
def loss(self,outputs,labels):
criterion=F.cross_entropy
loss = criterion(outputs, labels)
return loss

View File

@@ -0,0 +1,105 @@
# coding=utf-8
# Copyright 2018 The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import argparse
import tensorflow as tf
import torch
import numpy as np
from modeling import BertConfig, BertModel
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--bert_config_file",
default = None,
type = str,
required = True,
help = "The config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture.")
parser.add_argument("--pytorch_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output PyTorch model.")
args = parser.parse_args()
def convert():
# Initialise PyTorch model
config = BertConfig.from_json_file(args.bert_config_file)
model = BertModel(config)
# Load weights from TF model
path = args.tf_checkpoint_path
print("Converting TensorFlow checkpoint from {}".format(path))
init_vars = tf.train.list_variables(path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading {} with shape {}".format(name, shape))
array = tf.train.load_variable(path, name)
print("Numpy array shape {}".format(array.shape))
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name[5:] # skip "bert/"
print("Loading {}".format(name))
name = name.split('/')
if name[0] in ['redictions', 'eq_relationship']:
print("Skipping")
continue
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] == 'kernel':
pointer = getattr(pointer, 'weight')
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel':
array = np.transpose(array)
try:
assert pointer.shape == array.shape
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
pointer.data = torch.from_numpy(array)
# Save pytorch-model
torch.save(model.state_dict(), args.pytorch_dump_path)
if __name__ == "__main__":
convert()

View File

@@ -0,0 +1 @@
python3 convert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path bert_model.ckpt --bert_config_file bert_config.json --pytorch_dump_path bert_model.bin

36
pytorch-bert-code/run.py Normal file
View File

@@ -0,0 +1,36 @@
# coding: UTF-8
import time
import torch
import numpy as np
from train_eval import train
import argparse
from utils import build_dataset, build_iterator, get_time_dif
import bert
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model', type=str, required=False, help='choose a model: Bert, ERNIE')
args = parser.parse_args()
if __name__ == '__main__':
dataset = '.' # 数据集
model_name = 'bert'#args.model # bert
x = bert
config = x.Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
# torch.cuda.manual_seed_all(1)
# torch.backends.cudnn.deterministic = True # 保证每次结果一样
start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# train
model = x.Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)

View File

@@ -0,0 +1,104 @@
# coding: UTF-8
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from utils import get_time_dif
from transformers.optimization import AdamW
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
optimizer = AdamW(optimizer_grouped_parameters,
lr=config.learning_rate,
)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
model.train()
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = F.cross_entropy(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels.data.cpu()
predic = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(true, predic)
dev_acc, dev_loss = evaluate(config, model, dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
test(config, model, test_iter)
def test(config, model, test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
def evaluate(config, model, data_iter, test=False):
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
confusion = metrics.confusion_matrix(labels_all, predict_all)
return acc, loss_total / len(data_iter), report, confusion
return acc, loss_total / len(data_iter)

View File

@@ -0,0 +1,97 @@
# coding: UTF-8
import torch
from tqdm import tqdm
import time
from datetime import timedelta
PAD, CLS = '[PAD]', '[CLS]' # padding符号, bert中综合信息符号
def build_dataset(config):
def load_dataset(path, pad_size=32):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
token = config.tokenizer.tokenize(content)
token = [CLS] + token
seq_len = len(token)
mask = []
token_ids = config.tokenizer.convert_tokens_to_ids(token)
if pad_size:
if len(token) < pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
seq_len = pad_size
contents.append((token_ids, int(label), seq_len, mask))
return contents
train = load_dataset(config.train_path, config.pad_size)
dev = load_dataset(config.dev_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
return train, dev, test
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False # 记录batch数量是否为整数
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self, datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
return (x, seq_len, mask), y
def __next__(self):
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index > self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, config.batch_size, config.device)
return iter
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))