From 6229486d3598c8949c6ec1a04d9db4e5f9337df4 Mon Sep 17 00:00:00 2001 From: "chongjiu.jin" Date: Fri, 13 Dec 2019 14:52:07 +0800 Subject: [PATCH] python bert code --- pytorch-bert-code/README.md | 22 ++++ pytorch-bert-code/bert-example.py | 89 +++++++++++++++ pytorch-bert-code/bert.py | 53 +++++++++ .../bert/convert_tf_checkpoint_to_pytorch.py | 105 ++++++++++++++++++ pytorch-bert-code/bert/run.sh | 1 + pytorch-bert-code/run.py | 36 ++++++ pytorch-bert-code/train_eval.py | 104 +++++++++++++++++ pytorch-bert-code/utils.py | 97 ++++++++++++++++ 8 files changed, 507 insertions(+) create mode 100644 pytorch-bert-code/README.md create mode 100644 pytorch-bert-code/bert-example.py create mode 100644 pytorch-bert-code/bert.py create mode 100644 pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py create mode 100644 pytorch-bert-code/bert/run.sh create mode 100644 pytorch-bert-code/run.py create mode 100644 pytorch-bert-code/train_eval.py create mode 100644 pytorch-bert-code/utils.py diff --git a/pytorch-bert-code/README.md b/pytorch-bert-code/README.md new file mode 100644 index 0000000..deb8b7a --- /dev/null +++ b/pytorch-bert-code/README.md @@ -0,0 +1,22 @@ +### Stanford / Winter 2019 + + + +download bert file + + google tensorflow bert(need to be converted) + + https://github.com/google-research/bert + + +https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip + + + chinese bert + + https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md + + +关于nlp职位面试相关的问题,请关注公众号: + + ![flypython微信公众号](https://flypython.com/images/wechat.png) \ No newline at end of file diff --git a/pytorch-bert-code/bert-example.py b/pytorch-bert-code/bert-example.py new file mode 100644 index 0000000..ff1999c --- /dev/null +++ b/pytorch-bert-code/bert-example.py @@ -0,0 +1,89 @@ +#https://github.com/huggingface/transformers + +#https://huggingface.co/transformers/quickstart.html +#BERT example + +#pip install transformers +#老的pytorch_transformers + +import torch +import torch.nn as nn +from transformers import BertConfig, BertModel +from transformers.tokenization_bert import BertTokenizer as tokenization +import os + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +#get_bert_model +# bert预训练模型: +# pytorch_model.bin +# config.json +# vocab.txt +bert_path = './bert' +do_lower_case=True + +bert_config_file = os.path.join(bert_path, f'bert_config.json') +vocab_file = os.path.join(bert_path, f'vocab.txt') +init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin') + +#加载配置 +bert_config = BertConfig.from_json_file(bert_config_file) + +# 加载词典 +tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case) + +# 加载模型 +model_bert = BertModel.from_pretrained(bert_path) +model_bert.to(device) + + +# Tokenize input +text = "乌兹别克斯坦议会立法院主席获连任" +tokenized_text = tokenizer.tokenize(text) +tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]'] +# Convert token to vocabulary indices +# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引 +input_ids = tokenizer.convert_tokens_to_ids(tokenized_text) +# Define sentence A and B indices associated to 1st and 2nd sentences (see paper) +# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。 +segment_ids = [0]*len(input_ids) +# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。 +input_mask = [1]*len(input_ids) + +# Convert inputs to PyTorch tensors +input_ids = torch.tensor([input_ids], dtype=torch.long).to(device) +print("input_ids",input_ids.size()) +input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入 +segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device) +#输出 +all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors) +# all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表, +# 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列) +# pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor, +# 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文) + +#如果我们要输出embeding 表示,只使用all_encoder_layer +print('all_encoder_layer',all_encoder_layer.shape) +print('pooled_output',pooled_output.size()) +#如果要分类,使用pooled_output + +#padding +max_seq_length=300 + +text = "乌兹别克斯坦议会立法院主席获连任" +tokenized_text = tokenizer.tokenize(text) +tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]'] +input_ids = tokenizer.convert_tokens_to_ids(tokenized_text) +input_mask = [1]*len(input_ids) + +padding = [0] * (max_seq_length - len(input_ids)) +input_ids += padding +input_mask += padding +input_ids = torch.tensor([input_ids], dtype=torch.long).to(device) +input_mask = torch.tensor([input_mask], dtype=torch.long).to(device) +print("padding input_ids",input_ids.size()) + +model_bert.eval() +with torch.no_grad(): + all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask) + print('padding all_encoder_layer', all_encoder_layer.shape) + print('padding pooled_output', pooled_output.size()) \ No newline at end of file diff --git a/pytorch-bert-code/bert.py b/pytorch-bert-code/bert.py new file mode 100644 index 0000000..f2990b2 --- /dev/null +++ b/pytorch-bert-code/bert.py @@ -0,0 +1,53 @@ +# coding: UTF-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +# from pytorch_pretrained_bert import BertModel, BertTokenizer +from transformers import BertModel, BertTokenizer + + +class Config(object): + + """配置参数""" + def __init__(self, dataset): + self.model_name = 'bert' + self.train_path = dataset + '/data/train.txt' + self.dev_path = dataset + '/data/dev.txt' + self.test_path = dataset + '/data/test.txt' + self.class_list = [x.strip() for x in open( + dataset + '/data/class.txt').readlines()] + self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + + self.num_classes = len(self.class_list) + self.num_epochs = 3 + self.batch_size = 128 + self.pad_size = 32 + self.learning_rate = 5e-5 + self.bert_path = './bert' + self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) + self.hidden_size = 768 + + +class Model(nn.Module): + + def __init__(self, config): + super(Model, self).__init__() + self.bert = BertModel.from_pretrained(config.bert_path) + for param in self.bert.parameters(): + param.requires_grad = True + self.fc = nn.Linear(config.hidden_size, config.num_classes) + + + def forward(self, input_ids,# 输入的句子 + input_mask,# 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0] + segments_ids + ): + _, pooled = self.bert(input_ids, attention_mask=input_mask,token_type_ids=segments_ids)#pooled [batch_size, hidden_size] + out = self.fc(pooled) + return out + def loss(self,outputs,labels): + criterion=F.cross_entropy + loss = criterion(outputs, labels) + return loss diff --git a/pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py b/pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000..dfcdbee --- /dev/null +++ b/pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py @@ -0,0 +1,105 @@ +# coding=utf-8 +# Copyright 2018 The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import argparse +import tensorflow as tf +import torch +import numpy as np + +from modeling import BertConfig, BertModel + +parser = argparse.ArgumentParser() + +## Required parameters +parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") +parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") +parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + +args = parser.parse_args() + +def convert(): + # Initialise PyTorch model + config = BertConfig.from_json_file(args.bert_config_file) + model = BertModel(config) + + # Load weights from TF model + path = args.tf_checkpoint_path + print("Converting TensorFlow checkpoint from {}".format(path)) + + init_vars = tf.train.list_variables(path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading {} with shape {}".format(name, shape)) + array = tf.train.load_variable(path, name) + print("Numpy array shape {}".format(array.shape)) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name[5:] # skip "bert/" + print("Loading {}".format(name)) + name = name.split('/') + if name[0] in ['redictions', 'eq_relationship']: + print("Skipping") + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + pointer.data = torch.from_numpy(array) + + # Save pytorch-model + torch.save(model.state_dict(), args.pytorch_dump_path) + +if __name__ == "__main__": + convert() diff --git a/pytorch-bert-code/bert/run.sh b/pytorch-bert-code/bert/run.sh new file mode 100644 index 0000000..24a7289 --- /dev/null +++ b/pytorch-bert-code/bert/run.sh @@ -0,0 +1 @@ +python3 convert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path bert_model.ckpt --bert_config_file bert_config.json --pytorch_dump_path bert_model.bin diff --git a/pytorch-bert-code/run.py b/pytorch-bert-code/run.py new file mode 100644 index 0000000..5e9775e --- /dev/null +++ b/pytorch-bert-code/run.py @@ -0,0 +1,36 @@ +# coding: UTF-8 +import time +import torch +import numpy as np +from train_eval import train +import argparse +from utils import build_dataset, build_iterator, get_time_dif +import bert +parser = argparse.ArgumentParser(description='Chinese Text Classification') +parser.add_argument('--model', type=str, required=False, help='choose a model: Bert, ERNIE') +args = parser.parse_args() + + +if __name__ == '__main__': + dataset = '.' # 数据集 + + model_name = 'bert'#args.model # bert + x = bert + config = x.Config(dataset) + np.random.seed(1) + torch.manual_seed(1) + # torch.cuda.manual_seed_all(1) + # torch.backends.cudnn.deterministic = True # 保证每次结果一样 + + start_time = time.time() + print("Loading data...") + train_data, dev_data, test_data = build_dataset(config) + train_iter = build_iterator(train_data, config) + dev_iter = build_iterator(dev_data, config) + test_iter = build_iterator(test_data, config) + time_dif = get_time_dif(start_time) + print("Time usage:", time_dif) + + # train + model = x.Model(config).to(config.device) + train(config, model, train_iter, dev_iter, test_iter) diff --git a/pytorch-bert-code/train_eval.py b/pytorch-bert-code/train_eval.py new file mode 100644 index 0000000..5303307 --- /dev/null +++ b/pytorch-bert-code/train_eval.py @@ -0,0 +1,104 @@ +# coding: UTF-8 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from sklearn import metrics +import time +from utils import get_time_dif +from transformers.optimization import AdamW + + + + + +def train(config, model, train_iter, dev_iter, test_iter): + start_time = time.time() + model.train() + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] + # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) + optimizer = AdamW(optimizer_grouped_parameters, + lr=config.learning_rate, + ) + total_batch = 0 # 记录进行到多少batch + dev_best_loss = float('inf') + last_improve = 0 # 记录上次验证集loss下降的batch数 + flag = False # 记录是否很久没有效果提升 + model.train() + for epoch in range(config.num_epochs): + print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) + for i, (trains, labels) in enumerate(train_iter): + outputs = model(trains) + model.zero_grad() + loss = F.cross_entropy(outputs, labels) + loss.backward() + optimizer.step() + if total_batch % 100 == 0: + # 每多少轮输出在训练集和验证集上的效果 + true = labels.data.cpu() + predic = torch.max(outputs.data, 1)[1].cpu() + train_acc = metrics.accuracy_score(true, predic) + dev_acc, dev_loss = evaluate(config, model, dev_iter) + if dev_loss < dev_best_loss: + dev_best_loss = dev_loss + torch.save(model.state_dict(), config.save_path) + improve = '*' + last_improve = total_batch + else: + improve = '' + time_dif = get_time_dif(start_time) + msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' + print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) + model.train() + total_batch += 1 + if total_batch - last_improve > config.require_improvement: + # 验证集loss超过1000batch没下降,结束训练 + print("No optimization for a long time, auto-stopping...") + flag = True + break + if flag: + break + test(config, model, test_iter) + + +def test(config, model, test_iter): + # test + model.load_state_dict(torch.load(config.save_path)) + model.eval() + start_time = time.time() + test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True) + msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}' + print(msg.format(test_loss, test_acc)) + print("Precision, Recall and F1-Score...") + print(test_report) + print("Confusion Matrix...") + print(test_confusion) + time_dif = get_time_dif(start_time) + print("Time usage:", time_dif) + + +def evaluate(config, model, data_iter, test=False): + model.eval() + loss_total = 0 + predict_all = np.array([], dtype=int) + labels_all = np.array([], dtype=int) + with torch.no_grad(): + for texts, labels in data_iter: + outputs = model(texts) + loss = F.cross_entropy(outputs, labels) + loss_total += loss + labels = labels.data.cpu().numpy() + predic = torch.max(outputs.data, 1)[1].cpu().numpy() + labels_all = np.append(labels_all, labels) + predict_all = np.append(predict_all, predic) + + acc = metrics.accuracy_score(labels_all, predict_all) + if test: + report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4) + confusion = metrics.confusion_matrix(labels_all, predict_all) + return acc, loss_total / len(data_iter), report, confusion + return acc, loss_total / len(data_iter) \ No newline at end of file diff --git a/pytorch-bert-code/utils.py b/pytorch-bert-code/utils.py new file mode 100644 index 0000000..697ba3b --- /dev/null +++ b/pytorch-bert-code/utils.py @@ -0,0 +1,97 @@ +# coding: UTF-8 +import torch +from tqdm import tqdm +import time +from datetime import timedelta + +PAD, CLS = '[PAD]', '[CLS]' # padding符号, bert中综合信息符号 + + +def build_dataset(config): + + def load_dataset(path, pad_size=32): + contents = [] + with open(path, 'r', encoding='UTF-8') as f: + for line in tqdm(f): + lin = line.strip() + if not lin: + continue + content, label = lin.split('\t') + token = config.tokenizer.tokenize(content) + token = [CLS] + token + seq_len = len(token) + mask = [] + token_ids = config.tokenizer.convert_tokens_to_ids(token) + + if pad_size: + if len(token) < pad_size: + mask = [1] * len(token_ids) + [0] * (pad_size - len(token)) + token_ids += ([0] * (pad_size - len(token))) + else: + mask = [1] * pad_size + token_ids = token_ids[:pad_size] + seq_len = pad_size + contents.append((token_ids, int(label), seq_len, mask)) + return contents + train = load_dataset(config.train_path, config.pad_size) + dev = load_dataset(config.dev_path, config.pad_size) + test = load_dataset(config.test_path, config.pad_size) + return train, dev, test + + +class DatasetIterater(object): + def __init__(self, batches, batch_size, device): + self.batch_size = batch_size + self.batches = batches + self.n_batches = len(batches) // batch_size + self.residue = False # 记录batch数量是否为整数 + if len(batches) % self.n_batches != 0: + self.residue = True + self.index = 0 + self.device = device + + def _to_tensor(self, datas): + x = torch.LongTensor([_[0] for _ in datas]).to(self.device) + y = torch.LongTensor([_[1] for _ in datas]).to(self.device) + + # pad前的长度(超过pad_size的设为pad_size) + seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device) + mask = torch.LongTensor([_[3] for _ in datas]).to(self.device) + return (x, seq_len, mask), y + + def __next__(self): + if self.residue and self.index == self.n_batches: + batches = self.batches[self.index * self.batch_size: len(self.batches)] + self.index += 1 + batches = self._to_tensor(batches) + return batches + + elif self.index > self.n_batches: + self.index = 0 + raise StopIteration + else: + batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size] + self.index += 1 + batches = self._to_tensor(batches) + return batches + + def __iter__(self): + return self + + def __len__(self): + if self.residue: + return self.n_batches + 1 + else: + return self.n_batches + + +def build_iterator(dataset, config): + iter = DatasetIterater(dataset, config.batch_size, config.device) + return iter + + +def get_time_dif(start_time): + """获取已使用时间""" + end_time = time.time() + time_dif = end_time - start_time + return timedelta(seconds=int(round(time_dif)))