python bert code

2019-12-13 14:52:07 +08:00
parent b68d3f1152
commit 6229486d35
8 changed files with 507 additions and 0 deletions
--- a/pytorch-bert-code/README.md
+++ b/pytorch-bert-code/README.md
@@ -0,0 +1,22 @@
+### Stanford / Winter 2019
+
+
+
+download bert file
+
+ google tensorflow bert(need to be converted)
+ 
+ https://github.com/google-research/bert
+ 
+ 
+https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
+
+
+ chinese bert   
+ 
+ https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
+
+
+关于nlp职位面试相关的问题，请关注公众号：
+
+  ![flypython微信公众号](https://flypython.com/images/wechat.png)
--- a/pytorch-bert-code/bert-example.py
+++ b/pytorch-bert-code/bert-example.py
@@ -0,0 +1,89 @@
+#https://github.com/huggingface/transformers
+
+#https://huggingface.co/transformers/quickstart.html
+#BERT example
+
+#pip install transformers
+#老的pytorch_transformers
+
+import torch
+import torch.nn as nn
+from transformers  import BertConfig, BertModel
+from transformers.tokenization_bert  import BertTokenizer as tokenization
+import os
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#get_bert_model
+# bert预训练模型：
+# pytorch_model.bin
+# config.json
+# vocab.txt
+bert_path = './bert'
+do_lower_case=True
+
+bert_config_file = os.path.join(bert_path, f'bert_config.json')
+vocab_file = os.path.join(bert_path, f'vocab.txt')
+init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
+
+#加载配置
+bert_config = BertConfig.from_json_file(bert_config_file)
+
+# 加载词典
+tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
+
+# 加载模型
+model_bert =  BertModel.from_pretrained(bert_path)
+model_bert.to(device)
+
+
+# Tokenize input
+text = "乌兹别克斯坦议会立法院主席获连任"
+tokenized_text = tokenizer.tokenize(text)
+tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
+# Convert token to vocabulary indices
+# input_ids：一个形状为[batch_size, sequence_length]的torch.LongTensor，在词汇表中包含单词的token索引
+input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
+# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
+# segment_ids ：形状[batch_size, sequence_length]的可选torch.LongTensor，在[0, 1]中选择token类型索引。类型0对应于句子A，类型1对应于句子B。
+segment_ids = [0]*len(input_ids)
+# input_mask：一个可选的torch.LongTensor，形状为[batch_size, sequence_length]，索引在[0, 1]中选择。
+input_mask = [1]*len(input_ids)
+
+# Convert inputs to PyTorch tensors
+input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
+print("input_ids",input_ids.size())
+input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
+segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
+#输出
+all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
+# all_encoder_layers：一个大小为[batch_size, sequence_length，hidden_size]的torch.FloatTensor列表，
+# 它是每个注意块末端隐藏状态的完整序列列表（即BERT-base的12个完整序列，BERT-large的24个完整序列）
+# pooled_output：一个大小为[batch_size, hidden_size] 的torch.FloatTensor，
+# 它是在与输入（CLF）的第一个字符相关联的隐藏状态之上预训练的分类器的输出，用于训练Next - Sentence任务（参见BERT的论文）
+
+#如果我们要输出embeding 表示，只使用all_encoder_layer
+print('all_encoder_layer',all_encoder_layer.shape)
+print('pooled_output',pooled_output.size())
+#如果要分类，使用pooled_output
+
+#padding
+max_seq_length=300
+
+text = "乌兹别克斯坦议会立法院主席获连任"
+tokenized_text = tokenizer.tokenize(text)
+tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
+input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
+input_mask = [1]*len(input_ids)
+
+padding = [0] * (max_seq_length - len(input_ids))
+input_ids += padding
+input_mask += padding
+input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
+input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
+print("padding input_ids",input_ids.size())
+
+model_bert.eval()
+with torch.no_grad():
+    all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
+    print('padding all_encoder_layer', all_encoder_layer.shape)
+    print('padding pooled_output', pooled_output.size())
--- a/pytorch-bert-code/bert.py
+++ b/pytorch-bert-code/bert.py
@@ -0,0 +1,53 @@
+# coding: UTF-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from pytorch_pretrained_bert import BertModel, BertTokenizer
+from transformers import BertModel, BertTokenizer
+
+
+class Config(object):
+
+    """配置参数"""
+    def __init__(self, dataset):
+        self.model_name = 'bert'
+        self.train_path = dataset + '/data/train.txt'
+        self.dev_path = dataset + '/data/dev.txt'
+        self.test_path = dataset + '/data/test.txt'
+        self.class_list = [x.strip() for x in open(
+            dataset + '/data/class.txt').readlines()]
+        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+        self.num_classes = len(self.class_list)
+        self.num_epochs = 3
+        self.batch_size = 128
+        self.pad_size = 32
+        self.learning_rate = 5e-5
+        self.bert_path = './bert'
+        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
+        self.hidden_size = 768
+
+
+class Model(nn.Module):
+
+    def __init__(self, config):
+        super(Model, self).__init__()
+        self.bert = BertModel.from_pretrained(config.bert_path)
+        for param in self.bert.parameters():
+            param.requires_grad = True
+        self.fc = nn.Linear(config.hidden_size, config.num_classes)
+
+
+    def forward(self, input_ids,# 输入的句子
+                input_mask,# 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
+                segments_ids
+                ):
+        _, pooled = self.bert(input_ids, attention_mask=input_mask,token_type_ids=segments_ids)#pooled [batch_size, hidden_size]
+        out = self.fc(pooled)
+        return out
+    def loss(self,outputs,labels):
+        criterion=F.cross_entropy
+        loss = criterion(outputs, labels)
+        return loss
--- a/pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch-bert-code/bert/convert_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import argparse
+import tensorflow as tf
+import torch
+import numpy as np
+
+from modeling import BertConfig, BertModel
+
+parser = argparse.ArgumentParser()
+
+## Required parameters
+parser.add_argument("--tf_checkpoint_path",
+                    default = None,
+                    type = str,
+                    required = True,
+                    help = "Path the TensorFlow checkpoint path.")
+parser.add_argument("--bert_config_file",
+                    default = None,
+                    type = str,
+                    required = True,
+                    help = "The config json file corresponding to the pre-trained BERT model. \n"
+                        "This specifies the model architecture.")
+parser.add_argument("--pytorch_dump_path",
+                    default = None,
+                    type = str,
+                    required = True,
+                    help = "Path to the output PyTorch model.")
+
+args = parser.parse_args()
+
+def convert():
+    # Initialise PyTorch model
+    config = BertConfig.from_json_file(args.bert_config_file)
+    model = BertModel(config)
+
+    # Load weights from TF model
+    path = args.tf_checkpoint_path
+    print("Converting TensorFlow checkpoint from {}".format(path))
+
+    init_vars = tf.train.list_variables(path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(path, name)
+        print("Numpy array shape {}".format(array.shape))
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name[5:]  # skip "bert/"
+        print("Loading {}".format(name))
+        name = name.split('/')
+        if name[0] in ['redictions', 'eq_relationship']:
+            print("Skipping")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        pointer.data = torch.from_numpy(array)
+
+    # Save pytorch-model
+    torch.save(model.state_dict(), args.pytorch_dump_path)
+
+if __name__ == "__main__":
+    convert()
--- a/pytorch-bert-code/bert/run.sh
+++ b/pytorch-bert-code/bert/run.sh
@@ -0,0 +1 @@
+python3 convert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path bert_model.ckpt --bert_config_file bert_config.json --pytorch_dump_path bert_model.bin
--- a/pytorch-bert-code/run.py
+++ b/pytorch-bert-code/run.py
@@ -0,0 +1,36 @@
+# coding: UTF-8
+import time
+import torch
+import numpy as np
+from train_eval import train
+import argparse
+from utils import build_dataset, build_iterator, get_time_dif
+import bert
+parser = argparse.ArgumentParser(description='Chinese Text Classification')
+parser.add_argument('--model', type=str, required=False, help='choose a model: Bert, ERNIE')
+args = parser.parse_args()
+
+
+if __name__ == '__main__':
+    dataset = '.'  # 数据集
+
+    model_name = 'bert'#args.model  # bert
+    x = bert
+    config = x.Config(dataset)
+    np.random.seed(1)
+    torch.manual_seed(1)
+    # torch.cuda.manual_seed_all(1)
+    # torch.backends.cudnn.deterministic = True  # 保证每次结果一样
+
+    start_time = time.time()
+    print("Loading data...")
+    train_data, dev_data, test_data = build_dataset(config)
+    train_iter = build_iterator(train_data, config)
+    dev_iter = build_iterator(dev_data, config)
+    test_iter = build_iterator(test_data, config)
+    time_dif = get_time_dif(start_time)
+    print("Time usage:", time_dif)
+
+    # train
+    model = x.Model(config).to(config.device)
+    train(config, model, train_iter, dev_iter, test_iter)
--- a/pytorch-bert-code/train_eval.py
+++ b/pytorch-bert-code/train_eval.py
@@ -0,0 +1,104 @@
+# coding: UTF-8
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn import metrics
+import time
+from utils import get_time_dif
+from transformers.optimization import AdamW
+
+
+
+
+
+def train(config, model, train_iter, dev_iter, test_iter):
+    start_time = time.time()
+    model.train()
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
+    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
+    optimizer = AdamW(optimizer_grouped_parameters,
+                         lr=config.learning_rate,
+                         )
+    total_batch = 0  # 记录进行到多少batch
+    dev_best_loss = float('inf')
+    last_improve = 0  # 记录上次验证集loss下降的batch数
+    flag = False  # 记录是否很久没有效果提升
+    model.train()
+    for epoch in range(config.num_epochs):
+        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
+        for i, (trains, labels) in enumerate(train_iter):
+            outputs = model(trains)
+            model.zero_grad()
+            loss = F.cross_entropy(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            if total_batch % 100 == 0:
+                # 每多少轮输出在训练集和验证集上的效果
+                true = labels.data.cpu()
+                predic = torch.max(outputs.data, 1)[1].cpu()
+                train_acc = metrics.accuracy_score(true, predic)
+                dev_acc, dev_loss = evaluate(config, model, dev_iter)
+                if dev_loss < dev_best_loss:
+                    dev_best_loss = dev_loss
+                    torch.save(model.state_dict(), config.save_path)
+                    improve = '*'
+                    last_improve = total_batch
+                else:
+                    improve = ''
+                time_dif = get_time_dif(start_time)
+                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
+                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
+                model.train()
+            total_batch += 1
+            if total_batch - last_improve > config.require_improvement:
+                # 验证集loss超过1000batch没下降，结束训练
+                print("No optimization for a long time, auto-stopping...")
+                flag = True
+                break
+        if flag:
+            break
+    test(config, model, test_iter)
+
+
+def test(config, model, test_iter):
+    # test
+    model.load_state_dict(torch.load(config.save_path))
+    model.eval()
+    start_time = time.time()
+    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
+    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
+    print(msg.format(test_loss, test_acc))
+    print("Precision, Recall and F1-Score...")
+    print(test_report)
+    print("Confusion Matrix...")
+    print(test_confusion)
+    time_dif = get_time_dif(start_time)
+    print("Time usage:", time_dif)
+
+
+def evaluate(config, model, data_iter, test=False):
+    model.eval()
+    loss_total = 0
+    predict_all = np.array([], dtype=int)
+    labels_all = np.array([], dtype=int)
+    with torch.no_grad():
+        for texts, labels in data_iter:
+            outputs = model(texts)
+            loss = F.cross_entropy(outputs, labels)
+            loss_total += loss
+            labels = labels.data.cpu().numpy()
+            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
+            labels_all = np.append(labels_all, labels)
+            predict_all = np.append(predict_all, predic)
+
+    acc = metrics.accuracy_score(labels_all, predict_all)
+    if test:
+        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
+        confusion = metrics.confusion_matrix(labels_all, predict_all)
+        return acc, loss_total / len(data_iter), report, confusion
+    return acc, loss_total / len(data_iter)
--- a/pytorch-bert-code/utils.py
+++ b/pytorch-bert-code/utils.py
@@ -0,0 +1,97 @@
+# coding: UTF-8
+import torch
+from tqdm import tqdm
+import time
+from datetime import timedelta
+
+PAD, CLS = '[PAD]', '[CLS]'  # padding符号, bert中综合信息符号
+
+
+def build_dataset(config):
+
+    def load_dataset(path, pad_size=32):
+        contents = []
+        with open(path, 'r', encoding='UTF-8') as f:
+            for line in tqdm(f):
+                lin = line.strip()
+                if not lin:
+                    continue
+                content, label = lin.split('\t')
+                token = config.tokenizer.tokenize(content)
+                token = [CLS] + token
+                seq_len = len(token)
+                mask = []
+                token_ids = config.tokenizer.convert_tokens_to_ids(token)
+
+                if pad_size:
+                    if len(token) < pad_size:
+                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
+                        token_ids += ([0] * (pad_size - len(token)))
+                    else:
+                        mask = [1] * pad_size
+                        token_ids = token_ids[:pad_size]
+                        seq_len = pad_size
+                contents.append((token_ids, int(label), seq_len, mask))
+        return contents
+    train = load_dataset(config.train_path, config.pad_size)
+    dev = load_dataset(config.dev_path, config.pad_size)
+    test = load_dataset(config.test_path, config.pad_size)
+    return train, dev, test
+
+
+class DatasetIterater(object):
+    def __init__(self, batches, batch_size, device):
+        self.batch_size = batch_size
+        self.batches = batches
+        self.n_batches = len(batches) // batch_size
+        self.residue = False  # 记录batch数量是否为整数
+        if len(batches) % self.n_batches != 0:
+            self.residue = True
+        self.index = 0
+        self.device = device
+
+    def _to_tensor(self, datas):
+        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
+        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
+
+        # pad前的长度(超过pad_size的设为pad_size)
+        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
+        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
+        return (x, seq_len, mask), y
+
+    def __next__(self):
+        if self.residue and self.index == self.n_batches:
+            batches = self.batches[self.index * self.batch_size: len(self.batches)]
+            self.index += 1
+            batches = self._to_tensor(batches)
+            return batches
+
+        elif self.index > self.n_batches:
+            self.index = 0
+            raise StopIteration
+        else:
+            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
+            self.index += 1
+            batches = self._to_tensor(batches)
+            return batches
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        if self.residue:
+            return self.n_batches + 1
+        else:
+            return self.n_batches
+
+
+def build_iterator(dataset, config):
+    iter = DatasetIterater(dataset, config.batch_size, config.device)
+    return iter
+
+
+def get_time_dif(start_time):
+    """获取已使用时间"""
+    end_time = time.time()
+    time_dif = end_time - start_time
+    return timedelta(seconds=int(round(time_dif)))
				`@@ -0,0 +1 @@`
				`python3 convert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path bert_model.ckpt --bert_config_file bert_config.json --pytorch_dump_path bert_model.bin`