From 44d8bc66a2be340cef75ed072fe1334e9b3dd05a Mon Sep 17 00:00:00 2001 From: Duzeyao <330501241@qq.com> Date: Wed, 20 Nov 2019 10:52:50 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgradient=20accumulation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +-- config/model_config_test.json | 10 ++++++++++ train.py | 9 ++++----- 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 config/model_config_test.json diff --git a/README.md b/README.md index faf9cf2..f97290a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ ## 项目状态 - 目前项目主要架构已经稳定。如发现任何bug或是有功能意见与改进欢迎提交Issue,PR或是联系作者。 -- 如使用梯度积累,loss计算可能存在bug。 ## 使用方法 @@ -61,7 +60,7 @@ python ./generate.py --length=50 --nsamples=4 --prefix=xxx --fast_pattern --save ## FP16与Gradient Accumulation支持 -- 我在train.py文件中加入了fp16与gradient accumulation支持,如果你安装了apex并且知道fp16是什么的话,可以修改变量fp16=True来启用。但是目前fp16不收敛,原因不明。 +- 我在train.py文件中加入了fp16与gradient accumulation支持,如果你安装了apex并且知道fp16是什么的话,可以修改变量fp16=True来启用。但是目前fp16可能不收敛,原因不明。 ## 联系作者 diff --git a/config/model_config_test.json b/config/model_config_test.json new file mode 100644 index 0000000..717da16 --- /dev/null +++ b/config/model_config_test.json @@ -0,0 +1,10 @@ +{ + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "n_ctx": 64, + "n_embd": 128, + "n_head": 2, + "n_layer": 1, + "n_positions": 64, + "vocab_size": 13317 +} \ No newline at end of file diff --git a/train.py b/train.py index bbab268..f08bebd 100644 --- a/train.py +++ b/train.py @@ -207,23 +207,22 @@ def main(): torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step - if (step + 1) % gradient_accumulation == 0: + if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() - overall_step += 1 - if (overall_step + 1) % log_step == 0: - tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % log_step == 0: + tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, - running_loss * gradient_accumulation / log_step)) + running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 + overall_step += 1 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1))