From 44d8bc66a2be340cef75ed072fe1334e9b3dd05a Mon Sep 17 00:00:00 2001
From: Duzeyao <330501241@qq.com>
Date: Wed, 20 Nov 2019 10:52:50 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dgradient=20accumulation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                     |  3 +--
 config/model_config_test.json | 10 ++++++++++
 train.py                      |  9 ++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 config/model_config_test.json

diff --git a/README.md b/README.md
index faf9cf2..f97290a 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,6 @@
 ## 项目状态
 
 - 目前项目主要架构已经稳定。如发现任何bug或是有功能意见与改进欢迎提交Issue，PR或是联系作者。
-- 如使用梯度积累，loss计算可能存在bug。
 
 ## 使用方法
 
@@ -61,7 +60,7 @@ python ./generate.py --length=50 --nsamples=4 --prefix=xxx --fast_pattern --save
 
 ## FP16与Gradient Accumulation支持
 
-- 我在train.py文件中加入了fp16与gradient accumulation支持，如果你安装了apex并且知道fp16是什么的话，可以修改变量fp16=True来启用。但是目前fp16不收敛，原因不明。
+- 我在train.py文件中加入了fp16与gradient accumulation支持，如果你安装了apex并且知道fp16是什么的话，可以修改变量fp16=True来启用。但是目前fp16可能不收敛，原因不明。
 
 ## 联系作者
 
diff --git a/config/model_config_test.json b/config/model_config_test.json
new file mode 100644
index 0000000..717da16
--- /dev/null
+++ b/config/model_config_test.json
@@ -0,0 +1,10 @@
+{
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "n_ctx": 64,
+  "n_embd": 128,
+  "n_head": 2,
+  "n_layer": 1,
+  "n_positions": 64,
+  "vocab_size": 13317
+}
\ No newline at end of file
diff --git a/train.py b/train.py
index bbab268..f08bebd 100644
--- a/train.py
+++ b/train.py
@@ -207,23 +207,22 @@ def main():
                     torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
 
                 #  optimizer step
-                if (step + 1) % gradient_accumulation == 0:
+                if (overall_step + 1) % gradient_accumulation == 0:
                     running_loss += loss.item()
                     optimizer.step()
                     optimizer.zero_grad()
                     scheduler.step()
-                    overall_step += 1
-                    if (overall_step + 1) % log_step == 0:
-                        tb_writer.add_scalar('loss', loss.item(), overall_step)
                 if (overall_step + 1) % log_step == 0:
+                    tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step)
                     print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format(
                         datetime.now().hour,
                         datetime.now().minute,
                         step + 1,
                         piece_num,
                         epoch + 1,
-                        running_loss * gradient_accumulation / log_step))
+                        running_loss * gradient_accumulation / (log_step / gradient_accumulation)))
                     running_loss = 0
+                overall_step += 1
             piece_num += 1
 
         print('saving model for epoch {}'.format(epoch + 1))