From 9a4331868a0441ee8684e5a44c064f82d6c28ffe Mon Sep 17 00:00:00 2001
From: Brandon Yang <bclyang@gmail.com>
Date: Wed, 14 Aug 2024 02:23:25 -0700
Subject: [PATCH] Sum instead of average for LayerNorm gradient all reduce

---
 megatron/model/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 65e4202ec..162b1e218 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -370,8 +370,6 @@ def reduce_weight_grads_from_model_parallel_region(input_):
 
     # All-reduce.
     torch.distributed.all_reduce(input_, group=mpu.get_model_parallel_group())
-    # average grads
-    input_ = input_ / mpu.get_model_parallel_world_size()
 
     # Bf16 convert
     if dt == torch.bfloat16 and mpu.get_fp32_allreduce():