Sum instead of average for LayerNorm gradient all reduce

EleutherAI · Aug 14, 2024 · 9a43318 · 9a43318
1 parent 651e24e
commit 9a43318
Showing 1 changed file with 0 additions and 2 deletions.
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
@@ -370,8 +370,6 @@ def reduce_weight_grads_from_model_parallel_region(input_):
 
     # All-reduce.
     torch.distributed.all_reduce(input_, group=mpu.get_model_parallel_group())
-    # average grads
-    input_ = input_ / mpu.get_model_parallel_world_size()
 
     # Bf16 convert
     if dt == torch.bfloat16 and mpu.get_fp32_allreduce():