From 9a4331868a0441ee8684e5a44c064f82d6c28ffe Mon Sep 17 00:00:00 2001 From: Brandon Yang Date: Wed, 14 Aug 2024 02:23:25 -0700 Subject: [PATCH] Sum instead of average for LayerNorm gradient all reduce --- megatron/model/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 65e4202ec..162b1e218 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -370,8 +370,6 @@ def reduce_weight_grads_from_model_parallel_region(input_): # All-reduce. torch.distributed.all_reduce(input_, group=mpu.get_model_parallel_group()) - # average grads - input_ = input_ / mpu.get_model_parallel_world_size() # Bf16 convert if dt == torch.bfloat16 and mpu.get_fp32_allreduce():