Merge branch '812-megatron-seq-parallel' of https://github.com/Eleuth…

…erAI/gpt-neox into 812-megatron-seq-parallel
EleutherAI · Aug 19, 2024 · 9ce982e · 9ce982e
2 parents 8455de7 + aafbbce
commit 9ce982e
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 9 deletions.
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
@@ -51,7 +51,9 @@ def __init__(
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
 
-        self.sequence_parallel = neox_args.sequence_parallel # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks
+        self.sequence_parallel = (
+            neox_args.sequence_parallel
+        )  # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks
 
         self.use_mup = neox_args.use_mup
         self.mup_embedding_mult = neox_args.mup_embedding_mult
@@ -163,10 +165,9 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
                 embeddings.mul_(self.mup_embedding_mult)
 
         if self.sequence_parallel:
-            # TODO: megatron-lm does dropout using the scattered embs. This'd save a tiny bit of time, perhaps?
+            # TODO: megatron-lm does dropout using the scattered embs. This would save a tiny bit of time, perhaps?
             # Not a priority since we don't often use dropout
             embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
-            # pass
 
         return embeddings
 

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -85,7 +85,7 @@ class NeoXArgsParallelism(NeoXArgsTemplate):
     according to pipeline parallel size.
     """
 
-    sequence_parallel: bool = False # TODO: default to True?
+    sequence_parallel: bool = False
     """
     flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198)
     (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1.

diff --git a/megatron/training.py b/megatron/training.py
@@ -29,7 +29,6 @@
 import torch
 import deepspeed
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
-from deepspeed.utils import safe_get_full_grad
 import numpy as np
 
 from megatron.utils import (
@@ -895,10 +894,6 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
             ):
                 torch.cuda.nvtx.range_push(f"Optimizer step")
 
-            # for name, param in model.named_parameters():
-            #     print(name, safe_get_full_grad(param))
-            # raise ValueError
-
             timers("optimizer").start()
             if neox_args.deepspeed:
                 model.step()