Revert rope fusion defaults (NVIDIA#9237)

* revert rope fusion defaults Signed-off-by: Chen Cui <[email protected]> * Apply isort and black reformatting Signed-off-by: cuichenx <[email protected]> --------- Signed-off-by: Chen Cui <[email protected]> Signed-off-by: cuichenx <[email protected]> Co-authored-by: cuichenx <[email protected]>
TarteelAI · May 17, 2024 · cd6d67b · cd6d67b
1 parent 0744016
commit cd6d67b
Show file tree

Hide file tree

Showing 9 changed files with 73 additions and 47 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
@@ -81,7 +81,7 @@ model:
   position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
   rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. For chatglm2, it is 0.5 (https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L754)
   rotary_interleaved: True # chatglm2 use interleaved rotary embedding
-  apply_rope_fusion: True
+  apply_rope_fusion: False
   attention_type: 'multihead' # Attention type. Options ['multihead']
   share_embeddings_and_output_weights: False # Share embedding and output layer weights.
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1

diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -113,7 +113,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -131,7 +131,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous

diff --git a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
@@ -112,7 +112,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous

diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
@@ -117,7 +117,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
   # Miscellaneous
   seed: 1234

diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -148,7 +148,9 @@ def __init__(self, sentences):
         super().__init__()
         self.sentences = sentences
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
@@ -173,7 +175,9 @@ def main(cfg) -> None:
         callbacks.append(CustomProgressBar())
     # trainer required for restoring model parallel models
     trainer = Trainer(
-        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), **cfg.trainer, callbacks=callbacks,
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
     )
 
     if cfg.gpt_model_file is not None:
@@ -224,6 +228,7 @@ def main(cfg) -> None:
             pretrained_cfg.activations_checkpoint_method = None
             pretrained_cfg.precision = trainer.precision
             pretrained_cfg["use_flash_attention"] = cfg.inference.get("use_flash_attention", False)
+            pretrained_cfg["apply_rope_fusion"] = False
             if pretrained_cfg.get('mcore_gpt', False):
                 # with dist checkpointing we can use the model parallel config specified by the user
                 pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size

diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
@@ -138,7 +138,8 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
                 checkpoint = dist_checkpointing.load(
-                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir,
+                    sharded_state_dict=checkpoint,
+                    checkpoint_dir=tmp_model_weights_dir,
                 )
                 state_dict = checkpoint["state_dict"]
 
@@ -149,7 +150,9 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
 
 
 def setup_trainer_and_models_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ):
     """
     Set up a trainer and NeMo model for inference.
@@ -172,7 +175,10 @@ def setup_trainer_and_models_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -215,7 +221,9 @@ def setup_trainer_and_models_for_inference(
             )
 
             model = model_provider.load_from_checkpoint(
-                single_model_cfg.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+                single_model_cfg.restore_from_path,
+                hparams_file=cfg.model.get("hparams_file"),
+                trainer=trainer,
             )
             models.append(model)
 
@@ -239,7 +247,9 @@ def dummy():
 
 
 def setup_trainer_and_model_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ) -> Tuple[Trainer, Any]:
     """
     Set up a trainer and NeMo model for inference.
@@ -261,7 +271,10 @@ def setup_trainer_and_model_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -299,7 +312,9 @@ def setup_trainer_and_model_for_inference(
         )
 
         model = model_provider.load_from_checkpoint(
-            cfg.model.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+            cfg.model.restore_from_path,
+            hparams_file=cfg.model.get("hparams_file"),
+            trainer=trainer,
         )
 
     else:
@@ -335,7 +350,9 @@ def create_neva_model_and_processor(cfg):
         or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
     ):
         model_config = MegatronNevaModel.restore_from(
-            restore_path=cfg.neva_model_file, trainer=trainer, return_config=True,
+            restore_path=cfg.neva_model_file,
+            trainer=trainer,
+            return_config=True,
         )
 
         with open_dict(cfg):
@@ -366,6 +383,7 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.activations_checkpoint_method = None
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
+            neva_cfg.apply_rope_fusion = False
             neva_cfg.fp8 = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 

diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -68,9 +68,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         assert (
             self.cfg.get("post_process", False) is False
         ), "post_process must be False to get hidden states in the loss_func"
-        assert (
-            self.cfg.get('apply_rope_fusion', True) is False
-        ), "RoPE fusion should be set to False for MegatronGPTEmbeddingModel"
 
     def model_provider_func(self, pre_process, post_process):
         # (@adithyare) We need post_process to be False to get hidden states in the loss_func

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -246,12 +246,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
         self.use_fsdp = cfg.get('fsdp', False)
 
     def setup_transformer_engine_tp_groups(self):
-        """ This should be called after model parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after model parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         for module in self.get_model_module_list():
             """Set TP group
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -262,14 +262,14 @@ def setup_transformer_engine_tp_groups(self):
                     child.set_tensor_parallel_group(tp_group)
 
     def setup_transformer_engine_cp_groups(self):
-        """ This should be called after context parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after context parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         cp_stream = torch.cuda.Stream()
 
         for module in self.get_model_module_list():
             """Set context parallel running
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -283,11 +283,11 @@ def setup_transformer_engine_cp_groups(self):
                     )
 
     def _wrap_model_for_O2(self):
-        """ Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
-            Args:
-                model: The model to wrap. Can be a list of modules or a single module.
-            Returns:
-                The wrapped model. Returns a list of wrapped modules or a single wrapped module.
+        """Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
+        Args:
+            model: The model to wrap. Can be a list of modules or a single module.
+        Returns:
+            The wrapped model. Returns a list of wrapped modules or a single wrapped module.
         """
         is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
 
@@ -450,10 +450,10 @@ def on_validation_end(self) -> None:
             gc.collect()
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
         # create a dictionary copy of the model config
@@ -509,7 +509,7 @@ def build_transformer_config(self) -> TransformerConfig:
 
         bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)
 
-        apply_rope_fusion = self.cfg.get('apply_rope_fusion', True)
+        apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)
 
         # TODO: need to check if recompute APIs are matching up properly
         recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None)
@@ -601,7 +601,7 @@ def get_parameters_with_grad(self):
 
     def configure_gradient_clipping(self, *args, **kwargs):
         """PTL hook to configure gradients.
-           We use gradient clipping implementation from megatron-lm.
+        We use gradient clipping implementation from megatron-lm.
         """
         clip_val = self.trainer.gradient_clip_val
         if clip_val is None:
@@ -627,13 +627,17 @@ def configure_gradient_clipping(self, *args, **kwargs):
                 parameters = self._optimizer.get_parameters_with_grad()
             else:
                 parameters = self.get_parameters_with_grad()
-            grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=self.use_fsdp,)
+            grad_norm = clip_grad_norm_fp32(
+                parameters=parameters,
+                max_norm=clip_val,
+                use_fsdp=self.use_fsdp,
+            )
 
         self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1)
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-           Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -732,7 +736,9 @@ def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloade
             self.validation_global_step += 1
 
     def setup_optimization(
-        self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        optim_config: Optional[Union[DictConfig, Dict]] = None,
+        optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         # Ensure `max_steps` is set correctly
         optim_config = self._optim_config_copy(optim_config)
@@ -913,8 +919,8 @@ def _extract_consumed_samples_from_ckpt(self, ckpt_path):
         return init_consumed_samples
 
     def _validate_and_override_config(self):
-        """ Certain configurations might be incompatible or discouraged.
-            We can check for them here and override if necessary.
+        """Certain configurations might be incompatible or discouraged.
+        We can check for them here and override if necessary.
         """
         app_state = AppState()
 
@@ -1093,9 +1099,9 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
         return num_parameters_on_device, total_num_parameters
 
     def build_model_parallel_config(self) -> ModelParallelConfig:
-        """ For attributes in the nemo model config that are the same as the
-            megatron core ModelParallelConfig we will use the value from the nemo config.
-            For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
+        """For attributes in the nemo model config that are the same as the
+        megatron core ModelParallelConfig we will use the value from the nemo config.
+        For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
         """
         cfg = OmegaConf.to_container(self.cfg, resolve=True)
 
@@ -1116,9 +1122,9 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
             and not self.cfg.get('sequence_parallel', False),
             "pipeline_dtype": pipeline_dtype,
-            "grad_scale_func": self.trainer.precision_plugin.scaler.scale
-            if self.trainer.precision in ["16", "16-mixed"]
-            else None,
+            "grad_scale_func": (
+                self.trainer.precision_plugin.scaler.scale if self.trainer.precision in ["16", "16-mixed"] else None
+            ),
             "enable_autocast": not megatron_amp_O2 and self.torch_dtype in [torch.bfloat16, torch.float16],
             "autocast_dtype": self.autocast_dtype,
             "variable_seq_lengths": False,  # set dynamically during training
@@ -1230,7 +1236,7 @@ def find_frozen_submodules(model):
             return frozen_submodule_names, frozen_submodules
 
         if self.use_fsdp:
-            """ Top-evel FSDP model sharding """
+            """Top-evel FSDP model sharding"""
             # Shard the top-level model hierarchically. We shard the strategy-unwrapped model not
             # to lose the structure of non-FSDP wrapped parameters (e.g, embedding)
             # TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be