Skip to content

Commit

Permalink
Revert rope fusion defaults (NVIDIA#9237)
Browse files Browse the repository at this point in the history
* revert rope fusion defaults

Signed-off-by: Chen Cui <[email protected]>

* Apply isort and black reformatting

Signed-off-by: cuichenx <[email protected]>

---------

Signed-off-by: Chen Cui <[email protected]>
Signed-off-by: cuichenx <[email protected]>
Co-authored-by: cuichenx <[email protected]>
  • Loading branch information
cuichenx and cuichenx authored May 17, 2024
1 parent 0744016 commit cd6d67b
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ model:
position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. For chatglm2, it is 0.5 (https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L754)
rotary_interleaved: True # chatglm2 use interleaved rotary embedding
apply_rope_fusion: True
apply_rope_fusion: False
attention_type: 'multihead' # Attention type. Options ['multihead']
share_embeddings_and_output_weights: False # Share embedding and output layer weights.
overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ model:
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope


# Miscellaneous
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ model:
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope


# Miscellaneous
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ model:
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope


# Miscellaneous
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ model:
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope

# Miscellaneous
seed: 1234
Expand Down
9 changes: 7 additions & 2 deletions examples/nlp/language_modeling/megatron_gpt_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ def __init__(self, sentences):
super().__init__()
self.sentences = sentences

def __len__(self,):
def __len__(
self,
):
return len(self.sentences)

def __getitem__(self, idx):
Expand All @@ -173,7 +175,9 @@ def main(cfg) -> None:
callbacks.append(CustomProgressBar())
# trainer required for restoring model parallel models
trainer = Trainer(
strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), **cfg.trainer, callbacks=callbacks,
strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
**cfg.trainer,
callbacks=callbacks,
)

if cfg.gpt_model_file is not None:
Expand Down Expand Up @@ -224,6 +228,7 @@ def main(cfg) -> None:
pretrained_cfg.activations_checkpoint_method = None
pretrained_cfg.precision = trainer.precision
pretrained_cfg["use_flash_attention"] = cfg.inference.get("use_flash_attention", False)
pretrained_cfg["apply_rope_fusion"] = False
if pretrained_cfg.get('mcore_gpt', False):
# with dist checkpointing we can use the model parallel config specified by the user
pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
Expand Down
34 changes: 26 additions & 8 deletions nemo/collections/multimodal/parts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
checkpoint = dist_checkpointing.load(
sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir,
sharded_state_dict=checkpoint,
checkpoint_dir=tmp_model_weights_dir,
)
state_dict = checkpoint["state_dict"]

Expand All @@ -149,7 +150,9 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):


def setup_trainer_and_models_for_inference(
model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
model_provider: Any,
cfg: DictConfig,
model_cfg_modifier: Callable,
):
"""
Set up a trainer and NeMo model for inference.
Expand All @@ -172,7 +175,10 @@ def setup_trainer_and_models_for_inference(

# Use the NLPDDPStrategy for the distributed data parallel strategy.
# We don't use DDP for async grad allreduce and don't find unused parameters.
strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
strategy = NLPDDPStrategy(
no_ddp_communication_hook=True,
find_unused_parameters=False,
)

# Set up the trainer with the specified plugins and strategy.
trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
Expand Down Expand Up @@ -215,7 +221,9 @@ def setup_trainer_and_models_for_inference(
)

model = model_provider.load_from_checkpoint(
single_model_cfg.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
single_model_cfg.restore_from_path,
hparams_file=cfg.model.get("hparams_file"),
trainer=trainer,
)
models.append(model)

Expand All @@ -239,7 +247,9 @@ def dummy():


def setup_trainer_and_model_for_inference(
model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
model_provider: Any,
cfg: DictConfig,
model_cfg_modifier: Callable,
) -> Tuple[Trainer, Any]:
"""
Set up a trainer and NeMo model for inference.
Expand All @@ -261,7 +271,10 @@ def setup_trainer_and_model_for_inference(

# Use the NLPDDPStrategy for the distributed data parallel strategy.
# We don't use DDP for async grad allreduce and don't find unused parameters.
strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
strategy = NLPDDPStrategy(
no_ddp_communication_hook=True,
find_unused_parameters=False,
)

# Set up the trainer with the specified plugins and strategy.
trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
Expand Down Expand Up @@ -299,7 +312,9 @@ def setup_trainer_and_model_for_inference(
)

model = model_provider.load_from_checkpoint(
cfg.model.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
cfg.model.restore_from_path,
hparams_file=cfg.model.get("hparams_file"),
trainer=trainer,
)

else:
Expand Down Expand Up @@ -335,7 +350,9 @@ def create_neva_model_and_processor(cfg):
or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
):
model_config = MegatronNevaModel.restore_from(
restore_path=cfg.neva_model_file, trainer=trainer, return_config=True,
restore_path=cfg.neva_model_file,
trainer=trainer,
return_config=True,
)

with open_dict(cfg):
Expand Down Expand Up @@ -366,6 +383,7 @@ def create_neva_model_and_processor(cfg):
neva_cfg.activations_checkpoint_method = None
neva_cfg.precision = trainer.precision
neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
neva_cfg.apply_rope_fusion = False
neva_cfg.fp8 = False
# neva_cfg.mm_cfg.vision_encoder.from_pretrained = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
assert (
self.cfg.get("post_process", False) is False
), "post_process must be False to get hidden states in the loss_func"
assert (
self.cfg.get('apply_rope_fusion', True) is False
), "RoPE fusion should be set to False for MegatronGPTEmbeddingModel"

def model_provider_func(self, pre_process, post_process):
# (@adithyare) We need post_process to be False to get hidden states in the loss_func
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
self.use_fsdp = cfg.get('fsdp', False)

def setup_transformer_engine_tp_groups(self):
""" This should be called after model parallel groups have been initialized
and only needs to be called when using Transformer Engine.
"""This should be called after model parallel groups have been initialized
and only needs to be called when using Transformer Engine.
"""
for module in self.get_model_module_list():
"""Set TP group
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
"""
# Deep iterate but skip self to avoid infinite recursion.
for index, child in enumerate(module.modules()):
Expand All @@ -262,14 +262,14 @@ def setup_transformer_engine_tp_groups(self):
child.set_tensor_parallel_group(tp_group)

def setup_transformer_engine_cp_groups(self):
""" This should be called after context parallel groups have been initialized
and only needs to be called when using Transformer Engine.
"""This should be called after context parallel groups have been initialized
and only needs to be called when using Transformer Engine.
"""
cp_stream = torch.cuda.Stream()

for module in self.get_model_module_list():
"""Set context parallel running
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
"""
# Deep iterate but skip self to avoid infinite recursion.
for index, child in enumerate(module.modules()):
Expand All @@ -283,11 +283,11 @@ def setup_transformer_engine_cp_groups(self):
)

def _wrap_model_for_O2(self):
""" Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
Args:
model: The model to wrap. Can be a list of modules or a single module.
Returns:
The wrapped model. Returns a list of wrapped modules or a single wrapped module.
"""Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
Args:
model: The model to wrap. Can be a list of modules or a single module.
Returns:
The wrapped model. Returns a list of wrapped modules or a single wrapped module.
"""
is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)

Expand Down Expand Up @@ -450,10 +450,10 @@ def on_validation_end(self) -> None:
gc.collect()

def build_transformer_config(self) -> TransformerConfig:
""" Builds the megatron core transformer config for the model.
For attributes in the nemo model config that are the same
as the megatron core TransformerConfig, we will use the value from the nemo model config.
For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
"""Builds the megatron core transformer config for the model.
For attributes in the nemo model config that are the same
as the megatron core TransformerConfig, we will use the value from the nemo model config.
For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
"""

# create a dictionary copy of the model config
Expand Down Expand Up @@ -509,7 +509,7 @@ def build_transformer_config(self) -> TransformerConfig:

bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)

apply_rope_fusion = self.cfg.get('apply_rope_fusion', True)
apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)

# TODO: need to check if recompute APIs are matching up properly
recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None)
Expand Down Expand Up @@ -601,7 +601,7 @@ def get_parameters_with_grad(self):

def configure_gradient_clipping(self, *args, **kwargs):
"""PTL hook to configure gradients.
We use gradient clipping implementation from megatron-lm.
We use gradient clipping implementation from megatron-lm.
"""
clip_val = self.trainer.gradient_clip_val
if clip_val is None:
Expand All @@ -627,13 +627,17 @@ def configure_gradient_clipping(self, *args, **kwargs):
parameters = self._optimizer.get_parameters_with_grad()
else:
parameters = self.get_parameters_with_grad()
grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=self.use_fsdp,)
grad_norm = clip_grad_norm_fp32(
parameters=parameters,
max_norm=clip_val,
use_fsdp=self.use_fsdp,
)

self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1)

def allreduce_gradients(self):
"""Reduce gradients across data parallel ranks.
Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
"""
# Bucketize and all-reduce
buckets = {}
Expand Down Expand Up @@ -732,7 +736,9 @@ def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloade
self.validation_global_step += 1

def setup_optimization(
self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
self,
optim_config: Optional[Union[DictConfig, Dict]] = None,
optim_kwargs: Optional[Dict[str, Any]] = None,
):
# Ensure `max_steps` is set correctly
optim_config = self._optim_config_copy(optim_config)
Expand Down Expand Up @@ -913,8 +919,8 @@ def _extract_consumed_samples_from_ckpt(self, ckpt_path):
return init_consumed_samples

def _validate_and_override_config(self):
""" Certain configurations might be incompatible or discouraged.
We can check for them here and override if necessary.
"""Certain configurations might be incompatible or discouraged.
We can check for them here and override if necessary.
"""
app_state = AppState()

Expand Down Expand Up @@ -1093,9 +1099,9 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
return num_parameters_on_device, total_num_parameters

def build_model_parallel_config(self) -> ModelParallelConfig:
""" For attributes in the nemo model config that are the same as the
megatron core ModelParallelConfig we will use the value from the nemo config.
For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
"""For attributes in the nemo model config that are the same as the
megatron core ModelParallelConfig we will use the value from the nemo config.
For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
"""
cfg = OmegaConf.to_container(self.cfg, resolve=True)

Expand All @@ -1116,9 +1122,9 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
"async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
and not self.cfg.get('sequence_parallel', False),
"pipeline_dtype": pipeline_dtype,
"grad_scale_func": self.trainer.precision_plugin.scaler.scale
if self.trainer.precision in ["16", "16-mixed"]
else None,
"grad_scale_func": (
self.trainer.precision_plugin.scaler.scale if self.trainer.precision in ["16", "16-mixed"] else None
),
"enable_autocast": not megatron_amp_O2 and self.torch_dtype in [torch.bfloat16, torch.float16],
"autocast_dtype": self.autocast_dtype,
"variable_seq_lengths": False, # set dynamically during training
Expand Down Expand Up @@ -1230,7 +1236,7 @@ def find_frozen_submodules(model):
return frozen_submodule_names, frozen_submodules

if self.use_fsdp:
""" Top-evel FSDP model sharding """
"""Top-evel FSDP model sharding"""
# Shard the top-level model hierarchically. We shard the strategy-unwrapped model not
# to lose the structure of non-FSDP wrapped parameters (e.g, embedding)
# TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be
Expand Down

0 comments on commit cd6d67b

Please sign in to comment.