From ec82c05780d40404c618d4905ad14b670a91bd3c Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Sun, 8 Sep 2024 14:14:33 -0700 Subject: [PATCH] apply pre-commit and add missing close-paren to mamba config (#1270) --- configs/mamba/mamba-130M.yml | 1 + megatron/data/helpers.cpp | 12 ++++++------ megatron/model/mamba/mamba.py | 14 ++++++++++---- megatron/model/rwkv/v6/rwkv.py | 8 ++++++-- megatron/model/transformer.py | 10 ++++++++-- megatron/neox_arguments/arguments.py | 4 +++- megatron/tokenizer/tokenizer.py | 6 ++++-- 7 files changed, 38 insertions(+), 17 deletions(-) diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml index 7187048e6..bd05723b2 100644 --- a/configs/mamba/mamba-130M.yml +++ b/configs/mamba/mamba-130M.yml @@ -86,3 +86,4 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, +} diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 9b062b050..aca290854 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py index b3d9e1549..950e36fed 100644 --- a/megatron/model/mamba/mamba.py +++ b/megatron/model/mamba/mamba.py @@ -13,8 +13,10 @@ from causal_conv1d import causal_conv1d_fn import einops except ModuleNotFoundError: - print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ - or directly from https://github.com/state-spaces/mamba") + print( + "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ + or directly from https://github.com/state-spaces/mamba" + ) pass from megatron.model.norms import get_norm @@ -44,7 +46,9 @@ def __init__( neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion ), "Mamba fused inner fn and bias in x_proj not compatible!" - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" # set variables, mostly following mamba defaults self.d_model = neox_args.hidden_size @@ -53,7 +57,9 @@ def __init__( if neox_args.intermediate_size: self.d_inner = neox_args.intermediate_size else: - self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 2 + ) self.d_inner = int(self.expand * self.d_model) self.dt_rank = math.ceil(self.d_model / 16) # rank of dt / Delta parameter self.dt_scale = 1.0 diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py index ec8cc1aa6..b3741a3fc 100644 --- a/megatron/model/rwkv/v6/rwkv.py +++ b/megatron/model/rwkv/v6/rwkv.py @@ -275,13 +275,17 @@ def __init__(self, neox_args, layer_number): self.layer_number = layer_number self.fp16 = neox_args.precision == "fp16" self.bf16 = neox_args.precision == "bfloat16" - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" if not hasattr(neox_args, "dim_att"): neox_args.dim_att = neox_args.hidden_size if neox_args.intermediate_size: neox_args.ffn_dim = neox_args.intermediate_size else: - self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + ) neox_args.ffn_dim = int(self.expand * neox_args.hidden_size) # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 119676c54..d2b93eb06 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -98,7 +98,9 @@ def __init__( MoE_mp_size=1, ): super().__init__() - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" self.activation_func, self.is_gated = get_activation(neox_args) self.activation_type = neox_args.activation @@ -1230,7 +1232,11 @@ def forward(self, x, attention_mask, layer_past=None): raise KeyError(self.moe_type) with torch.enable_grad(): - if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed": + if ( + self.activation == "swiglu" + or self.num_experts > 1 + and self.moe_type == "deepspeed" + ): # No dropout either assert mlp_bias is None output = mlp_output + attention_output diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 327639454..8fbe045bb 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1187,7 +1187,9 @@ def validate_values(self): return False # Checks. - if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config): + if self.hidden_size % self.num_attention_heads != 0 and not ( + "mamba" in self.attention_config + ): error_message = ( self.__class__.__name__ + ".validate_values() hidden_size must be divisible by num_attention_heads" diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index e450504c8..d39e18243 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -31,8 +31,10 @@ def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) - - assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config" + + assert ( + args.tokenizer_type is not None + ), "tokenizer_type must be specified in the .yml config" # Select and instantiate the tokenizer. if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():