From 1350b2c27390eb2f78ec61c5e334f322bf061199 Mon Sep 17 00:00:00 2001 From: tiandeyu-cs <54715756+tiandeyu-cs@users.noreply.github.com> Date: Thu, 14 Nov 2024 06:45:18 +0800 Subject: [PATCH] fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed (#1309) * fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed * config adjustments for llama and gated activations * pre-commit --------- Co-authored-by: jahatef Co-authored-by: Quentin Anthony --- configs/llama/13B.yml | 2 ++ configs/llama/30B.yml | 2 ++ configs/llama/65B.yml | 2 ++ configs/llama/7B.yml | 2 ++ configs/llama/train_config.yml | 2 +- configs/llama2/13B.yml | 1 + configs/llama2/70B.yml | 2 +- configs/llama2/7B.yml | 1 + megatron/model/transformer.py | 5 ++--- 9 files changed, 14 insertions(+), 5 deletions(-) diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml index 162e51719..a7470cae8 100644 --- a/configs/llama/13B.yml +++ b/configs/llama/13B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 40, "hidden_size": 5120, + "intermediate_size": 40960, "num_attention_heads": 40, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml index 2c948e40c..234445c77 100644 --- a/configs/llama/30B.yml +++ b/configs/llama/30B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 60, "hidden_size": 6656, + "intermediate_size": 53248, "num_attention_heads": 52, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml index 4ebd249b9..8ffffe241 100644 --- a/configs/llama/65B.yml +++ b/configs/llama/65B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 80, "hidden_size": 8192, + "intermediate_size": 65536, "num_attention_heads": 64, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml index cc21446be..0d7c40b24 100644 --- a/configs/llama/7B.yml +++ b/configs/llama/7B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 32, "hidden_size": 4096, + "intermediate_size": 32768, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml index 7cc5a5968..459332609 100644 --- a/configs/llama/train_config.yml +++ b/configs/llama/train_config.yml @@ -70,5 +70,5 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, - "mlp_multiple_of": 256, + } diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml index 5bf7a4f72..7df5ad3ea 100644 --- a/configs/llama2/13B.yml +++ b/configs/llama2/13B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 40, "hidden_size": 5120, + "intermediate_size": 41472, "num_attention_heads": 40, "seq_length": 4096, "max_position_embeddings": 4096, diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml index b628deffe..d175e146e 100644 --- a/configs/llama2/70B.yml +++ b/configs/llama2/70B.yml @@ -6,7 +6,7 @@ # model settings "num_layers": 80, "hidden_size": 8192, - "intermediate_size": 28672, + "intermediate_size": 86016, "num_attention_heads": 64, "num_kv_heads": 8, "seq_length": 4096, diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml index eeba99c52..cdb63f02e 100644 --- a/configs/llama2/7B.yml +++ b/configs/llama2/7B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 32, "hidden_size": 4096, + "intermediate_size": 32768, "num_attention_heads": 32, "seq_length": 4096, "max_position_embeddings": 4096, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 42dbdfeeb..7627e13b6 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -1245,9 +1245,8 @@ def forward(self, x, attention_mask, layer_past=None): with torch.enable_grad() if not self.eval else nullcontext(): if ( - self.activation == "swiglu" - or self.num_experts > 1 - and self.moe_type == "deepspeed" + mlp_bias == None, + self.num_experts > 1 and self.moe_type == "deepspeed", ): # No dropout either assert mlp_bias is None