From 1350b2c27390eb2f78ec61c5e334f322bf061199 Mon Sep 17 00:00:00 2001
From: tiandeyu-cs <54715756+tiandeyu-cs@users.noreply.github.com>
Date: Thu, 14 Nov 2024 06:45:18 +0800
Subject: [PATCH] fix 'intermediate_size' in Llama configuration files after
 the 'mlp_type' option was removed (#1309)

* fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed

* config adjustments for llama and gated activations

* pre-commit

---------

Co-authored-by: jahatef <hatef.4@buckeyemail.osu.edu>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/llama/13B.yml          | 2 ++
 configs/llama/30B.yml          | 2 ++
 configs/llama/65B.yml          | 2 ++
 configs/llama/7B.yml           | 2 ++
 configs/llama/train_config.yml | 2 +-
 configs/llama2/13B.yml         | 1 +
 configs/llama2/70B.yml         | 2 +-
 configs/llama2/7B.yml          | 1 +
 megatron/model/transformer.py  | 5 ++---
 9 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
index 162e51719..a7470cae8 100644
--- a/configs/llama/13B.yml
+++ b/configs/llama/13B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 40,
   "hidden_size": 5120,
+  "intermediate_size": 40960,
   "num_attention_heads": 40,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
index 2c948e40c..234445c77 100644
--- a/configs/llama/30B.yml
+++ b/configs/llama/30B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 60,
   "hidden_size": 6656,
+  "intermediate_size": 53248,
   "num_attention_heads": 52,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
index 4ebd249b9..8ffffe241 100644
--- a/configs/llama/65B.yml
+++ b/configs/llama/65B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 80,
   "hidden_size": 8192,
+  "intermediate_size": 65536,
   "num_attention_heads": 64,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
index cc21446be..0d7c40b24 100644
--- a/configs/llama/7B.yml
+++ b/configs/llama/7B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 32,
   "hidden_size": 4096,
+  "intermediate_size": 32768,
   "num_attention_heads": 32,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml
index 7cc5a5968..459332609 100644
--- a/configs/llama/train_config.yml
+++ b/configs/llama/train_config.yml
@@ -70,5 +70,5 @@
   "steps_per_print": 10,
   "keep_last_n_checkpoints": 4,
   "wall_clock_breakdown": true,
-  "mlp_multiple_of": 256,
+
 }
diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml
index 5bf7a4f72..7df5ad3ea 100644
--- a/configs/llama2/13B.yml
+++ b/configs/llama2/13B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 40,
   "hidden_size": 5120,
+  "intermediate_size": 41472,
   "num_attention_heads": 40,
   "seq_length": 4096,
   "max_position_embeddings": 4096,
diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml
index b628deffe..d175e146e 100644
--- a/configs/llama2/70B.yml
+++ b/configs/llama2/70B.yml
@@ -6,7 +6,7 @@
   # model settings
   "num_layers": 80,
   "hidden_size": 8192,
-  "intermediate_size": 28672,
+  "intermediate_size": 86016,
   "num_attention_heads": 64,
   "num_kv_heads": 8,
   "seq_length": 4096,
diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml
index eeba99c52..cdb63f02e 100644
--- a/configs/llama2/7B.yml
+++ b/configs/llama2/7B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 32,
   "hidden_size": 4096,
+  "intermediate_size": 32768,
   "num_attention_heads": 32,
   "seq_length": 4096,
   "max_position_embeddings": 4096,
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 42dbdfeeb..7627e13b6 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -1245,9 +1245,8 @@ def forward(self, x, attention_mask, layer_past=None):
 
             with torch.enable_grad() if not self.eval else nullcontext():
                 if (
-                    self.activation == "swiglu"
-                    or self.num_experts > 1
-                    and self.moe_type == "deepspeed"
+                    mlp_bias == None,
+                    self.num_experts > 1 and self.moe_type == "deepspeed",
                 ):
                     # No dropout either
                     assert mlp_bias is None