diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
index 305567be1..7a823a43c 100644
--- a/configs/llama/13B.yml
+++ b/configs/llama/13B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
index 450f8da38..2c356cea2 100644
--- a/configs/llama/30B.yml
+++ b/configs/llama/30B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
index 85f199ce2..cc22d3734 100644
--- a/configs/llama/65B.yml
+++ b/configs/llama/65B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
index ecbf187a8..0b134ae27 100644
--- a/configs/llama/7B.yml
+++ b/configs/llama/7B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index aca290854..9b062b050 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 7a29b0716..c0b825261 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -25,9 +25,23 @@
 
 
 def get_activation(neox_args):
-    """retrieves the activation function specified in neox_args"""
+    """retrieves the activation function specified in neox_args and whether or not the activation is gated"""
+    is_gated = False
     if neox_args.activation == "geglu":
-        activation_func = GEGLU(neox_args=neox_args)
+        is_gated = True
+        activation_func = F.gelu
+    elif neox_args.activation == "reglu":
+        is_gated = True
+        activation_func = F.relu
+    elif neox_args.activation == "bilinear":
+        is_gated = True
+        activation_func = lambda x: x
+    elif neox_args.activation == "swiglu":
+        is_gated = True
+        activation_func = swish
+    elif neox_args.activation == "glu":
+        is_gated = True
+        activation_func = F.sigmoid
     elif neox_args.activation == "gelu":
         if neox_args.onnx_safe and neox_args.bias_gelu_fusion:
             raise ValueError("onnx_safe + bias_gelu_fusion not compatible")
@@ -49,7 +63,7 @@ def get_activation(neox_args):
         activation_func = F.silu
     else:
         raise ValueError(f"Activation function {neox_args.activation} not recognized")
-    return activation_func
+    return activation_func, is_gated
 
 
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################
@@ -119,21 +133,3 @@ def swish(x, beta: float = 1.0):
 @torch.jit.script
 def mish(x):
     return x * torch.tanh(F.softplus(x))
-
-
-class GEGLU(torch.nn.Module):
-    def __init__(self, neox_args):
-        super(GEGLU, self).__init__()
-        if neox_args.onnx_safe:
-            self.activation_func = erf_gelu
-        else:
-            self.activation_func = F.gelu
-
-    def forward(self, x, bias=None):
-        x, gate = x.chunk(2, dim=-1)
-        if bias is not None:
-            bias_1, bias_2 = bias.chunk(2, dim=-1)
-            x = x + bias_1
-            gate = gate + bias_2
-        intermediate_parallel = self.activation_func(gate)
-        return intermediate_parallel * x
diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py
index c3462c651..6400640bd 100644
--- a/megatron/model/gmlp.py
+++ b/megatron/model/gmlp.py
@@ -112,7 +112,7 @@ def __init__(
             init_method=init_method,
             skip_bias_add=True,
         )
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, _ = get_activation(neox_args)
         ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size())
         if neox_args.attention_config[layer_number] == "amlp":
             d_attn = neox_args.gmlp_attn_dim
diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py
index 3177267cb..b3d9e1549 100644
--- a/megatron/model/mamba/mamba.py
+++ b/megatron/model/mamba/mamba.py
@@ -44,12 +44,17 @@ def __init__(
             neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
         ), "Mamba fused inner fn and bias in x_proj not compatible!"
 
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+
         # set variables, mostly following mamba defaults
         self.d_model = neox_args.hidden_size
         self.d_state = 16  # state dimensions per channel
         self.d_conv = 4  # convolution width
-        self.expand = 2  # linear projection expansion factors
-        self.d_inner = int(self.expand * self.d_model)
+        if neox_args.intermediate_size:
+            self.d_inner = neox_args.intermediate_size
+        else:
+            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 
+            self.d_inner = int(self.expand * self.d_model)
         self.dt_rank = math.ceil(self.d_model / 16)  # rank of dt / Delta parameter
         self.dt_scale = 1.0
 
diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py
index 5d4e0d144..ec8cc1aa6 100644
--- a/megatron/model/rwkv/v6/rwkv.py
+++ b/megatron/model/rwkv/v6/rwkv.py
@@ -247,11 +247,11 @@ def __init__(self, neox_args, layer_number):
             self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
             self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
 
-        self.key = nn.Linear(neox_args.hidden_size, neox_args.dim_ffn, bias=False)
+        self.key = nn.Linear(neox_args.hidden_size, neox_args.ffn_dim, bias=False)
         self.receptance = nn.Linear(
             neox_args.hidden_size, neox_args.hidden_size, bias=False
         )
-        self.value = nn.Linear(neox_args.dim_ffn, neox_args.hidden_size, bias=False)
+        self.value = nn.Linear(neox_args.ffn_dim, neox_args.hidden_size, bias=False)
 
     def forward(self, x):
         xx = self.time_shift(x) - x
@@ -275,14 +275,19 @@ def __init__(self, neox_args, layer_number):
         self.layer_number = layer_number
         self.fp16 = neox_args.precision == "fp16"
         self.bf16 = neox_args.precision == "bfloat16"
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
         if not hasattr(neox_args, "dim_att"):
             neox_args.dim_att = neox_args.hidden_size
-        if not hasattr(neox_args, "dim_ffn"):
-            # Make hidden size 3.5x. Round to nearest multiple of 32 until we add hdim rounding logic
-            neox_args.dim_ffn = int((neox_args.hidden_size * 3.5) // 32 * 32)
+        if neox_args.intermediate_size:
+            neox_args.ffn_dim = neox_args.intermediate_size
+        else:
+            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            neox_args.ffn_dim = int(self.expand * neox_args.hidden_size)
+            # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic
+        neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32)
         assert neox_args.hidden_size % 32 == 0
         assert neox_args.dim_att % 32 == 0
-        assert neox_args.dim_ffn % 32 == 0
+        assert neox_args.ffn_dim % 32 == 0
         self.neox_args.head_size = neox_args.dim_att // neox_args.num_attention_heads
         self.head_size = self.neox_args.head_size
         self.num_attention_heads = neox_args.num_attention_heads
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 62e7d3a9c..119676c54 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -93,37 +93,55 @@ def __init__(
         init_method,
         output_layer_init_method,
         parallel_output=False,
+        multiple_of=256,
         MOE=False,
         MoE_mp_size=1,
     ):
         super().__init__()
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, self.is_gated = get_activation(neox_args)
         self.activation_type = neox_args.activation
         self.bias_gelu_fusion = neox_args.bias_gelu_fusion
+        self.multiple_of = multiple_of
 
-        # auto scale so geglu has equal parameters
-        ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4
-        ff_dim = (
-            int(ff_mult * neox_args.hidden_size) * 2
-            if self.activation_type == "geglu"
-            else ff_mult * neox_args.hidden_size
+        if neox_args.intermediate_size:
+            ffn_dim = neox_args.intermediate_size
+        elif neox_args.expansion_factor:
+            ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size)
+        else:
+            # 4h is default for ffn_dim
+            ffn_dim = 4 * neox_args.hidden_size
+        ffn_dim_in = ffn_dim
+        if self.is_gated:
+            # set activation function to be gated implementation
+            self.activation_func = Gated_Activation(self.activation_func)
+            # auto scale so gated activations has equal parameters
+            ffn_dim = int(ffn_dim * 2 / 3)
+            ffn_dim_in = ffn_dim // 2
+        # set multiple
+        ffn_dim = int(
+            (2 * self.multiple_of)
+            * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of))
+        )
+        ffn_dim_in = int(
+            self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of)
         )
-        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+
+        self.linear1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
-            output_size=ff_dim,
+            output_size=ffn_dim,
             gather_output=False,
             init_method=init_method,
             skip_bias_add=True,
             MOE=MOE,
             MoE_mp_size=MoE_mp_size,
         )
-        ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim
         # Project back to h.
-        self.dense_4h_to_h = mpu.RowParallelLinear(
+        self.linear2 = mpu.RowParallelLinear(
             neox_args=neox_args,
-            input_size=ff_dim_in,
+            input_size=ffn_dim_in,
             output_size=neox_args.hidden_size,
             input_is_parallel=True,
             init_method=output_layer_init_method,
@@ -134,13 +152,10 @@ def __init__(
         )
 
     def forward(self, hidden_states):
+        # [s, b, intermediate_size]
+        intermediate_parallel, bias_parallel = self.linear1(hidden_states)
 
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-
-        if (
-            self.activation_type == "gelu" and self.bias_gelu_fusion
-        ) or self.activation_type == "geglu":
+        if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion):
             intermediate_parallel = self.activation_func(
                 intermediate_parallel, bias_parallel
             )
@@ -150,84 +165,23 @@ def forward(self, hidden_states):
             )
 
         # [s, b, h]
-        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        output, output_bias = self.linear2(intermediate_parallel)
         return output, output_bias
 
 
-class LLaMAParallelMLP(nn.Module):
-    """LLaMA's MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension. At the end, dropout is also
-    applied.
-
-    Note: multiple_of is used to compute the hidden dimension of the MLP
-    """
-
-    def __init__(
-        self,
-        neox_args,
-        init_method,
-        output_layer_init_method,
-        parallel_output=False,
-        multiple_of=256,
-        MOE=False,
-        MoE_mp_size=1,
-    ):
+class Gated_Activation(torch.nn.Module):
+    def __init__(self, activation_func):
         super().__init__()
+        self.activation_func = activation_func
 
-        self.activation_func = get_activation(neox_args)
-        self.activation_type = neox_args.activation
-
-        self.multiple_of = multiple_of
-
-        # Allow custom intermediate size, e.g. for Mistral
-        if neox_args.intermediate_size is not None:
-            ff_dim = neox_args.intermediate_size
-        else:
-            ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
-            ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-        self.w3 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-        self.w2 = mpu.RowParallelLinear(
-            neox_args=neox_args,
-            input_size=ff_dim,
-            output_size=neox_args.hidden_size,
-            input_is_parallel=True,
-            init_method=output_layer_init_method,
-            skip_bias_add=True,
-            parallel_output=parallel_output,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-
-    def forward(self, hidden_states):
-        w1_out, _ = self.w1(hidden_states)
-        w3_out, _ = self.w3(hidden_states)
-        return self.w2(self.activation_func(w1_out) * w3_out)
+    def forward(self, x, bias=None):
+        x, gate = x.chunk(2, dim=-1)
+        if bias is not None:
+            bias_1, bias_2 = bias.chunk(2, dim=-1)
+            x = x + bias_1
+            gate = gate + bias_2
+        intermediate_parallel = self.activation_func(gate)
+        return intermediate_parallel * x
 
 
 class ParallelLinear(nn.Module):
@@ -1054,24 +1008,13 @@ def __init__(
 
         # MLP
         def get_mlp(mlp_type, **kw):
-            if mlp_type == "regular":
-                return ParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            elif mlp_type == "llama":
-                return LLaMAParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            else:
-                raise KeyError(mlp_type)
+            return ParallelMLP(
+                neox_args=neox_args,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                parallel_output=self.gpt_j_residual,
+                **kw,
+            )
 
         self.num_experts = (
             neox_args.moe_num_experts
@@ -1287,11 +1230,7 @@ def forward(self, x, attention_mask, layer_past=None):
                     raise KeyError(self.moe_type)
 
             with torch.enable_grad():
-                if (
-                    self.mlp_type == "llama"
-                    or self.num_experts > 1
-                    and self.moe_type == "deepspeed"
-                ):
+                if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed":
                     # No dropout either
                     assert mlp_bias is None
                     output = mlp_output + attention_output
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index dd51c7778..818c86d31 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -121,9 +121,12 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     intermediate_size: int = None
     """
-    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+    Transformer intermediate size. Default = 4h
+    """
 
-    If not passed, will be set to a reasonable default.
+    expansion_factor: float = None
+    """
+    Transformer intermediate size. Default = 4
     """
 
     num_attention_heads: int = None
@@ -278,10 +281,20 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
 
     activation: Literal[
-        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+        "gelu",
+        "geglu",
+        "relu",
+        "softsign",
+        "swish",
+        "mish",
+        "silu",
+        "reglu",
+        "swiglu",
+        "bilinear",
+        "glu",
     ] = "gelu"
     """
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"]
     """
 
     scaled_upper_triang_masked_softmax_fusion: bool = False
@@ -421,9 +434,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     mlp_type: str = "regular"
     """
+    Currently, the only mlp_type is "regular." This behavior is currently deprecated.
     Types:
         regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
     """
 
     soft_prompt_tuning: dict = None