diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index d2b93eb06..f14076a17 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -979,6 +979,7 @@ def __init__( self.gpt_j_tied = neox_args.gpt_j_tied self.mlp_type = neox_args.mlp_type self.moe_type = neox_args.moe_type + self.activation = neox_args.activation if self.gpt_j_residual: # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers.