fused layernorm (#1105)

* Add simple util for CUDA timings * Add fused layernorm kernel from Megatron Closes #952 * change default fused layernorm to false * Update test_setup.yml * Update test_train_base.yml --------- Co-authored-by: Yang Zhang <[email protected]> Co-authored-by: jahatef <[email protected]> Co-authored-by: Jacob Hatef <[email protected]>
EleutherAI · Jan 26, 2024 · 3d8fec0 · 3d8fec0
1 parent 7a8fa2f
commit 3d8fec0
Show file tree

Hide file tree

Showing 23 changed files with 285 additions and 1 deletion.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/125M-json.yml b/configs/125M-json.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   "init_method": "small_init",
   "output_layer_init_method": "wang_init",

diff --git a/configs/125M.yml b/configs/125M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/13B.yml b/configs/13B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/175B.yml b/configs/175B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/19M.yml b/configs/19M.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",

diff --git a/configs/2-7B.yml b/configs/2-7B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/20B.yml b/configs/20B.yml
@@ -31,6 +31,7 @@
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": true,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",

diff --git a/configs/350M.yml b/configs/350M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/49M.yml b/configs/49M.yml
@@ -19,6 +19,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",

diff --git a/configs/6-7B.yml b/configs/6-7B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/760M.yml b/configs/760M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",

diff --git a/configs/800M.yml b/configs/800M.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",

diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
@@ -19,6 +19,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
 
    # optimizer settings

diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
@@ -20,6 +20,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
 
    # optimizer settings

diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
@@ -12,6 +12,7 @@
    "scaled_upper_triang_masked_softmax_fusion": true,
    "bias_gelu_fusion": true,
    "rope_fusion": false,
+   "layernorm_fusion": false,
    "optimizer": {
      "type": "Adam",
      "params": {

diff --git a/megatron/devutil.py b/megatron/devutil.py
@@ -0,0 +1,51 @@
+import torch.cuda
+
+
+class Metric:
+    """
+    Dumb utility to collect and report average wall-time metrics.
+    """
+
+    def __init__(self, label):
+        self.label = label
+        self.measurements = []
+
+    def collect(self, measurement):
+        self.measurements.append(measurement)
+
+    def get_measurements(self):
+        return self.measurements[:]
+
+    def report(self):
+        print(
+            self.label,
+            torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0),
+        )
+
+
+def monitor_method_cuda_wall_times(metric, obj, methodname):
+    """
+    Measure timings for a method on an object or class.
+
+    For instance:
+
+    >>> metric = Metric('!LNORM')
+    >>> monitor_method_wall_times(metric, LayerNorm, 'forward')
+    """
+    oldmeth = getattr(obj, methodname)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    def newmeth(*args, **kw):
+        start_event.record()
+        try:
+            return oldmeth(*args, **kw)
+        finally:
+            end_event.record()
+            torch.cuda.synchronize()
+            elapsed = start_event.elapsed_time(end_event)
+            metric.collect(elapsed)
+            metric.report()
+
+    setattr(obj, methodname, newmeth)
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""This code is copied from NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with some changes. """
+
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+from torch.nn import functional as F
+import inspect
+
+from megatron.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+
+global fused_layer_norm_cuda
+fused_layer_norm_cuda = None
+
+
+class MixedFusedLayerNorm(torch.nn.Module):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-5,
+        no_persist_layer_norm=True,
+        sequence_parallel=False,
+        apply_layernorm_1p=False,
+        mem_efficient_ln=True,
+    ):
+        super(MixedFusedLayerNorm, self).__init__()
+
+        self.apply_layernorm_1p = apply_layernorm_1p
+        self.mem_efficient_ln = mem_efficient_ln
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        if (
+            normalized_shape not in persist_ln_hidden_sizes
+            or not HAVE_PERSIST_LAYER_NORM
+        ):
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*normalized_shape))
+        self.bias = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, "sequence_parallel", self.sequence_parallel)
+        setattr(self.bias, "sequence_parallel", self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.apply_layernorm_1p:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input):
+
+        weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
+        # CPU path is here for unittest sake.
+        if not input.is_cuda:
+            print(
+                "WARNING! The input of FusedLayerNorm should be on the GPU."
+                "This warning should only be triggered in the FusedLayerNorm unit tests."
+            )
+            return F.layer_norm(
+                input, self.normalized_shape, weight, self.bias, self.eps
+            )
+
+        if self.no_persist_layer_norm:
+            # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect
+            # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
+            if (
+                "memory_efficient"
+                in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+            ):
+                return FusedLayerNormAffineFunction.apply(
+                    input,
+                    weight,
+                    self.bias,
+                    self.normalized_shape,
+                    self.eps,
+                    self.mem_efficient_ln,
+                )
+            else:
+                return FusedLayerNormAffineFunction.apply(
+                    input, weight, self.bias, self.normalized_shape, self.eps
+                )
+        else:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+            return output
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch.nn import LayerNorm as LayerNorm
+from .fused_layer_norm import MixedFusedLayerNorm
 
 
 def get_norm(neox_args):
@@ -22,7 +23,7 @@ def get_norm(neox_args):
         eps = neox_args.rms_norm_epsilon
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
-        norm = LayerNorm
+        norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -125,6 +125,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
     """
 
+    layernorm_fusion: bool = False
+    """
+    Use fused layer norm kernel (if `norm` is `layernorm`).
+    """
+
     use_qk_layernorm: bool = False
     """
     Use QK Normalization