Fix numeric instability in LayerNormalization and BatchNormalization.

keras-team · Sep 13, 2024 · c2f5651 · c2f5651
1 parent e7b5a5d
commit c2f5651
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 16 deletions.
diff --git a/keras/src/layers/normalization/batch_normalization.py b/keras/src/layers/normalization/batch_normalization.py
@@ -4,7 +4,6 @@
 from keras.src import ops
 from keras.src import regularizers
 from keras.src.api_export import keras_export
-from keras.src.backend import standardize_dtype
 from keras.src.layers.input_spec import InputSpec
 from keras.src.layers.layer import Layer
 
@@ -244,11 +243,11 @@ def call(self, inputs, training=None, mask=None):
                     f"mask.shape={mask.shape}, inputs.shape={inputs.shape}"
                 )
 
-        input_dtype = standardize_dtype(inputs.dtype)
-        if input_dtype in ("float16", "bfloat16"):
-            # BN is prone to overflowing for float16/bfloat16 inputs, so we opt
-            # out BN for mixed precision.
-            inputs = ops.cast(inputs, "float32")
+        input_dtype = backend.standardize_dtype(inputs.dtype)
+        compute_dtype = backend.result_type(input_dtype, "float32")
+        # BN is prone to overflow with float16/bfloat16 inputs, so we upcast to
+        # float32 for the subsequent computations.
+        inputs = ops.cast(inputs, compute_dtype)
 
         moving_mean = ops.cast(self.moving_mean, inputs.dtype)
         moving_variance = ops.cast(self.moving_variance, inputs.dtype)
@@ -286,9 +285,7 @@ def call(self, inputs, training=None, mask=None):
             scale=gamma,
             epsilon=self.epsilon,
         )
-        if input_dtype in ("float16", "bfloat16"):
-            outputs = ops.cast(outputs, input_dtype)
-        return outputs
+        return ops.cast(outputs, input_dtype)
 
     def get_config(self):
         base_config = super().get_config()

diff --git a/keras/src/layers/normalization/layer_normalization.py b/keras/src/layers/normalization/layer_normalization.py
@@ -1,3 +1,4 @@
+from keras.src import backend
 from keras.src import constraints
 from keras.src import initializers
 from keras.src import ops
@@ -179,7 +180,6 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, inputs):
-        inputs = ops.cast(inputs, self.compute_dtype)
         # Compute the axes along which to reduce the mean / variance
         input_shape = inputs.shape
         ndims = len(input_shape)
@@ -199,11 +199,11 @@ def _broadcast(v):
                 return ops.reshape(v, broadcast_shape)
             return v
 
-        input_dtype = inputs.dtype
-        if input_dtype in ("float16", "bfloat16") and self.dtype == "float32":
-            # If mixed precision is used, cast inputs to float32 so that
-            # this is at least as numerically stable as the fused version.
-            inputs = ops.cast(inputs, "float32")
+        input_dtype = backend.standardize_dtype(inputs.dtype)
+        compute_dtype = backend.result_type(input_dtype, "float32")
+        # LN is prone to overflow with float16/bfloat16 inputs, so we upcast to
+        # float32 for the subsequent computations.
+        inputs = ops.cast(inputs, compute_dtype)
 
         if self.rms_scaling:
             # Calculate outputs with only variance and gamma if rms scaling
@@ -231,7 +231,6 @@ def _broadcast(v):
                 res = res + beta
 
             outputs = inputs * inv + res
-
         return ops.cast(outputs, input_dtype)
 
     def compute_output_shape(self, input_shape):