add adamax for torch (keras-team#549)

Co-authored-by: Haifeng Jin <[email protected]>
freedomtan · Jul 19, 2023 · 8514729 · 8514729
1 parent dc9d822
commit 8514729
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 4 deletions.
diff --git a/keras_core/backend/torch/optimizers/torch_adamax.py b/keras_core/backend/torch/optimizers/torch_adamax.py
@@ -0,0 +1,52 @@
+import torch
+
+from keras_core import ops
+from keras_core import optimizers
+from keras_core.backend.torch.optimizers import torch_parallel_optimizer
+
+
+class Adamax(
+    torch_parallel_optimizer.TorchParallelOptimizer, optimizers.Adamax
+):
+    def _parallel_update_step(
+        self,
+        grads,
+        variables,
+        learning_rate,
+    ):
+        keras_variables = variables
+        variables = [v.value for v in variables]
+
+        dtype = variables[0].dtype
+        lr = ops.cast(learning_rate, dtype)
+
+        local_step = ops.cast(self.iterations + 1, dtype)
+
+        beta_1_power = ops.power(ops.cast(self.beta_1, dtype), local_step)
+
+        m_list = [
+            self._m[self._get_variable_index(variable)].value
+            for variable in keras_variables
+        ]
+        u_list = [
+            self._u[self._get_variable_index(variable)].value
+            for variable in keras_variables
+        ]
+
+        torch._foreach_mul_(m_list, self.beta_1)
+        torch._foreach_add_(m_list, grads, alpha=1 - self.beta_1)
+
+        torch._foreach_mul_(u_list, self.beta_2)
+        torch._foreach_maximum_(u_list, torch._foreach_abs(grads))
+
+        torch._foreach_add_(
+            variables,
+            torch._foreach_div(
+                torch._foreach_mul(m_list, lr),
+                torch._foreach_mul(
+                    torch._foreach_add(u_list, self.epsilon),
+                    1 - beta_1_power,
+                ),
+            ),
+            alpha=-1,
+        )
diff --git a/keras_core/backend/torch/optimizers/torch_optimizer.py b/keras_core/backend/torch/optimizers/torch_optimizer.py
@@ -10,6 +10,7 @@ def __new__(cls, *args, **kwargs):
         from keras_core.backend.torch.optimizers import torch_adadelta
         from keras_core.backend.torch.optimizers import torch_adagrad
         from keras_core.backend.torch.optimizers import torch_adam
+        from keras_core.backend.torch.optimizers import torch_adamax
         from keras_core.backend.torch.optimizers import torch_adamw
         from keras_core.backend.torch.optimizers import torch_rmsprop
         from keras_core.backend.torch.optimizers import torch_sgd
@@ -18,6 +19,7 @@ def __new__(cls, *args, **kwargs):
             optimizers.Adadelta: torch_adadelta.Adadelta,
             optimizers.Adagrad: torch_adagrad.Adagrad,
             optimizers.Adam: torch_adam.Adam,
+            optimizers.Adamax: torch_adamax.Adamax,
             optimizers.AdamW: torch_adamw.AdamW,
             optimizers.RMSprop: torch_rmsprop.RMSprop,
             optimizers.SGD: torch_sgd.SGD,

diff --git a/keras_core/optimizers/adamax_test.py b/keras_core/optimizers/adamax_test.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from keras_core import backend
+from keras_core import ops
 from keras_core import testing
 from keras_core.optimizers.adamax import Adamax
 
@@ -20,14 +21,14 @@ def test_config(self):
 
     def test_single_step(self):
         optimizer = Adamax(learning_rate=0.5)
-        grads = np.array([1.0, 6.0, 7.0, 2.0])
+        grads = ops.array([1.0, 6.0, 7.0, 2.0])
         vars = backend.Variable([1.0, 2.0, 3.0, 4.0])
         optimizer.apply_gradients(zip([grads], [vars]))
         self.assertAllClose(vars, [0.5, 1.5, 2.5, 3.5], rtol=1e-4, atol=1e-4)
 
     def test_weight_decay(self):
         grads, var1, var2, var3 = (
-            np.zeros(()),
+            ops.zeros(()),
             backend.Variable(2.0),
             backend.Variable(2.0, name="exclude"),
             backend.Variable(2.0),
@@ -53,8 +54,8 @@ def test_correctness_with_golden(self):
         )
 
         x = backend.Variable(np.ones([10]))
-        grads = np.arange(0.1, 1.1, 0.1)
-        first_grads = np.full((10,), 0.01)
+        grads = ops.arange(0.1, 1.1, 0.1)
+        first_grads = ops.full((10,), 0.01)
 
         # fmt: off
         golden = np.array(