add adadelta for torch (#534)

Co-authored-by: Haifeng Jin <haifeng-jin@users.noreply.github.com>
2023-07-18 18:44:30 -07:00 · 2023-07-18 18:44:30 -07:00 · 2e9ed2ddba
commit 2e9ed2ddba
parent bacd49c4d2
5 changed files with 66 additions and 8 deletions
--- a/keras_core/backend/torch/optimizers/torch_adadelta.py
+++ b/keras_core/backend/torch/optimizers/torch_adadelta.py
@ -0,0 +1,56 @@
 import torch
 from keras_core import ops
 from keras_core import optimizers
 from keras_core.backend.torch.optimizers import torch_parallel_optimizer
 class Adadelta(
    torch_parallel_optimizer.TorchParallelOptimizer, optimizers.Adadelta
 ):
    def _parallel_update_step(
        self,
        grads,
        variables,
        learning_rate,
    ):
        keras_variables = variables
        variables = [v.value for v in variables]
        dtype = variables[0].dtype
        lr = ops.cast(learning_rate, dtype)
        rho = self.rho
        accumulated_grads = [
            self._accumulated_grads[self._get_variable_index(variable)].value
            for variable in keras_variables
        ]
        accumulated_delta_vars = [
            self._accumulated_delta_vars[
                self._get_variable_index(variable)
            ].value
            for variable in keras_variables
        ]
        torch._foreach_mul_(accumulated_grads, rho)
        torch._foreach_add_(
            accumulated_grads, torch._foreach_mul(grads, grads), alpha=1 - rho
        )
        def rms(x):
            return torch._foreach_sqrt(torch._foreach_add(x, self.epsilon))
        delta_vars = torch._foreach_mul(
            torch._foreach_div(
                torch._foreach_mul(rms(accumulated_delta_vars), grads),
                rms(accumulated_grads),
            ),
            -1,
        )
        torch._foreach_mul_(accumulated_delta_vars, rho)
        torch._foreach_add_(
            accumulated_delta_vars,
            torch._foreach_mul(delta_vars, delta_vars),
            alpha=1 - rho,
        )
        torch._foreach_add_(variables, delta_vars, alpha=lr)
--- a/keras_core/backend/torch/optimizers/torch_optimizer.py
+++ b/keras_core/backend/torch/optimizers/torch_optimizer.py
@ -7,12 +7,14 @@ from keras_core.optimizers.base_optimizer import BaseOptimizer
 class TorchOptimizer(BaseOptimizer):
    def __new__(cls, *args, **kwargs):
        # Import locally to avoid circular imports.
        from keras_core.backend.torch.optimizers import torch_adadelta
        from keras_core.backend.torch.optimizers import torch_adam
        from keras_core.backend.torch.optimizers import torch_adamw
        from keras_core.backend.torch.optimizers import torch_rmsprop
        from keras_core.backend.torch.optimizers import torch_sgd
        OPTIMIZERS = {
            optimizers.Adadelta: torch_adadelta.Adadelta,
            optimizers.Adam: torch_adam.Adam,
            optimizers.AdamW: torch_adamw.AdamW,
            optimizers.RMSprop: torch_rmsprop.RMSprop,
--- a/keras_core/backend/torch/optimizers/torch_rmsprop.py
+++ b/keras_core/backend/torch/optimizers/torch_rmsprop.py
@ -57,9 +57,8 @@ class RMSprop(
                self._momentums[self._get_variable_index(variable)].value
                for variable in keras_variables
            ]
-            momentum_list = torch._foreach_add(
+            torch._foreach_mul_(momentum_list, self.momentum)
-                increments, momentum_list, alpha=self.momentum
+            torch._foreach_add_(momentum_list, increments)
            )
            torch._foreach_add_(variables, momentum_list, alpha=-1)
        else:
            torch._foreach_add_(variables, increments, alpha=-1)
--- a/keras_core/backend/torch/optimizers/torch_sgd.py
+++ b/keras_core/backend/torch/optimizers/torch_sgd.py
@ -15,7 +15,7 @@ class SGD(torch_parallel_optimizer.TorchParallelOptimizer, optimizers.SGD):
        variables = [v.value for v in variables]
        if self.momentum != 0:
            bufs = [
-                self.momentums[self._get_variable_index(variable.value)].value
+                self.momentums[self._get_variable_index(variable)].value
                for variable in keras_variables
            ]
--- a/keras_core/optimizers/adadelta_test.py
+++ b/keras_core/optimizers/adadelta_test.py
@ -1,6 +1,7 @@
 import numpy as np
 from keras_core import backend
 from keras_core import ops
 from keras_core import testing
 from keras_core.optimizers.adadelta import Adadelta
@ -16,7 +17,7 @@ class AdadeltaTest(testing.TestCase):
    def test_single_step(self):
        optimizer = Adadelta(learning_rate=0.5)
-        grads = np.array([1.0, 6.0, 7.0, 2.0])
+        grads = ops.array([1.0, 6.0, 7.0, 2.0])
        vars = backend.Variable([1.0, 2.0, 3.0, 4.0])
        optimizer.apply_gradients(zip([grads], [vars]))
        self.assertAllClose(
@ -25,7 +26,7 @@ class AdadeltaTest(testing.TestCase):
    def test_weight_decay(self):
        grads, var1, var2, var3 = (
-            np.zeros(()),
+            ops.zeros(()),
            backend.Variable(2.0),
            backend.Variable(2.0, name="exclude"),
            backend.Variable(2.0),
@ -49,8 +50,8 @@ class AdadeltaTest(testing.TestCase):
        optimizer = Adadelta(learning_rate=1.0, rho=0.8, epsilon=1e-6)
        x = backend.Variable(np.ones([10]))
-        grads = np.arange(0.1, 1.1, 0.1)
+        grads = ops.arange(0.1, 1.1, 0.1)
-        first_grads = np.full((10,), 0.01)
+        first_grads = ops.full((10,), 0.01)
        golden = np.tile(
            [[0.9978], [0.9947], [0.9915], [0.9882], [0.9849]], (1, 10)