add adam for torch (#531)

Co-authored-by: Haifeng Jin <haifeng-jin@users.noreply.github.com>
This commit is contained in:
Haifeng Jin 2023-07-18 12:36:10 -07:00 committed by Francois Chollet
parent fcda274b00
commit 860b1ca4da
6 changed files with 88 additions and 20 deletions

@ -0,0 +1,58 @@
import torch
from keras_core import ops
from keras_core import optimizers
from keras_core.backend.torch.optimizers import torch_parallel_optimizer
class Adam(torch_parallel_optimizer.TorchParallelOptimizer, optimizers.Adam):
def _parallel_update_step(
self,
grads,
variables,
learning_rate,
):
keras_variables = variables
variables = [v.value for v in variables]
dtype = variables[0].dtype
lr = ops.cast(learning_rate, dtype)
local_step = ops.cast(self.iterations + 1, dtype)
beta_1_power = ops.power(ops.cast(self.beta_1, dtype), local_step)
beta_2_power = ops.power(ops.cast(self.beta_2, dtype), local_step)
alpha = lr * ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)
m_list = [
self._momentums[self._get_variable_index(variable)].value
for variable in keras_variables
]
v_list = [
self._velocities[self._get_variable_index(variable)].value
for variable in keras_variables
]
torch._foreach_mul_(m_list, self.beta_1)
torch._foreach_add_(m_list, grads, alpha=1 - self.beta_1)
torch._foreach_mul_(v_list, self.beta_2)
torch._foreach_add_(
v_list, torch._foreach_mul(grads, grads), alpha=1 - self.beta_2
)
if self.amsgrad:
v_hat_list = [
self._velocity_hats[self._get_variable_index(variable)].value
for variable in keras_variables
]
torch._foreach_maximum_(v_hat_list, v_list)
v_list = v_hat_list
torch._foreach_add_(
variables,
torch._foreach_div(
torch._foreach_mul(m_list, alpha),
torch._foreach_add(torch._foreach_sqrt(v_list), self.epsilon),
),
alpha=-1,
)

@ -1,15 +1,19 @@
import torch
from keras_core import optimizers
from keras_core.optimizers.base_optimizer import BaseOptimizer
class TorchOptimizer(BaseOptimizer):
def __new__(cls, *args, **kwargs):
# Import locally to avoid circular imports.
from keras_core import optimizers
from keras_core.backend.torch.optimizers import torch_adam
from keras_core.backend.torch.optimizers import torch_sgd
OPTIMIZERS = {optimizers.SGD: torch_sgd.SGD}
OPTIMIZERS = {
optimizers.SGD: torch_sgd.SGD,
optimizers.Adam: torch_adam.Adam,
}
if cls in OPTIMIZERS:
return OPTIMIZERS[cls](*args, **kwargs)
return super().__new__(cls)

@ -0,0 +1,13 @@
from keras_core.optimizers.base_optimizer import BaseOptimizer
class TorchParallelOptimizer(BaseOptimizer):
def _internal_apply_gradients(self, grads_and_vars):
grads, trainable_variables = zip(*grads_and_vars)
self._parallel_update_step(
grads,
trainable_variables,
self._get_current_learning_rate(),
)
self.iterations.assign(self.iterations + 1)

@ -1,29 +1,22 @@
import torch
from keras_core import optimizers
from keras_core.backend.torch.optimizers import torch_parallel_optimizer
class SGD(optimizers.SGD):
def _internal_apply_gradients(self, grads_and_vars):
grads, trainable_variables = zip(*grads_and_vars)
self._parallel_update_step(
grads,
[v.value for v in trainable_variables],
self._get_current_learning_rate(),
)
self.iterations.assign(self.iterations + 1)
class SGD(torch_parallel_optimizer.TorchParallelOptimizer, optimizers.SGD):
def _parallel_update_step(
self,
grads,
variables,
learning_rate,
):
keras_variables = variables
variables = [v.value for v in variables]
if self.momentum != 0:
bufs = [
self.momentums[self._get_variable_index(variable.value)]
for variable in variables
self.momentums[self._get_variable_index(variable.value)].value
for variable in keras_variables
]
for i in range(len(bufs)):

@ -5,7 +5,6 @@ from keras_core.backend import KerasTensor
class SymbolicArguments:
def __init__(self, *args, **kwargs):
self.args = tree.map_structure(lambda x: x, args)
self.kwargs = tree.map_structure(lambda x: x, kwargs)
self._flat_arguments = tree.flatten((self.args, self.kwargs))

@ -2,6 +2,7 @@ import numpy as np
import keras_core
from keras_core import backend
from keras_core import ops
from keras_core import testing
from keras_core.optimizers.adam import Adam
@ -19,14 +20,14 @@ class AdamTest(testing.TestCase):
def test_single_step(self):
optimizer = Adam(learning_rate=0.5)
grads = np.array([1.0, 6.0, 7.0, 2.0])
grads = ops.array([1.0, 6.0, 7.0, 2.0])
vars = backend.Variable([1.0, 2.0, 3.0, 4.0])
optimizer.apply_gradients(zip([grads], [vars]))
self.assertAllClose(vars, [0.5, 1.5, 2.5, 3.5], rtol=1e-4, atol=1e-4)
def test_weight_decay(self):
grads, var1, var2, var3 = (
np.zeros(()),
ops.zeros(()),
backend.Variable(2.0),
backend.Variable(2.0, name="exclude"),
backend.Variable(2.0),
@ -50,8 +51,8 @@ class AdamTest(testing.TestCase):
optimizer = Adam(amsgrad=True)
x = backend.Variable(np.ones([10]))
grads = np.arange(0.1, 1.1, 0.1)
first_grads = np.full((10,), 0.01)
grads = ops.arange(0.1, 1.1, 0.1)
first_grads = ops.full((10,), 0.01)
golden = np.tile(
[[0.999], [0.9982], [0.9974], [0.9965], [0.9955]], (1, 10)