keras/optimizers.py
2015-03-27 17:59:42 -07:00

137 lines
4.3 KiB
Python

import theano
import theano.tensor as T
import numpy as np
from utils.theano_utils import shared_zeros, shared_scalar
def clip_norm(g, c, n):
if c > 0:
g = T.switch(T.ge(n, c), g*c/n, g)
return g
class Optimizer(object):
def get_updates(self, params, grads):
raise NotImplementedError
def get_gradients(self, cost, params):
grads = T.grad(cost, params)
if hasattr(self, 'clipnorm') and self.clipnorm > 0:
norm = T.sqrt(sum([T.sum(g**2) for g in grads]))
grads = [clip_norm(g, c, norm) for g in grads]
new_grads = []
for p, g in zip(params, grads):
if hasattr(self, 'l1') and self.l1 > 0:
g += T.sgn(p) * self.l1
if hasattr(self, 'l2') and self.l2 > 0:
g += p * self.l2
if hasattr(self, 'maxnorm') and self.maxnorm > 0:
norms = T.sqrt(T.sum(T.sqr(p), axis=0))
desired = T.clip(norms, 0, self.maxnorm)
p = p * (desired / (1e-7 + norms))
new_grads.append(g)
return new_grads
class SGD(Optimizer):
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwargs):
self.__dict__.update(locals())
self.iterations = shared_scalar(0)
def get_updates(self, params, cost):
grads = self.get_gradients(cost, params)
lr = self.lr - self.decay * self.iterations
updates = [(self.iterations, self.iterations+1.)]
for p, g in zip(params, grads):
m = shared_zeros(p.get_value().shape) # momentum
v = self.momentum * m - lr * g # velocity
updates.append((m, v))
if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v
updates.append((p, new_p))
return updates
class RMSprop(Optimizer):
def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
self.__dict__.update(locals())
def get_updates(self, params, cost):
grads = self.get_gradients(cost, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
updates = []
for p, g, a in zip(params, grads, accumulators):
new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
updates.append((a, new_a))
new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
updates.append((p, new_p))
return updates
class Adagrad(Optimizer):
def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
self.__dict__.update(locals())
def get_updates(self, params, cost):
grads = self.get_gradients(cost, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
updates = []
for p, g, a in zip(params, grads, accumulators):
new_a = a + g ** 2 # update accumulator
updates.append((a, new_a))
new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
updates.append((p, new_p))
return updates
class Adadelta(Optimizer):
def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
self.__dict__.update(locals())
def get_updates(self, params, cost):
grads = self.get_gradients(cost, params)
accumulators = [shared_zeros(p.get_value().shape) for p in params]
delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
updates = []
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
updates.append((a, new_a))
# use the new accumulator and the *old* delta_accumulator
update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon)
new_p = p - self.lr * update
updates.append((p, new_p))
# update delta_accumulator
new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
updates.append((d_a, new_d_a))
return updates
# aliases
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta
from utils.generic_utils import get_from_module
def get(identifier):
return get_from_module(identifier, globals(), 'optimizer', instantiate=True)