import theano
import theano.tensor as T
import numpy as np

from utils.theano_utils import shared_zeros, shared_scalar

def clip_norm(g, c, n):
    if c > 0:
        g = T.switch(T.ge(n, c), g*c/n, g)
    return g

class Optimizer(object):
    def get_updates(self, params, grads):
        raise NotImplementedError

    def get_gradients(self, cost, params):
        grads = T.grad(cost, params)

        if hasattr(self, 'clipnorm') and self.clipnorm > 0:
            norm = T.sqrt(sum([T.sum(g**2) for g in grads]))
            grads = [clip_norm(g, c, norm) for g in grads]

        new_grads = []
        for p, g in zip(params, grads):
            if hasattr(self, 'l1') and self.l1 > 0:
                g += T.sgn(p) * self.l1

            if hasattr(self, 'l2') and self.l2 > 0:
                g += p * self.l2

            if hasattr(self, 'maxnorm') and self.maxnorm > 0:
                norms = T.sqrt(T.sum(T.sqr(p), axis=0))
                desired = T.clip(norms, 0, self.maxnorm)
                p = p * (desired / (1e-7 + norms))

            new_grads.append(g)
        return new_grads


class SGD(Optimizer):

    def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwargs):
        self.__dict__.update(locals())
        self.iterations = shared_scalar(0)

    def get_updates(self, params, cost):
        grads = self.get_gradients(cost, params)
        lr = self.lr - self.decay * self.iterations
        updates = [(self.iterations, self.iterations+1.)]

        for p, g in zip(params, grads):
            m = shared_zeros(p.get_value().shape) # momentum
            v = self.momentum * m - lr * g # velocity
            updates.append((m, v)) 

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v
            updates.append((p, new_p))
        return updates


class RMSprop(Optimizer):

    def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
        self.__dict__.update(locals())

    def get_updates(self, params, cost):
        grads = self.get_gradients(cost, params)
        accumulators = [shared_zeros(p.get_value().shape) for p in params]
        updates = []

        for p, g, a in zip(params, grads, accumulators):
            new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
            updates.append((a, new_a))

            new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
            updates.append((p, new_p))
        return updates


class Adagrad(Optimizer):

    def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
        self.__dict__.update(locals())

    def get_updates(self, params, cost):
        grads = self.get_gradients(cost, params)
        accumulators = [shared_zeros(p.get_value().shape) for p in params]
        updates = []

        for p, g, a in zip(params, grads, accumulators):
            new_a = a + g ** 2 # update accumulator
            updates.append((a, new_a))

            new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
            updates.append((p, new_p))
        return updates


class Adadelta(Optimizer):
    
    def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
        self.__dict__.update(locals())

    def get_updates(self, params, cost):
        grads = self.get_gradients(cost, params)
        accumulators = [shared_zeros(p.get_value().shape) for p in params]
        delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
        updates = []

        for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
            new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
            updates.append((a, new_a))

            # use the new accumulator and the *old* delta_accumulator
            update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon)

            new_p = p - self.lr * update
            updates.append((p, new_p))

            # update delta_accumulator
            new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
            updates.append((d_a, new_d_a))
        return updates

# aliases
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta

from utils.generic_utils import get_from_module
def get(identifier):
    return get_from_module(identifier, globals(), 'optimizer', instantiate=True)