Refactor regularizers and add add_weight method. (#4703)

* Refactor regularizers, introduce layer.add_weight * Fix BN add_update syntax * Fix eigenvalue regularizer * Style fixes.
2016-12-14 13:41:24 -08:00 · 2016-12-14 13:41:24 -08:00 · ff62eb251b
commit ff62eb251b
parent 2b336756b6
15 changed files with 521 additions and 536 deletions
--- a/keras/backend/theano_backend.py
+++ b/keras/backend/theano_backend.py
@ -57,7 +57,7 @@ def to_dense(tensor):


 def variable(value, dtype=_FLOATX, name=None):
-    '''Instantiate a tensor variable.
+    '''Instantiates a variable.
    '''
    if hasattr(value, 'tocoo'):
        _assert_sparse_module()
--- a/keras/engine/topology.py
+++ b/keras/engine/topology.py
@ -13,6 +13,7 @@ import inspect
 from six.moves import zip

 from .. import backend as K
+from .. import initializations
 from ..utils.io_utils import ask_to_proceed_with_overwrite
 from ..utils.generic_utils import func_dump, func_load

@ -28,6 +29,11 @@ def to_list(x):
    return [x]


+def object_list_uid(object_list):
+    object_list = to_list(object_list)
+    return ', '.join([str(abs(id(x))) for x in object_list])
+
+
 class InputSpec(object):
    '''This specifies the ndim, dtype and shape of every input to a layer.
    Every layer should expose (if appropriate) an `input_spec` attribute:
@ -239,7 +245,6 @@ class Layer(object):
        non_trainable_weights: List of variables.
        weights: The concatenation of the lists trainable_weights and
            non_trainable_weights (in this order).
-        regularizers: List of regularizers.
        constraints: Dict mapping weights to constraints.

    # Methods
@ -294,8 +299,8 @@ class Layer(object):
            self.trainable_weights = []
        if not hasattr(self, 'non_trainable_weights'):
            self.non_trainable_weights = []
-        if not hasattr(self, 'regularizers'):
-            self.regularizers = []
+        if not hasattr(self, 'losses'):
+            self.losses = []
        if not hasattr(self, 'constraints'):
            self.constraints = {}  # dict {tensor: constraint instance}
        self.built = False
@ -354,6 +359,19 @@ class Layer(object):
    def non_trainable_weights(self, weights):
        self._non_trainable_weights = weights

+    @property
+    def regularizers(self):
+        warnings.warn('The `regularizers` property of layers/models is deprecated. '
+                      'Regularization losses are now managed via the `losses` '
+                      'layer/model property.')
+        return []
+
+    @regularizers.setter
+    def regularizers(self, _):
+        warnings.warn('The `regularizers` property of layers/models is deprecated. '
+                      'Regularization losses are now managed via the `losses` '
+                      'layer/model property.')
+
    def create_input_layer(self, batch_input_shape,
                           input_dtype=None, name=None):
        if not name:
@ -373,6 +391,32 @@ class Layer(object):
        # to the input layer we just created.
        self(x)

+    def add_weight(self, shape, initializer, name=None,
+                   trainable=True,
+                   regularizer=None,
+                   constraint=None):
+        '''Adds a weight variable to the layer.
+
+        # Arguments:
+            shape: The shape tuple of the weight.
+            initializer: An Initializer instance (callable).
+            trainable: A boolean, whether the weight should
+                be trained via backprop or not (assuming
+                that the layer itself is also trainable).
+            regularizer: An optional Regularizer instance.
+        '''
+        initializer = initializations.get(initializer)
+        weight = initializer(shape, name=name)
+        if regularizer is not None:
+            self.add_loss(regularizer(weight))
+        if constraint is not None:
+            self.constraints[weight] = constraint
+        if trainable:
+            self.trainable_weights.append(weight)
+        else:
+            self.non_trainable_weights.append(weight)
+        return weight
+
    def assert_input_compatibility(self, input):
        '''This checks that the tensor(s) `input`
        verify the input assumptions of the layer
@ -519,15 +563,21 @@ class Layer(object):
            self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
            # Outputs were already computed when calling self.add_inbound_node.
            outputs = self.inbound_nodes[-1].output_tensors
-            # If single output tensor: return it,
-            # else return a list (at least 2 elements).
-            if len(outputs) == 1:
-                return outputs[0]
-            else:
-                return outputs
        else:
            # This case appears if the input was not a Keras tensor.
-            return self.call(x, mask)
+            outputs = to_list(self.call(x, mask))
+
+        # Apply activity regularizer if any:
+        if hasattr(self, 'activity_regularizer') and self.activity_regularizer is not None:
+            regularization_losses = [self.activity_regularizer(x) for x in outputs]
+            self.add_loss(regularization_losses, input_tensors)
+
+        # If single output tensor: return it,
+        # else return a list (at least 2 elements).
+        if len(outputs) == 1:
+            return outputs[0]
+        else:
+            return outputs

    def add_inbound_node(self, inbound_layers,
                         node_indices=None, tensor_indices=None):
@ -806,20 +856,58 @@ class Layer(object):
                            'ill-defined for the layer. ' +
                            'Use `get_output_shape_at(node_index)` instead.')

-    def add_updates(self, updates, inputs):
+    def add_loss(self, losses, inputs=None):
+        if losses is None:
+            return
+        # Update self.losses
+        losses = to_list(losses)
+        if not hasattr(self, 'losses'):
+            self.losses = []
+        try:
+            self.losses += losses
+        except AttributeError:
+            # In case self.losses isn't settable
+            # (i.e. it's a getter method).
+            # In that case the `losses` property is
+            # auto-computed and shouldn't be set.
+            pass
+        # Update self._per_input_updates
+        if not hasattr(self, '_per_input_losses'):
+            self._per_input_losses = {}
+        if inputs is not None:
+            inputs_hash = object_list_uid(inputs)
+        else:
+            # Updates indexed by None are unconditional
+            # rather than input-dependent
+            inputs_hash = None
+        if inputs_hash not in self._per_input_losses:
+            self._per_input_losses[inputs_hash] = []
+        self._per_input_losses[inputs_hash] += losses
+
+    def add_update(self, updates, inputs=None):
+        if updates is None:
+            return
        # Update self.updates
+        updates = to_list(updates)
        if not hasattr(self, 'updates'):
            self.updates = []
        try:
            self.updates += updates
        except AttributeError:
+            # In case self.updates isn't settable
+            # (i.e. it's a getter method).
+            # In that case the `updates` property is
+            # auto-computed and shouldn't be set.
            pass
        # Update self._per_input_updates
        if not hasattr(self, '_per_input_updates'):
            self._per_input_updates = {}
-        inputs = to_list(inputs)
-        updates = to_list(updates)
-        inputs_hash = ', '.join([str(abs(id(x))) for x in inputs])
+        if inputs is not None:
+            inputs_hash = object_list_uid(inputs)
+        else:
+            # Updates indexed by None are unconditional
+            # rather than input-dependent
+            inputs_hash = None
        if inputs_hash not in self._per_input_updates:
            self._per_input_updates[inputs_hash] = []
        self._per_input_updates[inputs_hash] += updates
@ -827,12 +915,19 @@ class Layer(object):
    def get_updates_for(self, inputs):
        if not hasattr(self, '_per_input_updates'):
            return []
-        inputs = to_list(inputs)
-        inputs_hash = ', '.join([str(abs(id(x))) for x in inputs])
+        inputs_hash = object_list_uid(inputs)
        if inputs_hash in self._per_input_updates:
            return self._per_input_updates[inputs_hash]
        return []

+    def get_losses_for(self, inputs):
+        if not hasattr(self, '_per_input_losses'):
+            return []
+        inputs_hash = object_list_uid(inputs)
+        if inputs_hash in self._per_input_losses:
+            return self._per_input_losses[inputs_hash]
+        return []
+
    @property
    def weights(self):
        return self.trainable_weights + self.non_trainable_weights
@ -950,7 +1045,6 @@ class InputLayer(Layer):

        self.trainable_weights = []
        self.non_trainable_weights = []
-        self.regularizers = []
        self.constraints = {}

        self.sparse = sparse
@ -1151,7 +1245,6 @@ class Merge(Layer):
        self.inbound_nodes = []
        self.outbound_nodes = []
        self.constraints = {}
-        self.regularizers = []
        self.trainable_weights = []
        self.non_trainable_weights = []
        self.supports_masking = True
@ -1587,7 +1680,6 @@ class Container(Layer):
        supports_masking (boolean)
        trainable_weights (list of variables)
        non_trainable_weights (list of variables)
-        regularizers (list of regularizers)
        constraints (list of tuples (weight, constraint))

    # Methods
@ -1901,7 +1993,6 @@ class Container(Layer):
        self.supports_masking = False
        # The following are implemented as property functions:
        # self.constraints
-        # self.regularizers
        # self.trainable_weights
        # self.non_trainable_weights
        # self.input_spec
@ -1946,14 +2037,38 @@ class Container(Layer):
                if len(layer.inbound_nodes) == 1:
                    updates += layer.updates
                else:
+                    # Collect updates that are dependent on inputs
+                    # that are part of the model.
                    for node_index, node in enumerate(layer.inbound_nodes):
                        node_key = layer.name + '_ib-' + str(node_index)
                        if node_key in self.container_nodes:
                            # The model owns this layer node.
                            inputs = node.input_tensors
                            updates += layer.get_updates_for(inputs)
+                    # Collect unconditional updates.
+                    updates += layer.get_updates_for(None)
        return updates

+    @property
+    def losses(self):
+        losses = []
+        for layer in self.layers:
+            if hasattr(layer, 'losses'):
+                if len(layer.inbound_nodes) == 1:
+                    losses += layer.losses
+                else:
+                    # Collect losses that are dependent on inputs
+                    # that are part of the model.
+                    for node_index, node in enumerate(layer.inbound_nodes):
+                        node_key = layer.name + '_ib-' + str(node_index)
+                        if node_key in self.container_nodes:
+                            # The model owns this layer node.
+                            inputs = node.input_tensors
+                            losses += layer.get_losses_for(inputs)
+                    # Collect unconditional losses.
+                    losses += layer.get_losses_for(None)
+        return losses
+
    @property
    def stateful(self):
        return any([(hasattr(layer, 'stateful') and layer.stateful) for layer in self.layers])
@ -1990,10 +2105,13 @@ class Container(Layer):

    @property
    def regularizers(self):
-        regs = []
-        for layer in self.layers:
-            regs += layer.regularizers
-        return regs
+        warnings.warn('The `regularizers` attribute of layers/models '
+                      'is deprecated. '
+                      'Regularization losses are now managed via the `losses` '
+                      'layer/model property.\n'
+                      'The `regularizers` attribute will be removed '
+                      'after 06/2017.')
+        return []

    @property
    def trainable_weights(self):
@ -2061,8 +2179,7 @@ class Container(Layer):
        '''True if any layer in the graph uses it.
        '''
        layers_learning_phase = any([layer.uses_learning_phase for layer in self.layers])
-        regs_learning_phase = any([reg.uses_learning_phase for reg in self.regularizers])
-        return layers_learning_phase or regs_learning_phase
+        return layers_learning_phase

    def call(self, input, mask=None):
        '''`call` just reapplies all ops in the graph to the new inputs
@ -2239,9 +2356,16 @@ class Container(Layer):
                        output_tensors = to_list(layer.call(computed_tensors, computed_masks))
                        output_masks = to_list(layer.compute_mask(computed_tensors, computed_masks))

-                    # update model updates
+                    # Update model updates and losses:
                    layer_inputs = [x[0] for x in computed_data]
-                    self.add_updates(layer.get_updates_for(layer_inputs), inputs)
+                    # Keep track of updates that depend on the inputs (e.g. BN updates).
+                    self.add_update(layer.get_updates_for(layer_inputs), inputs)
+                    # Keep track of unconditional updates (e.g. a counter).
+                    self.add_update(layer.get_updates_for(None), None)
+                    # Keep track of losses that depend on the inputs (e.g. activity regularizers).
+                    self.add_loss(layer.get_losses_for(layer_inputs), inputs)
+                    # Keep track of unconditional losses (e.g. weight regularizers).
+                    self.add_loss(layer.get_losses_for(None), None)

                    # Update _keras_shape.
                    if all([hasattr(x, '_keras_shape') for x in computed_tensors]):
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@ -611,9 +611,10 @@ class Model(Container):
            else:
                total_loss += loss_weight * output_loss

-        # add regularization penalties to the loss
-        for r in self.regularizers:
-            total_loss = r(total_loss)
+        # add regularization penalties
+        # and other layer-specific losses
+        for loss_tensor in self.losses:
+            total_loss += loss_tensor

        # list of same size as output_names.
        # contains tuples (metrics for output, names of metrics)
--- a/keras/layers/convolutional.py
+++ b/keras/layers/convolutional.py
@ -113,31 +113,20 @@ class Convolution1D(Layer):
    def build(self, input_shape):
        input_dim = input_shape[2]
        self.W_shape = (self.filter_length, 1, input_dim, self.nb_filter)
-        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+
+        self.W = self.add_weight(self.W_shape,
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((self.nb_filter,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-        self.regularizers = []
-
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -406,32 +395,20 @@ class Convolution2D(Layer):
            stack_size = input_shape[3]
            self.W_shape = (self.nb_row, self.nb_col, stack_size, self.nb_filter)
        else:
-            raise ValueError('Invalid dim_ordering:', self.dim_ordering)
-        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)
+        self.W = self.add_weight(self.W_shape,
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((self.nb_filter,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-        self.regularizers = []
-
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -957,40 +934,26 @@ class SeparableConvolution2D(Layer):
            depthwise_shape = (self.nb_row, self.nb_col, stack_size, self.depth_multiplier)
            pointwise_shape = (1, 1, self.depth_multiplier * stack_size, self.nb_filter)
        else:
-            raise ValueError('Invalid dim_ordering:', self.dim_ordering)
-        self.depthwise_kernel = self.init(depthwise_shape,
-                                          name='{}_depthwise_kernel'.format(self.name))
-        self.pointwise_kernel = self.init(pointwise_shape,
-                                          name='{}_pointwise_kernel'.format(self.name))
-        if self.bias:
-            self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.depthwise_kernel,
-                                      self.pointwise_kernel,
-                                      self.b]
-        else:
-            self.trainable_weights = [self.depthwise_kernel,
-                                      self.pointwise_kernel]
-        self.regularizers = []
-        if self.depthwise_regularizer:
-            self.depthwise_regularizer.set_param(self.depthwise_kernel)
-            self.regularizers.append(self.depthwise_regularizer)
-        if self.pointwise_regularizer:
-            self.pointwise_regularizer.set_param(self.pointwise_kernel)
-            self.regularizers.append(self.pointwise_regularizer)
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
+            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)

-        self.constraints = {}
-        if self.depthwise_constraint:
-            self.constraints[self.depthwise_kernel] = self.depthwise_constraint
-        if self.pointwise_constraint:
-            self.constraints[self.pointwise_kernel] = self.pointwise_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+        self.depthwise_kernel = self.add_weight(depthwise_shape,
+                                                initializer=self.init,
+                                                regularizer=self.depthwise_regularizer,
+                                                constraint=self.depthwise_constraint,
+                                                name='{}_depthwise_kernel'.format(self.name))
+        self.pointwise_kernel = self.add_weight(pointwise_shape,
+                                                initializer=self.init,
+                                                regularizer=self.pointwise_regularizer,
+                                                constraint=self.pointwise_constraint,
+                                                name='{}_pointwise_kernel'.format(self.name))
+        if self.bias:
+            self.b = self.add_weight((self.nb_filter,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
+        else:
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -1165,31 +1128,19 @@ class Convolution3D(Layer):
        else:
            raise ValueError('Invalid dim_ordering:', self.dim_ordering)

-        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+        self.W = self.add_weight(self.W_shape,
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.nb_filter,), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((self.nb_filter,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
--- a/keras/layers/core.py
+++ b/keras/layers/core.py
@ -125,8 +125,8 @@ class SpatialDropout1D(Dropout):
        input_shape = K.shape(x)
        noise_shape = (input_shape[0], 1, input_shape[2])
        return noise_shape
-    
-    
+
+
 class SpatialDropout2D(Dropout):
    '''This version performs the same function as Dropout, however it drops
    entire 2D feature maps instead of individual elements. If adjacent pixels
@ -728,33 +728,19 @@ class Dense(Layer):
        self.input_spec = [InputSpec(dtype=K.floatx(),
                                     shape=(None, input_dim))]

-        self.W = self.init((input_dim, self.output_dim),
-                           name='{}_W'.format(self.name))
+        self.W = self.add_weight((input_dim, self.output_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.output_dim,),
-                             name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((self.output_dim,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -808,9 +794,8 @@ class ActivityRegularization(Layer):
        self.l2 = l2

        super(ActivityRegularization, self).__init__(**kwargs)
-        activity_regularizer = ActivityRegularizer(l1=l1, l2=l2)
-        activity_regularizer.set_layer(self)
-        self.regularizers = [activity_regularizer]
+        self.activity_regularizer = regularizers.L1L2Regularizer(l1=l1, l2=l2)
+        self.regularizers = [self.activity_regularizer]

    def get_config(self):
        config = {'l1': self.l1,
@ -897,33 +882,19 @@ class MaxoutDense(Layer):
        self.input_spec = [InputSpec(dtype=K.floatx(),
                                     shape=(None, input_dim))]

-        self.W = self.init((self.nb_feature, input_dim, self.output_dim),
-                           name='{}_W'.format(self.name))
+        self.W = self.add_weight((self.nb_feature, input_dim, self.output_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.nb_feature, self.output_dim),
-                             name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((self.nb_feature, self.output_dim,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -1030,38 +1001,25 @@ class Highway(Layer):
        self.input_spec = [InputSpec(dtype=K.floatx(),
                                     shape=(None, input_dim))]

-        self.W = self.init((input_dim, input_dim),
-                           name='{}_W'.format(self.name))
-        self.W_carry = self.init((input_dim, input_dim),
-                                 name='{}_W_carry'.format(self.name))
-
+        self.W = self.add_weight((input_dim, input_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
+        self.W_carry = self.add_weight((input_dim, input_dim),
+                                       initializer=self.init,
+                                       name='{}_W_carry'.format(self.name))
        if self.bias:
-            self.b = K.zeros((input_dim,), name='{}_b'.format(self.name))
-            # initialize with a vector of values `transform_bias`
-            self.b_carry = K.variable(np.ones((input_dim,)) * self.transform_bias,
-                                      name='{}_b_carry'.format(self.name))
-            self.trainable_weights = [self.W, self.b, self.W_carry, self.b_carry]
+            self.b = self.add_weight((input_dim,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
+            self.b_carry = self.add_weight((input_dim,),
+                                           initializer='one',
+                                           name='{}_b_carry'.format(self.name))
        else:
-            self.trainable_weights = [self.W, self.W_carry]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b_carry = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -1178,31 +1136,19 @@ class TimeDistributedDense(Layer):
                                     shape=(None,) + input_shape[1:])]
        input_dim = input_shape[2]

-        self.W = self.init((input_dim, self.output_dim),
-                           name='{}_W'.format(self.name))
+        self.W = self.add_weight((input_dim, self.output_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((self.output_dim,),
-                             name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
-        self.regularizers = []
-
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = self.add_weight((self.output_dim,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
+        else:
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
--- a/keras/layers/embeddings.py
+++ b/keras/layers/embeddings.py
@ -91,22 +91,11 @@ class Embedding(Layer):
        super(Embedding, self).__init__(**kwargs)

    def build(self, input_shape):
-        self.W = self.init((self.input_dim, self.output_dim),
-                           name='{}_W'.format(self.name))
-        self.trainable_weights = [self.W]
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
+        self.W = self.add_weight((self.input_dim, self.output_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
--- a/keras/layers/local.py
+++ b/keras/layers/local.py
@ -110,31 +110,21 @@ class LocallyConnected1D(Layer):
    def build(self, input_shape):
        input_dim = input_shape[2]
        _, output_length, nb_filter = self.get_output_shape_for(input_shape)
-
        self.W_shape = (output_length, self.filter_length * input_dim, nb_filter)
-        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))
+
+        self.W = self.add_weight(self.W_shape,
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((output_length, self.nb_filter), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((output_length, self.nb_filter),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -306,30 +296,20 @@ class LocallyConnected2D(Layer):
        self.output_row = output_row
        self.output_col = output_col
        self.W_shape = (output_row * output_col, self.nb_row * self.nb_col * input_filter, nb_filter)
-        self.W = self.init(self.W_shape, name='{}_W'.format(self.name))

+        self.W = self.add_weight(self.W_shape,
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer,
+                                 constraint=self.W_constraint)
        if self.bias:
-            self.b = K.zeros((output_row, output_col, nb_filter), name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.b]
+            self.b = self.add_weight((output_row, output_col, nb_filter),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer,
+                                     constraint=self.b_constraint)
        else:
-            self.trainable_weights = [self.W]
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-        if self.bias and self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-        if self.activity_regularizer:
-            self.activity_regularizer.set_layer(self)
-            self.regularizers.append(self.activity_regularizer)
-
-        self.constraints = {}
-        if self.W_constraint:
-            self.constraints[self.W] = self.W_constraint
-        if self.bias and self.b_constraint:
-            self.constraints[self.b] = self.b_constraint
+            self.b = None

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
--- a/keras/layers/normalization.py
+++ b/keras/layers/normalization.py
@ -82,24 +82,20 @@ class BatchNormalization(Layer):
        self.input_spec = [InputSpec(shape=input_shape)]
        shape = (input_shape[self.axis],)

-        self.gamma = self.gamma_init(shape, name='{}_gamma'.format(self.name))
-        self.beta = self.beta_init(shape, name='{}_beta'.format(self.name))
-        self.trainable_weights = [self.gamma, self.beta]
-
-        self.regularizers = []
-        if self.gamma_regularizer:
-            self.gamma_regularizer.set_param(self.gamma)
-            self.regularizers.append(self.gamma_regularizer)
-
-        if self.beta_regularizer:
-            self.beta_regularizer.set_param(self.beta)
-            self.regularizers.append(self.beta_regularizer)
-
-        self.running_mean = K.zeros(shape,
-                                    name='{}_running_mean'.format(self.name))
-        self.running_std = K.ones(shape,
-                                  name='{}_running_std'.format(self.name))
-        self.non_trainable_weights = [self.running_mean, self.running_std]
+        self.gamma = self.add_weight(shape,
+                                     initializer=self.gamma_init,
+                                     regularizer=self.gamma_regularizer,
+                                     name='{}_gamma'.format(self.name))
+        self.beta = self.add_weight(shape,
+                                    initializer=self.beta_init,
+                                    regularizer=self.beta_regularizer,
+                                    name='{}_beta'.format(self.name))
+        self.running_mean = self.add_weight(shape, initializer='zero',
+                                            name='{}_running_mean'.format(self.name),
+                                            trainable=False)
+        self.running_std = self.add_weight(shape, initializer='one',
+                                           name='{}_running_std'.format(self.name),
+                                           trainable=False)

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -121,8 +117,8 @@ class BatchNormalization(Layer):
                epsilon=self.epsilon)

            if self.mode == 0:
-                self.add_updates([K.moving_average_update(self.running_mean, mean, self.momentum),
-                                  K.moving_average_update(self.running_std, std, self.momentum)], x)
+                self.add_update([K.moving_average_update(self.running_mean, mean, self.momentum),
+                                 K.moving_average_update(self.running_std, std, self.momentum)], x)

                if sorted(reduction_axes) == range(K.ndim(x))[:-1]:
                    x_normed_running = K.batch_normalization(
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@ -229,7 +229,7 @@ class Recurrent(Layer):
            updates = []
            for i in range(len(states)):
                updates.append((self.states[i], states[i]))
-            self.add_updates(updates, x)
+            self.add_update(updates, x)

        if self.return_sequences:
            return outputs
@ -288,7 +288,8 @@ class SimpleRNN(Recurrent):
        self.W_regularizer = regularizers.get(W_regularizer)
        self.U_regularizer = regularizers.get(U_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
-        self.dropout_W, self.dropout_U = dropout_W, dropout_U
+        self.dropout_W = dropout_W
+        self.dropout_U = dropout_U

        if self.dropout_W or self.dropout_U:
            self.uses_learning_phase = True
@ -304,24 +305,18 @@ class SimpleRNN(Recurrent):
        input_dim = input_shape[2]
        self.input_dim = input_dim

-        self.W = self.init((input_dim, self.output_dim),
-                           name='{}_W'.format(self.name))
-        self.U = self.inner_init((self.output_dim, self.output_dim),
-                                 name='{}_U'.format(self.name))
-        self.b = K.zeros((self.output_dim,), name='{}_b'.format(self.name))
-
-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-        if self.U_regularizer:
-            self.U_regularizer.set_param(self.U)
-            self.regularizers.append(self.U_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
-        self.trainable_weights = [self.W, self.U, self.b]
+        self.W = self.add_weight((input_dim, self.output_dim),
+                                 initializer=self.init,
+                                 name='{}_W'.format(self.name),
+                                 regularizer=self.W_regularizer)
+        self.U = self.add_weight((self.output_dim, self.output_dim),
+                                 initializer=self.inner_init,
+                                 name='{}_U'.format(self.name),
+                                 regularizer=self.U_regularizer)
+        self.b = self.add_weight((self.output_dim,),
+                                 initializer='zero',
+                                 name='{}_b'.format(self.name),
+                                 regularizer=self.b_regularizer)

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
@ -446,7 +441,8 @@ class GRU(Recurrent):
        self.W_regularizer = regularizers.get(W_regularizer)
        self.U_regularizer = regularizers.get(U_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
-        self.dropout_W, self.dropout_U = dropout_W, dropout_U
+        self.dropout_W = dropout_W
+        self.dropout_U = dropout_U

        if self.dropout_W or self.dropout_U:
            self.uses_learning_phase = True
@ -463,57 +459,59 @@ class GRU(Recurrent):
            self.states = [None]

        if self.consume_less == 'gpu':
-
-            self.W = self.init((self.input_dim, 3 * self.output_dim),
-                               name='{}_W'.format(self.name))
-            self.U = self.inner_init((self.output_dim, 3 * self.output_dim),
-                                     name='{}_U'.format(self.name))
-
-            self.b = K.variable(np.hstack((np.zeros(self.output_dim),
-                                           np.zeros(self.output_dim),
-                                           np.zeros(self.output_dim))),
-                                name='{}_b'.format(self.name))
-
-            self.trainable_weights = [self.W, self.U, self.b]
+            self.W = self.add_weight((self.input_dim, 3 * self.output_dim),
+                                     initializer=self.init,
+                                     name='{}_W'.format(self.name),
+                                     regularizer=self.W_regularizer)
+            self.U = self.add_weight((self.output_dim, 3 * self.output_dim),
+                                     initializer=self.inner_init,
+                                     name='{}_U'.format(self.name),
+                                     regularizer=self.U_regularizer)
+            self.b = self.add_weight((self.output_dim * 3,),
+                                     initializer='zero',
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer)
        else:
-
-            self.W_z = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_z'.format(self.name))
-            self.U_z = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_z'.format(self.name))
-            self.b_z = K.zeros((self.output_dim,), name='{}_b_z'.format(self.name))
-
-            self.W_r = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_r'.format(self.name))
-            self.U_r = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_r'.format(self.name))
-            self.b_r = K.zeros((self.output_dim,), name='{}_b_r'.format(self.name))
-
-            self.W_h = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_h'.format(self.name))
-            self.U_h = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_h'.format(self.name))
-            self.b_h = K.zeros((self.output_dim,), name='{}_b_h'.format(self.name))
-
-            self.trainable_weights = [self.W_z, self.U_z, self.b_z,
-                                      self.W_r, self.U_r, self.b_r,
-                                      self.W_h, self.U_h, self.b_h]
-
+            self.W_z = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_z'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_z = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_z'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_z = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_z'.format(self.name),
+                                       regularizer=self.b_regularizer)
+            self.W_r = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_r'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_r = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_r'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_r = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_r'.format(self.name),
+                                       regularizer=self.b_regularizer)
+            self.W_h = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_h'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_h = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_h'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_h = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_h'.format(self.name),
+                                       regularizer=self.b_regularizer)
            self.W = K.concatenate([self.W_z, self.W_r, self.W_h])
            self.U = K.concatenate([self.U_z, self.U_r, self.U_h])
            self.b = K.concatenate([self.b_z, self.b_r, self.b_h])

-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-        if self.U_regularizer:
-            self.U_regularizer.set_param(self.U)
-            self.regularizers.append(self.U_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
@ -671,7 +669,8 @@ class LSTM(Recurrent):
        self.W_regularizer = regularizers.get(W_regularizer)
        self.U_regularizer = regularizers.get(U_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
-        self.dropout_W, self.dropout_U = dropout_W, dropout_U
+        self.dropout_W = dropout_W
+        self.dropout_U = dropout_U

        if self.dropout_W or self.dropout_U:
            self.uses_learning_phase = True
@ -688,63 +687,83 @@ class LSTM(Recurrent):
            self.states = [None, None]

        if self.consume_less == 'gpu':
-            self.W = self.init((self.input_dim, 4 * self.output_dim),
-                               name='{}_W'.format(self.name))
-            self.U = self.inner_init((self.output_dim, 4 * self.output_dim),
-                                     name='{}_U'.format(self.name))
+            self.W = self.add_weight((self.input_dim, 4 * self.output_dim),
+                                     initializer=self.init,
+                                     name='{}_W'.format(self.name),
+                                     regularizer=self.W_regularizer)
+            self.U = self.add_weight((self.output_dim, 4 * self.output_dim),
+                                     initializer=self.inner_init,
+                                     name='{}_U'.format(self.name),
+                                     regularizer=self.U_regularizer)

-            self.b = K.variable(np.hstack((np.zeros(self.output_dim),
-                                           K.get_value(self.forget_bias_init((self.output_dim,))),
-                                           np.zeros(self.output_dim),
-                                           np.zeros(self.output_dim))),
-                                name='{}_b'.format(self.name))
-            self.trainable_weights = [self.W, self.U, self.b]
+            def b_reg(shape, name=None):
+                return K.variable(np.hstack((np.zeros(self.output_dim),
+                                             K.get_value(self.forget_bias_init((self.output_dim,))),
+                                             np.zeros(self.output_dim),
+                                             np.zeros(self.output_dim))),
+                                  name='{}_b'.format(self.name))
+            self.b = self.add_weight((self.output_dim * 4,),
+                                     initializer=b_reg,
+                                     name='{}_b'.format(self.name),
+                                     regularizer=self.b_regularizer)
        else:
-            self.W_i = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_i'.format(self.name))
-            self.U_i = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_i'.format(self.name))
-            self.b_i = K.zeros((self.output_dim,), name='{}_b_i'.format(self.name))
-
-            self.W_f = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_f'.format(self.name))
-            self.U_f = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_f'.format(self.name))
-            self.b_f = self.forget_bias_init((self.output_dim,),
-                                             name='{}_b_f'.format(self.name))
-
-            self.W_c = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_c'.format(self.name))
-            self.U_c = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_c'.format(self.name))
-            self.b_c = K.zeros((self.output_dim,), name='{}_b_c'.format(self.name))
-
-            self.W_o = self.init((self.input_dim, self.output_dim),
-                                 name='{}_W_o'.format(self.name))
-            self.U_o = self.inner_init((self.output_dim, self.output_dim),
-                                       name='{}_U_o'.format(self.name))
-            self.b_o = K.zeros((self.output_dim,), name='{}_b_o'.format(self.name))
+            self.W_i = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_i'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_i = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_i'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_i = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_i'.format(self.name),
+                                       regularizer=self.b_regularizer)
+            self.W_f = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_f'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_f = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_f'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_f = self.add_weight((self.output_dim,),
+                                       initializer=self.forget_bias_init,
+                                       name='{}_b_f'.format(self.name),
+                                       regularizer=self.b_regularizer)
+            self.W_c = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_c'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_c = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_c'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_c = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_c'.format(self.name),
+                                       regularizer=self.b_regularizer)
+            self.W_o = self.add_weight((self.input_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_W_o'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.U_o = self.add_weight((self.output_dim, self.output_dim),
+                                       initializer=self.init,
+                                       name='{}_U_o'.format(self.name),
+                                       regularizer=self.W_regularizer)
+            self.b_o = self.add_weight((self.output_dim,),
+                                       initializer='zero',
+                                       name='{}_b_o'.format(self.name),
+                                       regularizer=self.b_regularizer)

            self.trainable_weights = [self.W_i, self.U_i, self.b_i,
                                      self.W_c, self.U_c, self.b_c,
                                      self.W_f, self.U_f, self.b_f,
                                      self.W_o, self.U_o, self.b_o]
-
            self.W = K.concatenate([self.W_i, self.W_f, self.W_c, self.W_o])
            self.U = K.concatenate([self.U_i, self.U_f, self.U_c, self.U_o])
            self.b = K.concatenate([self.b_i, self.b_f, self.b_c, self.b_o])

-        self.regularizers = []
-        if self.W_regularizer:
-            self.W_regularizer.set_param(self.W)
-            self.regularizers.append(self.W_regularizer)
-        if self.U_regularizer:
-            self.U_regularizer.set_param(self.U)
-            self.regularizers.append(self.U_regularizer)
-        if self.b_regularizer:
-            self.b_regularizer.set_param(self.b)
-            self.regularizers.append(self.b_regularizer)
-
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
--- a/keras/layers/wrappers.py
+++ b/keras/layers/wrappers.py
@ -17,7 +17,7 @@ class Wrapper(Layer):
        self.trainable_weights = getattr(self.layer, 'trainable_weights', [])
        self.non_trainable_weights = getattr(self.layer, 'non_trainable_weights', [])
        self.updates = getattr(self.layer, 'updates', [])
-        self.regularizers = getattr(self.layer, 'regularizers', [])
+        self.losses = getattr(self.layer, 'losses', [])
        self.constraints = getattr(self.layer, 'constraints', {})

        # properly attribute the current layer to
@ -130,6 +130,11 @@ class TimeDistributed(Wrapper):
            # (nb_samples, timesteps, ...)
            output_shape = self.get_output_shape_for(input_shape)
            y = K.reshape(y, (-1, input_length) + output_shape[2:])
+
+        # Apply activity regularizer if any:
+        if hasattr(self.layer, 'activity_regularizer') and self.layer.activity_regularizer is not None:
+            regularization_loss = self.layer.activity_regularizer(y)
+            self.add_loss(regularization_loss, X)
        return y


@ -246,9 +251,9 @@ class Bidirectional(Wrapper):
        return []

    @property
-    def regularizers(self):
-        if hasattr(self.forward_layer, 'regularizers'):
-            return self.forward_layer.regularizers + self.backward_layer.regularizers
+    def losses(self):
+        if hasattr(self.forward_layer, 'losses'):
+            return self.forward_layer.losses + self.backward_layer.losses
        return []

    @property
--- a/keras/models.py
+++ b/keras/models.py
@ -497,6 +497,13 @@ class Sequential(Model):
    def get_updates_for(self, inputs):
        return self.model.get_updates_for(inputs)

+    @property
+    def losses(self):
+        return self.model.losses
+
+    def get_losses_for(self, inputs):
+        return self.model.get_losses_for(inputs)
+
    @property
    def regularizers(self):
        # support for legacy behavior
--- a/keras/regularizers.py
+++ b/keras/regularizers.py
@ -1,22 +1,27 @@
 from __future__ import absolute_import
 from . import backend as K
 from .utils.generic_utils import get_from_module
+import warnings


 class Regularizer(object):

-    def set_param(self, p):
-        self.p = p
-
-    def set_layer(self, layer):
-        self.layer = layer
-
-    def __call__(self, loss):
-        return loss
+    def __call__(self, x):
+        return 0

    def get_config(self):
        return {'name': self.__class__.__name__}

+    def set_param(self, _):
+        warnings.warn('The `set_param` method on regularizers is deprecated. '
+                      'It no longer does anything, '
+                      'and it will be removed after 06/2017.')
+
+    def set_layer(self, _):
+        warnings.warn('The `set_layer` method on regularizers is deprecated. '
+                      'It no longer does anything, '
+                      'and it will be removed after 06/2017.')
+

 class EigenvalueRegularizer(Regularizer):
    '''This takes a constant that controls
@ -28,71 +33,43 @@ class EigenvalueRegularizer(Regularizer):
    '''
    def __init__(self, k):
        self.k = k
-        self.uses_learning_phase = True

-    def set_param(self, p):
-        if hasattr(self, 'p'):
-            raise Exception('Regularizers cannot be reused. '
-                            'Instantiate one regularizer per layer.')
-        self.p = p
+    def __call__(self, x):
+        if K.ndim(x) > 2:
+            raise Exception('EigenvalueRegularizer '
+                            'is only available for tensors of rank 2.')
+        covariance = K.dot(K.transpose(x), x)
+        dim1, dim2 = K.eval(K.shape(covariance))

-    def __call__(self, loss):
-        power = 9  # number of iterations of the power method
-        W = self.p
-        if K.ndim(W) > 2:
-            raise Exception('Eigenvalue Decay regularizer '
-                            'is only available for dense '
-                            'and embedding layers.')
-        WW = K.dot(K.transpose(W), W)
-        dim1, dim2 = K.eval(K.shape(WW))  # number of neurons in the layer
-
-        # power method for approximating the dominant eigenvector:
-        o = K.ones([dim1, 1])  # initial values for the dominant eigenvector
-        main_eigenvect = K.dot(WW, o)
+        # Power method for approximating the dominant eigenvector:
+        power = 9  # Number of iterations of the power method.
+        o = K.ones([dim1, 1])  # Initial values for the dominant eigenvector.
+        main_eigenvect = K.dot(covariance, o)
        for n in range(power - 1):
-            main_eigenvect = K.dot(WW, main_eigenvect)
+            main_eigenvect = K.dot(covariance, main_eigenvect)
+        covariance_d = K.dot(covariance, main_eigenvect)

-        WWd = K.dot(WW, main_eigenvect)
-
-        # the corresponding dominant eigenvalue:
-        main_eigenval = (K.dot(K.transpose(WWd), main_eigenvect) /
+        # The corresponding dominant eigenvalue:
+        main_eigenval = (K.dot(K.transpose(covariance_d), main_eigenvect) /
                         K.dot(K.transpose(main_eigenvect), main_eigenvect))
-        # multiplied by the given regularization gain
-        regularized_loss = loss + (main_eigenval ** 0.5) * self.k
-
-        return K.in_train_phase(regularized_loss[0, 0], loss)
+        # Multiply by the given regularization gain.
+        regularization = (main_eigenval ** 0.5) * self.k
+        return K.sum(regularization)


-class WeightRegularizer(Regularizer):
+class L1L2Regularizer(Regularizer):

    def __init__(self, l1=0., l2=0.):
        self.l1 = K.cast_to_floatx(l1)
        self.l2 = K.cast_to_floatx(l2)
-        self.uses_learning_phase = True
-        self.p = None

-    def set_param(self, p):
-        if self.p is not None:
-            raise Exception('Regularizers cannot be reused. '
-                            'Instantiate one regularizer per layer.')
-        self.p = p
-
-    def __call__(self, loss):
-        if self.p is None:
-            raise Exception('Need to call `set_param` on '
-                            'WeightRegularizer instance '
-                            'before calling the instance. '
-                            'Check that you are not passing '
-                            'a WeightRegularizer instead of an '
-                            'ActivityRegularizer '
-                            '(i.e. activity_regularizer="l2" instead '
-                            'of activity_regularizer="activity_l2".')
-        regularized_loss = loss
+    def __call__(self, x):
+        regularization = 0
        if self.l1:
-            regularized_loss += K.sum(self.l1 * K.abs(self.p))
+            regularization += K.sum(self.l1 * K.abs(x))
        if self.l2:
-            regularized_loss += K.sum(self.l2 * K.square(self.p))
-        return K.in_train_phase(regularized_loss, loss)
+            regularization += K.sum(self.l2 * K.square(x))
+        return regularization

    def get_config(self):
        return {'name': self.__class__.__name__,
@ -100,61 +77,34 @@ class WeightRegularizer(Regularizer):
                'l2': float(self.l2)}


-class ActivityRegularizer(Regularizer):
+# Aliases.

-    def __init__(self, l1=0., l2=0.):
-        self.l1 = K.cast_to_floatx(l1)
-        self.l2 = K.cast_to_floatx(l2)
-        self.uses_learning_phase = True
-        self.layer = None
-
-    def set_layer(self, layer):
-        if self.layer is not None:
-            raise Exception('Regularizers cannot be reused')
-        self.layer = layer
-
-    def __call__(self, loss):
-        if self.layer is None:
-            raise Exception('Need to call `set_layer` on '
-                            'ActivityRegularizer instance '
-                            'before calling the instance.')
-        regularized_loss = loss
-        for i in range(len(self.layer.inbound_nodes)):
-            output = self.layer.get_output_at(i)
-            if self.l1:
-                regularized_loss += K.sum(self.l1 * K.abs(output))
-            if self.l2:
-                regularized_loss += K.sum(self.l2 * K.square(output))
-        return K.in_train_phase(regularized_loss, loss)
-
-    def get_config(self):
-        return {'name': self.__class__.__name__,
-                'l1': float(self.l1),
-                'l2': float(self.l2)}
+WeightRegularizer = L1L2Regularizer
+ActivityRegularizer = L1L2Regularizer


 def l1(l=0.01):
-    return WeightRegularizer(l1=l)
+    return L1L2Regularizer(l1=l)


 def l2(l=0.01):
-    return WeightRegularizer(l2=l)
+    return L1L2Regularizer(l2=l)


 def l1l2(l1=0.01, l2=0.01):
-    return WeightRegularizer(l1=l1, l2=l2)
+    return L1L2Regularizer(l1=l1, l2=l2)


 def activity_l1(l=0.01):
-    return ActivityRegularizer(l1=l)
+    return L1L2Regularizer(l1=l)


 def activity_l2(l=0.01):
-    return ActivityRegularizer(l2=l)
+    return L1L2Regularizer(l2=l)


 def activity_l1l2(l1=0.01, l2=0.01):
-    return ActivityRegularizer(l1=l1, l2=l2)
+    return L1L2Regularizer(l1=l1, l2=l2)


 def get(identifier, kwargs=None):
--- a/tests/keras/layers/test_recurrent.py
+++ b/tests/keras/layers/test_recurrent.py
@ -132,6 +132,12 @@ def test_regularizer(layer_class):
    layer.build(shape)
    output = layer(K.variable(np.ones(shape)))
    K.eval(output)
+    if layer_class == recurrent.SimpleRNN:
+        assert len(layer.losses) == 3
+    if layer_class == recurrent.GRU:
+        assert len(layer.losses) == 9
+    if layer_class == recurrent.LSTM:
+        assert len(layer.losses) == 12


@keras_test
--- a/tests/keras/layers/test_wrappers.py
+++ b/tests/keras/layers/test_wrappers.py
@ -76,6 +76,15 @@ def test_TimeDistributed():
    outer_model.fit(np.random.random((10, 3, 2)), np.random.random((10, 3, 3)), nb_epoch=1, batch_size=10)


+@keras_test
+def test_regularizers():
+    model = Sequential()
+    model.add(wrappers.TimeDistributed(core.Dense(2, W_regularizer='l1'), input_shape=(3, 4)))
+    model.add(core.Activation('relu'))
+    model.compile(optimizer='rmsprop', loss='mse')
+    assert len(model.losses) == 1
+
+
@keras_test
 def test_Bidirectional():
    rnn = recurrent.SimpleRNN
--- a/tests/keras/test_regularizers.py
+++ b/tests/keras/test_regularizers.py
@ -67,6 +67,7 @@ def test_W_reg():
                regularizers.l1l2()]:
        model = create_model(weight_reg=reg)
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+        assert len(model.losses) == 1
        model.fit(X_train, Y_train, batch_size=batch_size,
                  nb_epoch=nb_epoch, verbose=0)
        model.evaluate(X_test[test_ids, :], Y_test[test_ids, :], verbose=0)
@ -77,6 +78,7 @@ def test_A_reg():
    for reg in [regularizers.activity_l1(), regularizers.activity_l2()]:
        model = create_model(activity_reg=reg)
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+        assert len(model.losses) == 1
        model.fit(X_train, Y_train, batch_size=batch_size,
                  nb_epoch=nb_epoch, verbose=0)
        model.evaluate(X_test[test_ids, :], Y_test[test_ids, :], verbose=0)