Update all examples with new API

2015-10-04 18:44:49 -07:00 · 2015-10-04 18:44:49 -07:00 · 2bd4c295d6
commit 2bd4c295d6
parent 35d66d672b
17 changed files with 140 additions and 139 deletions
--- a/examples/addition_rnn.py
+++ b/examples/addition_rnn.py
@ -1,9 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 from keras.models import Sequential, slice_X
-from keras.layers.core import Activation, Dense, RepeatVector
+from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
 from keras.layers import recurrent
-from sklearn.utils import shuffle
 import numpy as np

 """
@ -25,18 +24,15 @@ and
 http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
 Theoretically it introduces shorter term dependencies between source and target.

-
 Two digits inverted:
 + One layer JZS1 (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

 Three digits inverted:
 + One layer JZS1 (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

-
 Four digits inverted:
 + One layer JZS1 (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

-
 Five digits inverted:
 + One layer JZS1 (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs

@ -122,23 +118,32 @@ for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, maxlen=DIGITS + 1)

 # Shuffle (X, y) in unison as the later parts of X will almost all be larger digits
-X, y = shuffle(X, y)
+indices = np.arange(len(y))
+np.random.shuffle(indices)
+X = X[indices]
+y = y[indices]
 # Explicitly set apart 10% for validation data that we never train over
 split_at = len(X) - len(X) / 10
 (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
 (y_train, y_val) = (y[:split_at], y[split_at:])

+print(X_train.shape)
+print(y_train.shape)
+
 print('Build model...')
 model = Sequential()
 # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
-model.add(RNN(len(chars), HIDDEN_SIZE))
+# note: in a situation where your input sequences have a variable length,
+# use input_shape=(None, nb_feature).
+model.add(RNN(HIDDEN_SIZE, input_shape=(None, len(chars))))
 # For the decoder's input, we repeat the encoded input for each time step
 model.add(RepeatVector(DIGITS + 1))
 # The decoder RNN could be multiple layers stacked or a single layer
 for _ in xrange(LAYERS):
-    model.add(RNN(HIDDEN_SIZE, HIDDEN_SIZE, return_sequences=True))
+    model.add(RNN(HIDDEN_SIZE, return_sequences=True))
+
 # For each of step of the output sequence, decide which character should be chosen
-model.add(Dense(HIDDEN_SIZE, len(chars)))
+model.add(TimeDistributedDense(len(chars)))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adam')
@ -148,7 +153,7 @@ for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
-    model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)
+    model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)
    ###
    # Select 10 samples from the validation set at random so we can visualize errors
    for i in xrange(10):
--- a/examples/babi_rnn.py
+++ b/examples/babi_rnn.py
@ -181,15 +181,15 @@ print('Build model...')

 sentrnn = Sequential()
 sentrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, mask_zero=True))
-sentrnn.add(RNN(EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, return_sequences=False))
+sentrnn.add(RNN(SENT_HIDDEN_SIZE, return_sequences=False))

 qrnn = Sequential()
 qrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE))
-qrnn.add(RNN(EMBED_HIDDEN_SIZE, QUERY_HIDDEN_SIZE, return_sequences=False))
+qrnn.add(RNN(QUERY_HIDDEN_SIZE, return_sequences=False))

 model = Sequential()
 model.add(Merge([sentrnn, qrnn], mode='concat'))
-model.add(Dense(SENT_HIDDEN_SIZE + QUERY_HIDDEN_SIZE, vocab_size, activation='softmax'))
+model.add(Dense(vocab_size, activation='softmax'))

 model.compile(optimizer='adam', loss='categorical_crossentropy', class_mode='categorical')

--- a/examples/cifar10_cnn.py
+++ b/examples/cifar10_cnn.py
@ -28,16 +28,10 @@ nb_classes = 10
 nb_epoch = 200
 data_augmentation = True

-# shape of the image (SHAPE x SHAPE)
-shapex, shapey = 32, 32
-# number of convolutional filters to use at each layer
-nb_filters = [32, 64]
-# level of pooling to perform at each layer (POOL x POOL)
-nb_pool = [2, 2]
-# level of convolution to perform at each layer (CONV x CONV)
-nb_conv = [3, 3]
+# input image dimensions
+img_rows, img_cols = 32, 32
 # the CIFAR10 images are RGB
-image_dimensions = 3
+img_channels = 3

 # the data, shuffled and split between tran and test sets
 (X_train, y_train), (X_test, y_test) = cifar10.load_data()
@ -51,28 +45,26 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()

-model.add(Convolution2D(nb_filters[0], image_dimensions, nb_conv[0], nb_conv[0], border_mode='full'))
+model.add(Convolution2D(32, 3, 3, border_mode='full',
+                        input_shape=(img_channels, img_rows, img_cols)))
 model.add(Activation('relu'))
-model.add(Convolution2D(nb_filters[0], nb_filters[0], nb_conv[0], nb_conv[0]))
+model.add(Convolution2D(32, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(pool_size=(nb_pool[0], nb_pool[0])))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

-model.add(Convolution2D(nb_filters[1], nb_filters[0], nb_conv[0], nb_conv[0], border_mode='full'))
+model.add(Convolution2D(64, 3, 3, border_mode='full'))
 model.add(Activation('relu'))
-model.add(Convolution2D(nb_filters[1], nb_filters[1], nb_conv[1], nb_conv[1]))
+model.add(Convolution2D(64, 3, 3))
 model.add(Activation('relu'))
-model.add(MaxPooling2D(pool_size=(nb_pool[1], nb_pool[1])))
+model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-# the image dimensions are the original dimensions divided by any pooling
-# each pixel has a number of filters, determined by the last Convolution2D layer
-model.add(Dense(nb_filters[-1] * (shapex / nb_pool[0] / nb_pool[1]) * (shapey / nb_pool[0] / nb_pool[1]), 512))
+model.add(Dense(512))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-
-model.add(Dense(512, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 # let's train the model using SGD + momentum (how original).
--- a/examples/imdb_cnn.py
+++ b/examples/imdb_cnn.py
@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337) # for reproducibility
+np.random.seed(1337)  # for reproducibility

 from keras.preprocessing import sequence
 from keras.optimizers import RMSprop
@ -25,7 +25,7 @@ max_features = 5000
 maxlen = 100
 batch_size = 32
 embedding_dims = 100
-nb_filters = 250
+nb_filter = 250
 filter_length = 3
 hidden_dims = 250
 nb_epoch = 3
@ -47,35 +47,29 @@ model = Sequential()

 # we start off with an efficient embedding layer which maps
 # our vocab indices into embedding_dims dimensions
-model.add(Embedding(max_features, embedding_dims))
+model.add(Embedding(max_features, embedding_dims, max_lenght=maxlen))
 model.add(Dropout(0.25))

-# we add a Convolution1D, which will learn nb_filters
+# we add a Convolution1D, which will learn nb_filter
 # word group filters of size filter_length:
-model.add(Convolution1D(input_dim=embedding_dims,
-                        nb_filter=nb_filters,
+model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode="valid",
                        activation="relu",
                        subsample_length=1))
-
 # we use standard max pooling (halving the output of the previous layer):
 model.add(MaxPooling1D(pool_length=2))

 # We flatten the output of the conv layer, so that we can add a vanilla dense layer:
 model.add(Flatten())

-# Computing the output shape of a conv layer can be tricky;
-# for a good tutorial, see: http://cs231n.github.io/convolutional-networks/
-output_size = nb_filters * (((maxlen - filter_length) / 1) + 1) / 2
-
 # We add a vanilla hidden layer:
-model.add(Dense(output_size, hidden_dims))
+model.add(Dense(hidden_dims))
 model.add(Dropout(0.25))
 model.add(Activation('relu'))

 # We project onto a single unit output layer, and squash it with a sigmoid:
-model.add(Dense(hidden_dims, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode="binary")
--- a/examples/imdb_lstm.py
+++ b/examples/imdb_lstm.py
@ -49,9 +49,9 @@ print('X_test shape:', X_test.shape)
 print('Build model...')
 model = Sequential()
 model.add(Embedding(max_features, 128))
-model.add(LSTM(128, 128))  # try using a GRU instead, for fun
+model.add(LSTM(128))  # try using a GRU instead, for fun
 model.add(Dropout(0.5))
-model.add(Dense(128, 1))
+model.add(Dense(1))
 model.add(Activation('sigmoid'))

 # try using different optimizers and different optimizer configs
--- a/examples/kaggle_otto_nn.py
+++ b/examples/kaggle_otto_nn.py
@ -20,11 +20,11 @@ from sklearn.preprocessing import StandardScaler

    Compatible Python 2.7-3.4. Requires Scikit-Learn and Pandas.

-    Recommended to run on GPU: 
+    Recommended to run on GPU:
        Command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python kaggle_otto_nn.py
        On EC2 g2.2xlarge instance: 19s/epoch. 6-7 minutes total training time.

-    Best validation score at epoch 21: 0.4881 
+    Best validation score at epoch 21: 0.4881

    Try it at home:
        - with/without BatchNormalization (BatchNormalization helps!)
@ -78,7 +78,6 @@ def make_submission(y_prob, ids, encoder, fname):
            f.write('\n')
    print("Wrote submission to file {}.".format(fname))

-
 print("Loading data...")
 X, labels = load_data('train.csv', train=True)
 X, scaler = preprocess_data(X)
@ -96,31 +95,29 @@ print(dims, 'dims')
 print("Building model...")

 model = Sequential()
-model.add(Dense(dims, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
+model.add(Dense(512, input_shape=(dims,)))
+model.add(PReLU())
 model.add(BatchNormalization((512,)))
 model.add(Dropout(0.5))

-model.add(Dense(512, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
-model.add(BatchNormalization((512,)))
+model.add(Dense(512))
+model.add(PReLU())
+model.add(BatchNormalization())
 model.add(Dropout(0.5))

-model.add(Dense(512, 512, init='glorot_uniform'))
-model.add(PReLU((512,)))
-model.add(BatchNormalization((512,)))
+model.add(Dense(512))
+model.add(PReLU())
+model.add(BatchNormalization())
 model.add(Dropout(0.5))

-model.add(Dense(512, nb_classes, init='glorot_uniform'))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer="adam")

 print("Training model...")
-
 model.fit(X, y, nb_epoch=20, batch_size=128, validation_split=0.15)

 print("Generating submission...")
-
 proba = model.predict_proba(X_test)
 make_submission(proba, ids, encoder, fname='keras-otto.csv')
--- a/examples/lstm_text_generation.py
+++ b/examples/lstm_text_generation.py
@ -4,7 +4,8 @@ from keras.layers.core import Dense, Activation, Dropout
 from keras.layers.recurrent import LSTM
 from keras.datasets.data_utils import get_file
 import numpy as np
-import random, sys
+import random
+import sys

 '''
    Example script to generate text from Nietzsche's writings.
@ -15,7 +16,7 @@ import random, sys
    It is recommended to run this script on GPU, as recurrent
    networks are quite computationally intensive.

-    If you try this script on new data, make sure your corpus 
+    If you try this script on new data, make sure your corpus
    has at least ~100k characters. ~1M is better.
 '''

@ -34,7 +35,7 @@ step = 3
 sentences = []
 next_chars = []
 for i in range(0, len(text) - maxlen, step):
-    sentences.append(text[i : i + maxlen])
+    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
 print('nb sequences:', len(sentences))

@ -50,20 +51,21 @@ for i, sentence in enumerate(sentences):
 # build the model: 2 stacked LSTM
 print('Build model...')
 model = Sequential()
-model.add(LSTM(len(chars), 512, return_sequences=True))
+model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
 model.add(Dropout(0.2))
-model.add(LSTM(512, 512, return_sequences=False))
+model.add(LSTM(512, return_sequences=False))
 model.add(Dropout(0.2))
-model.add(Dense(512, len(chars)))
+model.add(Dense(len(chars)))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

-# helper function to sample an index from a probability array
+
 def sample(a, temperature=1.0):
-    a = np.log(a)/temperature
-    a = np.exp(a)/np.sum(np.exp(a))
-    return np.argmax(np.random.multinomial(1,a,1))
+    # helper function to sample an index from a probability array
+    a = np.log(a) / temperature
+    a = np.exp(a) / np.sum(np.exp(a))
+    return np.argmax(np.random.multinomial(1, a, 1))

 # train the model, output generated text after each iteration
 for iteration in range(1, 60):
@ -79,7 +81,7 @@ for iteration in range(1, 60):
        print('----- diversity:', diversity)

        generated = ''
-        sentence = text[start_index : start_index + maxlen]
+        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
--- a/examples/mnist_cnn.py
+++ b/examples/mnist_cnn.py
@ -22,20 +22,20 @@ batch_size = 128
 nb_classes = 10
 nb_epoch = 12

-# shape of the image (SHAPE x SHAPE)
-shapex, shapey = 28, 28
+# input image dimensions
+img_rows, img_cols = 28, 28
 # number of convolutional filters to use
 nb_filters = 32
-# level of pooling to perform (POOL x POOL)
+# size of pooling area for max pooling
 nb_pool = 2
-# level of convolution to perform (CONV x CONV)
+# convolution kernel size
 nb_conv = 3

 # the data, shuffled and split between tran and test sets
 (X_train, y_train), (X_test, y_test) = mnist.load_data()

-X_train = X_train.reshape(X_train.shape[0], 1, shapex, shapey)
-X_test = X_test.reshape(X_test.shape[0], 1, shapex, shapey)
+X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
+X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
 X_train = X_train.astype("float32")
 X_test = X_test.astype("float32")
 X_train /= 255
@ -50,22 +50,20 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()

-model.add(Convolution2D(nb_filters, 1, nb_conv, nb_conv, border_mode='full'))
+model.add(Convolution2D(nb_filters, nb_conv, nb_conv,
+                        border_mode='full',
+                        input_shape=(1, img_rows, img_cols)))
 model.add(Activation('relu'))
-model.add(Convolution2D(nb_filters, nb_filters, nb_conv, nb_conv))
+model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
 model.add(Activation('relu'))
 model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
 model.add(Dropout(0.25))

 model.add(Flatten())
-# the resulting image after conv and pooling is the original shape
-# divided by the pooling with a number of filters for each "pixel"
-# (the number of filters is determined by the last Conv2D)
-model.add(Dense(nb_filters * (shapex / nb_pool) * (shapey / nb_pool), 128))
+model.add(Dense(128))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-
-model.add(Dense(128, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adadelta')
--- a/examples/mnist_irnn.py
+++ b/examples/mnist_irnn.py
@ -55,11 +55,12 @@ Y_test = np_utils.to_categorical(y_test, nb_classes)

 print('Evaluate IRNN...')
 model = Sequential()
-model.add(SimpleRNN(input_dim=1, output_dim=hidden_units,
+model.add(SimpleRNN(output_dim=hidden_units,
                    init=lambda shape: normal(shape, scale=0.001),
                    inner_init=lambda shape: identity(shape, scale=1.0),
-                    activation='relu', truncate_gradient=BPTT_truncate))
-model.add(Dense(hidden_units, nb_classes))
+                    activation='relu', truncate_gradient=BPTT_truncate,
+                    input_shape=(None, 1)))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))
 rmsprop = RMSprop(lr=learning_rate)
 model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
@ -73,8 +74,8 @@ print('IRNN test accuracy:', scores[1])

 print('Compare to LSTM...')
 model = Sequential()
-model.add(LSTM(1, hidden_units))
-model.add(Dense(hidden_units, nb_classes))
+model.add(LSTM(hidden_units, input_shape=(None, 1)))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))
 rmsprop = RMSprop(lr=learning_rate)
 model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
--- a/examples/mnist_mlp.py
+++ b/examples/mnist_mlp.py
@ -37,13 +37,13 @@ Y_train = np_utils.to_categorical(y_train, nb_classes)
 Y_test = np_utils.to_categorical(y_test, nb_classes)

 model = Sequential()
-model.add(Dense(784, 128))
+model.add(Dense(128, input_shape=(784,)))
 model.add(Activation('relu'))
 model.add(Dropout(0.2))
-model.add(Dense(128, 128))
+model.add(Dense(128))
 model.add(Activation('relu'))
 model.add(Dropout(0.2))
-model.add(Dense(128, 10))
+model.add(Dense(10))
 model.add(Activation('softmax'))

 rms = RMSprop()
--- a/examples/reuters_mlp.py
+++ b/examples/reuters_mlp.py
@ -45,10 +45,10 @@ print('Y_test shape:', Y_test.shape)

 print("Building model...")
 model = Sequential()
-model.add(Dense(max_words, 512))
+model.add(Dense(512, input_shape=(max_words,)))
 model.add(Activation('relu'))
 model.add(Dropout(0.5))
-model.add(Dense(512, nb_classes))
+model.add(Dense(nb_classes))
 model.add(Activation('softmax'))

 model.compile(loss='categorical_crossentropy', optimizer='adam')
--- a/examples/skipgram_word_embeddings.py
+++ b/examples/skipgram_word_embeddings.py
@ -160,7 +160,6 @@ word_index = tokenizer.word_index
 reverse_word_index = dict([(v, k) for k, v in list(word_index.items())])


-
 def embed_word(w):
    i = word_index.get(w)
    if (not i) or (i < skip_top) or (i >= max_features):
--- a/keras/layers/advanced_activations.py
+++ b/keras/layers/advanced_activations.py
@ -25,15 +25,19 @@ class PReLU(MaskedLayer):
            Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
                http://arxiv.org/pdf/1502.01852v1.pdf
    '''
-    def __init__(self, input_shape, init='zero', weights=None, **kwargs):
-        super(PReLU, self).__init__(**kwargs)
+    def __init__(self, init='zero', weights=None, **kwargs):
        self.init = initializations.get(init)
+        self.initial_weights = weights
+        super(PReLU, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape[1:]
        self.alphas = self.init(input_shape)
        self.params = [self.alphas]
-        self.input_shape = input_shape

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def get_output(self, train):
        X = self.get_input(train)
@ -43,7 +47,6 @@ class PReLU(MaskedLayer):

    def get_config(self):
        return {"name": self.__class__.__name__,
-                "input_shape": self.input_shape,
                "init": self.init.__name__}


@ -55,19 +58,23 @@ class ParametricSoftplus(MaskedLayer):
            Inferring Nonlinear Neuronal Computation Based on Physiologically Plausible Inputs
            http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003143
    '''
-    def __init__(self, input_shape, alpha_init=0.2,
-                 beta_init=5.0, weights=None, **kwargs):
-
-        super(ParametricSoftplus, self).__init__(**kwargs)
+    def __init__(self, alpha_init=0.2, beta_init=5.0,
+                 weights=None, **kwargs):
        self.alpha_init = alpha_init
        self.beta_init = beta_init
-        self.alphas = sharedX(alpha_init * np.ones(input_shape))
-        self.betas = sharedX(beta_init * np.ones(input_shape))
+        self.initial_weights = weights
+        super(ParametricSoftplus, self).__init__(**kwargs)
+
+    def build(self):
+        input_shape = self.input_shape[1:]
+        self.alphas = sharedX(self.alpha_init * np.ones(input_shape))
+        self.betas = sharedX(self.beta_init * np.ones(input_shape))
        self.params = [self.alphas, self.betas]
        self.input_shape = input_shape

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def get_output(self, train):
        X = self.get_input(train)
@ -75,7 +82,6 @@ class ParametricSoftplus(MaskedLayer):

    def get_config(self):
        return {"name": self.__class__.__name__,
-                "input_shape": self.input_shape,
                "alpha_init": self.alpha_init,
                "beta_init": self.beta_init}

--- a/keras/layers/convolutional.py
+++ b/keras/layers/convolutional.py
@ -51,7 +51,7 @@ def pool_output_length(input_length, pool_size, ignore_border, stride):
 class Convolution1D(Layer):
    input_ndim = 3

-    def __init__(self, input_dim, nb_filter, filter_length,
+    def __init__(self, nb_filter, filter_length,
                 init='uniform', activation='linear', weights=None,
                 border_mode='valid', subsample_length=1,
                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
@ -82,9 +82,9 @@ class Convolution1D(Layer):
    def build(self):
        input_dim = self.input_shape[2]
        self.input = T.tensor3()
-        self.W_shape = (nb_filter, input_dim, filter_length, 1)
+        self.W_shape = (self.nb_filter, input_dim, self.filter_length, 1)
        self.W = self.init(self.W_shape)
-        self.b = shared_zeros((nb_filter,))
+        self.b = shared_zeros((self.nb_filter,))
        self.params = [self.W, self.b]
        self.regularizers = []

@ -190,9 +190,9 @@ class Convolution2D(Layer):
    def build(self):
        stack_size = self.input_shape[1]
        self.input = T.tensor4()
-        self.W_shape = (nb_filter, stack_size, nb_row, nb_col)
+        self.W_shape = (self.nb_filter, stack_size, self.nb_row, self.nb_col)
        self.W = self.init(self.W_shape)
-        self.b = shared_zeros((nb_filter,))
+        self.b = shared_zeros((self.nb_filter,))
        self.params = [self.W, self.b]
        self.regularizers = []

--- a/keras/layers/core.py
+++ b/keras/layers/core.py
@ -21,7 +21,8 @@ class Layer(object):
    def __init__(self, **kwargs):
        if 'input_shape' in kwargs:
            self.set_input_shape(kwargs['input_shape'])
-        self.params = []
+        if not hasattr(self, 'params'):
+            self.params = []

    def init_updates(self):
        self.updates = []
@ -59,7 +60,7 @@ class Layer(object):
        elif hasattr(self, '_input_shape'):
            return self._input_shape
        else:
-            raise Exception('Layer is not connected.')
+            raise Exception('Layer is not connected. Did you forget to set "input_shape"?')

    def set_input_shape(self, input_shape):
        if type(input_shape) not in [tuple, list]:
@ -283,7 +284,7 @@ class Merge(Layer):
        elif self.mode == 'concat':
            output_shape = list(input_shapes[0])
            for shape in input_shapes[1:]:
-                output_shape[self.concat_axis] += shape[concat_axis]
+                output_shape[self.concat_axis] += shape[self.concat_axis]
            return tuple(output_shape)

    def get_params(self):
@ -528,7 +529,7 @@ class Dense(Layer):

        self.input = T.matrix()
        self.W = self.init((input_dim, self.output_dim))
-        self.b = shared_zeros((self.output_dim))
+        self.b = shared_zeros((self.output_dim,))

        self.params = [self.W, self.b]

--- a/keras/layers/embeddings.py
+++ b/keras/layers/embeddings.py
@ -19,38 +19,40 @@ class Embedding(Layer):
    '''
    input_ndim = 2

-    def __init__(self, input_dim, output_dim, init='uniform',
+    def __init__(self, input_dim, output_dim, init='uniform', max_lenght=None,
                 W_regularizer=None, activity_regularizer=None, W_constraint=None,
                 mask_zero=False, weights=None, **kwargs):
-
-        super(Embedding, self).__init__(**kwargs)
-        self.init = initializations.get(init)
        self.input_dim = input_dim
        self.output_dim = output_dim
-
-        self.input = T.imatrix()
-        self.W = self.init((self.input_dim, self.output_dim))
+        self.init = initializations.get(init)
+        self.max_lenght = max_lenght
        self.mask_zero = mask_zero

-        self.params = [self.W]
-
        self.W_constraint = constraints.get(W_constraint)
        self.constraints = [self.W_constraint]

-        self.regularizers = []
-
        self.W_regularizer = regularizers.get(W_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+        self.initial_weights = weights
+        kwargs['input_shape'] = (self.input_dim,)
+        super(Embedding, self).__init__(**kwargs)
+
+    def build(self):
+        self.input = T.imatrix()
+        self.W = self.init((self.input_dim, self.output_dim))
+        self.params = [self.W]
+        self.regularizers = []
        if self.W_regularizer:
            self.W_regularizer.set_param(self.W)
            self.regularizers.append(self.W_regularizer)

-        self.activity_regularizer = regularizers.get(activity_regularizer)
        if self.activity_regularizer:
            self.activity_regularizer.set_layer(self)
            self.regularizers.append(self.activity_regularizer)

-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)

    def get_output_mask(self, train=None):
        X = self.get_input(train)
@ -61,7 +63,7 @@ class Embedding(Layer):

    @property
    def output_shape(self):
-        return (self.input_shape[0], None, self.output_dim)
+        return (self.input_shape[0], self.max_lenght, self.output_dim)

    def get_output(self, train=False):
        X = self.get_input(train)
@ -73,6 +75,8 @@ class Embedding(Layer):
                "input_dim": self.input_dim,
                "output_dim": self.output_dim,
                "init": self.init.__name__,
+                "max_lenght": self.max_lenght,
+                "mask_zero": self.mask_zero,
                "activity_regularizer": self.activity_regularizer.get_config() if self.activity_regularizer else None,
                "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
                "W_constraint": self.W_constraint.get_config() if self.W_constraint else None}
--- a/keras/layers/normalization.py
+++ b/keras/layers/normalization.py
@ -21,6 +21,7 @@ class BatchNormalization(Layer):
        self.epsilon = epsilon
        self.mode = mode
        self.momentum = momentum
+        self.initial_weights = weights
        super(BatchNormalization, self).__init__(**kwargs)

    def build(self):
@ -34,8 +35,9 @@ class BatchNormalization(Layer):
        self.params = [self.gamma, self.beta]
        self.running_mean = shared_zeros(input_shape)
        self.running_std = shared_ones((input_shape))
-        if weights is not None:
-            self.set_weights(weights)
+        if self.initial_weights is not None:
+            self.set_weights(self.initial_weights)
+            del self.initial_weights

    def get_weights(self):
        return super(BatchNormalization, self).get_weights() + [self.running_mean.get_value(), self.running_std.get_value()]