diff --git a/examples/addition_rnn.py b/examples/addition_rnn.py
index bf74d8949..c7133a0e2 100644
--- a/examples/addition_rnn.py
+++ b/examples/addition_rnn.py
@@ -23,12 +23,10 @@ Four digits inverted:
 
 Five digits inverted:
 + One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs
-
 '''
 
 from __future__ import print_function
 from keras.models import Sequential
-from keras.engine.training import slice_X
 from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent
 import numpy as np
 from six.moves import range
@@ -57,15 +55,15 @@ class CharacterTable(object):
             num_rows: Number of rows in the returned one hot encoding. This is
                 used to keep the # of rows for each data the same.
         """
-        X = np.zeros((num_rows, len(self.chars)))
+        x = np.zeros((num_rows, len(self.chars)))
         for i, c in enumerate(C):
-            X[i, self.char_indices[c]] = 1
-        return X
+            x[i, self.char_indices[c]] = 1
+        return x
 
-    def decode(self, X, calc_argmax=True):
+    def decode(self, x, calc_argmax=True):
         if calc_argmax:
-            X = X.argmax(axis=-1)
-        return ''.join(self.indices_char[x] for x in X)
+            x = x.argmax(axis=-1)
+        return ''.join(self.indices_char[x] for x in x)
 
 
 class colors:
@@ -80,7 +78,7 @@ INVERT = True
 
 # Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
 # int is DIGITS.
-MAXLEN = DIGITS + 1 + DIGITS
+MAxLEN = DIGITS + 1 + DIGITS
 
 # All the numbers, plus sign and space for padding.
 chars = '0123456789+ '
@@ -95,14 +93,14 @@ while len(questions) < TRAINING_SIZE:
                     for i in range(np.random.randint(1, DIGITS + 1))))
     a, b = f(), f()
     # Skip any addition questions we've already seen
-    # Also skip any such that X+Y == Y+X (hence the sorting).
+    # Also skip any such that x+Y == Y+x (hence the sorting).
     key = tuple(sorted((a, b)))
     if key in seen:
         continue
     seen.add(key)
-    # Pad the data with spaces such that it is always MAXLEN.
+    # Pad the data with spaces such that it is always MAxLEN.
     q = '{}+{}'.format(a, b)
-    query = q + ' ' * (MAXLEN - len(q))
+    query = q + ' ' * (MAxLEN - len(q))
     ans = str(a + b)
     # Answers can be of maximum size DIGITS + 1.
     ans += ' ' * (DIGITS + 1 - len(ans))
@@ -115,31 +113,31 @@ while len(questions) < TRAINING_SIZE:
 print('Total addition questions:', len(questions))
 
 print('Vectorization...')
-X = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)
+x = np.zeros((len(questions), MAxLEN, len(chars)), dtype=np.bool)
 y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)
 for i, sentence in enumerate(questions):
-    X[i] = ctable.encode(sentence, MAXLEN)
+    x[i] = ctable.encode(sentence, MAxLEN)
 for i, sentence in enumerate(expected):
     y[i] = ctable.encode(sentence, DIGITS + 1)
 
-# Shuffle (X, y) in unison as the later parts of X will almost all be larger
+# Shuffle (x, y) in unison as the later parts of x will almost all be larger
 # digits.
 indices = np.arange(len(y))
 np.random.shuffle(indices)
-X = X[indices]
+x = x[indices]
 y = y[indices]
 
 # Explicitly set apart 10% for validation data that we never train over.
-split_at = len(X) - len(X) // 10
-(X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
-(y_train, y_val) = (y[:split_at], y[split_at:])
+split_at = len(x) - len(x) // 10
+(x_train, x_val) = x[:split_at], x[split_at:]
+(y_train, y_val) = y[:split_at], y[split_at:]
 
 print('Training Data:')
-print(X_train.shape)
+print(x_train.shape)
 print(y_train.shape)
 
 print('Validation Data:')
-print(X_val.shape)
+print(x_val.shape)
 print(y_val.shape)
 
 # Try replacing GRU, or SimpleRNN.
@@ -153,7 +151,7 @@ model = Sequential()
 # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
 # Note: In a situation where your input sequences have a variable length,
 # use input_shape=(None, num_feature).
-model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars))))
+model.add(RNN(HIDDEN_SIZE, input_shape=(MAxLEN, len(chars))))
 # As the decoder RNN's input, repeatedly provide with the last hidden state of
 # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
 # length of output, e.g., when DIGITS=3, max output is 999+999=1998.
@@ -181,15 +179,15 @@ for iteration in range(1, 200):
     print()
     print('-' * 50)
     print('Iteration', iteration)
-    model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=1,
-              validation_data=(X_val, y_val))
+    model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=1,
+              validation_data=(x_val, y_val))
     # Select 10 samples from the validation set at random so we can visualize
     # errors.
     for i in range(10):
-        ind = np.random.randint(0, len(X_val))
-        rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
-        preds = model.predict_classes(rowX, verbose=0)
-        q = ctable.decode(rowX[0])
+        ind = np.random.randint(0, len(x_val))
+        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
+        preds = model.predict_classes(rowx, verbose=0)
+        q = ctable.decode(rowx[0])
         correct = ctable.decode(rowy[0])
         guess = ctable.decode(preds[0], calc_argmax=False)
         print('Q', q[::-1] if INVERT else q)
diff --git a/examples/antirectifier.py b/examples/antirectifier.py
index 398175eb4..4a4269b62 100644
--- a/examples/antirectifier.py
+++ b/examples/antirectifier.py
@@ -52,11 +52,11 @@ class Antirectifier(Layer):
         shape[-1] *= 2
         return tuple(shape)
 
-    def call(self, x, mask=None):
-        x -= K.mean(x, axis=1, keepdims=True)
-        x = K.l2_normalize(x, axis=1)
-        pos = K.relu(x)
-        neg = K.relu(-x)
+    def call(self, inputs):
+        inputs -= K.mean(inputs, axis=1, keepdims=True)
+        inputs = K.l2_normalize(inputs, axis=1)
+        pos = K.relu(inputs)
+        neg = K.relu(-inputs)
         return K.concatenate([pos, neg], axis=1)
 
 # global parameters
@@ -65,16 +65,16 @@ num_classes = 10
 epochs = 40
 
 # the data, shuffled and split between train and test sets
-(X_train, y_train), (X_test, y_test) = mnist.load_data()
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
 
-X_train = X_train.reshape(60000, 784)
-X_test = X_test.reshape(10000, 784)
-X_train = X_train.astype('float32')
-X_test = X_test.astype('float32')
-X_train /= 255
-X_test /= 255
-print(X_train.shape[0], 'train samples')
-print(X_test.shape[0], 'test samples')
+x_train = x_train.reshape(60000, 784)
+x_test = x_test.reshape(10000, 784)
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
 
 # convert class vectors to binary class matrices
 Y_train = np_utils.to_categorical(y_train, num_classes)
@@ -97,9 +97,9 @@ model.compile(loss='categorical_crossentropy',
               metrics=['accuracy'])
 
 # train the model
-model.fit(X_train, Y_train,
+model.fit(x_train, Y_train,
           batch_size=batch_size, epochs=epochs,
-          verbose=1, validation_data=(X_test, Y_test))
+          verbose=1, validation_data=(x_test, Y_test))
 
 # next, compare with an equivalent network
 # with2x bigger Dense layers and ReLU
diff --git a/examples/babi_rnn.py b/examples/babi_rnn.py
index 1c11a8445..c04db8250 100644
--- a/examples/babi_rnn.py
+++ b/examples/babi_rnn.py
@@ -12,7 +12,7 @@ QA2 - Two Supporting Facts   | 20               | 50.0
 QA3 - Three Supporting Facts | 20               | 20.5
 QA4 - Two Arg. Relations     | 61               | 62.9
 QA5 - Three Arg. Relations   | 70               | 61.9
-QA6 - Yes/No Questions       | 48               | 50.7
+QA6 - yes/No Questions       | 48               | 50.7
 QA7 - Counting               | 49               | 78.9
 QA8 - Lists/Sets             | 45               | 77.2
 QA9 - Simple Negation        | 64               | 64.0
@@ -62,13 +62,12 @@ import re
 import tarfile
 
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.utils.data_utils import get_file
 from keras.layers.embeddings import Embedding
-from keras.layers import Dense, Merge, Dropout, RepeatVector
+from keras import layers
 from keras.layers import recurrent
-from keras.models import Sequential
+from keras.models import Model
 from keras.preprocessing.sequence import pad_sequences
 
 
@@ -125,26 +124,26 @@ def get_stories(f, only_supporting=False, max_length=None):
 
 
 def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
-    X = []
-    Xq = []
-    Y = []
+    xs = []
+    xqs = []
+    ys = []
     for story, query, answer in data:
         x = [word_idx[w] for w in story]
         xq = [word_idx[w] for w in query]
         y = np.zeros(len(word_idx) + 1)  # let's not forget that index 0 is reserved
         y[word_idx[answer]] = 1
-        X.append(x)
-        Xq.append(xq)
-        Y.append(y)
-    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)
+        xs.append(x)
+        xqs.append(xq)
+        ys.append(y)
+    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)
 
 RNN = recurrent.LSTM
 EMBED_HIDDEN_SIZE = 50
 SENT_HIDDEN_SIZE = 100
-QUERY_HIDDEN_SIZE = 100
+QUERy_HIDDEN_SIZE = 100
 BATCH_SIZE = 32
 EPOCHS = 40
-print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE))
+print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERy_HIDDEN_SIZE))
 
 try:
     path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
@@ -172,40 +171,38 @@ word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
 story_maxlen = max(map(len, (x for x, _, _ in train + test)))
 query_maxlen = max(map(len, (x for _, x, _ in train + test)))
 
-X, Xq, Y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
-tX, tXq, tY = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
+x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
+tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
 
 print('vocab = {}'.format(vocab))
-print('X.shape = {}'.format(X.shape))
-print('Xq.shape = {}'.format(Xq.shape))
-print('Y.shape = {}'.format(Y.shape))
+print('x.shape = {}'.format(x.shape))
+print('xq.shape = {}'.format(xq.shape))
+print('y.shape = {}'.format(y.shape))
 print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))
 
 print('Build model...')
 
-sentrnn = Sequential()
-sentrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE,
-                      input_length=story_maxlen))
-sentrnn.add(Dropout(0.3))
+sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
+encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
+encoded_sentence = layers.Dropout(0.3)(encoded_sentence)
 
-qrnn = Sequential()
-qrnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE,
-                   input_length=query_maxlen))
-qrnn.add(Dropout(0.3))
-qrnn.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
-qrnn.add(RepeatVector(story_maxlen))
+question = layers.Input(shape=(query_maxlen,), dtype='int32')
+encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
+encoded_question = layers.Dropout(0.3)(encoded_question)
+encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
+encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)
 
-model = Sequential()
-model.add(Merge([sentrnn, qrnn], mode='sum'))
-model.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
-model.add(Dropout(0.3))
-model.add(Dense(vocab_size, activation='softmax'))
+merged = layers.sum([encoded_sentence, encoded_question])
+merged = RNN(EMBED_HIDDEN_SIZE)(merged)
+merged = layers.Dropout(0.3)(merged)
+preds = layers.Dense(vocab_size, activation='softmax')(merged)
 
+model = Model([sentence, question], preds)
 model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
 
 print('Training')
-model.fit([X, Xq], Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05)
-loss, acc = model.evaluate([tX, tXq], tY, batch_size=BATCH_SIZE)
+model.fit([x, xq], y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05)
+loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)
 print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
diff --git a/examples/cifar10_cnn.py b/examples/cifar10_cnn.py
index 92a3ed308..f27265bd3 100644
--- a/examples/cifar10_cnn.py
+++ b/examples/cifar10_cnn.py
@@ -1,7 +1,7 @@
 '''Train a simple deep CNN on the CIFAR10 small images dataset.
 
 GPU run command with Theano backend (with TensorFlow, the GPU is automatically used):
-    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python cifar10_cnn.py
+    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python cifar10_cnn.py
 
 It gets down to 0.65 test logloss in 25 epochs, and down to 0.55 after 50 epochs.
 (it's still underfitting at that point, though).
@@ -12,7 +12,7 @@ from keras.datasets import cifar10
 from keras.preprocessing.image import ImageDataGenerator
 from keras.models import Sequential
 from keras.layers import Dense, Dropout, Activation, Flatten
-from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers import Conv2D, MaxPooling2D
 from keras.utils import np_utils
 
 batch_size = 32
@@ -26,28 +26,28 @@ img_rows, img_cols = 32, 32
 img_channels = 3
 
 # The data, shuffled and split between train and test sets:
-(X_train, y_train), (X_test, y_test) = cifar10.load_data()
-print('X_train shape:', X_train.shape)
-print(X_train.shape[0], 'train samples')
-print(X_test.shape[0], 'test samples')
+(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
 
 # Convert class vectors to binary class matrices.
-Y_train = np_utils.to_categorical(y_train, num_classes)
-Y_test = np_utils.to_categorical(y_test, num_classes)
+y_train = np_utils.to_categorical(y_train, num_classes)
+y_test = np_utils.to_categorical(y_test, num_classes)
 
 model = Sequential()
 
-model.add(Convolution2D(32, 3, 3, border_mode='same',
-                        input_shape=X_train.shape[1:]))
+model.add(Conv2D(32, (3, 3), padding='same',
+                 input_shape=x_train.shape[1:]))
 model.add(Activation('relu'))
-model.add(Convolution2D(32, 3, 3))
+model.add(Conv2D(32, (3, 3)))
 model.add(Activation('relu'))
 model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))
 
-model.add(Convolution2D(64, 3, 3, border_mode='same'))
+model.add(Conv2D(64, (3, 3), padding='same'))
 model.add(Activation('relu'))
-model.add(Convolution2D(64, 3, 3))
+model.add(Conv2D(64, (3, 3)))
 model.add(Activation('relu'))
 model.add(MaxPooling2D(pool_size=(2, 2)))
 model.add(Dropout(0.25))
@@ -64,17 +64,17 @@ model.compile(loss='categorical_crossentropy',
               optimizer='rmsprop',
               metrics=['accuracy'])
 
-X_train = X_train.astype('float32')
-X_test = X_test.astype('float32')
-X_train /= 255
-X_test /= 255
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
 
 if not data_augmentation:
     print('Not using data augmentation.')
-    model.fit(X_train, Y_train,
+    model.fit(x_train, y_train,
               batch_size=batch_size,
               epochs=epochs,
-              validation_data=(X_test, Y_test),
+              validation_data=(x_test, y_test),
               shuffle=True)
 else:
     print('Using real-time data augmentation.')
@@ -93,11 +93,11 @@ else:
 
     # Compute quantities required for featurewise normalization
     # (std, mean, and principal components if ZCA whitening is applied).
-    datagen.fit(X_train)
+    datagen.fit(x_train)
 
     # Fit the model on the batches generated by datagen.flow().
-    model.fit_generator(datagen.flow(X_train, Y_train,
+    model.fit_generator(datagen.flow(x_train, y_train,
                                      batch_size=batch_size),
-                        samples_per_epoch=X_train.shape[0],
+                        samples_per_epoch=x_train.shape[0],
                         epochs=epochs,
-                        validation_data=(X_test, Y_test))
+                        validation_data=(x_test, y_test))
diff --git a/examples/conv_lstm.py b/examples/conv_lstm.py
index 9b4e758ab..653f75481 100644
--- a/examples/conv_lstm.py
+++ b/examples/conv_lstm.py
@@ -3,7 +3,7 @@ This network is used to predict the next frame of an artificially
 generated movie which contains moving squares.
 """
 from keras.models import Sequential
-from keras.layers.convolutional import Convolution3D
+from keras.layers.convolutional import Conv3D
 from keras.layers.convolutional_recurrent import ConvLSTM2D
 from keras.layers.normalization import BatchNormalization
 import numpy as np
@@ -14,27 +14,26 @@ import pylab as plt
 # of identical shape.
 
 seq = Sequential()
-seq.add(ConvLSTM2D(filters=40, num_row=3, num_col=3,
+seq.add(ConvLSTM2D(filters=40, kernel_size=(3, 3),
                    input_shape=(None, 40, 40, 1),
-                   border_mode='same', return_sequences=True))
+                   padding='same', return_sequences=True))
 seq.add(BatchNormalization())
 
-seq.add(ConvLSTM2D(filters=40, num_row=3, num_col=3,
-                   border_mode='same', return_sequences=True))
+seq.add(ConvLSTM2D(filters=40, kernel_size=(3, 3),
+                   padding='same', return_sequences=True))
 seq.add(BatchNormalization())
 
-seq.add(ConvLSTM2D(filters=40, num_row=3, num_col=3,
-                   border_mode='same', return_sequences=True))
+seq.add(ConvLSTM2D(filters=40, kernel_size=(3, 3),
+                   padding='same', return_sequences=True))
 seq.add(BatchNormalization())
 
-seq.add(ConvLSTM2D(filters=40, num_row=3, num_col=3,
-                   border_mode='same', return_sequences=True))
+seq.add(ConvLSTM2D(filters=40, kernel_size=(3, 3),
+                   padding='same', return_sequences=True))
 seq.add(BatchNormalization())
 
-seq.add(Convolution3D(filters=1, kernel_dim1=1, kernel_dim2=3,
-                      kernel_dim3=3, activation='sigmoid',
-                      border_mode='same', data_format='channels_last'))
-
+seq.add(Conv3D(filters=1, kernel_size=(3, 3, 3),
+               activation='sigmoid',
+               padding='same', data_format='channels_last'))
 seq.compile(loss='binary_crossentropy', optimizer='adadelta')
 
 
diff --git a/examples/imdb_lstm.py b/examples/imdb_lstm.py
index a5f8ddeae..c83d5c6c7 100644
--- a/examples/imdb_lstm.py
+++ b/examples/imdb_lstm.py
@@ -12,7 +12,6 @@ from what you see with CNNs/MLPs/etc.
 '''
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
@@ -37,10 +36,9 @@ print('x_test shape:', x_test.shape)
 
 print('Build model...')
 model = Sequential()
-model.add(Embedding(max_features, 128, dropout=0.2))
-model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
-model.add(Dense(1))
-model.add(Activation('sigmoid'))
+model.add(Embedding(max_features, 128))
+model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+model.add(Dense(1, activation='sigmoid'))
 
 # try using different optimizers and different optimizer configs
 model.compile(loss='binary_crossentropy',
diff --git a/keras/preprocessing/sequence.py b/keras/preprocessing/sequence.py
index 2b5064f66..ac26c9f53 100644
--- a/keras/preprocessing/sequence.py
+++ b/keras/preprocessing/sequence.py
@@ -33,7 +33,14 @@ def pad_sequences(sequences, maxlen=None, dtype='int32',
         ValueError: in case of invalid values for `truncating` or `padding`,
             or in case of invalid shape for a `sequences` entry.
     """
-    lengths = [len(s) for s in sequences]
+    if not hasattr(sequences, '__len__'):
+        raise ValueError('`sequences` must be iterable.')
+    lengths = []
+    for x in sequences:
+        if not hasattr(x, '__len__'):
+            raise ValueError('`sequences` must be a list of iterables. '
+                             'Found non-iterable: ' + str(x))
+        lengths.append(len(x))
 
     num_samples = len(sequences)
     if maxlen is None: