From aa826d684d97c7871daa65cdbaf19eaac32955c2 Mon Sep 17 00:00:00 2001
From: Francois Chollet <francois.chollet@gmail.com>
Date: Sat, 11 Mar 2017 19:44:29 -0800
Subject: [PATCH] Finish updating examples.

---
 examples/babi_memnn.py                |  2 +-
 examples/conv_filter_visualization.py |  1 +
 examples/deep_dream.py                | 37 +++++------
 examples/imdb_bidirectional_lstm.py   | 23 +++----
 examples/imdb_cnn.py                  | 21 +++----
 examples/imdb_cnn_lstm.py             | 23 +++----
 examples/imdb_fasttext.py             | 37 ++++++-----
 examples/imdb_lstm.py                 |  3 +-
 examples/lstm_benchmark.py            |  8 ++-
 examples/lstm_text_generation.py      |  2 +-
 examples/mnist_acgan.py               | 39 +++++++-----
 examples/mnist_hierarchical_rnn.py    | 11 ++--
 examples/mnist_siamese_graph.py       |  7 ++-
 examples/mnist_sklearn_wrapper.py     | 10 +--
 examples/mnist_swwae.py               | 91 +++++++++++++--------------
 examples/neural_doodle.py             | 10 +--
 examples/stateful_lstm.py             | 10 +--
 17 files changed, 170 insertions(+), 165 deletions(-)

diff --git a/examples/babi_memnn.py b/examples/babi_memnn.py
index f477d700e..7580de4da 100644
--- a/examples/babi_memnn.py
+++ b/examples/babi_memnn.py
@@ -12,8 +12,8 @@ References:
 Reaches 98.6% accuracy on task 'single_supporting_fact_10k' after 120 epochs.
 Time per epoch: 3s on CPU (core i7).
 '''
-
 from __future__ import print_function
+
 from keras.models import Sequential
 from keras.layers.embeddings import Embedding
 from keras.layers import Activation, Dense, Merge, Permute, Dropout
diff --git a/examples/conv_filter_visualization.py b/examples/conv_filter_visualization.py
index 66d4af9ee..b85b86797 100644
--- a/examples/conv_filter_visualization.py
+++ b/examples/conv_filter_visualization.py
@@ -5,6 +5,7 @@ This script can run on CPU in a few minutes (with the TensorFlow backend).
 Results example: http://i.imgur.com/4nj4KjN.jpg
 '''
 from __future__ import print_function
+
 from scipy.misc import imsave
 import numpy as np
 import time
diff --git a/examples/deep_dream.py b/examples/deep_dream.py
index 84664177e..5022ac24b 100644
--- a/examples/deep_dream.py
+++ b/examples/deep_dream.py
@@ -15,6 +15,7 @@ If running on CPU, prefer the TensorFlow backend (much faster).
 Example results: http://i.imgur.com/FX6ROg9.jpg
 '''
 from __future__ import print_function
+
 from keras.preprocessing.image import load_img, img_to_array
 import numpy as np
 from scipy.misc import imsave
@@ -57,21 +58,19 @@ saved_settings = {
 # the settings we will use in this experiment
 settings = saved_settings['dreamy']
 
-# util function to open, resize and format pictures into appropriate tensors
-
 
 def preprocess_image(image_path):
+    # util function to open, resize and format pictures
+    # into appropriate tensors
     img = load_img(image_path, target_size=(img_height, img_width))
     img = img_to_array(img)
     img = np.expand_dims(img, axis=0)
     img = vgg16.preprocess_input(img)
     return img
 
-# util function to convert a tensor into a valid image
-
 
 def deprocess_image(x):
-
+    # util function to convert a tensor into a valid image
     if K.image_data_format() == 'channels_first':
         x = x.reshape((3, img_height, img_width))
         x = x.transpose((1, 2, 0))
@@ -102,10 +101,9 @@ print('Model loaded.')
 # get the symbolic outputs of each "key" layer (we gave them unique names).
 layer_dict = dict([(layer.name, layer) for layer in model.layers])
 
-# continuity loss util function
-
 
 def continuity_loss(x):
+    # continuity loss util function
     assert K.ndim(x) == 4
     if K.image_data_format() == 'channels_first':
         a = K.square(x[:, :, :img_height - 1, :img_width - 1] -
@@ -162,15 +160,17 @@ def eval_loss_and_grads(x):
         grad_values = np.array(outs[1:]).flatten().astype('float64')
     return loss_value, grad_values
 
-# this Evaluator class makes it possible
-# to compute loss and gradients in one pass
-# while retrieving them via two separate functions,
-# "loss" and "grads". This is done because scipy.optimize
-# requires separate functions for loss and gradients,
-# but computing them separately would be inefficient.
-
 
 class Evaluator(object):
+    """Loss and gradients evaluator.
+
+    This Evaluator class makes it possible
+    to compute loss and gradients in one pass
+    while retrieving them via two separate functions,
+    "loss" and "grads". This is done because scipy.optimize
+    requires separate functions for loss and gradients,
+    but computing them separately would be inefficient.
+    """
 
     def __init__(self):
         self.loss_value = None
@@ -192,22 +192,23 @@ class Evaluator(object):
 
 evaluator = Evaluator()
 
-# run scipy-based optimization (L-BFGS) over the pixels of the generated image
+# Run scipy-based optimization (L-BFGS) over the pixels of the generated image
 # so as to minimize the loss
 x = preprocess_image(base_image_path)
 for i in range(5):
     print('Start of iteration', i)
     start_time = time.time()
 
-    # add a random jitter to the initial image. This will be reverted at decoding time
+    # Add a random jitter to the initial image.
+    # This will be reverted at decoding time
     random_jitter = (settings['jitter'] * 2) * (np.random.random(img_size) - 0.5)
     x += random_jitter
 
-    # run L-BFGS for 7 steps
+    # Run L-BFGS for 7 steps
     x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(),
                                      fprime=evaluator.grads, maxfun=7)
     print('Current loss value:', min_val)
-    # decode the dream and save it
+    # Decode the dream and save it
     x = x.reshape(img_size)
     x -= random_jitter
     img = deprocess_image(np.copy(x))
diff --git a/examples/imdb_bidirectional_lstm.py b/examples/imdb_bidirectional_lstm.py
index 48ef039d3..f27402355 100644
--- a/examples/imdb_bidirectional_lstm.py
+++ b/examples/imdb_bidirectional_lstm.py
@@ -6,7 +6,6 @@ Time per epoch on CPU (Core i7): ~150s.
 
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
@@ -15,19 +14,21 @@ from keras.datasets import imdb
 
 
 max_features = 20000
-maxlen = 100  # cut texts after this number of words (among top max_features most common words)
+# cut texts after this number of words
+# (among top max_features most common words)
+maxlen = 100
 batch_size = 32
 
 print('Loading data...')
-(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-print(len(X_train), 'train sequences')
-print(len(X_test), 'test sequences')
+(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
+print(len(x_train), 'train sequences')
+print(len(x_test), 'test sequences')
 
 print("Pad sequences (samples x time)")
-X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-print('X_train shape:', X_train.shape)
-print('X_test shape:', X_test.shape)
+x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
+x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
+print('x_train shape:', x_train.shape)
+print('x_test shape:', x_test.shape)
 y_train = np.array(y_train)
 y_test = np.array(y_test)
 
@@ -41,7 +42,7 @@ model.add(Dense(1, activation='sigmoid'))
 model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
 
 print('Train...')
-model.fit(X_train, y_train,
+model.fit(x_train, y_train,
           batch_size=batch_size,
           epochs=4,
-          validation_data=[X_test, y_test])
+          validation_data=[x_test, y_test])
diff --git a/examples/imdb_cnn.py b/examples/imdb_cnn.py
index f78bd332a..1b54128aa 100644
--- a/examples/imdb_cnn.py
+++ b/examples/imdb_cnn.py
@@ -7,8 +7,6 @@ Gets to 0.89 test accuracy after 2 epochs.
 '''
 
 from __future__ import print_function
-import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
@@ -17,7 +15,6 @@ from keras.layers import Embedding
 from keras.layers import Conv1D, GlobalMaxPooling1D
 from keras.datasets import imdb
 
-
 # set parameters:
 max_features = 5000
 maxlen = 400
@@ -29,15 +26,15 @@ hidden_dims = 250
 epochs = 2
 
 print('Loading data...')
-(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-print(len(X_train), 'train sequences')
-print(len(X_test), 'test sequences')
+(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
+print(len(x_train), 'train sequences')
+print(len(x_test), 'test sequences')
 
 print('Pad sequences (samples x time)')
-X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-print('X_train shape:', X_train.shape)
-print('X_test shape:', X_test.shape)
+x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
+x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
+print('x_train shape:', x_train.shape)
+print('x_test shape:', x_test.shape)
 
 print('Build model...')
 model = Sequential()
@@ -71,7 +68,7 @@ model.add(Activation('sigmoid'))
 model.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
-model.fit(X_train, y_train,
+model.fit(x_train, y_train,
           batch_size=batch_size,
           epochs=epochs,
-          validation_data=(X_test, y_test))
+          validation_data=(x_test, y_test))
diff --git a/examples/imdb_cnn_lstm.py b/examples/imdb_cnn_lstm.py
index 5df5e12b9..4730fd732 100644
--- a/examples/imdb_cnn_lstm.py
+++ b/examples/imdb_cnn_lstm.py
@@ -4,8 +4,6 @@ classification task.
 Gets to 0.8498 test accuracy after 2 epochs. 41s/epoch on K520 GPU.
 '''
 from __future__ import print_function
-import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
@@ -15,7 +13,6 @@ from keras.layers import LSTM
 from keras.layers import Conv1D, MaxPooling1D
 from keras.datasets import imdb
 
-
 # Embedding
 max_features = 20000
 maxlen = 100
@@ -40,15 +37,15 @@ Only 2 epochs are needed as the dataset is very small.
 '''
 
 print('Loading data...')
-(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-print(len(X_train), 'train sequences')
-print(len(X_test), 'test sequences')
+(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
+print(len(x_train), 'train sequences')
+print(len(x_test), 'test sequences')
 
 print('Pad sequences (samples x time)')
-X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-print('X_train shape:', X_train.shape)
-print('X_test shape:', X_test.shape)
+x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
+x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
+print('x_train shape:', x_train.shape)
+print('x_test shape:', x_test.shape)
 
 print('Build model...')
 
@@ -70,8 +67,8 @@ model.compile(loss='binary_crossentropy',
               metrics=['accuracy'])
 
 print('Train...')
-model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
-          validation_data=(X_test, y_test))
-score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
+model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
+          validation_data=(x_test, y_test))
+score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
 print('Test score:', score)
 print('Test accuracy:', acc)
diff --git a/examples/imdb_fasttext.py b/examples/imdb_fasttext.py
index 78abccb02..6a7e6ff37 100644
--- a/examples/imdb_fasttext.py
+++ b/examples/imdb_fasttext.py
@@ -7,12 +7,11 @@ https://arxiv.org/abs/1607.01759
 
 Results on IMDB datasets with uni and bi-gram embeddings:
     Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
-    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTX 980M gpu.
+    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
 '''
 
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
@@ -73,17 +72,17 @@ embedding_dims = 50
 epochs = 5
 
 print('Loading data...')
-(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-print(len(X_train), 'train sequences')
-print(len(X_test), 'test sequences')
-print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
-print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
+(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
+print(len(x_train), 'train sequences')
+print(len(x_test), 'test sequences')
+print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
+print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
 
 if ngram_range > 1:
     print('Adding {}-gram features'.format(ngram_range))
     # Create set of unique n-gram from the training set.
     ngram_set = set()
-    for input_list in X_train:
+    for input_list in x_train:
         for i in range(2, ngram_range + 1):
             set_of_ngram = create_ngram_set(input_list, ngram_value=i)
             ngram_set.update(set_of_ngram)
@@ -98,17 +97,17 @@ if ngram_range > 1:
     # max_features is the highest integer that could be found in the dataset.
     max_features = np.max(list(indice_token.keys())) + 1
 
-    # Augmenting X_train and X_test with n-grams features
-    X_train = add_ngram(X_train, token_indice, ngram_range)
-    X_test = add_ngram(X_test, token_indice, ngram_range)
-    print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
-    print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
+    # Augmenting x_train and x_test with n-grams features
+    x_train = add_ngram(x_train, token_indice, ngram_range)
+    x_test = add_ngram(x_test, token_indice, ngram_range)
+    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
+    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
 
 print('Pad sequences (samples x time)')
-X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-print('X_train shape:', X_train.shape)
-print('X_test shape:', X_test.shape)
+x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
+x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
+print('x_train shape:', x_train.shape)
+print('x_test shape:', x_test.shape)
 
 print('Build model...')
 model = Sequential()
@@ -130,7 +129,7 @@ model.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
 
-model.fit(X_train, y_train,
+model.fit(x_train, y_train,
           batch_size=batch_size,
           epochs=epochs,
-          validation_data=(X_test, y_test))
+          validation_data=(x_test, y_test))
diff --git a/examples/imdb_lstm.py b/examples/imdb_lstm.py
index c83d5c6c7..bc51a396f 100644
--- a/examples/imdb_lstm.py
+++ b/examples/imdb_lstm.py
@@ -11,11 +11,10 @@ Some configurations won't converge.
 from what you see with CNNs/MLPs/etc.
 '''
 from __future__ import print_function
-import numpy as np
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
-from keras.layers import Dense, Activation, Embedding
+from keras.layers import Dense, Embedding
 from keras.layers import LSTM
 from keras.datasets import imdb
 
diff --git a/examples/lstm_benchmark.py b/examples/lstm_benchmark.py
index 329b3df7f..403fefdfc 100644
--- a/examples/lstm_benchmark.py
+++ b/examples/lstm_benchmark.py
@@ -43,9 +43,13 @@ for mode in modes:
     print('Testing mode: implementation={}'.format(mode))
 
     model = Sequential()
-    model.add(Embedding(max_features, embedding_dim, input_length=max_length))
+    model.add(Embedding(max_features, embedding_dim,
+                        input_length=max_length))
     model.add(Dropout(0.2))
-    model.add(LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2, implementation=mode))
+    model.add(LSTM(embedding_dim,
+                   dropout=0.2,
+                   recurrent_dropout=0.2,
+                   implementation=mode))
     model.add(Dense(1, activation='sigmoid'))
     model.compile(loss='binary_crossentropy',
                   optimizer='adam',
diff --git a/examples/lstm_text_generation.py b/examples/lstm_text_generation.py
index 6559c0165..912d91663 100644
--- a/examples/lstm_text_generation.py
+++ b/examples/lstm_text_generation.py
@@ -20,7 +20,7 @@ import numpy as np
 import random
 import sys
 
-path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
+path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
 text = open(path).read().lower()
 print('corpus length:', len(text))
 
diff --git a/examples/mnist_acgan.py b/examples/mnist_acgan.py
index 87ca7466e..c85337d7a 100644
--- a/examples/mnist_acgan.py
+++ b/examples/mnist_acgan.py
@@ -6,8 +6,8 @@ MNIST dataset. See https://arxiv.org/abs/1610.09585 for more details.
 
 You should start to see reasonable images after ~5 epochs, and good images
 by ~15 epochs. You should use a GPU, as the convolution-heavy operations are
-very slow on the CPU. Prefer the TensorFlow backend if you plan on iterating, as
-the compilation time can be a blocker using Theano.
+very slow on the CPU. Prefer the TensorFlow backend if you plan on iterating,
+as the compilation time can be a blocker using Theano.
 
 Timings:
 
@@ -33,9 +33,10 @@ from six.moves import range
 
 import keras.backend as K
 from keras.datasets import mnist
-from keras.layers import Input, Dense, Reshape, Flatten, Embedding, merge, Dropout
+from keras import layers
+from keras.layers import Input, Dense, Reshape, Flatten, Embedding, Dropout
 from keras.layers.advanced_activations import LeakyReLU
-from keras.layers.convolutional import UpSampling2D, Convolution2D
+from keras.layers.convolutional import UpSampling2D, Conv2D
 from keras.models import Sequential, Model
 from keras.optimizers import Adam
 from keras.utils.generic_utils import Progbar
@@ -57,17 +58,20 @@ def build_generator(latent_size):
 
     # upsample to (..., 14, 14)
     cnn.add(UpSampling2D(size=(2, 2)))
-    cnn.add(Convolution2D(256, 5, padding='same',
-                          activation='relu', kernel_initializer='glorot_normal'))
+    cnn.add(Conv2D(256, 5, padding='same',
+                   activation='relu',
+                   kernel_initializer='glorot_normal'))
 
     # upsample to (..., 28, 28)
     cnn.add(UpSampling2D(size=(2, 2)))
-    cnn.add(Convolution2D(128, 5, padding='same',
-                          activation='relu', kernel_initializer='glorot_normal'))
+    cnn.add(Conv2D(128, 5, padding='same',
+                   activation='relu',
+                   kernel_initializer='glorot_normal'))
 
     # take a channel axis reduction
-    cnn.add(Convolution2D(1, 2, padding='same',
-                          activation='tanh', kernel_initializer='glorot_normal'))
+    cnn.add(Conv2D(1, 2, padding='same',
+                   activation='tanh',
+                   kernel_initializer='glorot_normal'))
 
     # this is the z space commonly refered to in GAN papers
     latent = Input(shape=(latent_size, ))
@@ -80,7 +84,7 @@ def build_generator(latent_size):
                               embeddings_initializer='glorot_normal')(image_class))
 
     # hadamard product between z-space and a class conditional embedding
-    h = merge([latent, cls], mode='mul')
+    h = layers.multiply([latent, cls])
 
     fake_image = cnn(h)
 
@@ -92,20 +96,20 @@ def build_discriminator():
     # the reference paper
     cnn = Sequential()
 
-    cnn.add(Convolution2D(32, 3, padding='same', strides=2,
-                          input_shape=(1, 28, 28)))
+    cnn.add(Conv2D(32, 3, padding='same', strides=2,
+                   input_shape=(1, 28, 28)))
     cnn.add(LeakyReLU())
     cnn.add(Dropout(0.3))
 
-    cnn.add(Convolution2D(64, 3, padding='same', strides=2))
+    cnn.add(Conv2D(64, 3, padding='same', strides=2))
     cnn.add(LeakyReLU())
     cnn.add(Dropout(0.3))
 
-    cnn.add(Convolution2D(128, 3, padding='same', strides=2))
+    cnn.add(Conv2D(128, 3, padding='same', strides=2))
     cnn.add(LeakyReLU())
     cnn.add(Dropout(0.3))
 
-    cnn.add(Convolution2D(256, 3, padding='same', strides=1))
+    cnn.add(Conv2D(256, 3, padding='same', strides=1))
     cnn.add(LeakyReLU())
     cnn.add(Dropout(0.3))
 
@@ -224,7 +228,8 @@ if __name__ == '__main__':
             trick = np.ones(2 * batch_size)
 
             epoch_gen_loss.append(combined.train_on_batch(
-                [noise, sampled_labels.reshape((-1, 1))], [trick, sampled_labels]))
+                [noise, sampled_labels.reshape((-1, 1))],
+                [trick, sampled_labels]))
 
         print('\nTesting for epoch {}:'.format(epoch + 1))
 
diff --git a/examples/mnist_hierarchical_rnn.py b/examples/mnist_hierarchical_rnn.py
index c543b8b4a..da279e08b 100644
--- a/examples/mnist_hierarchical_rnn.py
+++ b/examples/mnist_hierarchical_rnn.py
@@ -56,8 +56,8 @@ print(x_train.shape[0], 'train samples')
 print(x_test.shape[0], 'test samples')
 
 # Converts class vectors to binary class matrices.
-Y_train = keras.utils.to_categorical(y_train, num_classes)
-Y_test = keras.utils.to_categorical(y_test, num_classes)
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
 
 row, col, pixel = x_train.shape[1:]
 
@@ -78,10 +78,11 @@ model.compile(loss='categorical_crossentropy',
               metrics=['accuracy'])
 
 # Training.
-model.fit(x_train, Y_train, batch_size=batch_size, epochs=epochs,
-          verbose=1, validation_data=(x_test, Y_test))
+model.fit(x_train, y_train,
+          batch_size=batch_size, epochs=epochs,
+          verbose=1, validation_data=(x_test, y_test))
 
 # Evaluation.
-scores = model.evaluate(x_test, Y_test, verbose=0)
+scores = model.evaluate(x_test, y_test, verbose=0)
 print('Test loss:', scores[0])
 print('Test accuracy:', scores[1])
diff --git a/examples/mnist_siamese_graph.py b/examples/mnist_siamese_graph.py
index 2affbb0b9..bb831bf4e 100644
--- a/examples/mnist_siamese_graph.py
+++ b/examples/mnist_siamese_graph.py
@@ -13,7 +13,6 @@ Gets to 99.5% test accuracy after 20 epochs.
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 import random
 from keras.datasets import mnist
@@ -38,7 +37,8 @@ def contrastive_loss(y_true, y_pred):
     http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
     '''
     margin = 1
-    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
+    return K.mean(y_true * K.square(y_pred) +
+                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
 
 
 def create_pairs(x, digit_indices):
@@ -108,7 +108,8 @@ input_b = Input(shape=(input_dim,))
 processed_a = base_network(input_a)
 processed_b = base_network(input_b)
 
-distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
+distance = Lambda(euclidean_distance,
+                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])
 
 model = Model([input_a, input_b], distance)
 
diff --git a/examples/mnist_sklearn_wrapper.py b/examples/mnist_sklearn_wrapper.py
index 926547e2d..567f53a6d 100644
--- a/examples/mnist_sklearn_wrapper.py
+++ b/examples/mnist_sklearn_wrapper.py
@@ -9,7 +9,7 @@ import keras
 from keras.datasets import mnist
 from keras.models import Sequential
 from keras.layers import Dense, Dropout, Activation, Flatten
-from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers import Conv2D, MaxPooling2D
 from keras.wrappers.scikit_learn import KerasClassifier
 from keras import backend as K
 from sklearn.grid_search import GridSearchCV
@@ -53,11 +53,11 @@ def make_model(dense_layer_sizes, filters, kernel_size, pool_size):
     '''
 
     model = Sequential()
-    model.add(Convolution2D(filters, kernel_size,
-                            padding='valid',
-                            input_shape=input_shape))
+    model.add(Conv2D(filters, kernel_size,
+                     padding='valid',
+                     input_shape=input_shape))
     model.add(Activation('relu'))
-    model.add(Convolution2D(filters, kernel_size))
+    model.add(Conv2D(filters, kernel_size))
     model.add(Activation('relu'))
     model.add(MaxPooling2D(pool_size=pool_size))
     model.add(Dropout(0.25))
diff --git a/examples/mnist_swwae.py b/examples/mnist_swwae.py
index 505936928..e29dd711e 100644
--- a/examples/mnist_swwae.py
+++ b/examples/mnist_swwae.py
@@ -2,21 +2,21 @@
 MNIST dataset.  It exemplifies two influential methods that have been developed
 in the past few years.
 
-The first is the idea of properly "unpooling." During any max pool, the
-exact location (the "where") of the maximal value in a pooled receptive field
+The first is the idea of properly 'unpooling.' During any max pool, the
+exact location (the 'where') of the maximal value in a pooled receptive field
 is lost, however it can be very useful in the overall reconstruction of an
-input image.  Therefore, if the "where" is handed from the encoder
-to the corresponding decoder layer, features being decoded can be "placed" in
+input image.  Therefore, if the 'where' is handed from the encoder
+to the corresponding decoder layer, features being decoded can be 'placed' in
 the right location, allowing for reconstructions of much higher fidelity.
 
 References:
 [1]
-"Visualizing and Understanding Convolutional Networks"
+'Visualizing and Understanding Convolutional Networks'
 Matthew D Zeiler, Rob Fergus
 https://arxiv.org/abs/1311.2901v3
 
 [2]
-"Stacked What-Where Auto-encoders"
+'Stacked What-Where Auto-encoders'
 Junbo Zhao, Michael Mathieu, Ross Goroshin, Yann LeCun
 https://arxiv.org/abs/1506.02351v8
 
@@ -34,42 +34,42 @@ applied as a bias because we know the MNIST digits are mapped to [0,1].
 
 References:
 [3]
-"Deep Residual Learning for Image Recognition"
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+'Deep Residual Learning for Image Recognition'
+Kaiming He, xiangyu Zhang, Shaoqing Ren, Jian Sun
 https://arxiv.org/abs/1512.03385v1
 
 [4]
-"Identity Mappings in Deep Residual Networks"
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+'Identity Mappings in Deep Residual Networks'
+Kaiming He, xiangyu Zhang, Shaoqing Ren, Jian Sun
 https://arxiv.org/abs/1603.05027v3
 
 '''
 from __future__ import print_function
 import numpy as np
-np.random.seed(1337)  # for reproducibility
 
 from keras.datasets import mnist
 from keras.models import Model
-from keras.layers import Activation, merge
-from keras.layers import UpSampling2D, Convolution2D, MaxPooling2D
+from keras.layers import Activation
+from keras.layers import UpSampling2D, Conv2D, MaxPooling2D
 from keras.layers import Input, BatchNormalization
 import matplotlib.pyplot as plt
 import keras.backend as K
+from keras import layers
 
 
 def convresblock(x, nfeats=8, ksize=3, nskipped=2):
     ''' The proposed residual block from [4]'''
-    y0 = Convolution2D(nfeats, ksize, ksize, border_mode='same')(x)
+    y0 = Conv2D(nfeats, ksize, padding='same')(x)
     y = y0
     for i in range(nskipped):
-        y = BatchNormalization(mode=0, axis=1)(y)
+        y = BatchNormalization(axis=1)(y)
         y = Activation('relu')(y)
-        y = Convolution2D(nfeats, ksize, ksize, border_mode='same')(y)
-    return merge([y0, y], mode='sum')
+        y = Conv2D(nfeats, ksize, padding='same')(y)
+    return layers.add([y0, y])
 
 
 def getwhere(x):
-    ''' Calculate the "where" mask that contains switches indicating which
+    ''' Calculate the 'where' mask that contains switches indicating which
     index contained the max value when MaxPool2D was applied.  Using the
     gradient of the sum is a nice trick to keep everything high level.'''
     y_prepool, y_postpool = x
@@ -89,17 +89,17 @@ K.set_image_data_format('channels_first')
 img_rows, img_cols = 28, 28
 
 # the data, shuffled and split between train and test sets
-(X_train, _), (X_test, _) = mnist.load_data()
+(x_train, _), (x_test, _) = mnist.load_data()
 
-X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
-X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
-X_train = X_train.astype('float32')
-X_test = X_test.astype('float32')
-X_train /= 255
-X_test /= 255
-print('X_train shape:', X_train.shape)
-print(X_train.shape[0], 'train samples')
-print(X_test.shape[0], 'test samples')
+x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
 
 # The size of the kernel used for the MaxPooling2D
 pool_size = 2
@@ -116,41 +116,40 @@ batch_size = 128
 
 if pool_size == 2:
     # if using a 5 layer net of pool_size = 2
-    X_train = np.pad(X_train, [[0, 0], [0, 0], [2, 2], [2, 2]],
+    x_train = np.pad(x_train, [[0, 0], [0, 0], [2, 2], [2, 2]],
                      mode='constant')
-    X_test = np.pad(X_test, [[0, 0], [0, 0], [2, 2], [2, 2]], mode='constant')
+    x_test = np.pad(x_test, [[0, 0], [0, 0], [2, 2], [2, 2]], mode='constant')
     nlayers = 5
 elif pool_size == 3:
     # if using a 3 layer net of pool_size = 3
-    X_train = X_train[:, :, :-1, :-1]
-    X_test = X_test[:, :, :-1, :-1]
+    x_train = x_train[:, :, :-1, :-1]
+    x_test = x_test[:, :, :-1, :-1]
     nlayers = 3
 else:
     import sys
-    sys.exit("Script supports pool_size of 2 and 3.")
+    sys.exit('Script supports pool_size of 2 and 3.')
 
 # Shape of input to train on (note that model is fully convolutional however)
-input_shape = X_train.shape[1:]
+input_shape = x_train.shape[1:]
 # The final list of the size of axis=1 for all layers, including input
 nfeats_all = [input_shape[0]] + nfeats
 
-# First build the encoder, all the while keeping track of the "where" masks
+# First build the encoder, all the while keeping track of the 'where' masks
 img_input = Input(shape=input_shape)
 
-# We push the "where" masks to the following list
+# We push the 'where' masks to the following list
 wheres = [None] * nlayers
 y = img_input
 for i in range(nlayers):
     y_prepool = convresblock(y, nfeats=nfeats_all[i + 1], ksize=ksize)
     y = MaxPooling2D(pool_size=(pool_sizes[i], pool_sizes[i]))(y_prepool)
-    wheres[i] = merge([y_prepool, y], mode=getwhere,
-                      output_shape=lambda x: x[0])
+    wheres[i] = layers.Lambda(getwhere, output_shape=lambda x: x[0])([y_prepool, y])
 
-# Now build the decoder, and use the stored "where" masks to place the features
+# Now build the decoder, and use the stored 'where' masks to place the features
 for i in range(nlayers):
     ind = nlayers - 1 - i
     y = UpSampling2D(size=(pool_sizes[ind], pool_sizes[ind]))(y)
-    y = merge([y, wheres[ind]], mode='mul')
+    y = layers.multiply([y, wheres[ind]])
     y = convresblock(y, nfeats=nfeats_all[ind], ksize=ksize)
 
 # Use hard_simgoid to clip range of reconstruction
@@ -161,16 +160,16 @@ model = Model(img_input, y)
 model.compile('adam', 'mse')
 
 # Fit the model
-model.fit(X_train, X_train, validation_data=(X_test, X_test),
+model.fit(x_train, x_train, validation_data=(x_test, x_test),
           batch_size=batch_size, epochs=epochs)
 
 # Plot
-X_recon = model.predict(X_test[:25])
-X_plot = np.concatenate((X_test[:25], X_recon), axis=1)
-X_plot = X_plot.reshape((5, 10, input_shape[-2], input_shape[-1]))
-X_plot = np.vstack([np.hstack(x) for x in X_plot])
+x_recon = model.predict(x_test[:25])
+x_plot = np.concatenate((x_test[:25], x_recon), axis=1)
+x_plot = x_plot.reshape((5, 10, input_shape[-2], input_shape[-1]))
+x_plot = np.vstack([np.hstack(x) for x in x_plot])
 plt.figure()
 plt.axis('off')
 plt.title('Test Samples: Originals/Reconstructions')
-plt.imshow(X_plot, interpolation='none', cmap='gray')
+plt.imshow(x_plot, interpolation='none', cmap='gray')
 plt.savefig('reconstructions.png')
diff --git a/examples/neural_doodle.py b/examples/neural_doodle.py
index 300db1fa8..c4133d8fe 100644
--- a/examples/neural_doodle.py
+++ b/examples/neural_doodle.py
@@ -122,7 +122,7 @@ def kmeans(xs, k):
     assert xs.ndim == 2
     try:
         from sklearn.cluster import k_means
-        _, labels, _ = k_means(xs.astype("float64"), k)
+        _, labels, _ = k_means(xs.astype('float64'), k)
     except ImportError:
         from scipy.cluster.vq import kmeans2
         _, labels = kmeans2(xs, k, missing='raise')
@@ -179,8 +179,8 @@ images = K.concatenate([style_image, target_image, content_image], axis=0)
 
 # Create tensor variables for masks
 raw_style_mask, raw_target_mask = load_mask_labels()
-style_mask = K.variable(raw_style_mask.astype("float32"))
-target_mask = K.variable(raw_target_mask.astype("float32"))
+style_mask = K.variable(raw_style_mask.astype('float32'))
+target_mask = K.variable(raw_target_mask.astype('float32'))
 masks = K.concatenate([style_mask, target_mask], axis=0)
 
 # index constants for images and tasks variables
@@ -191,13 +191,13 @@ STYLE, TARGET, CONTENT = 0, 1, 2
 image_model = vgg19.VGG19(include_top=False, input_tensor=images)
 
 # mask model as a series of pooling
-mask_input = Input(tensor=masks, shape=(None, None, None), name="mask_input")
+mask_input = Input(tensor=masks, shape=(None, None, None), name='mask_input')
 x = mask_input
 for layer in image_model.layers[1:]:
     name = 'mask_%s' % layer.name
     if 'conv' in layer.name:
         x = AveragePooling2D((3, 3), strides=(
-            1, 1), name=name, border_mode="same")(x)
+            1, 1), name=name, border_mode='same')(x)
     elif 'pool' in layer.name:
         x = AveragePooling2D((2, 2), name=name)(x)
 mask_model = Model(mask_input, x)
diff --git a/examples/stateful_lstm.py b/examples/stateful_lstm.py
index 268017901..6a6a7ac62 100644
--- a/examples/stateful_lstm.py
+++ b/examples/stateful_lstm.py
@@ -36,7 +36,7 @@ def gen_cosine_amp(amp=100, period=1000, x0=0, xn=50000, step=1, k=0.0001):
     return cos
 
 
-print('Generating Data')
+print('Generating Data...')
 cos = gen_cosine_amp()
 print('Input shape:', cos.shape)
 
@@ -44,13 +44,13 @@ expected_output = np.zeros((len(cos), 1))
 for i in range(len(cos) - lahead):
     expected_output[i, 0] = np.mean(cos[i + 1:i + lahead + 1])
 
-print('Output shape')
-print(expected_output.shape)
+print('Output shape:', expected_output.shape)
 
-print('Creating Model')
+print('Creating Model...')
 model = Sequential()
 model.add(LSTM(50,
-               batch_input_shape=(batch_size, tsteps, 1),
+               input_shape=(tsteps, 1),
+               batch_size=batch_size,
                return_sequences=True,
                stateful=True))
 model.add(LSTM(50,