diff --git a/examples/lstm_benchmark.py b/examples/lstm_benchmark.py
index eb462c571..91e5fc0eb 100644
--- a/examples/lstm_benchmark.py
+++ b/examples/lstm_benchmark.py
@@ -10,8 +10,10 @@ a little longer, but should require less peak memory.
 consume_less='gpu' concatenates the input, output and forget gate's weights
 into one, large matrix, resulting in faster computation time as the GPU can
 utilize more cores, at the expense of reduced regularization because the same
-dropout is shared across the gates. It should require similar memory usage as
-consume_less='mem'.
+dropout is shared across the gates.
+
+Note that the relative performance of the different `consume_less` modes
+can vary depending on your device, your model and the size of your data.
 '''
 
 import time
@@ -20,13 +22,13 @@ import matplotlib.pyplot as plt
 
 from keras.preprocessing import sequence
 from keras.models import Sequential
-from keras.layers import Embedding, BatchNormalization, Dense, LSTM
+from keras.layers import Embedding, Dense, LSTM
 from keras.datasets import imdb
 
 max_features = 20000
 max_length = 80
-embedding = 400
-batch_size = 256
+embedding_dim = 256
+batch_size = 128
 epochs = 10
 modes = ['cpu', 'mem', 'gpu']
 
@@ -38,32 +40,36 @@ X_test = sequence.pad_sequences(X_test, max_length)
 # Compile and train different models while meauring performance.
 results = []
 for mode in modes:
-    print("Testing mode: consume_less='{}'".format(mode))
+    print('Testing mode: consume_less="{}"'.format(mode))
 
     model = Sequential()
-    model.add(Embedding(max_features, embedding, input_length=max_length, dropout=0.2))
-    model.add(BatchNormalization())
-    model.add(LSTM(embedding, dropout_W=0.2, dropout_U=0.2, consume_less=mode))
+    model.add(Embedding(max_features, embedding_dim, input_length=max_length, dropout=0.2))
+    model.add(LSTM(embedding_dim, dropout_W=0.2, dropout_U=0.2, consume_less=mode))
     model.add(Dense(1, activation='sigmoid'))
-    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam',
+                  metrics=['accuracy'])
 
     start_time = time.time()
-    history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=epochs, validation_data=(X_test, y_test))
+    history = model.fit(X_train, y_train,
+                        batch_size=batch_size,
+                        nb_epoch=epochs,
+                        validation_data=(X_test, y_test))
     average_time_per_epoch = (time.time() - start_time) / epochs
 
     results.append((history, average_time_per_epoch))
 
 # Compare models' accuracy, loss and elapsed time per epoch.
 plt.style.use('ggplot')
-ax1 = plt.subplot2grid((2,2), (0,0))
+ax1 = plt.subplot2grid((2, 2), (0, 0))
 ax1.set_title('Accuracy')
 ax1.set_ylabel('Validation Accuracy')
 ax1.set_xlabel('Epochs')
-ax2 = plt.subplot2grid((2,2), (1,0))
+ax2 = plt.subplot2grid((2, 2), (1, 0))
 ax2.set_title('Loss')
 ax2.set_ylabel('Validation Loss')
 ax2.set_xlabel('Epochs')
-ax3 = plt.subplot2grid((2,2), (0,1), rowspan=2)
+ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)
 ax3.set_title('Time')
 ax3.set_ylabel('Seconds')
 for mode, result in zip(modes, results):
@@ -71,6 +77,7 @@ for mode, result in zip(modes, results):
     ax2.plot(result[0].epoch, result[0].history['val_loss'], label=mode)
 ax1.legend()
 ax2.legend()
-ax3.bar(np.arange(len(results)), [x[1] for x in results], tick_label=modes, align='center')
+ax3.bar(np.arange(len(results)), [x[1] for x in results],
+        tick_label=modes, align='center')
 plt.tight_layout()
 plt.show()
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index d624fdf35..38411f527 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -81,7 +81,9 @@ class Recurrent(Layer):
             is always unrolled, so this argument does not do anything.
             Unrolling can speed-up a RNN, although it tends to be more memory-intensive.
             Unrolling is only suitable for short sequences.
-        consume_less: one of "cpu", "mem", or "gpu" (LSTM only).
+        consume_less: one of "cpu", "mem", or "gpu".
+            Note that "gpu" mode is only available for LSTM.
+
             If set to "cpu", the RNN will use
             an implementation that uses fewer, larger matrix products,
             thus running faster on CPU but consuming more memory.
@@ -391,15 +393,15 @@ class SimpleRNN(Recurrent):
         return constants
 
     def get_config(self):
-        config = {"output_dim": self.output_dim,
-                  "init": self.init.__name__,
-                  "inner_init": self.inner_init.__name__,
-                  "activation": self.activation.__name__,
-                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
-                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                  "dropout_W": self.dropout_W,
-                  "dropout_U": self.dropout_U}
+        config = {'output_dim': self.output_dim,
+                  'init': self.init.__name__,
+                  'inner_init': self.inner_init.__name__,
+                  'activation': self.activation.__name__,
+                  'W_regularizer': self.W_regularizer.get_config() if self.W_regularizer else None,
+                  'U_regularizer': self.U_regularizer.get_config() if self.U_regularizer else None,
+                  'b_regularizer': self.b_regularizer.get_config() if self.b_regularizer else None,
+                  'dropout_W': self.dropout_W,
+                  'dropout_U': self.dropout_U}
         base_config = super(SimpleRNN, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
@@ -574,16 +576,16 @@ class GRU(Recurrent):
         return constants
 
     def get_config(self):
-        config = {"output_dim": self.output_dim,
-                  "init": self.init.__name__,
-                  "inner_init": self.inner_init.__name__,
-                  "activation": self.activation.__name__,
-                  "inner_activation": self.inner_activation.__name__,
-                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
-                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                  "dropout_W": self.dropout_W,
-                  "dropout_U": self.dropout_U}
+        config = {'output_dim': self.output_dim,
+                  'init': self.init.__name__,
+                  'inner_init': self.inner_init.__name__,
+                  'activation': self.activation.__name__,
+                  'inner_activation': self.inner_activation.__name__,
+                  'W_regularizer': self.W_regularizer.get_config() if self.W_regularizer else None,
+                  'U_regularizer': self.U_regularizer.get_config() if self.U_regularizer else None,
+                  'b_regularizer': self.b_regularizer.get_config() if self.b_regularizer else None,
+                  'dropout_W': self.dropout_W,
+                  'dropout_U': self.dropout_U}
         base_config = super(GRU, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
@@ -654,9 +656,9 @@ class LSTM(Recurrent):
             self.states = [None, None]
 
         if self.consume_less == 'gpu':
-            self.W = self.init((self.input_dim, 4*self.output_dim),
+            self.W = self.init((self.input_dim, 4 * self.output_dim),
                                name='{}_W'.format(self.name))
-            self.U = self.inner_init((self.output_dim, 4*self.output_dim),
+            self.U = self.inner_init((self.output_dim, 4 * self.output_dim),
                                      name='{}_U'.format(self.name))
 
             self.b = K.variable(np.hstack((np.zeros(self.output_dim),
@@ -814,16 +816,16 @@ class LSTM(Recurrent):
         return constants
 
     def get_config(self):
-        config = {"output_dim": self.output_dim,
-                  "init": self.init.__name__,
-                  "inner_init": self.inner_init.__name__,
-                  "forget_bias_init": self.forget_bias_init.__name__,
-                  "activation": self.activation.__name__,
-                  "inner_activation": self.inner_activation.__name__,
-                  "W_regularizer": self.W_regularizer.get_config() if self.W_regularizer else None,
-                  "U_regularizer": self.U_regularizer.get_config() if self.U_regularizer else None,
-                  "b_regularizer": self.b_regularizer.get_config() if self.b_regularizer else None,
-                  "dropout_W": self.dropout_W,
-                  "dropout_U": self.dropout_U}
+        config = {'output_dim': self.output_dim,
+                  'init': self.init.__name__,
+                  'inner_init': self.inner_init.__name__,
+                  'forget_bias_init': self.forget_bias_init.__name__,
+                  'activation': self.activation.__name__,
+                  'inner_activation': self.inner_activation.__name__,
+                  'W_regularizer': self.W_regularizer.get_config() if self.W_regularizer else None,
+                  'U_regularizer': self.U_regularizer.get_config() if self.U_regularizer else None,
+                  'b_regularizer': self.b_regularizer.get_config() if self.b_regularizer else None,
+                  'dropout_W': self.dropout_W,
+                  'dropout_U': self.dropout_U}
         base_config = super(LSTM, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))