diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 436b32238..478861cd6 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -416,7 +416,7 @@ class LSTM(Recurrent):
 
 
 
-class JZS1(Layer):
+class JZS1(Recurrent):
     '''
         Evolved recurrent neural network architectures from the evaluation of thousands
         of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@@ -484,23 +484,24 @@ class JZS1(Layer):
         xz_t, xr_t, xh_t, mask_tm1,
         h_tm1, 
         u_r, u_h):
+        h_mask_tm1 = mask_tm1 * h_tm1
         z = self.inner_activation(xz_t)
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
         return h_t
 
     def get_output(self, train):
         X = self.get_input(train) 
-        X = X.dimshuffle((1, 0, 2)) 
         padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
+        X = X.dimshuffle((1, 0, 2))
 
         x_z = T.dot(X, self.W_z) + self.b_z
         x_r = T.dot(X, self.W_r) + self.b_r
         x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h
         outputs, updates = theano.scan(
             self._step, 
-            sequences=[x_z, x_r, x_h, padded_mask], 
+            sequences=[x_z, x_r, x_h, padded_mask],
             outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
             non_sequences=[self.U_r, self.U_h],
             truncate_gradient=self.truncate_gradient
@@ -522,7 +523,7 @@ class JZS1(Layer):
 
 
 
-class JZS2(Layer):
+class JZS2(Recurrent):
     '''
         Evolved recurrent neural network architectures from the evaluation of thousands
         of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@@ -588,17 +589,19 @@ class JZS2(Layer):
             self.set_weights(weights)
 
     def _step(self, 
-        xz_t, xr_t, xh_t, 
+        xz_t, xr_t, xh_t, mask_tm1,
         h_tm1, 
         u_z, u_r, u_h):
-        z = self.inner_activation(xz_t + T.dot(h_tm1, u_z))
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        h_mask_tm1 = mask_tm1 * h_tm1
+        z = self.inner_activation(xz_t + T.dot(h_mask_tm1, u_z))
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
         return h_t
 
     def get_output(self, train):
-        X = self.get_input(train) 
+        X = self.get_input(train)
+        padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
         X = X.dimshuffle((1, 0, 2)) 
 
         x_z = T.dot(X, self.W_z) + self.b_z
@@ -606,7 +609,7 @@ class JZS2(Layer):
         x_h = T.dot(X, self.W_h) + self.b_h
         outputs, updates = theano.scan(
             self._step, 
-            sequences=[x_z, x_r, x_h], 
+            sequences=[x_z, x_r, x_h, padded_mask],
             outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
             non_sequences=[self.U_z, self.U_r, self.U_h],
             truncate_gradient=self.truncate_gradient
@@ -628,7 +631,7 @@ class JZS2(Layer):
 
 
 
-class JZS3(Layer):
+class JZS3(Recurrent):
     '''
         Evolved recurrent neural network architectures from the evaluation of thousands
         of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@@ -687,17 +690,19 @@ class JZS3(Layer):
             self.set_weights(weights)
 
     def _step(self, 
-        xz_t, xr_t, xh_t, 
+        xz_t, xr_t, xh_t, mask_tm1,
         h_tm1, 
         u_z, u_r, u_h):
-        z = self.inner_activation(xz_t + T.dot(T.tanh(h_tm1), u_z))
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        h_mask_tm1 = mask_tm1 * h_tm1
+        z = self.inner_activation(xz_t + T.dot(T.tanh(h_mask_tm1), u_z))
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
         return h_t
 
     def get_output(self, train):
-        X = self.get_input(train) 
+        X = self.get_input(train)
+        padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
         X = X.dimshuffle((1, 0, 2)) 
 
         x_z = T.dot(X, self.W_z) + self.b_z
@@ -705,7 +710,7 @@ class JZS3(Layer):
         x_h = T.dot(X, self.W_h) + self.b_h
         outputs, updates = theano.scan(
             self._step, 
-            sequences=[x_z, x_r, x_h], 
+            sequences=[x_z, x_r, x_h, padded_mask],
             outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
             non_sequences=[self.U_z, self.U_r, self.U_h],
             truncate_gradient=self.truncate_gradient