Add mask support to new recurrent layers

2015-06-27 13:01:27 -07:00 · 2015-06-27 13:01:27 -07:00 · 5b6f56a040
commit 5b6f56a040
parent 9c8e0d43f3
1 changed files with 27 additions and 22 deletions
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@ -416,7 +416,7 @@ class LSTM(Recurrent):



-class JZS1(Layer):
+class JZS1(Recurrent):
    '''
        Evolved recurrent neural network architectures from the evaluation of thousands
        of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -484,23 +484,24 @@ class JZS1(Layer):
        xz_t, xr_t, xh_t, mask_tm1,
        h_tm1, 
        u_r, u_h):
+        h_mask_tm1 = mask_tm1 * h_tm1
        z = self.inner_activation(xz_t)
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
        return h_t

    def get_output(self, train):
        X = self.get_input(train) 
-        X = X.dimshuffle((1, 0, 2)) 
        padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
+        X = X.dimshuffle((1, 0, 2))

        x_z = T.dot(X, self.W_z) + self.b_z
        x_r = T.dot(X, self.W_r) + self.b_r
        x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h
        outputs, updates = theano.scan(
            self._step, 
-            sequences=[x_z, x_r, x_h, padded_mask], 
+            sequences=[x_z, x_r, x_h, padded_mask],
            outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
            non_sequences=[self.U_r, self.U_h],
            truncate_gradient=self.truncate_gradient
@ -522,7 +523,7 @@ class JZS1(Layer):



-class JZS2(Layer):
+class JZS2(Recurrent):
    '''
        Evolved recurrent neural network architectures from the evaluation of thousands
        of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -588,17 +589,19 @@ class JZS2(Layer):
            self.set_weights(weights)

    def _step(self, 
-        xz_t, xr_t, xh_t, 
+        xz_t, xr_t, xh_t, mask_tm1,
        h_tm1, 
        u_z, u_r, u_h):
-        z = self.inner_activation(xz_t + T.dot(h_tm1, u_z))
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        h_mask_tm1 = mask_tm1 * h_tm1
+        z = self.inner_activation(xz_t + T.dot(h_mask_tm1, u_z))
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
        return h_t

    def get_output(self, train):
-        X = self.get_input(train) 
+        X = self.get_input(train)
+        padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
        X = X.dimshuffle((1, 0, 2)) 

        x_z = T.dot(X, self.W_z) + self.b_z
@ -606,7 +609,7 @@ class JZS2(Layer):
        x_h = T.dot(X, self.W_h) + self.b_h
        outputs, updates = theano.scan(
            self._step, 
-            sequences=[x_z, x_r, x_h], 
+            sequences=[x_z, x_r, x_h, padded_mask],
            outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
            non_sequences=[self.U_z, self.U_r, self.U_h],
            truncate_gradient=self.truncate_gradient
@ -628,7 +631,7 @@ class JZS2(Layer):



-class JZS3(Layer):
+class JZS3(Recurrent):
    '''
        Evolved recurrent neural network architectures from the evaluation of thousands
        of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -687,17 +690,19 @@ class JZS3(Layer):
            self.set_weights(weights)

    def _step(self, 
-        xz_t, xr_t, xh_t, 
+        xz_t, xr_t, xh_t, mask_tm1,
        h_tm1, 
        u_z, u_r, u_h):
-        z = self.inner_activation(xz_t + T.dot(T.tanh(h_tm1), u_z))
-        r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
-        hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
-        h_t = hh_t * z + h_tm1 * (1 - z)
+        h_mask_tm1 = mask_tm1 * h_tm1
+        z = self.inner_activation(xz_t + T.dot(T.tanh(h_mask_tm1), u_z))
+        r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
+        hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
+        h_t = hh_t * z + h_mask_tm1 * (1 - z)
        return h_t

    def get_output(self, train):
-        X = self.get_input(train) 
+        X = self.get_input(train)
+        padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
        X = X.dimshuffle((1, 0, 2)) 

        x_z = T.dot(X, self.W_z) + self.b_z
@ -705,7 +710,7 @@ class JZS3(Layer):
        x_h = T.dot(X, self.W_h) + self.b_h
        outputs, updates = theano.scan(
            self._step, 
-            sequences=[x_z, x_r, x_h], 
+            sequences=[x_z, x_r, x_h, padded_mask],
            outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
            non_sequences=[self.U_z, self.U_r, self.U_h],
            truncate_gradient=self.truncate_gradient