Add mask support to new recurrent layers

This commit is contained in:
fchollet 2015-06-27 13:01:27 -07:00
parent 9c8e0d43f3
commit 5b6f56a040

@ -416,7 +416,7 @@ class LSTM(Recurrent):
class JZS1(Layer): class JZS1(Recurrent):
''' '''
Evolved recurrent neural network architectures from the evaluation of thousands Evolved recurrent neural network architectures from the evaluation of thousands
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015. of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -484,23 +484,24 @@ class JZS1(Layer):
xz_t, xr_t, xh_t, mask_tm1, xz_t, xr_t, xh_t, mask_tm1,
h_tm1, h_tm1,
u_r, u_h): u_r, u_h):
h_mask_tm1 = mask_tm1 * h_tm1
z = self.inner_activation(xz_t) z = self.inner_activation(xz_t)
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r)) r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h)) hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
h_t = hh_t * z + h_tm1 * (1 - z) h_t = hh_t * z + h_mask_tm1 * (1 - z)
return h_t return h_t
def get_output(self, train): def get_output(self, train):
X = self.get_input(train) X = self.get_input(train)
X = X.dimshuffle((1, 0, 2))
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
X = X.dimshuffle((1, 0, 2))
x_z = T.dot(X, self.W_z) + self.b_z x_z = T.dot(X, self.W_z) + self.b_z
x_r = T.dot(X, self.W_r) + self.b_r x_r = T.dot(X, self.W_r) + self.b_r
x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h
outputs, updates = theano.scan( outputs, updates = theano.scan(
self._step, self._step,
sequences=[x_z, x_r, x_h, padded_mask], sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
non_sequences=[self.U_r, self.U_h], non_sequences=[self.U_r, self.U_h],
truncate_gradient=self.truncate_gradient truncate_gradient=self.truncate_gradient
@ -522,7 +523,7 @@ class JZS1(Layer):
class JZS2(Layer): class JZS2(Recurrent):
''' '''
Evolved recurrent neural network architectures from the evaluation of thousands Evolved recurrent neural network architectures from the evaluation of thousands
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015. of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -588,17 +589,19 @@ class JZS2(Layer):
self.set_weights(weights) self.set_weights(weights)
def _step(self, def _step(self,
xz_t, xr_t, xh_t, xz_t, xr_t, xh_t, mask_tm1,
h_tm1, h_tm1,
u_z, u_r, u_h): u_z, u_r, u_h):
z = self.inner_activation(xz_t + T.dot(h_tm1, u_z)) h_mask_tm1 = mask_tm1 * h_tm1
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r)) z = self.inner_activation(xz_t + T.dot(h_mask_tm1, u_z))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h)) r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
h_t = hh_t * z + h_tm1 * (1 - z) hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
h_t = hh_t * z + h_mask_tm1 * (1 - z)
return h_t return h_t
def get_output(self, train): def get_output(self, train):
X = self.get_input(train) X = self.get_input(train)
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
X = X.dimshuffle((1, 0, 2)) X = X.dimshuffle((1, 0, 2))
x_z = T.dot(X, self.W_z) + self.b_z x_z = T.dot(X, self.W_z) + self.b_z
@ -606,7 +609,7 @@ class JZS2(Layer):
x_h = T.dot(X, self.W_h) + self.b_h x_h = T.dot(X, self.W_h) + self.b_h
outputs, updates = theano.scan( outputs, updates = theano.scan(
self._step, self._step,
sequences=[x_z, x_r, x_h], sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
non_sequences=[self.U_z, self.U_r, self.U_h], non_sequences=[self.U_z, self.U_r, self.U_h],
truncate_gradient=self.truncate_gradient truncate_gradient=self.truncate_gradient
@ -628,7 +631,7 @@ class JZS2(Layer):
class JZS3(Layer): class JZS3(Recurrent):
''' '''
Evolved recurrent neural network architectures from the evaluation of thousands Evolved recurrent neural network architectures from the evaluation of thousands
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015. of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
@ -687,17 +690,19 @@ class JZS3(Layer):
self.set_weights(weights) self.set_weights(weights)
def _step(self, def _step(self,
xz_t, xr_t, xh_t, xz_t, xr_t, xh_t, mask_tm1,
h_tm1, h_tm1,
u_z, u_r, u_h): u_z, u_r, u_h):
z = self.inner_activation(xz_t + T.dot(T.tanh(h_tm1), u_z)) h_mask_tm1 = mask_tm1 * h_tm1
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r)) z = self.inner_activation(xz_t + T.dot(T.tanh(h_mask_tm1), u_z))
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h)) r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
h_t = hh_t * z + h_tm1 * (1 - z) hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
h_t = hh_t * z + h_mask_tm1 * (1 - z)
return h_t return h_t
def get_output(self, train): def get_output(self, train):
X = self.get_input(train) X = self.get_input(train)
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
X = X.dimshuffle((1, 0, 2)) X = X.dimshuffle((1, 0, 2))
x_z = T.dot(X, self.W_z) + self.b_z x_z = T.dot(X, self.W_z) + self.b_z
@ -705,7 +710,7 @@ class JZS3(Layer):
x_h = T.dot(X, self.W_h) + self.b_h x_h = T.dot(X, self.W_h) + self.b_h
outputs, updates = theano.scan( outputs, updates = theano.scan(
self._step, self._step,
sequences=[x_z, x_r, x_h], sequences=[x_z, x_r, x_h, padded_mask],
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
non_sequences=[self.U_z, self.U_r, self.U_h], non_sequences=[self.U_z, self.U_r, self.U_h],
truncate_gradient=self.truncate_gradient truncate_gradient=self.truncate_gradient