Add mask support to new recurrent layers
This commit is contained in:
parent
9c8e0d43f3
commit
5b6f56a040
@ -416,7 +416,7 @@ class LSTM(Recurrent):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class JZS1(Layer):
|
class JZS1(Recurrent):
|
||||||
'''
|
'''
|
||||||
Evolved recurrent neural network architectures from the evaluation of thousands
|
Evolved recurrent neural network architectures from the evaluation of thousands
|
||||||
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
||||||
@ -484,23 +484,24 @@ class JZS1(Layer):
|
|||||||
xz_t, xr_t, xh_t, mask_tm1,
|
xz_t, xr_t, xh_t, mask_tm1,
|
||||||
h_tm1,
|
h_tm1,
|
||||||
u_r, u_h):
|
u_r, u_h):
|
||||||
|
h_mask_tm1 = mask_tm1 * h_tm1
|
||||||
z = self.inner_activation(xz_t)
|
z = self.inner_activation(xz_t)
|
||||||
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
|
r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
|
||||||
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
|
hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
|
||||||
h_t = hh_t * z + h_tm1 * (1 - z)
|
h_t = hh_t * z + h_mask_tm1 * (1 - z)
|
||||||
return h_t
|
return h_t
|
||||||
|
|
||||||
def get_output(self, train):
|
def get_output(self, train):
|
||||||
X = self.get_input(train)
|
X = self.get_input(train)
|
||||||
X = X.dimshuffle((1, 0, 2))
|
|
||||||
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
|
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
|
||||||
|
X = X.dimshuffle((1, 0, 2))
|
||||||
|
|
||||||
x_z = T.dot(X, self.W_z) + self.b_z
|
x_z = T.dot(X, self.W_z) + self.b_z
|
||||||
x_r = T.dot(X, self.W_r) + self.b_r
|
x_r = T.dot(X, self.W_r) + self.b_r
|
||||||
x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h
|
x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h
|
||||||
outputs, updates = theano.scan(
|
outputs, updates = theano.scan(
|
||||||
self._step,
|
self._step,
|
||||||
sequences=[x_z, x_r, x_h, padded_mask],
|
sequences=[x_z, x_r, x_h, padded_mask],
|
||||||
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
||||||
non_sequences=[self.U_r, self.U_h],
|
non_sequences=[self.U_r, self.U_h],
|
||||||
truncate_gradient=self.truncate_gradient
|
truncate_gradient=self.truncate_gradient
|
||||||
@ -522,7 +523,7 @@ class JZS1(Layer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class JZS2(Layer):
|
class JZS2(Recurrent):
|
||||||
'''
|
'''
|
||||||
Evolved recurrent neural network architectures from the evaluation of thousands
|
Evolved recurrent neural network architectures from the evaluation of thousands
|
||||||
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
||||||
@ -588,17 +589,19 @@ class JZS2(Layer):
|
|||||||
self.set_weights(weights)
|
self.set_weights(weights)
|
||||||
|
|
||||||
def _step(self,
|
def _step(self,
|
||||||
xz_t, xr_t, xh_t,
|
xz_t, xr_t, xh_t, mask_tm1,
|
||||||
h_tm1,
|
h_tm1,
|
||||||
u_z, u_r, u_h):
|
u_z, u_r, u_h):
|
||||||
z = self.inner_activation(xz_t + T.dot(h_tm1, u_z))
|
h_mask_tm1 = mask_tm1 * h_tm1
|
||||||
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
|
z = self.inner_activation(xz_t + T.dot(h_mask_tm1, u_z))
|
||||||
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
|
r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
|
||||||
h_t = hh_t * z + h_tm1 * (1 - z)
|
hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
|
||||||
|
h_t = hh_t * z + h_mask_tm1 * (1 - z)
|
||||||
return h_t
|
return h_t
|
||||||
|
|
||||||
def get_output(self, train):
|
def get_output(self, train):
|
||||||
X = self.get_input(train)
|
X = self.get_input(train)
|
||||||
|
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
|
||||||
X = X.dimshuffle((1, 0, 2))
|
X = X.dimshuffle((1, 0, 2))
|
||||||
|
|
||||||
x_z = T.dot(X, self.W_z) + self.b_z
|
x_z = T.dot(X, self.W_z) + self.b_z
|
||||||
@ -606,7 +609,7 @@ class JZS2(Layer):
|
|||||||
x_h = T.dot(X, self.W_h) + self.b_h
|
x_h = T.dot(X, self.W_h) + self.b_h
|
||||||
outputs, updates = theano.scan(
|
outputs, updates = theano.scan(
|
||||||
self._step,
|
self._step,
|
||||||
sequences=[x_z, x_r, x_h],
|
sequences=[x_z, x_r, x_h, padded_mask],
|
||||||
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
||||||
non_sequences=[self.U_z, self.U_r, self.U_h],
|
non_sequences=[self.U_z, self.U_r, self.U_h],
|
||||||
truncate_gradient=self.truncate_gradient
|
truncate_gradient=self.truncate_gradient
|
||||||
@ -628,7 +631,7 @@ class JZS2(Layer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class JZS3(Layer):
|
class JZS3(Recurrent):
|
||||||
'''
|
'''
|
||||||
Evolved recurrent neural network architectures from the evaluation of thousands
|
Evolved recurrent neural network architectures from the evaluation of thousands
|
||||||
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
of models, serving as alternatives to LSTMs and GRUs. See Jozefowicz et al. 2015.
|
||||||
@ -687,17 +690,19 @@ class JZS3(Layer):
|
|||||||
self.set_weights(weights)
|
self.set_weights(weights)
|
||||||
|
|
||||||
def _step(self,
|
def _step(self,
|
||||||
xz_t, xr_t, xh_t,
|
xz_t, xr_t, xh_t, mask_tm1,
|
||||||
h_tm1,
|
h_tm1,
|
||||||
u_z, u_r, u_h):
|
u_z, u_r, u_h):
|
||||||
z = self.inner_activation(xz_t + T.dot(T.tanh(h_tm1), u_z))
|
h_mask_tm1 = mask_tm1 * h_tm1
|
||||||
r = self.inner_activation(xr_t + T.dot(h_tm1, u_r))
|
z = self.inner_activation(xz_t + T.dot(T.tanh(h_mask_tm1), u_z))
|
||||||
hh_t = self.activation(xh_t + T.dot(r * h_tm1, u_h))
|
r = self.inner_activation(xr_t + T.dot(h_mask_tm1, u_r))
|
||||||
h_t = hh_t * z + h_tm1 * (1 - z)
|
hh_t = self.activation(xh_t + T.dot(r * h_mask_tm1, u_h))
|
||||||
|
h_t = hh_t * z + h_mask_tm1 * (1 - z)
|
||||||
return h_t
|
return h_t
|
||||||
|
|
||||||
def get_output(self, train):
|
def get_output(self, train):
|
||||||
X = self.get_input(train)
|
X = self.get_input(train)
|
||||||
|
padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)
|
||||||
X = X.dimshuffle((1, 0, 2))
|
X = X.dimshuffle((1, 0, 2))
|
||||||
|
|
||||||
x_z = T.dot(X, self.W_z) + self.b_z
|
x_z = T.dot(X, self.W_z) + self.b_z
|
||||||
@ -705,7 +710,7 @@ class JZS3(Layer):
|
|||||||
x_h = T.dot(X, self.W_h) + self.b_h
|
x_h = T.dot(X, self.W_h) + self.b_h
|
||||||
outputs, updates = theano.scan(
|
outputs, updates = theano.scan(
|
||||||
self._step,
|
self._step,
|
||||||
sequences=[x_z, x_r, x_h],
|
sequences=[x_z, x_r, x_h, padded_mask],
|
||||||
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
|
||||||
non_sequences=[self.U_z, self.U_r, self.U_h],
|
non_sequences=[self.U_z, self.U_r, self.U_h],
|
||||||
truncate_gradient=self.truncate_gradient
|
truncate_gradient=self.truncate_gradient
|
||||||
|
Loading…
Reference in New Issue
Block a user