# Dummy test data as input to RNN. This input is 3 timesteps long where the third timestep always matches the
# first. Without masking it should be able to learn it, with masking it should fail.

import numpy as np
from keras.utils.theano_utils import sharedX
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Merge, Dropout, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, SimpleDeepRNN, LSTM, GRU
import theano

theano.config.exception_verbosity = 'high' 

# (nb_samples, timesteps, dimensions)
X = np.random.random_integers(1, 4, size=(500000, 15))

print("About to compile the first model")
model = Sequential()
model.add(Embedding(5, 4, mask_zero=True))
model.add(TimeDistributedDense(4, 4)) # obviously this is redundant. Just testing.
model.add(SimpleRNN(4, 4, activation='relu', return_sequences=True))
model.add(Dropout(0.5))
model.add(SimpleDeepRNN(4, 4, depth=2, activation='relu')) 
model.add(Dropout(0.5))
model.add(Dense(4, 4, activation='softmax'))
model.compile(loss='categorical_crossentropy',
        optimizer='rmsprop', theano_mode=theano.compile.mode.FAST_RUN)
print("Compiled model")

W = model.get_weights() # We'll save these so we can reset it later

X[:, : 10] = 0
Xmask0 = X.copy()
Xmask0[:, 10] = 0

Xmask12 = X.copy()
Xmask12[:, 11] = 0
Xmask12[:, 12] = 0

X0_onehot = np.zeros((X.shape[0], 4))
X1_onehot = np.zeros((X.shape[0], 4))
for i, row in enumerate(X):
    X0_onehot[i, row[10] - 1] = 1
    X1_onehot[i, row[11] - 1] = 1

# Uniform score: 4 options = ln(4) nats (2 bits)
# we should not do better than this when we mask out the part of the input
# that gives us the correct answer
uniform_score = np.log(4)
batch_size=512

# Train it to guess 0th dim
model.fit(X, X0_onehot, nb_epoch=1, batch_size=batch_size)
score = model.evaluate(X, X0_onehot, batch_size=batch_size)
if score > uniform_score * 0.9:
    raise Exception('Failed to learn to copy timestep 0, score %f' % score)
    

model.set_weights(W)

# Train without showing it the 0th dim to learn 1st dim
model.fit(X[: , 1:], X1_onehot, nb_epoch=1, batch_size=batch_size)
score = model.evaluate(X[:, 1:], X1_onehot, batch_size=batch_size)
if score > uniform_score * 0.9:
    raise Exception('Failed to learn to copy timestep 1, score %f' % score)

model.set_weights(W)

# Train to guess 0th dim when 0th dim has been masked (should fail)
model.fit(Xmask0, X0_onehot, nb_epoch=1, batch_size=batch_size)
score = model.evaluate(Xmask0, X0_onehot, batch_size=batch_size)
if score < uniform_score * 0.9:
   raise Exception('Somehow learned to copy timestep 0 despite mask, score %f' % score)

model.set_weights(W)

# Train to guess 1st dim when 0th dim has been masked (should succeed)
model.fit(Xmask0, X1_onehot, nb_epoch=1, batch_size=batch_size)
score = model.evaluate(Xmask0, X1_onehot, batch_size=batch_size)
if score > uniform_score * 0.9:
    raise Exception('Failed to learn to copy timestep 1 in masked model, score %f' % score)

model.set_weights(W)

# Finally, make sure the mask is actually blocking input, mask out timesteps 1 and 2, and see if
# it can learn timestep 0 (should fail)
model.fit(Xmask12, X0_onehot, nb_epoch=1, batch_size=batch_size)

score = model.evaluate(Xmask12, X0_onehot, batch_size=batch_size)
if score < uniform_score * 0.9:
    raise Exception('Somehow learned to copy timestep 0 despite masking 1, score %f' % score)

# Another testing approach, just initialize models and make sure that prepending zeros doesn't affect
# their output
print("About to compile the second model")
model2 = Sequential()
model2.add(Embedding(5, 4, mask_zero=True))
model2.add(TimeDistributedDense(4, 4))
model2.add(Activation('time_distributed_softmax'))
model2.add(LSTM(4, 4, return_sequences=True))
model2.add(Activation('tanh'))
model2.add(GRU(4, 4, activation='softmax', return_sequences=True))
model2.add(SimpleDeepRNN(4, 4, depth=2, activation='relu', return_sequences=True)) 
model2.add(SimpleRNN(4, 4, activation='relu', return_sequences=True))
model2.compile(loss='categorical_crossentropy',
        optimizer='rmsprop', theano_mode=theano.compile.mode.FAST_RUN)
print("Compiled model2")

X2 = np.random.random_integers(1, 4, size=(2, 5))
y2 = np.random.random((X2.shape[0], X2.shape[1], 4))

ref = model2.predict(X2)
ref_eval = model2.evaluate(X2, y2)
mask = np.ones((y2.shape[0], y2.shape[1], 1))

for pre_zeros in range(1, 10):
    padded_X2 = np.concatenate((np.zeros((X2.shape[0], pre_zeros)), X2), axis=1)
    padded_mask = np.concatenate((np.zeros((mask.shape[0], pre_zeros, mask.shape[2])), mask), axis=1)
    padded_y2 = np.concatenate((np.zeros((y2.shape[0], pre_zeros, y2.shape[2])), y2), axis=1)

    pred = model2.predict(padded_X2)
    if not np.allclose(ref[:, -1, :], pred[:, -1, :]):
        raise Exception("Different result after left-padding %d zeros. Ref: %s, Pred: %s" % (pre_zeros, ref, pred))

    pad_eval = model2.evaluate(padded_X2, padded_y2, weights=padded_mask)
    if not np.allclose([pad_eval], [ref_eval]):
        raise Exception("Got dissimilar categorical_crossentropy after left-padding %d zeros. Ref: %f, Pred %f" %\
                (pref_eval, pred_val))