keras/examples/addition_rnn.py

# -*- coding: utf-8 -*-
from __future__ import print_function
from keras.models import Sequential, slice_X
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

"""
An implementation of sequence to sequence learning for performing addition
Input: "535+61"
Output: "596"
Padding is handled by using a repeated sentinel character (space)

By default, the JZS1 recurrent neural network is used
JZS1 was an "evolved" recurrent neural network performing well on arithmetic benchmark in:
"An Empirical Exploration of Recurrent Network Architectures"
http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf

Input may optionally be inverted, shown to increase performance in many tasks in:
"Learning to Execute"
http://arxiv.org/abs/1410.4615
and
"Sequence to Sequence Learning with Neural Networks"
http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
Theoretically it introduces shorter term dependencies between source and target.

Two digits inverted:
+ One layer JZS1 (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

Three digits inverted:
+ One layer JZS1 (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

Four digits inverted:
+ One layer JZS1 (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

Five digits inverted:
+ One layer JZS1 (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs

"""


class CharacterTable(object):
    """
    Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilties to their character output
    """
    def __init__(self, chars, maxlen):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        self.maxlen = maxlen

    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.chars)))
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
        return X

    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X)


class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

# Parameters for the model and dataset
TRAINING_SIZE = 50000
DIGITS = 3
INVERT = True
# Try replacing JZS1 with LSTM, GRU, or SimpleRNN
RNN = recurrent.JZS1
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 1
MAXLEN = DIGITS + 1 + DIGITS

chars = '0123456789+ '
ctable = CharacterTable(chars, MAXLEN)

questions = []
expected = []
seen = set()
print('Generating data...')
while len(questions) < TRAINING_SIZE:
    f = lambda: int(''.join(np.random.choice(list('0123456789')) for i in xrange(np.random.randint(1, DIGITS + 1))))
    a, b = f(), f()
    # Skip any addition questions we've already seen
    # Also skip any such that X+Y == Y+X (hence the sorting)
    key = tuple(sorted((a, b)))
    if key in seen:
        continue
    seen.add(key)
    # Pad the data with spaces such that it is always MAXLEN
    q = '{}+{}'.format(a, b)
    query = q + ' ' * (MAXLEN - len(q))
    ans = str(a + b)
    # Answers can be of maximum size DIGITS + 1
    ans += ' ' * (DIGITS + 1 - len(ans))
    if INVERT:
        query = query[::-1]
    questions.append(query)
    expected.append(ans)
print('Total addition questions:', len(questions))

print('Vectorization...')
X = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    X[i] = ctable.encode(sentence, maxlen=MAXLEN)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, maxlen=DIGITS + 1)

# Shuffle (X, y) in unison as the later parts of X will almost all be larger digits
indices = np.arange(len(y))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
# Explicitly set apart 10% for validation data that we never train over
split_at = len(X) - len(X) / 10
(X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))
(y_train, y_val) = (y[:split_at], y[split_at:])

print(X_train.shape)
print(y_train.shape)

print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(None, len(chars))))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(DIGITS + 1))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in xrange(LAYERS):
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))

# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributedDense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)
    ###
    # Select 10 samples from the validation set at random so we can visualize errors
    for i in xrange(10):
        ind = np.random.randint(0, len(X_val))
        rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowX, verbose=0)
        q = ctable.decode(rowX[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q[::-1] if INVERT else q)
        print('T', correct)
        print(colors.ok + '☑' + colors.close if correct == guess else colors.fail + '☒' + colors.close, guess)
        print('---')
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`# -- coding: utf-8 --`
			`from __future__ import print_function`
			`from keras.models import Sequential, slice_X`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`from keras.layers.core import Activation, TimeDistributedDense, RepeatVector`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`from keras.layers import recurrent`
			`import numpy as np`

			`"""`
			`An implementation of sequence to sequence learning for performing addition`
			`Input: "535+61"`
			`Output: "596"`
			`Padding is handled by using a repeated sentinel character (space)`

			`By default, the JZS1 recurrent neural network is used`
			`JZS1 was an "evolved" recurrent neural network performing well on arithmetic benchmark in:`
			`"An Empirical Exploration of Recurrent Network Architectures"`
			`http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf`

			`Input may optionally be inverted, shown to increase performance in many tasks in:`
			`"Learning to Execute"`
			`http://arxiv.org/abs/1410.4615`
			`and`
			`"Sequence to Sequence Learning with Neural Networks"`
			`http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf`
			`Theoretically it introduces shorter term dependencies between source and target.`

			`Two digits inverted:`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`+ One layer JZS1 (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`Three digits inverted:`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`+ One layer JZS1 (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`Four digits inverted:`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`+ One layer JZS1 (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`Five digits inverted:`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`+ One layer JZS1 (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`"""`


			`class CharacterTable(object):`
			`"""`
			`Given a set of characters:`
			`+ Encode them to a one hot integer representation`
			`+ Decode the one hot integer representation to their character output`
			`+ Decode a vector of probabilties to their character output`
			`"""`
			`def __init__(self, chars, maxlen):`
			`self.chars = sorted(set(chars))`
			`self.char_indices = dict((c, i) for i, c in enumerate(self.chars))`
			`self.indices_char = dict((i, c) for i, c in enumerate(self.chars))`
			`self.maxlen = maxlen`

			`def encode(self, C, maxlen=None):`
			`maxlen = maxlen if maxlen else self.maxlen`
			`X = np.zeros((maxlen, len(self.chars)))`
			`for i, c in enumerate(C):`
			`X[i, self.char_indices[c]] = 1`
			`return X`

			`def decode(self, X, calc_argmax=True):`
			`if calc_argmax:`
			`X = X.argmax(axis=-1)`
			`return ''.join(self.indices_char[x] for x in X)`

Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00
			`class colors:`
			`ok = '\033[92m'`
			`fail = '\033[91m'`
			`close = '\033[0m'`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`# Parameters for the model and dataset`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`TRAINING_SIZE = 50000`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`DIGITS = 3`
			`INVERT = True`
			`# Try replacing JZS1 with LSTM, GRU, or SimpleRNN`
			`RNN = recurrent.JZS1`
			`HIDDEN_SIZE = 128`
			`BATCH_SIZE = 128`
			`LAYERS = 1`
			`MAXLEN = DIGITS + 1 + DIGITS`

			`chars = '0123456789+ '`
			`ctable = CharacterTable(chars, MAXLEN)`

			`questions = []`
			`expected = []`
			`seen = set()`
			`print('Generating data...')`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`while len(questions) < TRAINING_SIZE:`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`f = lambda: int(''.join(np.random.choice(list('0123456789')) for i in xrange(np.random.randint(1, DIGITS + 1))))`
			`a, b = f(), f()`
			`# Skip any addition questions we've already seen`
			`# Also skip any such that X+Y == Y+X (hence the sorting)`
			`key = tuple(sorted((a, b)))`
			`if key in seen:`
			`continue`
			`seen.add(key)`
			`# Pad the data with spaces such that it is always MAXLEN`
			`q = '{}+{}'.format(a, b)`
			`query = q + ' ' * (MAXLEN - len(q))`
			`ans = str(a + b)`
			`# Answers can be of maximum size DIGITS + 1`
			`ans += ' ' * (DIGITS + 1 - len(ans))`
			`if INVERT:`
			`query = query[::-1]`
			`questions.append(query)`
			`expected.append(ans)`
			`print('Total addition questions:', len(questions))`

			`print('Vectorization...')`
			`X = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)`
			`y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)`
			`for i, sentence in enumerate(questions):`
			`X[i] = ctable.encode(sentence, maxlen=MAXLEN)`
			`for i, sentence in enumerate(expected):`
			`y[i] = ctable.encode(sentence, maxlen=DIGITS + 1)`

			`# Shuffle (X, y) in unison as the later parts of X will almost all be larger digits`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`indices = np.arange(len(y))`
			`np.random.shuffle(indices)`
			`X = X[indices]`
			`y = y[indices]`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`# Explicitly set apart 10% for validation data that we never train over`
			`split_at = len(X) - len(X) / 10`
			`(X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at))`
			`(y_train, y_val) = (y[:split_at], y[split_at:])`

Update all examples with new API 2015-10-05 01:44:49 +00:00			`print(X_train.shape)`
			`print(y_train.shape)`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`print('Build model...')`
			`model = Sequential()`
			`# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`# note: in a situation where your input sequences have a variable length,`
			`# use input_shape=(None, nb_feature).`
			`model.add(RNN(HIDDEN_SIZE, input_shape=(None, len(chars))))`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`# For the decoder's input, we repeat the encoded input for each time step`
			`model.add(RepeatVector(DIGITS + 1))`
			`# The decoder RNN could be multiple layers stacked or a single layer`
			`for _ in xrange(LAYERS):`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`model.add(RNN(HIDDEN_SIZE, return_sequences=True))`

Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`# For each of step of the output sequence, decide which character should be chosen`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`model.add(TimeDistributedDense(len(chars)))`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`model.add(Activation('softmax'))`

			`model.compile(loss='categorical_crossentropy', optimizer='adam')`

			`# Train the model each generation and show predictions against the validation dataset`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`for iteration in range(1, 200):`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`print()`
			`print('-' * 50)`
			`print('Iteration', iteration)`
Update all examples with new API 2015-10-05 01:44:49 +00:00			`model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=1, validation_data=(X_val, y_val), show_accuracy=True)`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`###`
			`# Select 10 samples from the validation set at random so we can visualize errors`
			`for i in xrange(10):`
			`ind = np.random.randint(0, len(X_val))`
			`rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]`
			`preds = model.predict_classes(rowX, verbose=0)`
			`q = ctable.decode(rowX[0])`
			`correct = ctable.decode(rowy[0])`
			`guess = ctable.decode(preds[0], calc_argmax=False)`
			`print('Q', q[::-1] if INVERT else q)`
			`print('T', correct)`
Touch-ups to addition RNN example 2015-08-18 00:57:20 +00:00			`print(colors.ok + '☑' + colors.close if correct == guess else colors.fail + '☒' + colors.close, guess)`
Example: Sequence to sequence learning for addition using RNNs 2015-08-17 11:42:54 +00:00			`print('---')`