Added support for CTC in both Theano and Tensorflow along with image OCR example. (#3436)

* Added CTC to Theano and Tensorflow backend along with image OCR example * Fixed python style issues, made data files remote, and made code more idiomatic to Keras * Fixed a couple more style issues brought up in the original PR * Reverted wrappers.py * Fixed potential training-on-validation issue and removed unused imports * Fixed PEP8 issue * Remaining PEP8 issues fixed
2016-08-16 13:25:26 -07:00 · 2016-08-16 13:25:26 -07:00 · e8190a8d8d
commit e8190a8d8d
parent 4e155139ca
4 changed files with 695 additions and 1 deletions
--- a/examples/image_ocr.py
+++ b/examples/image_ocr.py
@ -0,0 +1,442 @@
 '''This example uses a convolutional stack followed by a recurrent stack
 and a CTC logloss function to perform optical character recognition
 of generated text images. I have no evidence of whether it actually
 learns general shapes of text, or just is able to recognize all
 the different fonts thrown at it...the purpose is more to demonstrate CTC
 inside of Keras.  Note that the font list may need to be updated
 for the particular OS in use.
 This starts off with 4 letter words. After 10 or so epochs, CTC
 learns translational invariance, so longer words and groups of words
 with spaces are gradually fed in.  This gradual increase in difficulty
 is handled using the TextImageGenerator class which is both a generator
 class for test/train data and a Keras callback class. Every 10 epochs
 the wordlist that the generator draws from increases in difficulty.
 The table below shows normalized edit distance values. Theano uses
 a slightly different CTC implementation, so some Theano-specific
 hyperparameter tuning would be needed to get it to match Tensorflow.
            Norm. ED
 Epoch |   TF   |   TH
 ------------------------
    10   0.072    0.272
    20   0.032    0.115
    30   0.024    0.098
    40   0.023    0.108
 This requires cairo and editdistance packages:
 pip install cairocffi
 pip install editdistance
 Due to the use of a dummy loss function, Theano requires the following flags:
 on_unused_input='ignore'
 Created by Mike Henry
 https://github.com/mbhenry/
 '''
 import os
 import itertools
 import re
 import datetime
 import cairocffi as cairo
 import editdistance
 import numpy as np
 from scipy import ndimage
 import pylab
 from keras import backend as K
 from keras.layers.convolutional import Convolution2D, MaxPooling2D
 from keras.layers import Input, Layer, Dense, Activation, Flatten
 from keras.layers import Reshape, Lambda, merge, Permute, TimeDistributed
 from keras.models import Model
 from keras.layers.recurrent import GRU
 from keras.optimizers import SGD
 from keras.utils import np_utils
 from keras.utils.data_utils import get_file
 from keras.preprocessing import image
 import keras.callbacks
 OUTPUT_DIR = "image_ocr"
 np.random.seed(55)
 # this creates larger "blotches" of noise which look
 # more realistic than just adding gaussian noise
 # assumes greyscale with pixels ranging from 0 to 1
 def speckle(img):
    severity = np.random.uniform(0, 0.6)
    blur = ndimage.gaussian_filter(np.random.randn(*img.shape) * severity, 1)
    img_speck = (img + blur)
    img_speck[img_speck > 1] = 1
    img_speck[img_speck <= 0] = 0
    return img_speck
 # paints the string in a random location the bounding box
 # also uses a random font, a slight random rotation,
 # and a random amount of speckle noise
 def paint_text(text, w, h):
    surface = cairo.ImageSurface(cairo.FORMAT_RGB24, w, h)
    with cairo.Context(surface) as context:
        context.set_source_rgb(1, 1, 1)  # White
        context.paint()
        # this font list works in Centos 7
        fonts = ['Century Schoolbook', 'Courier', 'STIX', 'URW Chancery L', 'FreeMono']
        context.select_font_face(np.random.choice(fonts), cairo.FONT_SLANT_NORMAL,
                                 np.random.choice([cairo.FONT_WEIGHT_BOLD, cairo.FONT_WEIGHT_NORMAL]))
        context.set_font_size(40)
        box = context.text_extents(text)
        if box[2] > w or box[3] > h:
            raise IOError('Could not fit string into image. Max char count is too large for given image width.')
        # teach the RNN translational invariance by
        # fitting text box randomly on canvas, with some room to rotate
        border_w_h = (10, 16)
        max_shift_x = w - box[2] - border_w_h[0]
        max_shift_y = h - box[3] - border_w_h[1]
        top_left_x = np.random.randint(0, int(max_shift_x))
        top_left_y = np.random.randint(0, int(max_shift_y))
        context.move_to(top_left_x - int(box[0]), top_left_y - int(box[1]))
        context.set_source_rgb(0, 0, 0)
        context.show_text(text)
    buf = surface.get_data()
    a = np.frombuffer(buf, np.uint8)
    a.shape = (h, w, 4)
    a = a[:, :, 0]  # grab single channel
    a /= 255
    a = np.expand_dims(a, 0)
    a = speckle(a)
    a = image.random_rotation(a, 3 * (w - top_left_x) / w + 1)
    return a
 def shuffle_mats_or_lists(matrix_list, stop_ind=None):
    ret = []
    assert all([len(i) == len(matrix_list[0]) for i in matrix_list])
    len_val = len(matrix_list[0])
    if stop_ind is None:
        stop_ind = len_val
    assert stop_ind <= len_val
    a = range(stop_ind)
    np.random.shuffle(a)
    a += range(stop_ind, len_val)
    for mat in matrix_list:
        if isinstance(mat, np.ndarray):
            ret.append(mat[a])
        elif isinstance(mat, list):
            ret.append([mat[i] for i in a])
        else:
            raise TypeError('shuffle_mats_or_lists only supports numpy.array and list objects')
    return ret
 def text_to_labels(text, num_classes):
    ret = []
    for char in text:
        if char >= 'a' and char <= 'z':
            ret.append(ord(char) - ord('a'))
        elif char == ' ':
            ret.append(26)
    return ret
 # only a-z and space..probably not to difficult
 # to expand to uppercase and symbols
 def is_valid_str(in_str):
    search = re.compile(r'[^a-z\ ]').search
    return not bool(search(in_str))
 # Uses generator functions to supply train/test with
 # data. Image renderings are text are created on the fly
 # each time with random perturbations
 class TextImageGenerator(keras.callbacks.Callback):
    def __init__(self, monogram_file, bigram_file, minibatch_size, img_w,
                 img_h, downsample_width, val_split,
                 absolute_max_string_len=16):
        self.minibatch_size = minibatch_size
        self.img_w = img_w
        self.img_h = img_h
        self.monogram_file = monogram_file
        self.bigram_file = bigram_file
        self.downsample_width = downsample_width
        self.val_split = val_split
        self.blank_label = self.get_output_size() - 1
        self.absolute_max_string_len = absolute_max_string_len
    def get_output_size(self):
        return 28
    # num_words can be independent of the epoch size due to the use of generators
    # as max_string_len grows, num_words can grow
    def build_word_list(self, num_words, max_string_len=None, mono_fraction=0.5):
        assert max_string_len <= self.absolute_max_string_len
        assert num_words % self.minibatch_size == 0
        assert (self.val_split * num_words) % self.minibatch_size == 0
        self.num_words = num_words
        self.string_list = []
        self.max_string_len = max_string_len
        self.Y_data = np.ones([self.num_words, self.absolute_max_string_len]) * -1
        self.X_text = []
        self.Y_len = [0] * self.num_words
        # monogram file is sorted by frequency in english speech
        with open(self.monogram_file, 'rt') as f:
            for line in f:
                if len(self.string_list) == int(self.num_words * mono_fraction):
                    break
                word = line.rstrip()
                if max_string_len == -1 or max_string_len is None or len(word) <= max_string_len:
                    self.string_list.append(word)
        # bigram file contains common word pairings in english speech
        with open(self.bigram_file, 'rt') as f:
            lines = f.readlines()
            for line in lines:
                if len(self.string_list) == self.num_words:
                    break
                columns = line.lower().split()
                word = columns[0] + ' ' + columns[1]
                if is_valid_str(word) and \
                        (max_string_len == -1 or max_string_len is None or len(word) <= max_string_len):
                    self.string_list.append(word)
        if len(self.string_list) != self.num_words:
            raise IOError('Could not pull enough words from supplied monogram and bigram files. ')
        for i, word in enumerate(self.string_list):
            self.Y_len[i] = len(word)
            self.Y_data[i, 0:len(word)] = text_to_labels(word, self.get_output_size())
            self.X_text.append(word)
        self.Y_len = np.expand_dims(np.array(self.Y_len), 1)
        self.cur_val_index = self.val_split
        self.cur_train_index = 0
    # each time an image is requested from train/val/test, a new random
    # painting of the text is performed
    def get_batch(self, index, size, train):
        X_data = np.ones([size, 1, self.img_h, self.img_w])
        labels = np.ones([size, self.absolute_max_string_len])
        input_length = np.zeros([size, 1])
        label_length = np.zeros([size, 1])
        source_str = []
        for i in range(0, size):
            # Mix in some blank inputs.  This seems to be important for
            # achieving translational invariance
            if train and i > size - 4:
                X_data[i, 0, :, :] = paint_text('', self.img_w, self.img_h)
                labels[i, 0] = self.blank_label
                input_length[i] = self.downsample_width
                label_length[i] = 1
                source_str.append('')
            else:
                X_data[i, 0, :, :] = paint_text(self.X_text[index + i], self.img_w, self.img_h)
                labels[i, :] = self.Y_data[index + i]
                input_length[i] = self.downsample_width
                label_length[i] = self.Y_len[index + i]
                source_str.append(self.X_text[index + i])
        inputs = {'the_input': X_data,
                  'the_labels': labels,
                  'input_length': input_length,
                  'label_length': label_length,
                  'source_str': source_str  # used for visualization only
                  }
        outputs = {'ctc': np.zeros([size])}  # dummy data for dummy loss function
        return (inputs, outputs)
    def next_train(self):
        while 1:
            ret = self.get_batch(self.cur_train_index, self.minibatch_size, train=True)
            self.cur_train_index += self.minibatch_size
            if self.cur_train_index >= self.val_split:
                self.cur_train_index = self.cur_train_index % 32
                (self.X_text, self.Y_data, self.Y_len) = shuffle_mats_or_lists(
                    [self.X_text, self.Y_data, self.Y_len], self.val_split)
            yield ret
    def next_val(self):
        while 1:
            ret = self.get_batch(self.cur_val_index, self.minibatch_size, train=False)
            self.cur_val_index += self.minibatch_size
            if self.cur_val_index >= self.num_words:
                self.cur_val_index = self.val_split + self.cur_val_index % 32
            yield ret
    def on_train_begin(self, logs={}):
        # translational invariance seems to be the hardest thing
        # for the RNN to learn, so start with <= 4 letter words.
        self.build_word_list(16000, 4, 1)
    def on_epoch_begin(self, epoch, logs={}):
        # After 10 epochs, translational invariance should be learned
        # so start feeding longer words and eventually multiple words with spaces
        if epoch == 10:
            self.build_word_list(32000, 8, 1)
        if epoch == 20:
            self.build_word_list(32000, 8, 0.6)
        if epoch == 30:
            self.build_word_list(64000, 12, 0.5)
 # the actual loss calc occurs here despite it not being
 # an internal Keras loss function
 def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 # For a real OCR application, this should be beam search with a dictionary
 # and language model.  For this example, best path is sufficient.
 def decode_batch(test_func, word_batch):
    out = test_func([word_batch])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        # 26 is space, 27 is CTC blank char
        outstr = ''
        for c in out_best:
            if c >= 0 and c < 26:
                outstr += chr(c + ord('a'))
            elif c == 26:
                outstr += ' '
        ret.append(outstr)
    return ret
 class VizCallback(keras.callbacks.Callback):
    def __init__(self, test_func, text_img_gen, num_display_words = 6):
        self.test_func = test_func
        self.output_dir = os.path.join(
            OUTPUT_DIR, datetime.datetime.now().strftime('%A, %d. %B %Y %I.%M%p'))
        self.text_img_gen = text_img_gen
        self.num_display_words = num_display_words
        os.makedirs(self.output_dir)
    def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
    def on_epoch_end(self, epoch, logs={}):
        self.model.save_weights(os.path.join(self.output_dir, 'weights%02d.h5' % epoch))
        self.show_edit_distance(256)
        word_batch = next(self.text_img_gen)[0]
        res = decode_batch(self.test_func, word_batch['the_input'][0:self.num_display_words])
        for i in range(self.num_display_words):
            pylab.subplot(self.num_display_words, 1, i + 1)
            pylab.imshow(word_batch['the_input'][i, 0, :, :], cmap='Greys_r')
            pylab.xlabel('Truth = \'%s\' Decoded = \'%s\'' % (word_batch['source_str'][i], res[i]))
        fig = pylab.gcf()
        fig.set_size_inches(10, 12)
        pylab.savefig(os.path.join(self.output_dir, 'e%02d.png' % epoch))
        pylab.close()
 # Input Parameters
 img_h = 64
 img_w = 512
 nb_epoch = 50
 minibatch_size = 32
 words_per_epoch = 16000
 val_split = 0.2
 val_words = int(words_per_epoch * (val_split))
 # Network parameters
 conv_num_filters = 16
 filter_size = 3
 pool_size_1 = 4
 pool_size_2 = 2
 time_dense_size = 32
 rnn_size = 512
 time_steps = img_w / (pool_size_1 * pool_size_2)
 fdir = os.path.dirname(get_file('wordlists.tgz',
                                origin='http://www.isosemi.com/datasets/wordlists.tgz', untar=True))
 img_gen = TextImageGenerator(monogram_file=os.path.join(fdir, 'wordlist_mono_clean.txt'),
                             bigram_file=os.path.join(fdir, 'wordlist_bi_clean.txt'),
                             minibatch_size=32,
                             img_w=img_w,
                             img_h=img_h,
                             downsample_width=img_w / (pool_size_1 * pool_size_2) - 2,
                             val_split=words_per_epoch - val_words)
 act = 'relu'
 input_data = Input(name='the_input', shape=(1, img_h, img_w), dtype='float32')
 inner = Convolution2D(conv_num_filters, filter_size, filter_size, border_mode='same',
                      activation=act, input_shape=(1, img_h, img_w), name='conv1')(input_data)
 inner = MaxPooling2D(pool_size=(pool_size_1, pool_size_1), name='max1')(inner)
 inner = Convolution2D(conv_num_filters, filter_size, filter_size, border_mode='same',
                      activation=act, name='conv2')(inner)
 inner = MaxPooling2D(pool_size=(pool_size_2, pool_size_2), name='max2')(inner)
 conv_to_rnn_dims = ((img_h / (pool_size_1 * pool_size_2)) * conv_num_filters, img_w / (pool_size_1 * pool_size_2))
 inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)
 inner = Permute(dims=(2, 1), name='permute')(inner)
 # cuts down input size going into RNN:
 inner = TimeDistributed(Dense(time_dense_size, activation=act, name='dense1'))(inner)
 # Two layers of bidirecitonal GRUs
 # GRU seems to work as well, if not better than LSTM:
 gru_1 = GRU(rnn_size, return_sequences=True, name='gru1')(inner)
 gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, name='gru1_b')(inner)
 gru1_merged = merge([gru_1, gru_1b], mode='sum')
 gru_2 = GRU(rnn_size, return_sequences=True, name='gru2')(gru1_merged)
 gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True)(gru1_merged)
 # transforms RNN output to character activations:
 inner = TimeDistributed(Dense(img_gen.get_output_size(), name='dense2'))(merge([gru_2, gru_2b], mode='concat'))
 y_pred = Activation('softmax', name='softmax')(inner)
 Model(input=[input_data], output=y_pred).summary()
 labels = Input(name='the_labels', shape=[img_gen.absolute_max_string_len], dtype='float32')
 input_length = Input(name='input_length', shape=[1], dtype='int64')
 label_length = Input(name='label_length', shape=[1], dtype='int64')
 # Keras doesn't currently support loss funcs with extra parameters
 # so CTC loss is implemented in a lambda layer
 loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name="ctc")([y_pred, labels, input_length, label_length])
 lr = 0.03
 # clipnorm seems to speeds up convergence
 clipnorm = 5
 sgd = SGD(lr=lr, decay=3e-7, momentum=0.9, nesterov=True, clipnorm=clipnorm)
 model = Model(input=[input_data, labels, input_length, label_length], output=[loss_out])
 # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
 model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
 # captures output of softmax so we can decode the output during visualization
 test_func = K.function([input_data], [y_pred])
 viz_cb = VizCallback(test_func, img_gen.next_val())
 model.fit_generator(generator=img_gen.next_train(), samples_per_epoch=(words_per_epoch - val_words),
                    nb_epoch=nb_epoch, validation_data=img_gen.next_val(), nb_val_samples=val_words,
                    callbacks=[viz_cb, img_gen])
--- a/keras/backend/tensorflow_backend.py
+++ b/keras/backend/tensorflow_backend.py
@ -1586,3 +1586,112 @@ def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None):
    return tf.select(tf.random_uniform(shape, dtype=dtype, seed=seed) <= p,
                     tf.ones(shape, dtype=dtype),
                     tf.zeros(shape, dtype=dtype))
 # CTC
 # tensorflow has a native implemenation, but it uses sparse tensors
 # and therefore requires a wrapper for Keras. The functions below convert
 # dense to sparse tensors and also wraps up the beam search code that is
 # in tensorflow's CTC implementation
 def ctc_label_dense_to_sparse(labels, label_lengths):
    # undocumented feature soon to be made public
    from tensorflow.python.ops import functional_ops
    label_shape = tf.shape(labels)
    num_batches_tns = tf.pack([label_shape[0]])
    max_num_labels_tns = tf.pack([label_shape[1]])
    def range_less_than(previous_state, current_input):
        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
    dense_mask = functional_ops.scan(range_less_than, label_lengths,
                                     initializer=init, parallel_iterations=1)
    dense_mask = dense_mask[:, 0, :]
    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
                             label_shape)
    label_ind = tf.boolean_mask(label_array, dense_mask)
    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), 
                                                  max_num_labels_tns), tf.reverse(label_shape, [True])))
    batch_ind = tf.boolean_mask(batch_array, dense_mask)
    indices = tf.transpose(tf.reshape(tf.concat(0, [batch_ind, label_ind]), [2,-1]))
    vals_sparse = tf.gather_nd(labels, indices)
    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    '''Runs CTC loss algorithm on each batch element.
    # Arguments
        y_true: tensor (samples, max_string_length) containing the truth labels
        y_pred: tensor (samples, time_steps, num_categories) containing the prediction,
                or output of the softmax
        input_length: tensor (samples,1) containing the sequence length for
                each batch item in y_pred
        label_length: tensor (samples,1) containing the sequence length for
                each batch item in y_true
    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element
    '''
    label_length = tf.to_int32(tf.squeeze(label_length))
    input_length = tf.to_int32(tf.squeeze(input_length))
    sparse_labels = tf.to_int32(ctc_label_dense_to_sparse(y_true, label_length))
    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
    return tf.expand_dims(tf.contrib.ctc.ctc_loss(inputs = y_pred,
                                                  labels = sparse_labels,
                                                  sequence_length = input_length), 1)
 def ctc_decode(y_pred, input_length, greedy = True, beam_width = None,
               dict_seq_lens = None, dict_values = None):
    '''Decodes the output of a softmax using either
       greedy (also known as best path) or a constrained dictionary
       search.  
    # Arguments
        y_pred: tensor (samples, time_steps, num_categories) containing the prediction,
                or output of the softmax
        input_length: tensor (samples,1) containing the sequence length for
                each batch item in y_pred
        greedy:  perform much faster best-path search if true.  This does
                not use a dictionary
        beam_width:  if greedy is false and this value is not none, then
                the constrained dictionary search uses a beam of this width
        dict_seq_lens: the length of each element in the dict_values list
        dict_values:  list of lists representing the dictionary.
    # Returns
        Tensor with shape (samples,time_steps,num_categories) containing the
            path probabilities (in softmax output format).  Note that a function that
            pulls out the argmax and collapses blank labels is still needed.
    '''
    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
    input_length = tf.to_int32(tf.squeeze(input_length))
    if greedy:
        (decoded, log_prob) = tf.contrib.ctc.ctc_greedy_decoder(
            inputs = y_pred,
            sequence_length = input_length)
    else:
        if beam_width is not None:
            (decoded, log_prob) = tf.contrib.ctc.ctc_beam_search_decoder(
                inputs = y_pred,
                sequence_length = input_length,
                dict_seq_lens = dict_seq_lens, dict_values = dict_values)
        else:
            (decoded, log_prob) = tf.contrib.ctc.ctc_beam_search_decoder(
                inputs = y_pred,
                sequence_length = input_length, beam_width = beam_width,
                dict_seq_lens = dict_seq_lens, dict_values = dict_values)
    decoded_dense = [tf.sparse_to_dense(st.indices, st.shape, st.values, default_value = -1)
                     for st in decoded]
    return (decoded_dense, log_prob)
--- a/keras/backend/theano_backend.py
+++ b/keras/backend/theano_backend.py
@ -1319,3 +1319,105 @@ def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None):
        seed = np.random.randint(1, 10e6)
    rng = RandomStreams(seed=seed)
    return rng.binomial(shape, p=p, dtype=dtype)
 # Theano implementation of CTC
 # Used with permission from Shawn Tan
 # https://github.com/shawntan/
 # Note that tensorflow's native CTC code is significantly
 # faster than this
 def ctc_interleave_blanks(Y):
    Y_ = T.alloc(-1, Y.shape[0] * 2 + 1)
    Y_ = T.set_subtensor(Y_[T.arange(Y.shape[0]) * 2 + 1], Y)
    return Y_
 def ctc_create_skip_idxs(Y):
    skip_idxs = T.arange((Y.shape[0] - 3) // 2) * 2 + 1
    non_repeats = T.neq(Y[skip_idxs], Y[skip_idxs + 2])
    return skip_idxs[non_repeats.nonzero()]
 def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
    active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
    active_next = T.cast(T.minimum(
        T.maximum(
            active + 1,
            T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
        ), log_p_curr.shape[0]), 'int32')
    common_factor = T.max(log_p_prev[:active])
    p_prev = T.exp(log_p_prev[:active] - common_factor)
    _p_prev = zeros[:active_next]
    # copy over
    _p_prev = T.set_subtensor(_p_prev[:active], p_prev)
    # previous transitions
    _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
    # skip transitions
    _p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
    updated_log_p_prev = T.log(_p_prev) + common_factor
    log_p_next = T.set_subtensor(
        zeros[:active_next],
        log_p_curr[:active_next] + updated_log_p_prev
    )
    return active_next, log_p_next
 def ctc_path_probs(predict, Y, alpha=1e-4):
    smoothed_predict = (1 - alpha) * predict[:, Y] + alpha * np.float32(1.) / Y.shape[0]
    L = T.log(smoothed_predict)
    zeros = T.zeros_like(L[0])
    base = T.set_subtensor(zeros[:1], np.float32(1))
    log_first = zeros
    f_skip_idxs = ctc_create_skip_idxs(Y)
    b_skip_idxs = ctc_create_skip_idxs(Y[::-1])  # there should be a shortcut to calculating this
    def step(log_f_curr, log_b_curr, f_active, log_f_prev, b_active, log_b_prev):
        f_active_next, log_f_next = ctc_update_log_p(f_skip_idxs, zeros, f_active, log_f_curr, log_f_prev)
        b_active_next, log_b_next = ctc_update_log_p(b_skip_idxs, zeros, b_active, log_b_curr, log_b_prev)
        return f_active_next, log_f_next, b_active_next, log_b_next
    [f_active, log_f_probs, b_active, log_b_probs], _ = theano.scan(
        step, sequences=[L, L[::-1, ::-1]], outputs_info=[np.int32(1), log_first, np.int32(1), log_first])
    idxs = T.arange(L.shape[1]).dimshuffle('x', 0)
    mask = (idxs < f_active.dimshuffle(0, 'x')) & (idxs < b_active.dimshuffle(0, 'x'))[::-1, ::-1]
    log_probs = log_f_probs + log_b_probs[::-1, ::-1] - L
    return log_probs, mask
 def ctc_cost(predict, Y):
    log_probs, mask = ctc_path_probs(predict, ctc_interleave_blanks(Y))
    common_factor = T.max(log_probs)
    total_log_prob = T.log(T.sum(T.exp(log_probs - common_factor)[mask.nonzero()])) + common_factor
    return -total_log_prob
 # batchifies original CTC code
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    '''Runs CTC loss algorithm on each batch element.
    # Arguments
        y_true: tensor (samples, max_string_length) containing the truth labels
        y_pred: tensor (samples, time_steps, num_categories) containing the prediction,
                or output of the softmax
        input_length: tensor (samples,1) containing the sequence length for
                each batch item in y_pred
        label_length: tensor (samples,1) containing the sequence length for
                each batch item in y_true
    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element
    '''
    def ctc_step(y_true_step, y_pred_step, input_length_step, label_length_step):
        y_pred_step = y_pred_step[0: input_length_step[0]]
        y_true_step = y_true_step[0:label_length_step[0]]
        return ctc_cost(y_pred_step, y_true_step)
    ret, _ = theano.scan(
        fn = ctc_step,
        outputs_info=None,
        sequences=[y_true, y_pred, input_length, label_length]
    )
    ret = ret.dimshuffle('x', 0)
    return ret
--- a/tests/keras/backend/test_backends.py
+++ b/tests/keras/backend/test_backends.py
@ -581,6 +581,48 @@ class TestBackend(object):
        assert(np.max(rand) == 1)
        assert(np.min(rand) == 0)
    def test_ctc(self):
        # simplified version of TensorFlow's test
        label_lens = np.expand_dims(np.asarray([5, 4]), 1)
        input_lens = np.expand_dims(np.asarray([5, 5]), 1)  # number of timesteps
        # the Theano and Tensorflow CTC code use different methods to ensure
        # numerical stability.  The Theano code subtracts out the max
        # before the final log, so the results are different but scale
        # identically and still train properly
        loss_log_probs_tf = [3.34211, 5.42262]
        loss_log_probs_th = [1.73308, 3.81351]
        # dimensions are batch x time x categories
        labels = np.asarray([[0, 1, 2, 1, 0], [0, 1, 1, 0, -1]])
        inputs = np.asarray(
            [[[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
              [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
              [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
              [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
              [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
             [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
              [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549],
              [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456],
              [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345],
              [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]]],
            dtype=np.float32)
        labels_tf = KTF.variable(labels, dtype="int32")
        inputs_tf = KTF.variable(inputs, dtype="float32")
        input_lens_tf = KTF.variable(input_lens, dtype="int32")
        label_lens_tf = KTF.variable(label_lens, dtype="int32")
        res = KTF.eval(KTF.ctc_batch_cost(labels_tf, inputs_tf, input_lens_tf, label_lens_tf))
        assert_allclose(res[:, 0], loss_log_probs_tf, atol=1e-05)
        labels_th = KTH.variable(labels, dtype="int32")
        inputs_th = KTH.variable(inputs, dtype="float32")
        input_lens_th = KTH.variable(input_lens, dtype="int32")
        label_lens_th = KTH.variable(label_lens, dtype="int32")
        res = KTH.eval(KTH.ctc_batch_cost(labels_th, inputs_th, input_lens_th, label_lens_th))
        assert_allclose(res[0, :], loss_log_probs_th, atol=1e-05)
    def test_one_hot(self):
        input_length = 10
        nb_classes = 20
@ -591,6 +633,5 @@ class TestBackend(object):
            koh = K.eval(K.one_hot(K.variable(indices, dtype='int32'), nb_classes))
            assert np.all(koh == oh)
 if __name__ == '__main__':
    pytest.main([__file__])