keras/examples/imdb_fasttext.py

'''This example demonstrates the use of fasttext for text classification

Based on Joulin et al's paper:

Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759

Results on IMDB datasets with uni and bi-gram embeddings:
    Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
'''

from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`'''This example demonstrates the use of fasttext for text classification`

			`Based on Joulin et al's paper:`

			`Bags of Tricks for Efficient Text Classification`
			`https://arxiv.org/abs/1607.01759`

fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`Results on IMDB datasets with uni and bi-gram embeddings:`
imdb fasttext speedup (#4026) * imdb fasttext speedup * Lambda -> GlobalAveragePooling1D 2016-10-11 18:01:11 +00:00			`Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.`
Finish updating examples. 2017-03-12 03:44:29 +00:00			`Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`'''`

			`from __future__ import print_function`
			`import numpy as np`

			`from keras.preprocessing import sequence`
			`from keras.models import Sequential`
imdb fasttext speedup (#4026) * imdb fasttext speedup * Lambda -> GlobalAveragePooling1D 2016-10-11 18:01:11 +00:00			`from keras.layers import Dense`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`from keras.layers import Embedding`
imdb fasttext speedup (#4026) * imdb fasttext speedup * Lambda -> GlobalAveragePooling1D 2016-10-11 18:01:11 +00:00			`from keras.layers import GlobalAveragePooling1D`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`from keras.datasets import imdb`


fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`def create_ngram_set(input_list, ngram_value=2):`
			`"""`
			`Extract a set of n-grams from a list of integers.`

			`>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)`
			`{(4, 9), (4, 1), (1, 4), (9, 4)}`

			`>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)`
			`[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]`
			`"""`
			`return set(zip(*[input_list[i:] for i in range(ngram_value)]))`


			`def add_ngram(sequences, token_indice, ngram_range=2):`
			`"""`
			`Augment the input list of list (sequences) by appending n-grams values.`

			`Example: adding bi-gram`
			`>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]`
			`>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}`
			`>>> add_ngram(sequences, token_indice, ngram_range=2)`
			`[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]`

			`Example: adding tri-gram`
			`>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]`
			`>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}`
			`>>> add_ngram(sequences, token_indice, ngram_range=3)`
			`[[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]`
			`"""`
			`new_sequences = []`
			`for input_list in sequences:`
			`new_list = input_list[:]`
PEP8 fixes in examples. 2017-01-11 19:39:58 +00:00			`for i in range(len(new_list) - ngram_range + 1):`
			`for ngram_value in range(2, ngram_range + 1):`
			`ngram = tuple(new_list[i:i + ngram_value])`
fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`if ngram in token_indice:`
			`new_list.append(token_indice[ngram])`
			`new_sequences.append(new_list)`

			`return new_sequences`

			`# Set parameters:`
			`# ngram_range = 2 will add bi-grams features`
			`ngram_range = 1`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`max_features = 20000`
			`maxlen = 400`
			`batch_size = 32`
fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`embedding_dims = 50`
Integration tests passing. 2017-02-15 00:08:30 +00:00			`epochs = 5`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00
			`print('Loading data...')`
Finish updating examples. 2017-03-12 03:44:29 +00:00			`(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)`
			`print(len(x_train), 'train sequences')`
			`print(len(x_test), 'test sequences')`
			`print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))`
			`print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))`
fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00
			`if ngram_range > 1:`
			`print('Adding {}-gram features'.format(ngram_range))`
			`# Create set of unique n-gram from the training set.`
			`ngram_set = set()`
Finish updating examples. 2017-03-12 03:44:29 +00:00			`for input_list in x_train:`
PEP8 fixes in examples. 2017-01-11 19:39:58 +00:00			`for i in range(2, ngram_range + 1):`
fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`set_of_ngram = create_ngram_set(input_list, ngram_value=i)`
			`ngram_set.update(set_of_ngram)`

			`# Dictionary mapping n-gram token to a unique integer.`
			`# Integer values are greater than max_features in order`
			`# to avoid collision with existing features.`
			`start_index = max_features + 1`
PEP8 fixes in examples. 2017-01-11 19:39:58 +00:00			`token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}`
fastText: adding n-gram embeddings for higher test_set accuracy (#3733) * adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment 2016-09-10 17:35:15 +00:00			`indice_token = {token_indice[k]: k for k in token_indice}`

			`# max_features is the highest integer that could be found in the dataset.`
			`max_features = np.max(list(indice_token.keys())) + 1`

Finish updating examples. 2017-03-12 03:44:29 +00:00			`# Augmenting x_train and x_test with n-grams features`
			`x_train = add_ngram(x_train, token_indice, ngram_range)`
			`x_test = add_ngram(x_test, token_indice, ngram_range)`
			`print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))`
			`print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00
			`print('Pad sequences (samples x time)')`
Finish updating examples. 2017-03-12 03:44:29 +00:00			`x_train = sequence.pad_sequences(x_train, maxlen=maxlen)`
			`x_test = sequence.pad_sequences(x_test, maxlen=maxlen)`
			`print('x_train shape:', x_train.shape)`
			`print('x_test shape:', x_test.shape)`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00
			`print('Build model...')`
			`model = Sequential()`

			`# we start off with an efficient embedding layer which maps`
			`# our vocab indices into embedding_dims dimensions`
			`model.add(Embedding(max_features,`
			`embedding_dims,`
			`input_length=maxlen))`

imdb fasttext speedup (#4026) * imdb fasttext speedup * Lambda -> GlobalAveragePooling1D 2016-10-11 18:01:11 +00:00			`# we add a GlobalAveragePooling1D, which will average the embeddings`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`# of all words in the document`
imdb fasttext speedup (#4026) * imdb fasttext speedup * Lambda -> GlobalAveragePooling1D 2016-10-11 18:01:11 +00:00			`model.add(GlobalAveragePooling1D())`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00
			`# We project onto a single unit output layer, and squash it with a sigmoid:`
Style fixes (#3462) 2016-08-13 18:22:01 +00:00			`model.add(Dense(1, activation='sigmoid'))`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00
			`model.compile(loss='binary_crossentropy',`
			`optimizer='adam',`
			`metrics=['accuracy'])`

Finish updating examples. 2017-03-12 03:44:29 +00:00			`model.fit(x_train, y_train,`
Implement a fasttext example (#3446) * Upload examples/imdb_fasttext.py which implement the fasttext model * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports * Remove Dropout and unnecessary imports 2016-08-12 21:36:35 +00:00			`batch_size=batch_size,`
Integration tests passing. 2017-02-15 00:08:30 +00:00			`epochs=epochs,`
Finish updating examples. 2017-03-12 03:44:29 +00:00			`validation_data=(x_test, y_test))`