2016-08-12 21:36:35 +00:00
|
|
|
'''This example demonstrates the use of fasttext for text classification
|
|
|
|
|
|
|
|
Based on Joulin et al's paper:
|
|
|
|
|
|
|
|
Bags of Tricks for Efficient Text Classification
|
|
|
|
https://arxiv.org/abs/1607.01759
|
|
|
|
|
2016-09-10 17:35:15 +00:00
|
|
|
Results on IMDB datasets with uni and bi-gram embeddings:
|
2016-10-11 18:01:11 +00:00
|
|
|
Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
|
2017-03-12 03:44:29 +00:00
|
|
|
Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
|
2016-08-12 21:36:35 +00:00
|
|
|
'''
|
|
|
|
|
|
|
|
from __future__ import print_function
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from keras.preprocessing import sequence
|
|
|
|
from keras.models import Sequential
|
2016-10-11 18:01:11 +00:00
|
|
|
from keras.layers import Dense
|
2016-08-12 21:36:35 +00:00
|
|
|
from keras.layers import Embedding
|
2016-10-11 18:01:11 +00:00
|
|
|
from keras.layers import GlobalAveragePooling1D
|
2016-08-12 21:36:35 +00:00
|
|
|
from keras.datasets import imdb
|
|
|
|
|
|
|
|
|
2016-09-10 17:35:15 +00:00
|
|
|
def create_ngram_set(input_list, ngram_value=2):
|
|
|
|
"""
|
|
|
|
Extract a set of n-grams from a list of integers.
|
|
|
|
|
|
|
|
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
|
|
|
|
{(4, 9), (4, 1), (1, 4), (9, 4)}
|
|
|
|
|
|
|
|
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
|
|
|
|
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
|
|
|
|
"""
|
|
|
|
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
|
|
|
|
|
|
|
|
|
|
|
|
def add_ngram(sequences, token_indice, ngram_range=2):
|
|
|
|
"""
|
|
|
|
Augment the input list of list (sequences) by appending n-grams values.
|
|
|
|
|
|
|
|
Example: adding bi-gram
|
|
|
|
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
|
|
|
|
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
|
|
|
|
>>> add_ngram(sequences, token_indice, ngram_range=2)
|
|
|
|
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
|
|
|
|
|
|
|
|
Example: adding tri-gram
|
|
|
|
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
|
|
|
|
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
|
|
|
|
>>> add_ngram(sequences, token_indice, ngram_range=3)
|
|
|
|
[[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
|
|
|
|
"""
|
|
|
|
new_sequences = []
|
|
|
|
for input_list in sequences:
|
|
|
|
new_list = input_list[:]
|
2017-01-11 19:39:58 +00:00
|
|
|
for i in range(len(new_list) - ngram_range + 1):
|
|
|
|
for ngram_value in range(2, ngram_range + 1):
|
|
|
|
ngram = tuple(new_list[i:i + ngram_value])
|
2016-09-10 17:35:15 +00:00
|
|
|
if ngram in token_indice:
|
|
|
|
new_list.append(token_indice[ngram])
|
|
|
|
new_sequences.append(new_list)
|
|
|
|
|
|
|
|
return new_sequences
|
|
|
|
|
|
|
|
# Set parameters:
|
|
|
|
# ngram_range = 2 will add bi-grams features
|
|
|
|
ngram_range = 1
|
2016-08-12 21:36:35 +00:00
|
|
|
max_features = 20000
|
|
|
|
maxlen = 400
|
|
|
|
batch_size = 32
|
2016-09-10 17:35:15 +00:00
|
|
|
embedding_dims = 50
|
2017-02-15 00:08:30 +00:00
|
|
|
epochs = 5
|
2016-08-12 21:36:35 +00:00
|
|
|
|
|
|
|
print('Loading data...')
|
2017-03-12 03:44:29 +00:00
|
|
|
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
|
|
|
|
print(len(x_train), 'train sequences')
|
|
|
|
print(len(x_test), 'test sequences')
|
|
|
|
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
|
|
|
|
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
|
2016-09-10 17:35:15 +00:00
|
|
|
|
|
|
|
if ngram_range > 1:
|
|
|
|
print('Adding {}-gram features'.format(ngram_range))
|
|
|
|
# Create set of unique n-gram from the training set.
|
|
|
|
ngram_set = set()
|
2017-03-12 03:44:29 +00:00
|
|
|
for input_list in x_train:
|
2017-01-11 19:39:58 +00:00
|
|
|
for i in range(2, ngram_range + 1):
|
2016-09-10 17:35:15 +00:00
|
|
|
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
|
|
|
|
ngram_set.update(set_of_ngram)
|
|
|
|
|
|
|
|
# Dictionary mapping n-gram token to a unique integer.
|
|
|
|
# Integer values are greater than max_features in order
|
|
|
|
# to avoid collision with existing features.
|
|
|
|
start_index = max_features + 1
|
2017-01-11 19:39:58 +00:00
|
|
|
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
|
2016-09-10 17:35:15 +00:00
|
|
|
indice_token = {token_indice[k]: k for k in token_indice}
|
|
|
|
|
|
|
|
# max_features is the highest integer that could be found in the dataset.
|
|
|
|
max_features = np.max(list(indice_token.keys())) + 1
|
|
|
|
|
2017-03-12 03:44:29 +00:00
|
|
|
# Augmenting x_train and x_test with n-grams features
|
|
|
|
x_train = add_ngram(x_train, token_indice, ngram_range)
|
|
|
|
x_test = add_ngram(x_test, token_indice, ngram_range)
|
|
|
|
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
|
|
|
|
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
|
2016-08-12 21:36:35 +00:00
|
|
|
|
|
|
|
print('Pad sequences (samples x time)')
|
2017-03-12 03:44:29 +00:00
|
|
|
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
|
|
|
|
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
|
|
|
|
print('x_train shape:', x_train.shape)
|
|
|
|
print('x_test shape:', x_test.shape)
|
2016-08-12 21:36:35 +00:00
|
|
|
|
|
|
|
print('Build model...')
|
|
|
|
model = Sequential()
|
|
|
|
|
|
|
|
# we start off with an efficient embedding layer which maps
|
|
|
|
# our vocab indices into embedding_dims dimensions
|
|
|
|
model.add(Embedding(max_features,
|
|
|
|
embedding_dims,
|
|
|
|
input_length=maxlen))
|
|
|
|
|
2016-10-11 18:01:11 +00:00
|
|
|
# we add a GlobalAveragePooling1D, which will average the embeddings
|
2016-08-12 21:36:35 +00:00
|
|
|
# of all words in the document
|
2016-10-11 18:01:11 +00:00
|
|
|
model.add(GlobalAveragePooling1D())
|
2016-08-12 21:36:35 +00:00
|
|
|
|
|
|
|
# We project onto a single unit output layer, and squash it with a sigmoid:
|
2016-08-13 18:22:01 +00:00
|
|
|
model.add(Dense(1, activation='sigmoid'))
|
2016-08-12 21:36:35 +00:00
|
|
|
|
|
|
|
model.compile(loss='binary_crossentropy',
|
|
|
|
optimizer='adam',
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
2017-03-12 03:44:29 +00:00
|
|
|
model.fit(x_train, y_train,
|
2016-08-12 21:36:35 +00:00
|
|
|
batch_size=batch_size,
|
2017-02-15 00:08:30 +00:00
|
|
|
epochs=epochs,
|
2017-03-12 03:44:29 +00:00
|
|
|
validation_data=(x_test, y_test))
|