fastText: adding n-gram embeddings for higher test_set accuracy (#3733)
* adding bi-gram embeddings for better test accuracy * - add arbitrary n-gram range - fix typos * - fixing white spaces * - add comment
This commit is contained in:
parent
d9c4d8a76a
commit
a375cb322f
@ -5,8 +5,9 @@ Based on Joulin et al's paper:
|
||||
Bags of Tricks for Efficient Text Classification
|
||||
https://arxiv.org/abs/1607.01759
|
||||
|
||||
Can achieve accuracy around 88% after 5 epochs in 70s.
|
||||
|
||||
Results on IMDB datasets with uni and bi-gram embeddings:
|
||||
Uni-gram: 0.8813 test accuracy after 5 epochs. 15s/epoch on i7 cpu.
|
||||
Bi-gram : 0.9056 test accuracy after 5 epochs. 5s/epoch on GTX 1080 gpu.
|
||||
'''
|
||||
|
||||
from __future__ import print_function
|
||||
@ -21,17 +22,87 @@ from keras.layers import AveragePooling1D
|
||||
from keras.datasets import imdb
|
||||
|
||||
|
||||
# set parameters:
|
||||
def create_ngram_set(input_list, ngram_value=2):
|
||||
"""
|
||||
Extract a set of n-grams from a list of integers.
|
||||
|
||||
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
|
||||
{(4, 9), (4, 1), (1, 4), (9, 4)}
|
||||
|
||||
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
|
||||
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
|
||||
"""
|
||||
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
|
||||
|
||||
|
||||
def add_ngram(sequences, token_indice, ngram_range=2):
|
||||
"""
|
||||
Augment the input list of list (sequences) by appending n-grams values.
|
||||
|
||||
Example: adding bi-gram
|
||||
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
|
||||
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
|
||||
>>> add_ngram(sequences, token_indice, ngram_range=2)
|
||||
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
|
||||
|
||||
Example: adding tri-gram
|
||||
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
|
||||
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
|
||||
>>> add_ngram(sequences, token_indice, ngram_range=3)
|
||||
[[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
|
||||
"""
|
||||
new_sequences = []
|
||||
for input_list in sequences:
|
||||
new_list = input_list[:]
|
||||
for i in range(len(new_list)-ngram_range+1):
|
||||
for ngram_value in range(2, ngram_range+1):
|
||||
ngram = tuple(new_list[i:i+ngram_value])
|
||||
if ngram in token_indice:
|
||||
new_list.append(token_indice[ngram])
|
||||
new_sequences.append(new_list)
|
||||
|
||||
return new_sequences
|
||||
|
||||
# Set parameters:
|
||||
# ngram_range = 2 will add bi-grams features
|
||||
ngram_range = 1
|
||||
max_features = 20000
|
||||
maxlen = 400
|
||||
batch_size = 32
|
||||
embedding_dims = 20
|
||||
embedding_dims = 50
|
||||
nb_epoch = 5
|
||||
|
||||
print('Loading data...')
|
||||
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
|
||||
print(len(X_train), 'train sequences')
|
||||
print(len(X_test), 'test sequences')
|
||||
print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
|
||||
print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
|
||||
|
||||
if ngram_range > 1:
|
||||
print('Adding {}-gram features'.format(ngram_range))
|
||||
# Create set of unique n-gram from the training set.
|
||||
ngram_set = set()
|
||||
for input_list in X_train:
|
||||
for i in range(2, ngram_range+1):
|
||||
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
|
||||
ngram_set.update(set_of_ngram)
|
||||
|
||||
# Dictionary mapping n-gram token to a unique integer.
|
||||
# Integer values are greater than max_features in order
|
||||
# to avoid collision with existing features.
|
||||
start_index = max_features + 1
|
||||
token_indice = {v: k+start_index for k, v in enumerate(ngram_set)}
|
||||
indice_token = {token_indice[k]: k for k in token_indice}
|
||||
|
||||
# max_features is the highest integer that could be found in the dataset.
|
||||
max_features = np.max(list(indice_token.keys())) + 1
|
||||
|
||||
# Augmenting X_train and X_test with n-grams features
|
||||
X_train = add_ngram(X_train, token_indice, ngram_range)
|
||||
X_test = add_ngram(X_test, token_indice, ngram_range)
|
||||
print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
|
||||
print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))
|
||||
|
||||
print('Pad sequences (samples x time)')
|
||||
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
|
||||
|
Loading…
Reference in New Issue
Block a user