diff --git a/examples/imdb_fasttext.py b/examples/imdb_fasttext.py index 84c075198..9ee13a626 100644 --- a/examples/imdb_fasttext.py +++ b/examples/imdb_fasttext.py @@ -5,8 +5,9 @@ Based on Joulin et al's paper: Bags of Tricks for Efficient Text Classification https://arxiv.org/abs/1607.01759 -Can achieve accuracy around 88% after 5 epochs in 70s. - +Results on IMDB datasets with uni and bi-gram embeddings: + Uni-gram: 0.8813 test accuracy after 5 epochs. 15s/epoch on i7 cpu. + Bi-gram : 0.9056 test accuracy after 5 epochs. 5s/epoch on GTX 1080 gpu. ''' from __future__ import print_function @@ -21,17 +22,87 @@ from keras.layers import AveragePooling1D from keras.datasets import imdb -# set parameters: +def create_ngram_set(input_list, ngram_value=2): + """ + Extract a set of n-grams from a list of integers. + + >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) + {(4, 9), (4, 1), (1, 4), (9, 4)} + + >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) + [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] + """ + return set(zip(*[input_list[i:] for i in range(ngram_value)])) + + +def add_ngram(sequences, token_indice, ngram_range=2): + """ + Augment the input list of list (sequences) by appending n-grams values. + + Example: adding bi-gram + >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] + >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} + >>> add_ngram(sequences, token_indice, ngram_range=2) + [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] + + Example: adding tri-gram + >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] + >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} + >>> add_ngram(sequences, token_indice, ngram_range=3) + [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] + """ + new_sequences = [] + for input_list in sequences: + new_list = input_list[:] + for i in range(len(new_list)-ngram_range+1): + for ngram_value in range(2, ngram_range+1): + ngram = tuple(new_list[i:i+ngram_value]) + if ngram in token_indice: + new_list.append(token_indice[ngram]) + new_sequences.append(new_list) + + return new_sequences + +# Set parameters: +# ngram_range = 2 will add bi-grams features +ngram_range = 1 max_features = 20000 maxlen = 400 batch_size = 32 -embedding_dims = 20 +embedding_dims = 50 nb_epoch = 5 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') +print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) +print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) + +if ngram_range > 1: + print('Adding {}-gram features'.format(ngram_range)) + # Create set of unique n-gram from the training set. + ngram_set = set() + for input_list in X_train: + for i in range(2, ngram_range+1): + set_of_ngram = create_ngram_set(input_list, ngram_value=i) + ngram_set.update(set_of_ngram) + + # Dictionary mapping n-gram token to a unique integer. + # Integer values are greater than max_features in order + # to avoid collision with existing features. + start_index = max_features + 1 + token_indice = {v: k+start_index for k, v in enumerate(ngram_set)} + indice_token = {token_indice[k]: k for k in token_indice} + + # max_features is the highest integer that could be found in the dataset. + max_features = np.max(list(indice_token.keys())) + 1 + + # Augmenting X_train and X_test with n-grams features + X_train = add_ngram(X_train, token_indice, ngram_range) + X_test = add_ngram(X_test, token_indice, ngram_range) + print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) + print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen)