'''This example demonstrates the use of fasttext for text classification Based on Joulin et al's paper: Bags of Tricks for Efficient Text Classification https://arxiv.org/abs/1607.01759 Results on IMDB datasets with uni and bi-gram embeddings: Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu. Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTX 980M gpu. ''' from __future__ import print_function import numpy as np np.random.seed(1337) # for reproducibility from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense from keras.layers import Embedding from keras.layers import GlobalAveragePooling1D from keras.datasets import imdb def create_ngram_set(input_list, ngram_value=2): """ Extract a set of n-grams from a list of integers. >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) {(4, 9), (4, 1), (1, 4), (9, 4)} >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] """ return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(sequences, token_indice, ngram_range=2): """ Augment the input list of list (sequences) by appending n-grams values. Example: adding bi-gram >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} >>> add_ngram(sequences, token_indice, ngram_range=2) [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] Example: adding tri-gram >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} >>> add_ngram(sequences, token_indice, ngram_range=3) [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] """ new_sequences = [] for input_list in sequences: new_list = input_list[:] for i in range(len(new_list) - ngram_range + 1): for ngram_value in range(2, ngram_range + 1): ngram = tuple(new_list[i:i + ngram_value]) if ngram in token_indice: new_list.append(token_indice[ngram]) new_sequences.append(new_list) return new_sequences # Set parameters: # ngram_range = 2 will add bi-grams features ngram_range = 1 max_features = 20000 maxlen = 400 batch_size = 32 embedding_dims = 50 epochs = 5 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) if ngram_range > 1: print('Adding {}-gram features'.format(ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in X_train: for i in range(2, ngram_range + 1): set_of_ngram = create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer. # Integer values are greater than max_features in order # to avoid collision with existing features. start_index = max_features + 1 token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset. max_features = np.max(list(indice_token.keys())) + 1 # Augmenting X_train and X_test with n-grams features X_train = add_ngram(X_train, token_indice, ngram_range) X_test = add_ngram(X_test, token_indice, ngram_range) print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) # we add a GlobalAveragePooling1D, which will average the embeddings # of all words in the document model.add(GlobalAveragePooling1D()) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))