From f1a95869ebad98db11aba463e7dab031de6dcba0 Mon Sep 17 00:00:00 2001 From: Minkoo Seo Date: Tue, 7 Feb 2017 03:53:33 +0900 Subject: [PATCH] Fix off by one error in WE example script Tokenizer returns sequence values in the range of [0, nb_words). In this example, MAX_NB_WORDS is 20000 and the data's min value is 19999. There is no need to use 'nb_words + 1'. --- examples/pretrained_word_embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pretrained_word_embeddings.py b/examples/pretrained_word_embeddings.py index a58af7eaf..b70e94540 100644 --- a/examples/pretrained_word_embeddings.py +++ b/examples/pretrained_word_embeddings.py @@ -102,9 +102,9 @@ print('Preparing embedding matrix.') # prepare embedding matrix nb_words = min(MAX_NB_WORDS, len(word_index)) -embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) +embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) for word, i in word_index.items(): - if i > MAX_NB_WORDS: + if i >= MAX_NB_WORDS: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: @@ -113,7 +113,7 @@ for word, i in word_index.items(): # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed -embedding_layer = Embedding(nb_words + 1, +embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,