186 lines
7.0 KiB
Python
186 lines
7.0 KiB
Python
"""IMDB sentiment classification dataset."""
|
|
|
|
import json
|
|
|
|
import numpy as np
|
|
|
|
from keras_core.api_export import keras_core_export
|
|
from keras_core.utils.file_utils import get_file
|
|
from keras_core.utils.python_utils import remove_long_seq
|
|
|
|
|
|
@keras_core_export("keras_core.datasets.imdb.load_data")
|
|
def load_data(
|
|
path="imdb.npz",
|
|
num_words=None,
|
|
skip_top=0,
|
|
maxlen=None,
|
|
seed=113,
|
|
start_char=1,
|
|
oov_char=2,
|
|
index_from=3,
|
|
**kwargs,
|
|
):
|
|
"""Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
|
|
|
|
This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
|
|
(positive/negative). Reviews have been preprocessed, and each review is
|
|
encoded as a list of word indexes (integers).
|
|
For convenience, words are indexed by overall frequency in the dataset,
|
|
so that for instance the integer "3" encodes the 3rd most frequent word in
|
|
the data. This allows for quick filtering operations such as:
|
|
"only consider the top 10,000 most
|
|
common words, but eliminate the top 20 most common words".
|
|
|
|
As a convention, "0" does not stand for a specific word, but instead is used
|
|
to encode the pad token.
|
|
|
|
Args:
|
|
path: where to cache the data (relative to `~/.keras/dataset`).
|
|
num_words: integer or None. Words are
|
|
ranked by how often they occur (in the training set) and only
|
|
the `num_words` most frequent words are kept. Any less frequent word
|
|
will appear as `oov_char` value in the sequence data. If None,
|
|
all words are kept. Defaults to `None`.
|
|
skip_top: skip the top N most frequently occurring words
|
|
(which may not be informative). These words will appear as
|
|
`oov_char` value in the dataset. When 0, no words are
|
|
skipped. Defaults to `0`.
|
|
maxlen: int or None. Maximum sequence length.
|
|
Any longer sequence will be truncated. None, means no truncation.
|
|
Defaults to `None`.
|
|
seed: int. Seed for reproducible data shuffling.
|
|
start_char: int. The start of a sequence will be marked with this
|
|
character. 0 is usually the padding character. Defaults to `1`.
|
|
oov_char: int. The out-of-vocabulary character.
|
|
Words that were cut out because of the `num_words` or
|
|
`skip_top` limits will be replaced with this character.
|
|
index_from: int. Index actual words with this index and higher.
|
|
|
|
Returns:
|
|
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
|
|
|
|
**`x_train`, `x_test`**: lists of sequences, which are lists of indexes
|
|
(integers). If the num_words argument was specific, the maximum
|
|
possible index value is `num_words - 1`. If the `maxlen` argument was
|
|
specified, the largest possible sequence length is `maxlen`.
|
|
|
|
**`y_train`, `y_test`**: lists of integer labels (1 or 0).
|
|
|
|
**Note**: The 'out of vocabulary' character is only used for
|
|
words that were present in the training set but are not included
|
|
because they're not making the `num_words` cut here.
|
|
Words that were not seen in the training set but are in the test set
|
|
have simply been skipped.
|
|
"""
|
|
origin_folder = (
|
|
"https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
|
|
)
|
|
path = get_file(
|
|
fname=path,
|
|
origin=origin_folder + "imdb.npz",
|
|
file_hash=( # noqa: E501
|
|
"69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f"
|
|
),
|
|
)
|
|
with np.load(path, allow_pickle=True) as f:
|
|
x_train, labels_train = f["x_train"], f["y_train"]
|
|
x_test, labels_test = f["x_test"], f["y_test"]
|
|
|
|
rng = np.random.RandomState(seed)
|
|
indices = np.arange(len(x_train))
|
|
rng.shuffle(indices)
|
|
x_train = x_train[indices]
|
|
labels_train = labels_train[indices]
|
|
|
|
indices = np.arange(len(x_test))
|
|
rng.shuffle(indices)
|
|
x_test = x_test[indices]
|
|
labels_test = labels_test[indices]
|
|
|
|
if start_char is not None:
|
|
x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
|
|
x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
|
|
elif index_from:
|
|
x_train = [[w + index_from for w in x] for x in x_train]
|
|
x_test = [[w + index_from for w in x] for x in x_test]
|
|
|
|
if maxlen:
|
|
x_train, labels_train = remove_long_seq(maxlen, x_train, labels_train)
|
|
x_test, labels_test = remove_long_seq(maxlen, x_test, labels_test)
|
|
if not x_train or not x_test:
|
|
raise ValueError(
|
|
"After filtering for sequences shorter than maxlen="
|
|
f"{str(maxlen)}, no sequence was kept. Increase maxlen."
|
|
)
|
|
|
|
xs = x_train + x_test
|
|
labels = np.concatenate([labels_train, labels_test])
|
|
|
|
if not num_words:
|
|
num_words = max(max(x) for x in xs)
|
|
|
|
# by convention, use 2 as OOV word
|
|
# reserve 'index_from' (=3 by default) characters:
|
|
# 0 (padding), 1 (start), 2 (OOV)
|
|
if oov_char is not None:
|
|
xs = [
|
|
[w if (skip_top <= w < num_words) else oov_char for w in x]
|
|
for x in xs
|
|
]
|
|
else:
|
|
xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
|
|
|
|
idx = len(x_train)
|
|
x_train, y_train = np.array(xs[:idx], dtype="object"), labels[:idx]
|
|
x_test, y_test = np.array(xs[idx:], dtype="object"), labels[idx:]
|
|
return (x_train, y_train), (x_test, y_test)
|
|
|
|
|
|
@keras_core_export("keras_core.datasets.imdb.get_word_index")
|
|
def get_word_index(path="imdb_word_index.json"):
|
|
"""Retrieves a dict mapping words to their index in the IMDB dataset.
|
|
|
|
Args:
|
|
path: where to cache the data (relative to `~/.keras/dataset`).
|
|
|
|
Returns:
|
|
The word index dictionary. Keys are word strings, values are their
|
|
index.
|
|
|
|
Example:
|
|
|
|
```python
|
|
# Use the default parameters to keras.datasets.imdb.load_data
|
|
start_char = 1
|
|
oov_char = 2
|
|
index_from = 3
|
|
# Retrieve the training sequences.
|
|
(x_train, _), _ = keras.datasets.imdb.load_data(
|
|
start_char=start_char, oov_char=oov_char, index_from=index_from
|
|
)
|
|
# Retrieve the word index file mapping words to indices
|
|
word_index = keras.datasets.imdb.get_word_index()
|
|
# Reverse the word index to obtain a dict mapping indices to words
|
|
# And add `index_from` to indices to sync with `x_train`
|
|
inverted_word_index = dict(
|
|
(i + index_from, word) for (word, i) in word_index.items()
|
|
)
|
|
# Update `inverted_word_index` to include `start_char` and `oov_char`
|
|
inverted_word_index[start_char] = "[START]"
|
|
inverted_word_index[oov_char] = "[OOV]"
|
|
# Decode the first sequence in the dataset
|
|
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])
|
|
```
|
|
"""
|
|
origin_folder = (
|
|
"https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
|
|
)
|
|
path = get_file(
|
|
fname=path,
|
|
origin=origin_folder + "imdb_word_index.json",
|
|
file_hash="bfafd718b763782e994055a2d397834f",
|
|
)
|
|
with open(path) as f:
|
|
return json.load(f)
|