Minor backwards compat fixes

This commit is contained in:
Francois Chollet 2023-06-21 21:44:47 -07:00
parent 58a870eba8
commit e9d0e84281
10 changed files with 147 additions and 565 deletions

@ -1,291 +0,0 @@
"""
Title: Image classification with Vision Transformer
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
Date created: 2021/01/18
Last modified: 2021/01/18
Description: Implementing the Vision Transformer (ViT) model for image classification.
Accelerator: GPU
"""
"""
## Introduction
This example implements the [Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929)
model by Alexey Dosovitskiy et al. for image classification,
and demonstrates it on the CIFAR-100 dataset.
The ViT model applies the Transformer architecture with self-attention to sequences of
image patches, without using convolution layers.
"""
"""
## Setup
"""
import numpy as np
import tensorflow as tf
import keras_core as keras
from keras_core import layers
"""
## Prepare the data
"""
num_classes = 100
input_shape = (32, 32, 3)
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
"""
## Configure the hyperparameters
"""
learning_rate = 0.001
weight_decay = 0.0001
batch_size = 256
num_epochs = 100
image_size = 72 # We'll resize input images to this size
patch_size = 6 # Size of the patches to be extract from the input images
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
transformer_units = [
projection_dim * 2,
projection_dim,
] # Size of the transformer layers
transformer_layers = 8
mlp_head_units = [2048, 1024] # Size of the dense layers of the final classifier
"""
## Use data augmentation
"""
data_augmentation = keras.Sequential(
[
layers.Normalization(),
layers.Resizing(image_size, image_size),
layers.RandomFlip("horizontal"),
layers.RandomRotation(factor=0.02),
layers.RandomZoom(height_factor=0.2, width_factor=0.2),
],
name="data_augmentation",
)
# Compute the mean and the variance of the training data for normalization.
data_augmentation.layers[0].adapt(x_train)
"""
## Implement multilayer perceptron (MLP)
"""
def mlp(x, hidden_units, dropout_rate):
for units in hidden_units:
x = layers.Dense(units, activation=tf.nn.gelu)(x)
x = layers.Dropout(dropout_rate)(x)
return x
"""
## Implement patch creation as a layer
"""
class Patches(layers.Layer):
def __init__(self, patch_size):
super().__init__()
self.patch_size = patch_size
def call(self, images):
batch_size = tf.shape(images)[0]
patches = tf.image.extract_patches(
images=images,
sizes=[1, self.patch_size, self.patch_size, 1],
strides=[1, self.patch_size, self.patch_size, 1],
rates=[1, 1, 1, 1],
padding="VALID",
)
patch_dims = patches.shape[-1]
patches = tf.reshape(patches, [batch_size, -1, patch_dims])
return patches
"""
Let's display patches for a sample image
"""
import matplotlib.pyplot as plt
plt.figure(figsize=(4, 4))
image = x_train[np.random.choice(range(x_train.shape[0]))]
plt.imshow(image.astype("uint8"))
plt.axis("off")
resized_image = tf.image.resize(
tf.convert_to_tensor([image]), size=(image_size, image_size)
)
patches = Patches(patch_size)(resized_image)
print(f"Image size: {image_size} X {image_size}")
print(f"Patch size: {patch_size} X {patch_size}")
print(f"Patches per image: {patches.shape[1]}")
print(f"Elements per patch: {patches.shape[-1]}")
n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(4, 4))
for i, patch in enumerate(patches[0]):
ax = plt.subplot(n, n, i + 1)
patch_img = tf.reshape(patch, (patch_size, patch_size, 3))
plt.imshow(patch_img.numpy().astype("uint8"))
plt.axis("off")
"""
## Implement the patch encoding layer
The `PatchEncoder` layer will linearly transform a patch by projecting it into a
vector of size `projection_dim`. In addition, it adds a learnable position
embedding to the projected vector.
"""
class PatchEncoder(layers.Layer):
def __init__(self, num_patches, projection_dim):
super().__init__()
self.num_patches = num_patches
self.projection = layers.Dense(units=projection_dim)
self.position_embedding = layers.Embedding(
input_dim=num_patches, output_dim=projection_dim
)
def call(self, patch):
positions = tf.range(start=0, limit=self.num_patches, delta=1)
encoded = self.projection(patch) + self.position_embedding(positions)
return encoded
"""
## Build the ViT model
The ViT model consists of multiple Transformer blocks,
which use the `layers.MultiHeadAttention` layer as a self-attention mechanism
applied to the sequence of patches. The Transformer blocks produce a
`[batch_size, num_patches, projection_dim]` tensor, which is processed via an
classifier head with softmax to produce the final class probabilities output.
Unlike the technique described in the [paper](https://arxiv.org/abs/2010.11929),
which prepends a learnable embedding to the sequence of encoded patches to serve
as the image representation, all the outputs of the final Transformer block are
reshaped with `layers.Flatten()` and used as the image
representation input to the classifier head.
Note that the `layers.GlobalAveragePooling1D` layer
could also be used instead to aggregate the outputs of the Transformer block,
especially when the number of patches and the projection dimensions are large.
"""
def create_vit_classifier():
inputs = layers.Input(shape=input_shape)
# Augment data.
augmented = data_augmentation(inputs)
# Create patches.
patches = Patches(patch_size)(augmented)
# Encode patches.
encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
# Create multiple layers of the Transformer block.
for _ in range(transformer_layers):
# Layer normalization 1.
x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
# Create a multi-head attention layer.
attention_output = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1, x1)
# Skip connection 1.
x2 = layers.Add()([attention_output, encoded_patches])
# Layer normalization 2.
x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
# MLP.
x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
# Skip connection 2.
encoded_patches = layers.Add()([x3, x2])
# Create a [batch_size, projection_dim] tensor.
representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
representation = layers.Flatten()(representation)
representation = layers.Dropout(0.5)(representation)
# Add MLP.
features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
# Classify outputs.
logits = layers.Dense(num_classes)(features)
# Create the Keras model.
model = keras.Model(inputs=inputs, outputs=logits)
return model
"""
## Compile, train, and evaluate the mode
"""
def run_experiment(model):
optimizer = keras.optimizers.AdamW(
learning_rate=learning_rate, weight_decay=weight_decay
)
model.compile(
optimizer=optimizer,
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[
keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
],
)
checkpoint_filepath = "/tmp/checkpoint"
checkpoint_callback = keras.callbacks.ModelCheckpoint(
checkpoint_filepath,
monitor="val_accuracy",
save_best_only=True,
save_weights_only=True,
)
history = model.fit(
x=x_train,
y=y_train,
batch_size=batch_size,
epochs=num_epochs,
validation_split=0.1,
callbacks=[checkpoint_callback],
)
model.load_weights(checkpoint_filepath)
_, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
return history
vit_classifier = create_vit_classifier()
history = run_experiment(vit_classifier)
"""
After 100 epochs, the ViT model achieves around 55% accuracy and
82% top-5 accuracy on the test data. These are not competitive results on the CIFAR-100 dataset,
as a ResNet50V2 trained from scratch on the same data can achieve 67% accuracy.
Note that the state of the art results reported in the
[paper](https://arxiv.org/abs/2010.11929) are achieved by pre-training the ViT model using
the JFT-300M dataset, then fine-tuning it on the target dataset. To improve the model quality
without pre-training, you can try to train the model for more epochs, use a larger number of
Transformer layers, resize the input images, change the patch size, or increase the projection dimensions.
Besides, as mentioned in the paper, the quality of the model is affected not only by architecture choices,
but also by parameters such as the learning rate schedule, optimizer, weight decay, etc.
In practice, it's recommended to fine-tune a ViT model
that was pre-trained using a large, high-resolution dataset.
"""

@ -1,229 +0,0 @@
"""
Title: MixUp augmentation for image classification
Author: [Sayak Paul](https://twitter.com/RisingSayak)
Date created: 2021/03/06
Last modified: 2021/03/06
Description: Data augmentation using the mixup technique for image classification.
Accelerator: GPU
"""
"""
## Introduction
"""
"""
_mixup_ is a *domain-agnostic* data augmentation technique proposed in [mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.09412)
by Zhang et al. It's implemented with the following formulas:
![](https://i.ibb.co/DRyHYww/image.png)
(Note that the lambda values are values with the [0, 1] range and are sampled from the
[Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution).)
The technique is quite systematically named. We are literally mixing up the features and
their corresponding labels. Implementation-wise it's simple. Neural networks are prone
to [memorizing corrupt labels](https://arxiv.org/abs/1611.03530). mixup relaxes this by
combining different features with one another (same happens for the labels too) so that
a network does not get overconfident about the relationship between the features and
their labels.
mixup is specifically useful when we are not sure about selecting a set of augmentation
transforms for a given dataset, medical imaging datasets, for example. mixup can be
extended to a variety of data modalities such as computer vision, naturallanguage
processing, speech, and so on.
"""
"""
## Setup
"""
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras_core import layers
import keras_core as keras
"""
## Prepare the dataset
In this example, we will be using the [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist) dataset. But this same recipe can
be used for other classification datasets as well.
"""
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype("float32") / 255.0
x_train = np.reshape(x_train, (-1, 28, 28, 1))
y_train = tf.one_hot(y_train, 10)
x_test = x_test.astype("float32") / 255.0
x_test = np.reshape(x_test, (-1, 28, 28, 1))
y_test = tf.one_hot(y_test, 10)
"""
## Define hyperparameters
"""
AUTO = tf.data.AUTOTUNE
BATCH_SIZE = 64
EPOCHS = 10
"""
## Convert the data into TensorFlow `Dataset` objects
"""
# Put aside a few samples to create our validation set
val_samples = 2000
x_val, y_val = x_train[:val_samples], y_train[:val_samples]
new_x_train, new_y_train = x_train[val_samples:], y_train[val_samples:]
train_ds_one = (
tf.data.Dataset.from_tensor_slices((new_x_train, new_y_train))
.shuffle(BATCH_SIZE * 100)
.batch(BATCH_SIZE)
)
train_ds_two = (
tf.data.Dataset.from_tensor_slices((new_x_train, new_y_train))
.shuffle(BATCH_SIZE * 100)
.batch(BATCH_SIZE)
)
# Because we will be mixing up the images and their corresponding labels, we will be
# combining two shuffled datasets from the same training data.
train_ds = tf.data.Dataset.zip((train_ds_one, train_ds_two))
val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE)
"""
## Define the mixup technique function
To perform the mixup routine, we create new virtual datasets using the training data from
the same dataset, and apply a lambda value within the [0, 1] range sampled from a [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution)
such that, for example, `new_x = lambda * x1 + (1 - lambda) * x2` (where
`x1` and `x2` are images) and the same equation is applied to the labels as well.
"""
def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
gamma_1_sample = tf.random.gamma(shape=[size], alpha=concentration_1)
gamma_2_sample = tf.random.gamma(shape=[size], alpha=concentration_0)
return gamma_1_sample / (gamma_1_sample + gamma_2_sample)
def mix_up(ds_one, ds_two, alpha=0.2):
# Unpack two datasets
images_one, labels_one = ds_one
images_two, labels_two = ds_two
batch_size = tf.shape(images_one)[0]
# Sample lambda and reshape it to do the mixup
l = sample_beta_distribution(batch_size, alpha, alpha)
x_l = tf.reshape(l, (batch_size, 1, 1, 1))
y_l = tf.reshape(l, (batch_size, 1))
# Perform mixup on both images and labels by combining a pair of images/labels
# (one from each dataset) into one image/label
images = images_one * x_l + images_two * (1 - x_l)
labels = labels_one * y_l + labels_two * (1 - y_l)
return (images, labels)
"""
**Note** that here , we are combining two images to create a single one. Theoretically,
we can combine as many we want but that comes at an increased computation cost. In
certain cases, it may not help improve the performance as well.
"""
"""
## Visualize the new augmented dataset
"""
# First create the new dataset using our `mix_up` utility
train_ds_mu = train_ds.map(
lambda ds_one, ds_two: mix_up(ds_one, ds_two, alpha=0.2), num_parallel_calls=AUTO
)
# Let's preview 9 samples from the dataset
sample_images, sample_labels = next(iter(train_ds_mu))
plt.figure(figsize=(10, 10))
for i, (image, label) in enumerate(zip(sample_images[:9], sample_labels[:9])):
ax = plt.subplot(3, 3, i + 1)
plt.imshow(image.numpy().squeeze())
print(label.numpy().tolist())
plt.axis("off")
"""
## Model building
"""
def get_training_model():
model = keras.Sequential(
[
layers.Conv2D(16, (5, 5), activation="relu", input_shape=(28, 28, 1)),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(32, (5, 5), activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Dropout(0.2),
layers.GlobalAveragePooling2D(),
layers.Dense(128, activation="relu"),
layers.Dense(10, activation="softmax"),
]
)
return model
"""
For the sake of reproducibility, we serialize the initial random weights of our shallow
network.
"""
initial_model = get_training_model()
initial_model.save_weights("initial_weights.weights.h5")
"""
## 1. Train the model with the mixed up dataset
"""
model = get_training_model()
model.load_weights("initial_weights.weights.h5")
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_ds_mu, validation_data=val_ds, epochs=EPOCHS)
_, test_acc = model.evaluate(test_ds)
print("Test accuracy: {:.2f}%".format(test_acc * 100))
"""
## 2. Train the model *without* the mixed up dataset
"""
model = get_training_model()
model.load_weights("initial_weights.weights.h5")
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# Notice that we are NOT using the mixed up dataset here
model.fit(train_ds_one, validation_data=val_ds, epochs=EPOCHS)
_, test_acc = model.evaluate(test_ds)
print("Test accuracy: {:.2f}%".format(test_acc * 100))
"""
Readers are encouraged to try out mixup on different datasets from different domains and
experiment with the lambda parameter. You are strongly advised to check out the
[original paper](https://arxiv.org/abs/1710.09412) as well - the authors present several ablation studies on mixup
showing how it can improve generalization, as well as show their results of combining
more than two images to create a single one.
"""
"""
## Notes
* With mixup, you can create synthetic examples especially when you lack a large
dataset - without incurring high computational costs.
* [Label smoothing](https://www.pyimagesearch.com/2019/12/30/label-smoothing-with-keras-tensorflow-and-deep-learning/) and mixup usually do not work well together because label smoothing
already modifies the hard labels by some factor.
* mixup does not work well when you are using [Supervised Contrastive
Learning](https://arxiv.org/abs/2004.11362) (SCL) since SCL expects the true labels
during its pre-training phase.
* A few other benefits of mixup include (as described in the [paper](https://arxiv.org/abs/1710.09412)) robustness to
adversarial examples and stabilized GAN (Generative Adversarial Networks) training.
* There are a number of data augmentation techniques that extend mixup such as
[CutMix](https://arxiv.org/abs/1905.04899) and [AugMix](https://arxiv.org/abs/1912.02781).
"""

@ -30,9 +30,8 @@ def categorical(logits, num_samples, dtype="int32", seed=None):
logits = convert_to_tensor(logits)
dtype = to_torch_dtype(dtype)
generator = torch_seed_generator(seed, device=get_device())
probs = torch.softmax(logits, dim=-1)
return torch.multinomial(
probs,
logits,
num_samples,
replacement=True,
generator=generator,

@ -77,65 +77,81 @@ class CallbackList(Callback):
callback.set_model(model)
def on_batch_begin(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_batch_begin(batch, logs=logs)
def on_batch_end(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_batch_end(batch, logs=logs)
def on_epoch_begin(self, epoch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_epoch_begin(epoch, logs)
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_epoch_end(epoch, logs)
def on_train_batch_begin(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_train_batch_begin(batch, logs=logs)
def on_train_batch_end(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_train_batch_end(batch, logs=logs)
def on_test_batch_begin(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_test_batch_begin(batch, logs=logs)
def on_test_batch_end(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_test_batch_end(batch, logs=logs)
def on_predict_batch_begin(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_predict_batch_begin(batch, logs=logs)
def on_predict_batch_end(self, batch, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_predict_batch_end(batch, logs=logs)
def on_train_begin(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_train_begin(logs)
def on_train_end(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_train_end(logs)
def on_test_begin(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_test_begin(logs)
def on_test_end(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_test_end(logs)
def on_predict_begin(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_predict_begin(logs)
def on_predict_end(self, logs=None):
logs = logs or {}
for callback in self.callbacks:
callback.on_predict_end(logs)

@ -82,6 +82,7 @@ class Dense(Layer):
self.bias_constraint = constraints.get(bias_constraint)
self.input_spec = InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
input_dim = input_shape[-1]

@ -248,6 +248,9 @@ class Layer(BackendLayer, Operation):
self._trainable = trainable
self._losses = []
self._call_signature_parameters = [
p.name for p in inspect.signature(self.call).parameters.values()
]
self._supports_masking = not utils.is_default(self.compute_mask)
# Whether to automatically convert (+ auto-cast) inputs to `call()`.
self._convert_input_args = True
@ -255,9 +258,6 @@ class Layer(BackendLayer, Operation):
self._allow_non_tensor_positional_args = False
# Dict of shapes that were used to call `build()`.
self._build_shapes_dict = None
self._call_signature_parameters = [
p.name for p in inspect.signature(self.call).parameters.values()
]
self._initializer_tracker()
@tracking.no_automatic_dependency_tracking
@ -606,31 +606,27 @@ class Layer(BackendLayer, Operation):
##############################
# 6. Populate mask argument(s)
if self.supports_masking:
if len(call_spec.tensor_arguments_dict) == 1:
if (
"mask" in call_spec.argument_names
and call_spec.arguments_dict["mask"] is None
):
arg_name = list(call_spec.tensor_arguments_dict.keys())[0]
only_tensor_arg = call_spec.tensor_arguments_dict[arg_name]
mask = nest.map_structure(
lambda x: getattr(x, "_keras_mask", None),
only_tensor_arg,
)
kwargs["mask"] = mask
elif len(call_spec.tensor_arguments_dict) > 1:
for k, v in call_spec.tensor_arguments_dict.items():
expected_mask_arg_name = f"{k}_mask"
if expected_mask_arg_name in call_spec.argument_names:
if (
call_spec.arguments_dict[expected_mask_arg_name]
is None
):
mask = nest.map_structure(
lambda x: getattr(x, "_keras_mask", None), v
)
kwargs[expected_mask_arg_name] = mask
if len(call_spec.tensor_arguments_dict) == 1:
if (
"mask" in call_spec.argument_names
and call_spec.arguments_dict["mask"] is None
):
arg_name = list(call_spec.tensor_arguments_dict.keys())[0]
only_tensor_arg = call_spec.tensor_arguments_dict[arg_name]
mask = nest.map_structure(
lambda x: getattr(x, "_keras_mask", None),
only_tensor_arg,
)
kwargs["mask"] = mask
elif len(call_spec.tensor_arguments_dict) > 1:
for k, v in call_spec.tensor_arguments_dict.items():
expected_mask_arg_name = f"{k}_mask"
if expected_mask_arg_name in call_spec.argument_names:
if call_spec.arguments_dict[expected_mask_arg_name] is None:
mask = nest.map_structure(
lambda x: getattr(x, "_keras_mask", None), v
)
kwargs[expected_mask_arg_name] = mask
####################
# 7. Call the layer.
@ -651,16 +647,22 @@ class Layer(BackendLayer, Operation):
if backend.is_tensor(output):
self.add_loss(self.activity_regularizer(output))
# Set masks on outputs,
# provided only the first positional input arg and its mask.
# TODO: consider extending this to all args and kwargs.
previous_mask = getattr(call_spec.first_arg, "_keras_mask", None)
if self.supports_masking:
# Set masks on outputs,
# provided only the first positional input arg and its mask.
# TODO: consider extending this to all args and kwargs.
previous_mask = getattr(
call_spec.first_arg, "_keras_mask", None
)
self._set_mask_metadata(
call_spec.first_arg, outputs, previous_mask
)
elif previous_mask is not None:
warnings.warn(
f"Layer '{self.name}' (of type {self.__class__.__name__}) "
"was passed an input with a mask attached to it. "
"However, this layer does not support masking and will "
"therefore destroy the mask information. Downstream "
"layers will not see the mask."
)
finally:
# Destroy call context if we created it
self._maybe_reset_call_context()

@ -11,8 +11,6 @@ convert_to_tensor
convert_to_numpy
"""
import numpy as np
from keras_core import backend
from keras_core.api_export import keras_core_export
from keras_core.backend import KerasTensor
@ -298,7 +296,9 @@ def convert_to_tensor(x, dtype=None):
def convert_to_numpy(x):
"""Convert a tensor to a NumPy array."""
if any_symbolic_tensors((x,)):
# This will raise a `ValueError` defined in the `KerasTensor` class. We
# trigger it rather than duplicate it here.
return np.array(x)
raise ValueError(
"A symbolic tensor (usually the result of applying layers or "
"operations to a `keras.Input`), cannot be converted to a numpy "
"array. There is no concrete value for the input."
)
return backend.convert_to_numpy(x)

@ -71,7 +71,8 @@ class Operation:
f"'{self.name}' (of type {self.__class__.__name__}). "
f"Either the `{self.__class__.__name__}.call()` method "
f"is incorrect, or you need to implement the "
f"`{self.__class__.__name__}.compute_output_spec()` method. "
f"`{self.__class__.__name__}.compute_output_spec() / "
"compute_output_shape()` method. "
f"Error encountered:\n\n{e}"
)
raise new_e.with_traceback(e.__traceback__) from None

@ -48,9 +48,8 @@ class RandomTest(testing.TestCase, parameterized.TestCase):
)
def test_categorical(self, seed, num_samples, batch_size):
np.random.seed(seed)
# Create logits that definitely favors the batch index after a softmax
# is applied. Without a softmax, this would be close to random.
logits = np.eye(batch_size) * 1e5 + 1e6
# Definitively favor the batch index.
logits = np.eye(batch_size) * 1e9
res = random.categorical(logits, num_samples, seed=seed)
# Outputs should have shape `(batch_size, num_samples)`, where each
# output index matches the batch index.

@ -347,3 +347,87 @@ class TestTrainer(testing.TestCase, parameterized.TestCase):
x1, x2 = np.random.rand(2, 3, 4)
out = model.predict({"a": x1, "b": x2})
self.assertEqual(out.shape, (3, 4))
def test_callback_methods_keys(self):
class CustomCallback(Callback):
def on_train_begin(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_train_end(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == [
"loss",
"mean_absolute_error",
"val_loss",
"val_mean_absolute_error",
]
def on_epoch_begin(self, epoch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_epoch_end(self, epoch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == [
"loss",
"mean_absolute_error",
"val_loss",
"val_mean_absolute_error",
]
def on_test_begin(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_test_end(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == ["loss", "mean_absolute_error"]
def on_predict_begin(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_predict_end(self, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_train_batch_begin(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_train_batch_end(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == ["loss", "mean_absolute_error"]
def on_test_batch_begin(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_test_batch_end(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == ["loss", "mean_absolute_error"]
def on_predict_batch_begin(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == []
def on_predict_batch_end(self, batch, logs=None):
keys = sorted(list(logs.keys()))
assert keys == ["outputs"]
model = ExampleModel(units=3)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
x = np.ones((16, 4))
y = np.zeros((16, 3))
x_test = np.ones((16, 4))
y_test = np.zeros((16, 3))
model.fit(
x,
y,
callbacks=[CustomCallback()],
batch_size=4,
validation_data=(x_test, y_test),
)
model.evaluate(x_test, y_test, batch_size=4)
model.predict(x_test, batch_size=4)