Minor backwards compat fixes

2023-06-21 21:44:47 -07:00 · 2023-06-21 21:44:47 -07:00 · e9d0e84281
commit e9d0e84281
parent 58a870eba8
10 changed files with 147 additions and 565 deletions
--- a/examples/keras_io/tensorflow/vision/image_classification_with_vision_transformer.py
+++ b/examples/keras_io/tensorflow/vision/image_classification_with_vision_transformer.py
@ -1,291 +0,0 @@
-"""
-Title: Image classification with Vision Transformer
-Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
-Date created: 2021/01/18
-Last modified: 2021/01/18
-Description: Implementing the Vision Transformer (ViT) model for image classification.
-Accelerator: GPU
-"""
-
-"""
-## Introduction
-
-This example implements the [Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929)
-model by Alexey Dosovitskiy et al. for image classification,
-and demonstrates it on the CIFAR-100 dataset.
-The ViT model applies the Transformer architecture with self-attention to sequences of
-image patches, without using convolution layers.
-
-"""
-
-"""
-## Setup
-"""
-
-import numpy as np
-import tensorflow as tf
-import keras_core as keras
-from keras_core import layers
-
-"""
-## Prepare the data
-"""
-
-num_classes = 100
-input_shape = (32, 32, 3)
-
-(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
-
-print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
-print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
-
-
-"""
-## Configure the hyperparameters
-"""
-
-learning_rate = 0.001
-weight_decay = 0.0001
-batch_size = 256
-num_epochs = 100
-image_size = 72  # We'll resize input images to this size
-patch_size = 6  # Size of the patches to be extract from the input images
-num_patches = (image_size // patch_size) ** 2
-projection_dim = 64
-num_heads = 4
-transformer_units = [
-    projection_dim * 2,
-    projection_dim,
-]  # Size of the transformer layers
-transformer_layers = 8
-mlp_head_units = [2048, 1024]  # Size of the dense layers of the final classifier
-
-
-"""
-## Use data augmentation
-"""
-
-data_augmentation = keras.Sequential(
-    [
-        layers.Normalization(),
-        layers.Resizing(image_size, image_size),
-        layers.RandomFlip("horizontal"),
-        layers.RandomRotation(factor=0.02),
-        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
-    ],
-    name="data_augmentation",
-)
-# Compute the mean and the variance of the training data for normalization.
-data_augmentation.layers[0].adapt(x_train)
-
-
-"""
-## Implement multilayer perceptron (MLP)
-"""
-
-
-def mlp(x, hidden_units, dropout_rate):
-    for units in hidden_units:
-        x = layers.Dense(units, activation=tf.nn.gelu)(x)
-        x = layers.Dropout(dropout_rate)(x)
-    return x
-
-
-"""
-## Implement patch creation as a layer
-"""
-
-
-class Patches(layers.Layer):
-    def __init__(self, patch_size):
-        super().__init__()
-        self.patch_size = patch_size
-
-    def call(self, images):
-        batch_size = tf.shape(images)[0]
-        patches = tf.image.extract_patches(
-            images=images,
-            sizes=[1, self.patch_size, self.patch_size, 1],
-            strides=[1, self.patch_size, self.patch_size, 1],
-            rates=[1, 1, 1, 1],
-            padding="VALID",
-        )
-        patch_dims = patches.shape[-1]
-        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
-        return patches
-
-
-"""
-Let's display patches for a sample image
-"""
-
-import matplotlib.pyplot as plt
-
-plt.figure(figsize=(4, 4))
-image = x_train[np.random.choice(range(x_train.shape[0]))]
-plt.imshow(image.astype("uint8"))
-plt.axis("off")
-
-resized_image = tf.image.resize(
-    tf.convert_to_tensor([image]), size=(image_size, image_size)
-)
-patches = Patches(patch_size)(resized_image)
-print(f"Image size: {image_size} X {image_size}")
-print(f"Patch size: {patch_size} X {patch_size}")
-print(f"Patches per image: {patches.shape[1]}")
-print(f"Elements per patch: {patches.shape[-1]}")
-
-n = int(np.sqrt(patches.shape[1]))
-plt.figure(figsize=(4, 4))
-for i, patch in enumerate(patches[0]):
-    ax = plt.subplot(n, n, i + 1)
-    patch_img = tf.reshape(patch, (patch_size, patch_size, 3))
-    plt.imshow(patch_img.numpy().astype("uint8"))
-    plt.axis("off")
-
-"""
-## Implement the patch encoding layer
-
-The `PatchEncoder` layer will linearly transform a patch by projecting it into a
-vector of size `projection_dim`. In addition, it adds a learnable position
-embedding to the projected vector.
-"""
-
-
-class PatchEncoder(layers.Layer):
-    def __init__(self, num_patches, projection_dim):
-        super().__init__()
-        self.num_patches = num_patches
-        self.projection = layers.Dense(units=projection_dim)
-        self.position_embedding = layers.Embedding(
-            input_dim=num_patches, output_dim=projection_dim
-        )
-
-    def call(self, patch):
-        positions = tf.range(start=0, limit=self.num_patches, delta=1)
-        encoded = self.projection(patch) + self.position_embedding(positions)
-        return encoded
-
-
-"""
-## Build the ViT model
-
-The ViT model consists of multiple Transformer blocks,
-which use the `layers.MultiHeadAttention` layer as a self-attention mechanism
-applied to the sequence of patches. The Transformer blocks produce a
-`[batch_size, num_patches, projection_dim]` tensor, which is processed via an
-classifier head with softmax to produce the final class probabilities output.
-
-Unlike the technique described in the [paper](https://arxiv.org/abs/2010.11929),
-which prepends a learnable embedding to the sequence of encoded patches to serve
-as the image representation, all the outputs of the final Transformer block are
-reshaped with `layers.Flatten()` and used as the image
-representation input to the classifier head.
-Note that the `layers.GlobalAveragePooling1D` layer
-could also be used instead to aggregate the outputs of the Transformer block,
-especially when the number of patches and the projection dimensions are large.
-"""
-
-
-def create_vit_classifier():
-    inputs = layers.Input(shape=input_shape)
-    # Augment data.
-    augmented = data_augmentation(inputs)
-    # Create patches.
-    patches = Patches(patch_size)(augmented)
-    # Encode patches.
-    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
-
-    # Create multiple layers of the Transformer block.
-    for _ in range(transformer_layers):
-        # Layer normalization 1.
-        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
-        # Create a multi-head attention layer.
-        attention_output = layers.MultiHeadAttention(
-            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
-        )(x1, x1)
-        # Skip connection 1.
-        x2 = layers.Add()([attention_output, encoded_patches])
-        # Layer normalization 2.
-        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
-        # MLP.
-        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
-        # Skip connection 2.
-        encoded_patches = layers.Add()([x3, x2])
-
-    # Create a [batch_size, projection_dim] tensor.
-    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
-    representation = layers.Flatten()(representation)
-    representation = layers.Dropout(0.5)(representation)
-    # Add MLP.
-    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
-    # Classify outputs.
-    logits = layers.Dense(num_classes)(features)
-    # Create the Keras model.
-    model = keras.Model(inputs=inputs, outputs=logits)
-    return model
-
-
-"""
-## Compile, train, and evaluate the mode
-"""
-
-
-def run_experiment(model):
-    optimizer = keras.optimizers.AdamW(
-        learning_rate=learning_rate, weight_decay=weight_decay
-    )
-
-    model.compile(
-        optimizer=optimizer,
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        metrics=[
-            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
-            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
-        ],
-    )
-
-    checkpoint_filepath = "/tmp/checkpoint"
-    checkpoint_callback = keras.callbacks.ModelCheckpoint(
-        checkpoint_filepath,
-        monitor="val_accuracy",
-        save_best_only=True,
-        save_weights_only=True,
-    )
-
-    history = model.fit(
-        x=x_train,
-        y=y_train,
-        batch_size=batch_size,
-        epochs=num_epochs,
-        validation_split=0.1,
-        callbacks=[checkpoint_callback],
-    )
-
-    model.load_weights(checkpoint_filepath)
-    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
-    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
-    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
-
-    return history
-
-
-vit_classifier = create_vit_classifier()
-history = run_experiment(vit_classifier)
-
-
-"""
-After 100 epochs, the ViT model achieves around 55% accuracy and
-82% top-5 accuracy on the test data. These are not competitive results on the CIFAR-100 dataset,
-as a ResNet50V2 trained from scratch on the same data can achieve 67% accuracy.
-
-Note that the state of the art results reported in the
-[paper](https://arxiv.org/abs/2010.11929) are achieved by pre-training the ViT model using
-the JFT-300M dataset, then fine-tuning it on the target dataset. To improve the model quality
-without pre-training, you can try to train the model for more epochs, use a larger number of
-Transformer layers, resize the input images, change the patch size, or increase the projection dimensions. 
-Besides, as mentioned in the paper, the quality of the model is affected not only by architecture choices, 
-but also by parameters such as the learning rate schedule, optimizer, weight decay, etc.
-In practice, it's recommended to fine-tune a ViT model
-that was pre-trained using a large, high-resolution dataset.
-"""
--- a/examples/keras_io/tensorflow/vision/mixup.py
+++ b/examples/keras_io/tensorflow/vision/mixup.py
@ -1,229 +0,0 @@
-"""
-Title: MixUp augmentation for image classification
-Author: [Sayak Paul](https://twitter.com/RisingSayak)
-Date created: 2021/03/06
-Last modified: 2021/03/06
-Description: Data augmentation using the mixup technique for image classification.
-Accelerator: GPU
-"""
-"""
-## Introduction
-"""
-
-"""
-_mixup_ is a *domain-agnostic* data augmentation technique proposed in [mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.09412)
-by Zhang et al. It's implemented with the following formulas:
-
-![](https://i.ibb.co/DRyHYww/image.png)
-
-(Note that the lambda values are values with the [0, 1] range and are sampled from the
-[Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution).)
-
-The technique is quite systematically named. We are literally mixing up the features and
-their corresponding labels. Implementation-wise it's simple. Neural networks are prone
-to [memorizing corrupt labels](https://arxiv.org/abs/1611.03530). mixup relaxes this by
-combining different features with one another (same happens for the labels too) so that
-a network does not get overconfident about the relationship between the features and
-their labels.
-
-mixup is specifically useful when we are not sure about selecting a set of augmentation
-transforms for a given dataset, medical imaging datasets, for example. mixup can be
-extended to a variety of data modalities such as computer vision, naturallanguage
-processing, speech, and so on.
-"""
-
-"""
-## Setup
-"""
-
-import numpy as np
-import tensorflow as tf
-import matplotlib.pyplot as plt
-from keras_core import layers
-import keras_core as keras
-
-"""
-## Prepare the dataset
-
-In this example, we will be using the [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist) dataset. But this same recipe can
-be used for other classification datasets as well.
-"""
-
-(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
-
-x_train = x_train.astype("float32") / 255.0
-x_train = np.reshape(x_train, (-1, 28, 28, 1))
-y_train = tf.one_hot(y_train, 10)
-
-x_test = x_test.astype("float32") / 255.0
-x_test = np.reshape(x_test, (-1, 28, 28, 1))
-y_test = tf.one_hot(y_test, 10)
-
-"""
-## Define hyperparameters
-"""
-
-AUTO = tf.data.AUTOTUNE
-BATCH_SIZE = 64
-EPOCHS = 10
-
-"""
-## Convert the data into TensorFlow `Dataset` objects
-"""
-
-# Put aside a few samples to create our validation set
-val_samples = 2000
-x_val, y_val = x_train[:val_samples], y_train[:val_samples]
-new_x_train, new_y_train = x_train[val_samples:], y_train[val_samples:]
-
-train_ds_one = (
-    tf.data.Dataset.from_tensor_slices((new_x_train, new_y_train))
-    .shuffle(BATCH_SIZE * 100)
-    .batch(BATCH_SIZE)
-)
-train_ds_two = (
-    tf.data.Dataset.from_tensor_slices((new_x_train, new_y_train))
-    .shuffle(BATCH_SIZE * 100)
-    .batch(BATCH_SIZE)
-)
-# Because we will be mixing up the images and their corresponding labels, we will be
-# combining two shuffled datasets from the same training data.
-train_ds = tf.data.Dataset.zip((train_ds_one, train_ds_two))
-
-val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SIZE)
-
-test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE)
-
-"""
-## Define the mixup technique function
-
-To perform the mixup routine, we create new virtual datasets using the training data from
-the same dataset, and apply a lambda value within the [0, 1] range sampled from a [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution)
-— such that, for example, `new_x = lambda * x1 + (1 - lambda) * x2` (where
-`x1` and `x2` are images) and the same equation is applied to the labels as well.
-"""
-
-
-def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
-    gamma_1_sample = tf.random.gamma(shape=[size], alpha=concentration_1)
-    gamma_2_sample = tf.random.gamma(shape=[size], alpha=concentration_0)
-    return gamma_1_sample / (gamma_1_sample + gamma_2_sample)
-
-
-def mix_up(ds_one, ds_two, alpha=0.2):
-    # Unpack two datasets
-    images_one, labels_one = ds_one
-    images_two, labels_two = ds_two
-    batch_size = tf.shape(images_one)[0]
-
-    # Sample lambda and reshape it to do the mixup
-    l = sample_beta_distribution(batch_size, alpha, alpha)
-    x_l = tf.reshape(l, (batch_size, 1, 1, 1))
-    y_l = tf.reshape(l, (batch_size, 1))
-
-    # Perform mixup on both images and labels by combining a pair of images/labels
-    # (one from each dataset) into one image/label
-    images = images_one * x_l + images_two * (1 - x_l)
-    labels = labels_one * y_l + labels_two * (1 - y_l)
-    return (images, labels)
-
-
-"""
-**Note** that here , we are combining two images to create a single one. Theoretically,
-we can combine as many we want but that comes at an increased computation cost. In
-certain cases, it may not help improve the performance as well.
-"""
-
-"""
-## Visualize the new augmented dataset
-"""
-
-# First create the new dataset using our `mix_up` utility
-train_ds_mu = train_ds.map(
-    lambda ds_one, ds_two: mix_up(ds_one, ds_two, alpha=0.2), num_parallel_calls=AUTO
-)
-
-# Let's preview 9 samples from the dataset
-sample_images, sample_labels = next(iter(train_ds_mu))
-plt.figure(figsize=(10, 10))
-for i, (image, label) in enumerate(zip(sample_images[:9], sample_labels[:9])):
-    ax = plt.subplot(3, 3, i + 1)
-    plt.imshow(image.numpy().squeeze())
-    print(label.numpy().tolist())
-    plt.axis("off")
-
-"""
-## Model building
-"""
-
-
-def get_training_model():
-    model = keras.Sequential(
-        [
-            layers.Conv2D(16, (5, 5), activation="relu", input_shape=(28, 28, 1)),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Conv2D(32, (5, 5), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Dropout(0.2),
-            layers.GlobalAveragePooling2D(),
-            layers.Dense(128, activation="relu"),
-            layers.Dense(10, activation="softmax"),
-        ]
-    )
-    return model
-
-
-"""
-For the sake of reproducibility, we serialize the initial random weights of our shallow
-network.
-"""
-
-initial_model = get_training_model()
-initial_model.save_weights("initial_weights.weights.h5")
-
-"""
-## 1. Train the model with the mixed up dataset
-"""
-
-model = get_training_model()
-model.load_weights("initial_weights.weights.h5")
-model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
-model.fit(train_ds_mu, validation_data=val_ds, epochs=EPOCHS)
-_, test_acc = model.evaluate(test_ds)
-print("Test accuracy: {:.2f}%".format(test_acc * 100))
-
-"""
-## 2. Train the model *without* the mixed up dataset
-"""
-
-model = get_training_model()
-model.load_weights("initial_weights.weights.h5")
-model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
-# Notice that we are NOT using the mixed up dataset here
-model.fit(train_ds_one, validation_data=val_ds, epochs=EPOCHS)
-_, test_acc = model.evaluate(test_ds)
-print("Test accuracy: {:.2f}%".format(test_acc * 100))
-
-"""
-Readers are encouraged to try out mixup on different datasets from different domains and
-experiment with the lambda parameter. You are strongly advised to check out the
-[original paper](https://arxiv.org/abs/1710.09412) as well - the authors present several ablation studies on mixup
-showing how it can improve generalization, as well as show their results of combining
-more than two images to create a single one.
-"""
-
-"""
-## Notes
-
-* With mixup, you can create synthetic examples — especially when you lack a large
-dataset - without incurring high computational costs.
-* [Label smoothing](https://www.pyimagesearch.com/2019/12/30/label-smoothing-with-keras-tensorflow-and-deep-learning/) and mixup usually do not work well together because label smoothing
-already modifies the hard labels by some factor.
-* mixup does not work well when you are using [Supervised Contrastive
-Learning](https://arxiv.org/abs/2004.11362) (SCL) since SCL expects the true labels
-during its pre-training phase.
-* A few other benefits of mixup include (as described in the [paper](https://arxiv.org/abs/1710.09412)) robustness to
-adversarial examples and stabilized GAN (Generative Adversarial Networks) training.
-* There are a number of data augmentation techniques that extend mixup such as
-[CutMix](https://arxiv.org/abs/1905.04899) and [AugMix](https://arxiv.org/abs/1912.02781).
-"""
--- a/keras_core/backend/torch/random.py
+++ b/keras_core/backend/torch/random.py
@ -30,9 +30,8 @@ def categorical(logits, num_samples, dtype="int32", seed=None):
    logits = convert_to_tensor(logits)
    dtype = to_torch_dtype(dtype)
    generator = torch_seed_generator(seed, device=get_device())
-    probs = torch.softmax(logits, dim=-1)
    return torch.multinomial(
-        probs,
+        logits,
        num_samples,
        replacement=True,
        generator=generator,
--- a/keras_core/callbacks/callback_list.py
+++ b/keras_core/callbacks/callback_list.py
@ -77,65 +77,81 @@ class CallbackList(Callback):
            callback.set_model(model)

    def on_batch_begin(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_batch_begin(batch, logs=logs)

    def on_batch_end(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_batch_end(batch, logs=logs)

    def on_epoch_begin(self, epoch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_epoch_begin(epoch, logs)

    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_epoch_end(epoch, logs)

    def on_train_batch_begin(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_train_batch_begin(batch, logs=logs)

    def on_train_batch_end(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_train_batch_end(batch, logs=logs)

    def on_test_batch_begin(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_test_batch_begin(batch, logs=logs)

    def on_test_batch_end(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_test_batch_end(batch, logs=logs)

    def on_predict_batch_begin(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_predict_batch_begin(batch, logs=logs)

    def on_predict_batch_end(self, batch, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_predict_batch_end(batch, logs=logs)

    def on_train_begin(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_test_begin(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_test_begin(logs)

    def on_test_end(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_test_end(logs)

    def on_predict_begin(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_predict_begin(logs)

    def on_predict_end(self, logs=None):
+        logs = logs or {}
        for callback in self.callbacks:
            callback.on_predict_end(logs)
--- a/keras_core/layers/core/dense.py
+++ b/keras_core/layers/core/dense.py
@ -82,6 +82,7 @@ class Dense(Layer):
        self.bias_constraint = constraints.get(bias_constraint)

        self.input_spec = InputSpec(min_ndim=2)
+        self.supports_masking = True

    def build(self, input_shape):
        input_dim = input_shape[-1]
--- a/keras_core/layers/layer.py
+++ b/keras_core/layers/layer.py
@ -248,6 +248,9 @@ class Layer(BackendLayer, Operation):
        self._trainable = trainable
        self._losses = []

+        self._call_signature_parameters = [
+            p.name for p in inspect.signature(self.call).parameters.values()
+        ]
        self._supports_masking = not utils.is_default(self.compute_mask)
        # Whether to automatically convert (+ auto-cast) inputs to `call()`.
        self._convert_input_args = True
@ -255,9 +258,6 @@ class Layer(BackendLayer, Operation):
        self._allow_non_tensor_positional_args = False
        # Dict of shapes that were used to call `build()`.
        self._build_shapes_dict = None
-        self._call_signature_parameters = [
-            p.name for p in inspect.signature(self.call).parameters.values()
-        ]
        self._initializer_tracker()

    @tracking.no_automatic_dependency_tracking
@ -606,31 +606,27 @@ class Layer(BackendLayer, Operation):

        ##############################
        # 6. Populate mask argument(s)
-        if self.supports_masking:
-            if len(call_spec.tensor_arguments_dict) == 1:
-                if (
-                    "mask" in call_spec.argument_names
-                    and call_spec.arguments_dict["mask"] is None
-                ):
-                    arg_name = list(call_spec.tensor_arguments_dict.keys())[0]
-                    only_tensor_arg = call_spec.tensor_arguments_dict[arg_name]
-                    mask = nest.map_structure(
-                        lambda x: getattr(x, "_keras_mask", None),
-                        only_tensor_arg,
-                    )
-                    kwargs["mask"] = mask
-            elif len(call_spec.tensor_arguments_dict) > 1:
-                for k, v in call_spec.tensor_arguments_dict.items():
-                    expected_mask_arg_name = f"{k}_mask"
-                    if expected_mask_arg_name in call_spec.argument_names:
-                        if (
-                            call_spec.arguments_dict[expected_mask_arg_name]
-                            is None
-                        ):
-                            mask = nest.map_structure(
-                                lambda x: getattr(x, "_keras_mask", None), v
-                            )
-                            kwargs[expected_mask_arg_name] = mask
+        if len(call_spec.tensor_arguments_dict) == 1:
+            if (
+                "mask" in call_spec.argument_names
+                and call_spec.arguments_dict["mask"] is None
+            ):
+                arg_name = list(call_spec.tensor_arguments_dict.keys())[0]
+                only_tensor_arg = call_spec.tensor_arguments_dict[arg_name]
+                mask = nest.map_structure(
+                    lambda x: getattr(x, "_keras_mask", None),
+                    only_tensor_arg,
+                )
+                kwargs["mask"] = mask
+        elif len(call_spec.tensor_arguments_dict) > 1:
+            for k, v in call_spec.tensor_arguments_dict.items():
+                expected_mask_arg_name = f"{k}_mask"
+                if expected_mask_arg_name in call_spec.argument_names:
+                    if call_spec.arguments_dict[expected_mask_arg_name] is None:
+                        mask = nest.map_structure(
+                            lambda x: getattr(x, "_keras_mask", None), v
+                        )
+                        kwargs[expected_mask_arg_name] = mask

        ####################
        # 7. Call the layer.
@ -651,16 +647,22 @@ class Layer(BackendLayer, Operation):
                        if backend.is_tensor(output):
                            self.add_loss(self.activity_regularizer(output))

+            # Set masks on outputs,
+            # provided only the first positional input arg and its mask.
+            # TODO: consider extending this to all args and kwargs.
+            previous_mask = getattr(call_spec.first_arg, "_keras_mask", None)
            if self.supports_masking:
-                # Set masks on outputs,
-                # provided only the first positional input arg and its mask.
-                # TODO: consider extending this to all args and kwargs.
-                previous_mask = getattr(
-                    call_spec.first_arg, "_keras_mask", None
-                )
                self._set_mask_metadata(
                    call_spec.first_arg, outputs, previous_mask
                )
+            elif previous_mask is not None:
+                warnings.warn(
+                    f"Layer '{self.name}' (of type {self.__class__.__name__}) "
+                    "was passed an input with a mask attached to it. "
+                    "However, this layer does not support masking and will "
+                    "therefore destroy the mask information. Downstream "
+                    "layers will not see the mask."
+                )
        finally:
            # Destroy call context if we created it
            self._maybe_reset_call_context()
--- a/keras_core/operations/core.py
+++ b/keras_core/operations/core.py
@ -11,8 +11,6 @@ convert_to_tensor
 convert_to_numpy
 """

-import numpy as np
-
 from keras_core import backend
 from keras_core.api_export import keras_core_export
 from keras_core.backend import KerasTensor
@ -298,7 +296,9 @@ def convert_to_tensor(x, dtype=None):
 def convert_to_numpy(x):
    """Convert a tensor to a NumPy array."""
    if any_symbolic_tensors((x,)):
-        # This will raise a `ValueError` defined in the `KerasTensor` class. We
-        # trigger it rather than duplicate it here.
-        return np.array(x)
+        raise ValueError(
+            "A symbolic tensor (usually the result of applying layers or "
+            "operations to a `keras.Input`), cannot be converted to a numpy "
+            "array. There is no concrete value for the input."
+        )
    return backend.convert_to_numpy(x)
--- a/keras_core/operations/operation.py
+++ b/keras_core/operations/operation.py
@ -71,7 +71,8 @@ class Operation:
                f"'{self.name}' (of type {self.__class__.__name__}). "
                f"Either the `{self.__class__.__name__}.call()` method "
                f"is incorrect, or you need to implement the "
-                f"`{self.__class__.__name__}.compute_output_spec()` method. "
+                f"`{self.__class__.__name__}.compute_output_spec() / "
+                "compute_output_shape()` method. "
                f"Error encountered:\n\n{e}"
            )
            raise new_e.with_traceback(e.__traceback__) from None
--- a/keras_core/random/random_test.py
+++ b/keras_core/random/random_test.py
@ -48,9 +48,8 @@ class RandomTest(testing.TestCase, parameterized.TestCase):
    )
    def test_categorical(self, seed, num_samples, batch_size):
        np.random.seed(seed)
-        # Create logits that definitely favors the batch index after a softmax
-        # is applied. Without a softmax, this would be close to random.
-        logits = np.eye(batch_size) * 1e5 + 1e6
+        # Definitively favor the batch index.
+        logits = np.eye(batch_size) * 1e9
        res = random.categorical(logits, num_samples, seed=seed)
        # Outputs should have shape `(batch_size, num_samples)`, where each
        # output index matches the batch index.
--- a/keras_core/trainers/trainer_test.py
+++ b/keras_core/trainers/trainer_test.py
@ -347,3 +347,87 @@ class TestTrainer(testing.TestCase, parameterized.TestCase):
        x1, x2 = np.random.rand(2, 3, 4)
        out = model.predict({"a": x1, "b": x2})
        self.assertEqual(out.shape, (3, 4))
+
+    def test_callback_methods_keys(self):
+        class CustomCallback(Callback):
+            def on_train_begin(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_train_end(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == [
+                    "loss",
+                    "mean_absolute_error",
+                    "val_loss",
+                    "val_mean_absolute_error",
+                ]
+
+            def on_epoch_begin(self, epoch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_epoch_end(self, epoch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == [
+                    "loss",
+                    "mean_absolute_error",
+                    "val_loss",
+                    "val_mean_absolute_error",
+                ]
+
+            def on_test_begin(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_test_end(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == ["loss", "mean_absolute_error"]
+
+            def on_predict_begin(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_predict_end(self, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_train_batch_begin(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_train_batch_end(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == ["loss", "mean_absolute_error"]
+
+            def on_test_batch_begin(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_test_batch_end(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == ["loss", "mean_absolute_error"]
+
+            def on_predict_batch_begin(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == []
+
+            def on_predict_batch_end(self, batch, logs=None):
+                keys = sorted(list(logs.keys()))
+                assert keys == ["outputs"]
+
+        model = ExampleModel(units=3)
+        model.compile(optimizer="adam", loss="mse", metrics=["mae"])
+        x = np.ones((16, 4))
+        y = np.zeros((16, 3))
+        x_test = np.ones((16, 4))
+        y_test = np.zeros((16, 3))
+        model.fit(
+            x,
+            y,
+            callbacks=[CustomCallback()],
+            batch_size=4,
+            validation_data=(x_test, y_test),
+        )
+        model.evaluate(x_test, y_test, batch_size=4)
+        model.predict(x_test, batch_size=4)