diff --git a/examples/keras_io/structured_data/deep_neural_decision_forests.py b/examples/keras_io/structured_data/deep_neural_decision_forests.py
new file mode 100644
index 000000000..e9a6ea833
--- /dev/null
+++ b/examples/keras_io/structured_data/deep_neural_decision_forests.py
@@ -0,0 +1,484 @@
+"""
+Title: Classification with Neural Decision Forests
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/01/15
+Last modified: 2021/01/15
+Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example provides an implementation of the
+[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
+model introduced by P. Kontschieder et al. for structured data classification.
+It demonstrates how to build a stochastic and differentiable decision tree model,
+train it end-to-end, and unify decision trees with deep representation learning.
+
+## The dataset
+
+This example uses the
+[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
+provided by the
+[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
+The task is binary classification
+to predict whether a person is likely to be making over USD 50,000 a year.
+
+The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
+and 9 categorical features.
+"""
+
+"""
+## Setup
+"""
+
+import keras_core as keras
+from keras_core import layers
+from keras_core.layers import StringLookup
+from keras_core import ops
+
+
+from tensorflow import data as tf_data
+import numpy as np
+import pandas as pd
+
+import math
+
+
+_dtype = "float32"
+
+"""
+## Prepare the data
+"""
+
+CSV_HEADER = [
+    "age",
+    "workclass",
+    "fnlwgt",
+    "education",
+    "education_num",
+    "marital_status",
+    "occupation",
+    "relationship",
+    "race",
+    "gender",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+    "native_country",
+    "income_bracket",
+]
+
+train_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
+)
+train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
+
+test_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
+)
+test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
+
+print(f"Train dataset shape: {train_data.shape}")
+print(f"Test dataset shape: {test_data.shape}")
+
+"""
+Remove the first record (because it is not a valid data example) and a trailing
+'dot' in the class labels.
+"""
+
+test_data = test_data[1:]
+test_data.income_bracket = test_data.income_bracket.apply(
+    lambda value: value.replace(".", "")
+)
+
+"""
+We store the training and test data splits locally as CSV files.
+"""
+
+train_data_file = "train_data.csv"
+test_data_file = "test_data.csv"
+
+train_data.to_csv(train_data_file, index=False, header=False)
+test_data.to_csv(test_data_file, index=False, header=False)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and parsing
+and encoding input features.
+"""
+
+# A list of the numerical feature names.
+NUMERIC_FEATURE_NAMES = [
+    "age",
+    "education_num",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+]
+# A dictionary of the categorical features and their vocabulary.
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    "workclass": sorted(list(train_data["workclass"].unique())),
+    "education": sorted(list(train_data["education"].unique())),
+    "marital_status": sorted(list(train_data["marital_status"].unique())),
+    "occupation": sorted(list(train_data["occupation"].unique())),
+    "relationship": sorted(list(train_data["relationship"].unique())),
+    "race": sorted(list(train_data["race"].unique())),
+    "gender": sorted(list(train_data["gender"].unique())),
+    "native_country": sorted(list(train_data["native_country"].unique())),
+}
+# A list of the columns to ignore from the dataset.
+IGNORE_COLUMN_NAMES = ["fnlwgt"]
+# A list of the categorical feature names.
+CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
+# A list of all the input features.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
+# A list of column default values for each feature.
+COLUMN_DEFAULTS = [
+    [0.0]
+    if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES
+    else ["NA"]
+    for feature_name in CSV_HEADER
+]
+# The name of the target feature.
+TARGET_FEATURE_NAME = "income_bracket"
+# A list of the labels of the target features.
+TARGET_LABELS = [" <=50K", " >50K"]
+
+"""
+## Create `tf_data.Dataset` objects for training and validation
+
+We create an input function to read and parse the file, and convert features and labels
+into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
+for training and validation. We also preprocess the input by mapping the target label
+to an index.
+"""
+
+
+target_label_lookup = StringLookup(
+    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
+)
+
+
+lookup_dict = {}
+for feature_name in CATEGORICAL_FEATURE_NAMES:
+    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+    # Create a lookup to convert a string values to an integer indices.
+    # Since we are not using a mask token, nor expecting any out of vocabulary
+    # (oov) token, we set mask_token to None and num_oov_indices to 0.
+    lookup = StringLookup(
+        vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+    )
+    lookup_dict[feature_name] = lookup
+
+
+def encode_categorical(batch_x, batch_y):
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
+
+    return batch_x, batch_y
+
+
+def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
+    dataset = (
+        tf_data.experimental.make_csv_dataset(
+            csv_file_path,
+            batch_size=batch_size,
+            column_names=CSV_HEADER,
+            column_defaults=COLUMN_DEFAULTS,
+            label_name=TARGET_FEATURE_NAME,
+            num_epochs=1,
+            header=False,
+            na_value="?",
+            shuffle=shuffle,
+        )
+        .map(lambda features, target: (features, target_label_lookup(target)))
+        .map(encode_categorical)
+    )
+
+    return dataset.cache()
+
+
+"""
+## Create model inputs
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+    for feature_name in FEATURE_NAMES:
+        if feature_name in NUMERIC_FEATURE_NAMES:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype=_dtype
+            )
+        else:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="int32"
+            )
+    return inputs
+
+
+"""
+## Encode input features
+"""
+
+
+def encode_inputs(inputs):
+    encoded_features = []
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token, nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and num_oov_indices to 0.
+            value_index = inputs[feature_name]
+            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
+            # Create an embedding layer with the specified dimensions.
+            embedding = layers.Embedding(
+                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
+            )
+            # Convert the index values to embedding representations.
+            encoded_feature = embedding(value_index)
+        else:
+            # Use the numerical features as-is.
+            encoded_feature = inputs[feature_name]
+            if inputs[feature_name].shape[-1] is None:
+                encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
+
+        encoded_features.append(encoded_feature)
+
+    encoded_features = layers.concatenate(encoded_features)
+    return encoded_features
+
+
+"""
+## Deep Neural Decision Tree
+
+A neural decision tree model has two sets of weights to learn. The first set is `pi`,
+which represents the probability distribution of the classes in the tree leaves.
+The second set is the weights of the routing layer `decision_fn`, which represents the probability
+of going to each leave. The forward pass of the model works as follows:
+
+1. The model expects input `features` as a single vector encoding all the features of an instance
+in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
+or dense transformations applied to structured data features.
+2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
+3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
+by iteratively performing a *stochastic* routing throughout the tree levels.
+4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
+leaves to produce the final `outputs`.
+"""
+
+
+class NeuralDecisionTree(keras.Model):
+    def __init__(self, depth, num_features, used_features_rate, num_classes):
+        super().__init__()
+        self.depth = depth
+        self.num_leaves = 2**depth
+        self.num_classes = num_classes
+
+        # Create a mask for the randomly selected features.
+        num_used_features = int(num_features * used_features_rate)
+        one_hot = np.eye(num_features)
+        sampled_feature_indices = np.random.choice(
+            np.arange(num_features), num_used_features, replace=False
+        )
+        self.used_features_mask = ops.convert_to_tensor(
+            one_hot[sampled_feature_indices], dtype=_dtype
+        )
+
+        # Initialize the weights of the classes in leaves.
+        self.pi = self.add_weight(
+            initializer="random_normal",
+            shape=[self.num_leaves, self.num_classes],
+            dtype=_dtype,
+            trainable=True,
+        )
+
+        # Initialize the stochastic routing layer.
+        self.decision_fn = layers.Dense(
+            units=self.num_leaves, activation="sigmoid", name="decision"
+        )
+
+    def call(self, features):
+        batch_size = ops.shape(features)[0]
+
+        # Apply the feature mask to the input features.
+        features = ops.matmul(
+            features, ops.transpose(self.used_features_mask)
+        )  # [batch_size, num_used_features]
+        # Compute the routing probabilities.
+        decisions = ops.expand_dims(
+            self.decision_fn(features), axis=2
+        )  # [batch_size, num_leaves, 1]
+        # Concatenate the routing probabilities with their complements.
+        decisions = layers.concatenate(
+            [decisions, 1 - decisions], axis=2
+        )  # [batch_size, num_leaves, 2]
+
+        mu = ops.ones([batch_size, 1, 1])
+
+        begin_idx = 1
+        end_idx = 2
+        # Traverse the tree in breadth-first order.
+        for level in range(self.depth):
+            mu = ops.reshape(
+                mu, [batch_size, -1, 1]
+            )  # [batch_size, 2 ** level, 1]
+            mu = ops.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
+            level_decisions = decisions[
+                :, begin_idx:end_idx, :
+            ]  # [batch_size, 2 ** level, 2]
+            mu = mu * level_decisions  # [batch_size, 2**level, 2]
+            begin_idx = end_idx
+            end_idx = begin_idx + 2 ** (level + 1)
+
+        mu = ops.reshape(
+            mu, [batch_size, self.num_leaves]
+        )  # [batch_size, num_leaves]
+        probabilities = keras.activations.softmax(
+            self.pi
+        )  # [num_leaves, num_classes]
+        outputs = ops.matmul(mu, probabilities)  # [batch_size, num_classes]
+        return outputs
+
+
+"""
+## Deep Neural Decision Forest
+
+The neural decision forest model consists of a set of neural decision trees that are
+trained simultaneously. The output of the forest model is the average outputs of its trees.
+"""
+
+
+class NeuralDecisionForest(keras.Model):
+    def __init__(
+        self, num_trees, depth, num_features, used_features_rate, num_classes
+    ):
+        super().__init__()
+        self.ensemble = []
+        # Initialize the ensemble by adding NeuralDecisionTree instances.
+        # Each tree will have its own randomly selected input features to use.
+        for _ in range(num_trees):
+            self.ensemble.append(
+                NeuralDecisionTree(
+                    depth, num_features, used_features_rate, num_classes
+                )
+            )
+
+    def call(self, inputs):
+        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
+        batch_size = ops.shape(inputs)[0]
+        outputs = ops.zeros([batch_size, num_classes])
+
+        # Aggregate the outputs of trees in the ensemble.
+        for tree in self.ensemble:
+            outputs += tree(inputs)
+        # Divide the outputs by the ensemble size to get the average.
+        outputs /= len(self.ensemble)
+        return outputs
+
+
+"""
+Finally, let's set up the code that will train and evaluate the model.
+"""
+
+learning_rate = 0.01
+batch_size = 265
+num_epochs = 10
+
+
+def run_experiment(model):
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+        loss=keras.losses.SparseCategoricalCrossentropy(),
+        metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    )
+
+    print("Start training the model...")
+    train_dataset = get_dataset_from_csv(
+        train_data_file, shuffle=True, batch_size=batch_size
+    )
+
+    model.fit(train_dataset, epochs=num_epochs)
+    print("Model training finished")
+
+    print("Evaluating the model on the test data...")
+    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
+
+    _, accuracy = model.evaluate(test_dataset)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+
+"""
+## Experiment 1: train a decision tree model
+
+In this experiment, we train a single neural decision tree model
+where we use all input features.
+"""
+
+num_trees = 10
+depth = 10
+used_features_rate = 1.0
+num_classes = len(TARGET_LABELS)
+
+
+def create_tree_model():
+    inputs = create_model_inputs()
+    features = encode_inputs(inputs)
+    features = layers.BatchNormalization()(features)
+    num_features = features.shape[1]
+
+    tree = NeuralDecisionTree(
+        depth, num_features, used_features_rate, num_classes
+    )
+
+    outputs = tree(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+tree_model = create_tree_model()
+run_experiment(tree_model)
+
+
+"""
+## Experiment 2: train a forest model
+
+In this experiment, we train a neural decision forest with `num_trees` trees
+where each tree uses randomly selected 50% of the input features. You can control the number
+of features to be used in each tree by setting the `used_features_rate` variable.
+In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
+"""
+
+num_trees = 25
+depth = 5
+used_features_rate = 0.5
+
+
+def create_forest_model():
+    inputs = create_model_inputs()
+    features = encode_inputs(inputs)
+    features = layers.BatchNormalization()(features)
+    num_features = features.shape[1]
+
+    forest_model = NeuralDecisionForest(
+        num_trees, depth, num_features, used_features_rate, num_classes
+    )
+
+    outputs = forest_model(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+forest_model = create_forest_model()
+
+run_experiment(forest_model)
+
+
+"""
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/neural-decision-forest)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Neural-Decision-Forest).
+"""
diff --git a/examples/keras_io/tensorflow/vision/simsiam.py b/examples/keras_io/tensorflow/vision/simsiam.py
new file mode 100644
index 000000000..b9671407e
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/simsiam.py
@@ -0,0 +1,443 @@
+"""
+Title: Self-supervised contrastive learning with SimSiam
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/03/19
+Last modified: 2021/03/20
+Description: Implementation of a self-supervised learning method for computer vision.
+Accelerator: GPU
+"""
+"""
+Self-supervised learning (SSL) is an interesting branch of study in the field of
+representation learning. SSL systems try to formulate a supervised signal from a corpus
+of unlabeled data points.  An example is we train a deep neural network to predict the
+next word from a given set of words. In literature, these tasks are known as *pretext
+tasks* or *auxiliary tasks*. If we [train such a network](https://arxiv.org/abs/1801.06146) on a huge dataset (such as
+the [Wikipedia text corpus](https://www.corpusdata.org/wikipedia.asp)) it learns very effective
+representations that transfer well to downstream tasks. Language models like
+[BERT](https://arxiv.org/abs/1810.04805), [GPT-3](https://arxiv.org/abs/2005.14165),
+[ELMo](https://allennlp.org/elmo) all benefit from this.
+
+Much like the language models we can train computer vision models using similar
+approaches. To make things work in computer vision, we need to formulate the learning
+tasks such that the underlying model (a deep neural network) is able to make sense of the
+semantic information present in vision data. One such task is to a model to _contrast_
+between two different versions of the same image. The hope is that in this way the model
+will have learn representations where the similar images are grouped as together possible
+while the dissimilar images are further away.
+
+In this example, we will be implementing one such system called **SimSiam** proposed in
+[Exploring Simple Siamese Representation Learning](https://arxiv.org/abs/2011.10566). It
+is implemented as the following:
+
+1. We create two different versions of the same dataset with a stochastic data
+augmentation pipeline. Note that the random initialization seed needs to be the same
+during create these versions.
+2. We take a ResNet without any classification head (**backbone**) and we add a shallow
+fully-connected network (**projection head**) on top of it. Collectively, this is known
+as the **encoder**.
+3. We pass the output of the encoder through a **predictor** which is again a shallow
+fully-connected network having an
+[AutoEncoder](https://en.wikipedia.org/wiki/Autoencoder) like structure.
+4. We then train our encoder to maximize the cosine similarity between the two different
+versions of our dataset.
+"""
+
+"""
+## Setup
+"""
+
+import os
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+
+from keras_core import layers
+from keras_core import regularizers
+import keras_core as keras
+import tensorflow as tf
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 128
+EPOCHS = 5
+CROP_TO = 32
+SEED = 26
+
+PROJECT_DIM = 2048
+LATENT_DIM = 512
+WEIGHT_DECAY = 0.0005
+
+"""
+## Load the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+print(f"Total training examples: {len(x_train)}")
+print(f"Total test examples: {len(x_test)}")
+
+"""
+## Defining our data augmentation pipeline
+
+As studied in [SimCLR](https://arxiv.org/abs/2002.05709) having the right data
+augmentation pipeline is critical for SSL systems to work effectively in computer vision.
+Two particular augmentation transforms that seem to matter the most are: 1.) Random
+resized crops and 2.) Color distortions. Most of the other SSL systems for computer
+vision (such as [BYOL](https://arxiv.org/abs/2006.07733),
+[MoCoV2](https://arxiv.org/abs/2003.04297), [SwAV](https://arxiv.org/abs/2006.09882),
+etc.) include these in their training pipelines.
+"""
+
+
+def flip_random_crop(image):
+    # With random crops we also apply horizontal flipping.
+    image = tf.image.random_flip_left_right(image)
+    image = tf.image.random_crop(image, (CROP_TO, CROP_TO, 3))
+    return image
+
+
+def color_jitter(x, strength=[0.4, 0.4, 0.4, 0.1]):
+    x = tf.image.random_brightness(x, max_delta=0.8 * strength[0])
+    x = tf.image.random_contrast(
+        x, lower=1 - 0.8 * strength[1], upper=1 + 0.8 * strength[1]
+    )
+    x = tf.image.random_saturation(
+        x, lower=1 - 0.8 * strength[2], upper=1 + 0.8 * strength[2]
+    )
+    x = tf.image.random_hue(x, max_delta=0.2 * strength[3])
+    # Affine transformations can disturb the natural range of
+    # RGB images, hence this is needed.
+    x = tf.clip_by_value(x, 0, 255)
+    return x
+
+
+def color_drop(x):
+    x = tf.image.rgb_to_grayscale(x)
+    x = tf.tile(x, [1, 1, 3])
+    return x
+
+
+def random_apply(func, x, p):
+    if tf.random.uniform([], minval=0, maxval=1) < p:
+        return func(x)
+    else:
+        return x
+
+
+def custom_augment(image):
+    # As discussed in the SimCLR paper, the series of augmentation
+    # transformations (except for random crops) need to be applied
+    # randomly to impose translational invariance.
+    image = flip_random_crop(image)
+    image = random_apply(color_jitter, image, p=0.8)
+    image = random_apply(color_drop, image, p=0.2)
+    return image
+
+
+"""
+It should be noted that an augmentation pipeline is generally dependent on various
+properties of the dataset we are dealing with. For example, if images in the dataset are
+heavily object-centric then taking random crops with a very high probability may hurt the
+training performance.
+
+Let's now apply our augmentation pipeline to our dataset and visualize a few outputs.
+"""
+
+"""
+## Convert the data into TensorFlow `Dataset` objects
+
+Here we create two different versions of our dataset *without* any ground-truth labels.
+"""
+
+ssl_ds_one = tf.data.Dataset.from_tensor_slices(x_train)
+ssl_ds_one = (
+    ssl_ds_one.shuffle(1024, seed=SEED)
+    .map(custom_augment, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+ssl_ds_two = tf.data.Dataset.from_tensor_slices(x_train)
+ssl_ds_two = (
+    ssl_ds_two.shuffle(1024, seed=SEED)
+    .map(custom_augment, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# We then zip both of these datasets.
+ssl_ds = tf.data.Dataset.zip((ssl_ds_one, ssl_ds_two))
+
+# Visualize a few augmented images.
+sample_images_one = next(iter(ssl_ds_one))
+plt.figure(figsize=(10, 10))
+for n in range(25):
+    ax = plt.subplot(5, 5, n + 1)
+    plt.imshow(sample_images_one[n].numpy().astype("int"))
+    plt.axis("off")
+plt.show()
+
+# Ensure that the different versions of the dataset actually contain
+# identical images.
+sample_images_two = next(iter(ssl_ds_two))
+plt.figure(figsize=(10, 10))
+for n in range(25):
+    ax = plt.subplot(5, 5, n + 1)
+    plt.imshow(sample_images_two[n].numpy().astype("int"))
+    plt.axis("off")
+plt.show()
+
+"""
+Notice that the images in `samples_images_one` and `sample_images_two` are essentially
+the same but are augmented differently.
+"""
+
+"""
+## Defining the encoder and the predictor
+
+We use an implementation of ResNet20 that is specifically configured for the CIFAR10
+dataset. The code is taken from the
+[keras-idiomatic-programmer](https://github.com/GoogleCloudPlatform/keras-idiomatic-programmer/blob/master/zoo/resnet/resnet_cifar10_v2.py) repository. The hyperparameters of
+these architectures have been referred from Section 3 and Appendix A of [the original
+paper](https://arxiv.org/abs/2011.10566).
+"""
+
+"""shell
+wget -q https://shorturl.at/QS369 -O resnet_cifar10_v2.py
+"""
+
+import resnet_cifar10_v2
+
+N = 2
+DEPTH = N * 9 + 2
+NUM_BLOCKS = ((DEPTH - 2) // 9) - 1
+
+
+def get_encoder():
+    # Input and backbone.
+    inputs = layers.Input((CROP_TO, CROP_TO, 3))
+    x = layers.Rescaling(scale=1.0 / 127.5, offset=-1)(inputs)
+    x = resnet_cifar10_v2.stem(x)
+    x = resnet_cifar10_v2.learner(x, NUM_BLOCKS)
+    x = layers.GlobalAveragePooling2D(name="backbone_pool")(x)
+
+    # Projection head.
+    x = layers.Dense(
+        PROJECT_DIM,
+        use_bias=False,
+        kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+    )(x)
+    x = layers.BatchNormalization()(x)
+    x = layers.ReLU()(x)
+    x = layers.Dense(
+        PROJECT_DIM,
+        use_bias=False,
+        kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+    )(x)
+    outputs = layers.BatchNormalization()(x)
+    return keras.Model(inputs, outputs, name="encoder")
+
+
+def get_predictor():
+    model = keras.Sequential(
+        [
+            # Note the AutoEncoder-like structure.
+            layers.Input((PROJECT_DIM,)),
+            layers.Dense(
+                LATENT_DIM,
+                use_bias=False,
+                kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+            ),
+            layers.ReLU(),
+            layers.BatchNormalization(),
+            layers.Dense(PROJECT_DIM),
+        ],
+        name="predictor",
+    )
+    return model
+
+
+"""
+## Defining the (pre-)training loop
+
+One of the main reasons behind training networks with these kinds of approaches is to
+utilize the learned representations for downstream tasks like classification. This is why
+this particular training phase is also referred to as _pre-training_.
+
+We start by defining the loss function.
+"""
+
+
+def compute_loss(p, z):
+    # The authors of SimSiam emphasize the impact of
+    # the `stop_gradient` operator in the paper as it
+    # has an important role in the overall optimization.
+    z = tf.stop_gradient(z)
+    p = tf.math.l2_normalize(p, axis=1)
+    z = tf.math.l2_normalize(z, axis=1)
+    # Negative cosine similarity (minimizing this is
+    # equivalent to maximizing the similarity).
+    return -tf.reduce_mean(tf.reduce_sum((p * z), axis=1))
+
+
+"""
+We then define our training loop by overriding the `train_step()` function of the
+`keras.Model` class.
+"""
+
+
+class SimSiam(keras.Model):
+    def __init__(self, encoder, predictor):
+        super().__init__()
+        self.encoder = encoder
+        self.predictor = predictor
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    def train_step(self, data):
+        # Unpack the data.
+        ds_one, ds_two = data
+
+        # Forward pass through the encoder and predictor.
+        with tf.GradientTape() as tape:
+            z1, z2 = self.encoder(ds_one), self.encoder(ds_two)
+            p1, p2 = self.predictor(z1), self.predictor(z2)
+            # Note that here we are enforcing the network to match
+            # the representations of two differently augmented batches
+            # of data.
+            loss = compute_loss(p1, z2) / 2 + compute_loss(p2, z1) / 2
+
+        # Compute gradients and update the parameters.
+        learnable_params = (
+            self.encoder.trainable_variables
+            + self.predictor.trainable_variables
+        )
+        gradients = tape.gradient(loss, learnable_params)
+        self.optimizer.apply_gradients(zip(gradients, learnable_params))
+
+        # Monitor loss.
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+## Pre-training our networks
+
+In the interest of this example, we will train the model for only 5 epochs. In reality,
+this should at least be 100 epochs.
+"""
+
+# Create a cosine decay learning scheduler.
+num_training_samples = len(x_train)
+steps = EPOCHS * (num_training_samples // BATCH_SIZE)
+lr_decayed_fn = keras.optimizers.schedules.CosineDecay(
+    initial_learning_rate=0.03, decay_steps=steps
+)
+
+# Create an early stopping callback.
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="loss", patience=5, restore_best_weights=True
+)
+
+# Compile model and start training.
+simsiam = SimSiam(get_encoder(), get_predictor())
+simsiam.compile(optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.6))
+history = simsiam.fit(ssl_ds, epochs=EPOCHS, callbacks=[early_stopping])
+
+# Visualize the training progress of the model.
+plt.plot(history.history["loss"])
+plt.grid()
+plt.title("Negative Cosine Similairty")
+plt.show()
+
+"""
+If your solution gets very close to -1 (minimum value of our loss) very quickly with a
+different dataset and a different backbone architecture that is likely because of
+*representation collapse*. It is a phenomenon where the encoder yields similar output for
+all the images. In that case additional hyperparameter tuning is required especially in
+the following areas:
+
+* Strength of the color distortions and their probabilities.
+* Learning rate and its schedule.
+* Architecture of both the backbone and their projection head.
+
+"""
+
+"""
+## Evaluating our SSL method
+
+The most popularly used method to evaluate a SSL method in computer vision (or any other
+pre-training method as such) is to learn a linear classifier on the frozen features of
+the trained backbone model (in this case it is ResNet20) and evaluate the classifier on
+unseen images. Other methods include
+[fine-tuning](https://keras.io/guides/transfer_learning/) on the source dataset or even a
+target dataset with 5% or 10% labels present. Practically, we can use the backbone model
+for any downstream task such as semantic segmentation, object detection, and so on where
+the backbone models are usually pre-trained with *pure supervised learning*.
+"""
+
+# We first create labeled `Dataset` objects.
+train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+# Then we shuffle, batch, and prefetch this dataset for performance. We
+# also apply random resized crops as an augmentation but only to the
+# training set.
+train_ds = (
+    train_ds.shuffle(1024)
+    .map(lambda x, y: (flip_random_crop(x), y), num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+# Extract the backbone ResNet20.
+backbone = keras.Model(
+    simsiam.encoder.input, simsiam.encoder.get_layer("backbone_pool").output
+)
+
+# We then create our linear classifier and train it.
+backbone.trainable = False
+inputs = layers.Input((CROP_TO, CROP_TO, 3))
+x = backbone(inputs, training=False)
+outputs = layers.Dense(10, activation="softmax")(x)
+linear_model = keras.Model(inputs, outputs, name="linear_model")
+
+# Compile model and start training.
+linear_model.compile(
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+    optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.9),
+)
+history = linear_model.fit(
+    train_ds, validation_data=test_ds, epochs=EPOCHS, callbacks=[early_stopping]
+)
+_, test_acc = linear_model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+
+## Notes
+* More data and longer pre-training schedule benefit SSL in general.
+* SSL is particularly very helpful when you do not have access to very limited *labeled*
+training data but you can manage to build a large corpus of unlabeled data. Recently,
+using an SSL method called [SwAV](https://arxiv.org/abs/2006.09882), a group of
+researchers at Facebook trained a [RegNet](https://arxiv.org/abs/2006.09882) on 2 Billion
+images. They were able to achieve downstream performance very close to those achieved by
+pure supervised pre-training. For some downstream tasks, their method even outperformed
+the supervised counterparts. You can check out [their
+paper](https://arxiv.org/pdf/2103.01988.pdf) to know the details.
+* If you are interested to understand why contrastive SSL helps networks learn meaningful
+representations, you can check out the following resources:
+   * [Self-supervised learning: The dark matter of
+intelligence](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/)
+   * [Understanding self-supervised learning using controlled datasets with known
+structure](https://sslneuips20.github.io/files/CameraReadys%203-77/64/CameraReady/Understanding_self_supervised_learning.pdf)
+
+"""
diff --git a/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py b/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py
new file mode 100644
index 000000000..3b1233c16
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py
@@ -0,0 +1,209 @@
+"""
+Title: Visualizing what convnets learn
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/05/29
+Last modified: 2020/05/29
+Description: Displaying the visual patterns that convnet filters respond to.
+Accelerator: GPU
+"""
+"""
+## Introduction
+
+In this example, we look into what sort of visual patterns image classification models
+learn. We'll be using the `ResNet50V2` model, trained on the ImageNet dataset.
+
+Our process is simple: we will create input images that maximize the activation of
+specific filters in a target layer (picked somewhere in the middle of the model: layer
+`conv3_block4_out`). Such images represent a visualization of the
+pattern that the filter responds to.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras_core as keras
+
+
+import numpy as np
+import tensorflow as tf
+
+# The dimensions of our input image
+img_width = 180
+img_height = 180
+# Our target layer: we will visualize the filters from this layer.
+# See `model.summary()` for list of layer names, if you want to change this.
+layer_name = "conv3_block4_out"
+
+"""
+## Build a feature extraction model
+"""
+
+# Build a ResNet50V2 model loaded with pre-trained ImageNet weights
+model = keras.applications.ResNet50V2(weights="imagenet", include_top=False)
+
+# Set up a model that returns the activation values for our target layer
+layer = model.get_layer(name=layer_name)
+feature_extractor = keras.Model(inputs=model.inputs, outputs=layer.output)
+
+"""
+## Set up the gradient ascent process
+
+The "loss" we will maximize is simply the mean of the activation of a specific filter in
+our target layer. To avoid border effects, we exclude border pixels.
+"""
+
+
+def compute_loss(input_image, filter_index):
+    activation = feature_extractor(input_image)
+    # We avoid border artifacts by only involving non-border pixels in the loss.
+    filter_activation = activation[:, 2:-2, 2:-2, filter_index]
+    return tf.reduce_mean(filter_activation)
+
+
+"""
+Our gradient ascent function simply computes the gradients of the loss above
+with regard to the input image, and update the update image so as to move it
+towards a state that will activate the target filter more strongly.
+"""
+
+
+@tf.function
+def gradient_ascent_step(img, filter_index, learning_rate):
+    with tf.GradientTape() as tape:
+        tape.watch(img)
+        loss = compute_loss(img, filter_index)
+    # Compute gradients.
+    grads = tape.gradient(loss, img)
+    # Normalize gradients.
+    grads = tf.math.l2_normalize(grads)
+    img += learning_rate * grads
+    return loss, img
+
+
+"""
+## Set up the end-to-end filter visualization loop
+
+Our process is as follow:
+
+- Start from a random image that is close to "all gray" (i.e. visually netural)
+- Repeatedly apply the gradient ascent step function defined above
+- Convert the resulting input image back to a displayable form, by normalizing it,
+center-cropping it, and restricting it to the [0, 255] range.
+"""
+
+
+def initialize_image():
+    # We start from a gray image with some random noise
+    img = tf.random.uniform((1, img_width, img_height, 3))
+    # ResNet50V2 expects inputs in the range [-1, +1].
+    # Here we scale our random inputs to [-0.125, +0.125]
+    return (img - 0.5) * 0.25
+
+
+def visualize_filter(filter_index):
+    # We run gradient ascent for 20 steps
+    iterations = 30
+    learning_rate = 10.0
+    img = initialize_image()
+    for iteration in range(iterations):
+        loss, img = gradient_ascent_step(img, filter_index, learning_rate)
+
+    # Decode the resulting input image
+    img = deprocess_image(img[0].numpy())
+    return loss, img
+
+
+def deprocess_image(img):
+    # Normalize array: center on 0., ensure variance is 0.15
+    img -= img.mean()
+    img /= img.std() + 1e-5
+    img *= 0.15
+
+    # Center crop
+    img = img[25:-25, 25:-25, :]
+
+    # Clip to [0, 1]
+    img += 0.5
+    img = np.clip(img, 0, 1)
+
+    # Convert to RGB array
+    img *= 255
+    img = np.clip(img, 0, 255).astype("uint8")
+    return img
+
+
+"""
+Let's try it out with filter 0 in the target layer:
+"""
+
+from IPython.display import Image, display
+
+loss, img = visualize_filter(0)
+keras.utils.save_img("0.png", img)
+
+"""
+This is what an input that maximizes the response of filter 0 in the target layer would
+look like:
+"""
+
+display(Image("0.png"))
+
+"""
+## Visualize the first 64 filters in the target layer
+
+Now, let's make a 8x8 grid of the first 64 filters
+in the target layer to get of feel for the range
+of different visual patterns that the model has learned.
+"""
+
+# Compute image inputs that maximize per-filter activations
+# for the first 64 filters of our target layer
+all_imgs = []
+for filter_index in range(64):
+    print("Processing filter %d" % (filter_index,))
+    loss, img = visualize_filter(filter_index)
+    all_imgs.append(img)
+
+# Build a black picture with enough space for
+# our 8 x 8 filters of size 128 x 128, with a 5px margin in between
+margin = 5
+n = 8
+cropped_width = img_width - 25 * 2
+cropped_height = img_height - 25 * 2
+width = n * cropped_width + (n - 1) * margin
+height = n * cropped_height + (n - 1) * margin
+stitched_filters = np.zeros((width, height, 3))
+
+# Fill the picture with our saved filters
+for i in range(n):
+    for j in range(n):
+        img = all_imgs[i * n + j]
+        stitched_filters[
+            (cropped_width + margin) * i : (cropped_width + margin) * i + cropped_width,
+            (cropped_height + margin) * j : (cropped_height + margin) * j
+            + cropped_height,
+            :,
+        ] = img
+keras.utils.save_img("stiched_filters.png", stitched_filters)
+
+from IPython.display import Image, display
+
+display(Image("stiched_filters.png"))
+
+"""
+Image classification models see the world by decomposing their inputs over a "vector
+basis" of texture filters such as these.
+
+See also
+[this old blog post](https://blog.keras.io/how-convolutional-neural-networks-see-the-world.html)
+for analysis and interpretation.
+
+Example available on HuggingFace.
+
+[![Generic badge](https://img.shields.io/badge/🤗%20Spaces-What%20Convnets%20Learn-black.svg)](https://huggingface.co/spaces/keras-io/what-convnets-learn)
+"""
diff --git a/examples/keras_io/vision/image_classifier.py b/examples/keras_io/vision/image_classifier.py
new file mode 100644
index 000000000..d681446dc
--- /dev/null
+++ b/examples/keras_io/vision/image_classifier.py
@@ -0,0 +1,695 @@
+# -*- coding: utf-8 -*-
+"""
+Author: [lukewood](https://lukewood.xyz)
+Date created: 03/28/2023
+Last modified: 07/25/2023
+Description: Use KerasCV to train powerful image classifiers.
+"""
+
+"""
+## Introduction
+
+Classification is the process of predicting a categorical label for a given
+input image.
+While classification is a relatively straightforward computer vision task,
+modern approaches still are built of several complex components.
+Luckily, KerasCV provides APIs to construct commonly used components.
+
+This guide demonstrates KerasCV's modular approach to solving image
+classification problems at three levels of complexity:
+
+- Inference with a pretrained classifier
+- Fine-tuning a pretrained backbone
+- Training a image classifier from scratch
+
+## Multi-Backend Support
+
+KerasCV's `ImageClassifier` model supports several backends like JAX, PyTorch,
+and TensorFlow with the help of `keras_core`. To enable multi-backend support
+in KerasCV, set the `KERAS_CV_MULTI_BACKEND` environment variable. We can
+then switch between different backends by setting the `KERAS_BACKEND`
+environment variable. Currently, `"tensorflow"`, `"jax"`, and `"torch"` are
+supported.
+
+This demonstration uses the Jax backend.
+"""
+
+import os
+
+os.environ["KERAS_CV_MULTI_BACKEND"] = "1"
+os.environ["KERAS_BACKEND"] = "jax"
+
+import json
+import math
+import keras_cv
+import keras_core as keras
+from keras_core import ops
+from keras_core import losses
+from keras_core import optimizers
+from keras_core.optimizers import schedules
+from keras_core import metrics
+import tensorflow as tf
+from tensorflow import data as tf_data
+import tensorflow_datasets as tfds
+import numpy as np
+
+"""## Inference with a pretrained classifier
+
+Let's get started with the simplest KerasCV API: a pretrained classifier.
+In this example, we will construct a classifier that was
+pretrained on the ImageNet dataset.
+We'll use this model to solve the age old "Cat or Dog" problem.
+
+The highest level module in KerasCV is a *task*. A *task* is a `keras.Model`
+consisting of a (generally pretrained) backbone model and task-specific
+layers. Here's an example using `keras_cv.models.ImageClassifier` with an
+EfficientNetV2B0 Backbone.
+
+EfficientNetV2B0 is a great starting model when constructing an image
+classification pipeline.
+This architecture manages to achieve high accuracy, while using a
+parameter count of 7M.
+If an EfficientNetV2B0 is not powerful enough for the task you are hoping to
+solve, be sure to check out
+[KerasCV's other available Backbones](https://github.com/keras-team/keras-cv/tree/master/keras_cv/models/backbones)!
+"""
+
+classifier = keras_cv.models.ImageClassifier.from_preset(
+    "efficientnetv2_b0_imagenet_classifier"
+)
+
+"""You may notice a small deviation from the old `keras.applications` API;
+where you would construct the class with
+`EfficientNetV2B0(weights="imagenet")`. While the old API was great for
+classification, it did not scale effectively to other use cases that required
+complex architectures, like object deteciton and semantic segmentation.
+
+Now that our classifier is built, let's apply it to this cute cat picture!
+"""
+
+filepath = keras.utils.get_file(origin="https://i.imgur.com/9i63gLN.jpg")
+image = keras.utils.load_img(filepath)
+image = np.array(image)
+keras_cv.visualization.plot_image_gallery(
+    image[None, ...], rows=1, cols=1, value_range=(0, 255), show=True, scale=4
+)
+
+"""Next, let's get some predictions from our classifier:"""
+
+predictions = classifier.predict(np.expand_dims(image, axis=0))
+
+"""Predictions come in the form of softmax-ed category rankings.
+We can find the index of the top classes using a simple argsort function:
+"""
+
+top_classes = predictions[0].argsort(axis=-1)
+
+"""In order to decode the class mappings, we can construct a mapping from
+category indices to ImageNet class names.
+For convenience, I've stored the ImageNet class mapping in a GitHub gist.
+Let's download and load it now.
+"""
+
+classes = keras.utils.get_file(
+    origin="https://gist.githubusercontent.com/LukeWood/62eebcd5c5c4a4d0e0b7845780f76d55/raw/fde63e5e4c09e2fa0a3436680f436bdcb8325aac/ImagenetClassnames.json"
+)
+with open(classes, "rb") as f:
+    classes = json.load(f)
+
+"""Now we can simply look up the class names via index:"""
+
+top_two = [classes[str(i)] for i in top_classes[-2:]]
+print("Top two classes are:", top_two)
+
+"""Great!  Both of these appear to be correct!
+However, one of the classes is "Velvet".
+We're trying to classify Cats VS Dogs.
+We don't care about the velvet blanket!
+
+Ideally, we'd have a classifier that only performs computation to determine if
+an image is a cat or a dog, and has all of its resources dedicated to this
+task. This can be solved by fine tuning our own classifier.
+
+# Fine tuning a pretrained classifier
+
+When labeled images specific to our task are available, fine-tuning a custom
+classifier can improve performance.
+If we want to train a Cats vs Dogs Classifier, using explicitly labeled Cat vs
+Dog data should perform better than the generic classifier!
+For many tasks, no relevant pretrained model
+will be available (e.g., categorizing images specific to your application).
+
+First, let's get started by loading some data:
+"""
+
+BATCH_SIZE = 32
+IMAGE_SIZE = (224, 224)
+AUTOTUNE = tf_data.AUTOTUNE
+tfds.disable_progress_bar()
+
+data, dataset_info = tfds.load(
+    "cats_vs_dogs",
+    with_info=True,
+    as_supervised=True
+)
+train_steps_per_epoch = (
+    dataset_info.splits["train"].num_examples // BATCH_SIZE
+)
+train_dataset = data["train"]
+
+num_classes = dataset_info.features["label"].num_classes
+
+resizing = keras_cv.layers.Resizing(
+    IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True
+)
+encoder = keras.layers.CategoryEncoding(num_classes, "one_hot", dtype="int32")
+
+
+def preprocess_inputs(image, label):
+    # Staticly resize images as we only iterate the dataset once.
+    return resizing(image), encoder(label)
+
+
+# Shuffle the dataset to increase diversity of batches.
+# 10*BATCH_SIZE follows the assumption that bigger machines can handle bigger
+# shuffle buffers.
+train_dataset = train_dataset.shuffle(
+    10 * BATCH_SIZE, reshuffle_each_iteration=True
+).map(preprocess_inputs, num_parallel_calls=AUTOTUNE)
+train_dataset = train_dataset.batch(BATCH_SIZE)
+
+images = next(iter(train_dataset.take(1)))[0]
+keras_cv.visualization.plot_image_gallery(images, value_range=(0, 255))
+
+"""Meow!
+
+Next let's construct our model.
+The use of imagenet in the preset name indicates that the backbone was
+pretrained on the ImageNet dataset.
+Pretrained backbones extract more information from our labeled examples by
+leveraging patterns extracted from potentially much larger datasets.
+
+Next lets put together our classifier:
+"""
+
+model = keras_cv.models.ImageClassifier.from_preset(
+    "efficientnetv2_b0_imagenet", num_classes=2
+)
+model.compile(
+    loss="categorical_crossentropy",
+    optimizer=keras.optimizers.SGD(learning_rate=0.01),
+    metrics=["accuracy"],
+)
+
+"""Here our classifier is just a simple `keras.Sequential`.
+All that is left to do is call `model.fit()`:
+"""
+
+model.fit(train_dataset)
+
+"""Let's look at how our model performs after the fine tuning:"""
+
+predictions = model.predict(np.expand_dims(image, axis=0))
+
+classes = {0: "cat", 1: "dog"}
+print("Top class is:", classes[predictions[0].argmax()])
+
+"""Awesome - looks like the model correctly classified the image.
+
+# Train a Classifier from Scratch
+
+Now that we've gotten our hands dirty with classification, let's take on one
+last task: training a classification model from scratch!
+A standard benchmark for image classification is the ImageNet dataset, however
+due to licensing constraints we will use the CalTech 101 image classification
+dataset in this tutorial.
+While we use the simpler CalTech 101 dataset in this guide, the same training
+template may be used on ImageNet to achieve near state-of-the-art scores.
+
+Let's start out by tackling data loading:
+"""
+
+NUM_CLASSES = 101
+# Change epochs to 100~ to fully train.
+EPOCHS = 1
+
+encoder = keras.layers.CategoryEncoding(NUM_CLASSES, "one_hot", dtype="int32")
+
+
+def package_inputs(image, label):
+    return {"images": image, "labels": encoder(label)}
+
+
+train_ds, eval_ds = tfds.load(
+    "caltech101", split=["train", "test"], as_supervised="true"
+)
+train_ds = train_ds.map(package_inputs, num_parallel_calls=tf_data.AUTOTUNE)
+eval_ds = eval_ds.map(package_inputs, num_parallel_calls=tf_data.AUTOTUNE)
+
+train_ds = train_ds.shuffle(BATCH_SIZE * 16)
+
+"""The CalTech101 dataset has different sizes for every image, so we use the
+`ragged_batch()` API to batch them together while maintaining each individual
+image's shape information.
+"""
+
+train_ds = train_ds.ragged_batch(BATCH_SIZE)
+eval_ds = eval_ds.ragged_batch(BATCH_SIZE)
+
+batch = next(iter(train_ds.take(1)))
+image_batch = batch["images"]
+label_batch = batch["labels"]
+
+keras_cv.visualization.plot_image_gallery(
+    image_batch.to_tensor(),
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""## Data Augmentation
+
+In our previous finetuning exmaple, we performed a static resizing operation
+and did not utilize any image augmentation.
+This is because a single pass over the training set was sufficient to achieve
+decent results.
+When training to solve a more difficult task, you'll want to include data
+augmentation in your data pipeline.
+
+Data augmentation is a technique to make your model robust to changes in input
+data such as lighting, cropping, and orientation.
+KerasCV includes some of the most useful augmentations in the
+`keras_cv.layers` API.
+Creating an optimal pipeline of augmentations is an art, but in this section
+of the guide we'll offer some tips on best practices for classification.
+
+One caveat to be aware of with image data augmentation is that you must be
+careful to not shift your augmented data distribution too far from the
+original data distribution.
+The goal is to prevent overfitting and increase generalization,
+but samples that lie completely out of the data distribution simply add noise
+to the training process.
+
+The first augmentation we'll use is `RandomFlip`.
+This augmentation behaves more or less how you'd expect: it either flips the
+image or not.
+While this augmentation is useful in CalTech101 and ImageNet, it should be
+noted that it should not be used on tasks where the data distribution is not
+vertical mirror invariant.
+An example of a dataset where this occurs is MNIST hand written digits.
+Flipping a `6` over the
+vertical axis will make the digit appear more like a `7` than a `6`, but the
+label will still show a `6`.
+"""
+
+random_flip = keras_cv.layers.RandomFlip()
+augmenters = [random_flip]
+
+image_batch = random_flip(image_batch)
+keras_cv.visualization.plot_image_gallery(
+    image_batch.to_tensor(),
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""Half of the images have been flipped!
+
+The next augmentation we'll use is `RandomCropAndResize`.
+This operation selects a random subset of the image, then resizes it to the
+provided target size.
+By using this augmentation, we force our classifier to become spatially
+invariant.
+Additionally, this layer accepts an `aspect_ratio_factor` which can be used to
+distort the aspect ratio of the image.
+While this can improve model performance, it should be used with caution.
+It is very easy for an aspect ratio distortion to shift a sample too far from
+the original training set's data distribution.
+Remember - the goal of data augmentation is to produce more training samples
+that align with the data distribution of your training set!
+
+`RandomCropAndResize` also can handle `tf.RaggedTensor` inputs.  In the
+CalTech101 image dataset images come in a wide variety of sizes.
+As such they cannot easily be batched together into a dense training batch.
+Luckily, `RandomCropAndResize` handles the Ragged -> Dense conversion process
+for you!
+
+Let's add a `RandomCropAndResize` to our set of augmentations:
+"""
+
+crop_and_resize = keras_cv.layers.RandomCropAndResize(
+    target_size=IMAGE_SIZE,
+    crop_area_factor=(0.8, 1.0),
+    aspect_ratio_factor=(0.9, 1.1),
+)
+augmenters += [crop_and_resize]
+
+image_batch = crop_and_resize(image_batch)
+keras_cv.visualization.plot_image_gallery(
+    image_batch,
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""Great!  We are now working with a batch of dense images.
+Next up, lets include some spatial and color-based jitter to our training set.
+This will allow us to produce a classifier that is robust to lighting
+flickers, shadows, and more.
+
+There are limitless ways to augment an image by altering color and spatial
+features, but perhaps the most battle tested technique is
+[`RandAugment`](https://arxiv.org/abs/1909.13719).
+`RandAugment` is actually a set of 10 different augmentations:
+`AutoContrast`, `Equalize`, `Solarize`, `RandomColorJitter`, `RandomContrast`,
+`RandomBrightness`, `ShearX`, `ShearY`, `TranslateX` and `TranslateY`.
+At inference time, `num_augmentations` augmenters are sampled for each image,
+and random magnitude factors are sampled for each.
+These augmentations are then applied sequentially.
+
+KerasCV makes tuning these parameters easy using the `augmentations_per_image`
+and `magnitude` parameters!
+Let's take it for a spin:
+"""
+
+rand_augment = keras_cv.layers.RandAugment(
+    augmentations_per_image=3,
+    magnitude=0.3,
+    value_range=(0, 255),
+)
+augmenters += [rand_augment]
+
+image_batch = rand_augment(image_batch)
+keras_cv.visualization.plot_image_gallery(
+    image_batch,
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""Looks great; but we're not done yet!
+What if an image is missing one critical feature of a class?  For example,
+what if a leaf is blocking the view of a cat's ear, but our classifier
+learned to classify cats simply by observing their ears?
+
+One easy approach to tackling this is to use `RandomCutout`, which randomly
+strips out a sub-section of the image:
+"""
+
+random_cutout = keras_cv.layers.RandomCutout(
+    width_factor=0.4, height_factor=0.4
+)
+keras_cv.visualization.plot_image_gallery(
+    random_cutout(image_batch),
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""While this tackles the problem reasonably well, it can cause the classifier
+to develop responses to borders between features and black pixel areas caused
+by the cutout.
+
+[`CutMix`](https://arxiv.org/abs/1905.04899) solves the same issue by using
+a more complex (and more effective) technique.
+Instead of replacing the cut-out areas with black pixels, `CutMix` replaces
+these regions with regions of other images sampled from within your training
+set!
+Following this replacement, the image's classification label is updated to be
+a blend of the original and mixed image's class label.
+
+What does this look like in practice?  Let's check it out:
+"""
+
+cut_mix = keras_cv.layers.CutMix()
+# CutMix needs to modify both images and labels
+inputs = {"images": image_batch, "labels": tf.cast(label_batch, "float32")}
+
+keras_cv.visualization.plot_image_gallery(
+    cut_mix(inputs)["images"],
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""Let's hold off from adding it to our augmenter for a minute - more on that
+soon!
+
+Next, let's look into `MixUp()`.
+Unfortunately, while `MixUp()` has been empirically shown to *substantially*
+improve both the robustness and the generalization of the trained model,
+it is not well-understood why such improvement occurs... but
+a little alchemy never hurt anyone!
+
+`MixUp()` works by sampling two images from a batch, then proceeding to
+literally blend together their pixel intensities as well as their
+classification labels.
+
+Let's see it in action:
+"""
+
+mix_up = keras_cv.layers.MixUp()
+# MixUp needs to modify both images and labels
+inputs = {"images": image_batch, "labels": tf.cast(label_batch, "float32")}
+
+keras_cv.visualization.plot_image_gallery(
+    mix_up(inputs)["images"],
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""If you look closely, you'll see that the images have been blended together.
+
+Instead of applying `CutMix()` and `MixUp()` to every image, we instead pick
+one or the other to apply to each batch.
+This can be expressed using `keras_cv.layers.RandomChoice()`
+"""
+
+cut_mix_or_mix_up = keras_cv.layers.RandomChoice(
+    [cut_mix, mix_up], batchwise=True
+)
+augmenters += [cut_mix_or_mix_up]
+
+"""Now let's apply our final augmenter to the training data:"""
+
+augmenter = keras_cv.layers.Augmenter(augmenters)
+train_ds = train_ds.map(augmenter, num_parallel_calls=tf_data.AUTOTUNE)
+
+image_batch = next(iter(train_ds.take(1)))["images"]
+keras_cv.visualization.plot_image_gallery(
+    image_batch,
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""We also need to resize our evaluation set to get dense batches of the image
+size expected by our model. We use the deterministic
+`keras_cv.layers.Resizing` in this case to avoid adding noise to our
+evaluation metric.
+"""
+
+inference_resizing = keras_cv.layers.Resizing(
+    IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True
+)
+eval_ds = eval_ds.map(inference_resizing, num_parallel_calls=tf_data.AUTOTUNE)
+
+inference_resizing = keras_cv.layers.Resizing(
+    IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True
+)
+eval_ds = eval_ds.map(inference_resizing, num_parallel_calls=tf_data.AUTOTUNE)
+
+image_batch = next(iter(eval_ds.take(1)))["images"]
+keras_cv.visualization.plot_image_gallery(
+    image_batch,
+    rows=3,
+    cols=3,
+    value_range=(0, 255),
+    show=True,
+)
+
+"""Finally, lets unpackage our datasets and prepare to pass them to
+`model.fit()`, which accepts a tuple of `(images, labels)`.
+"""
+
+def unpackage_dict(inputs):
+    return inputs["images"], inputs["labels"]
+
+
+train_ds = train_ds.map(unpackage_dict, num_parallel_calls=tf_data.AUTOTUNE)
+eval_ds = eval_ds.map(unpackage_dict, num_parallel_calls=tf_data.AUTOTUNE)
+
+"""Data augmentation is by far the hardest piece of training a modern
+classifier.
+Congratulations on making it this far!
+
+## Optimizer Tuning
+
+To achieve optimal performance, we need to use a learning rate schedule
+instead of a single learning rate. While we won't go into detail on the
+Cosine decay with warmup schedule used here, [you can read more about it
+here](https://scorrea92.medium.com/cosine-learning-rate-decay-e8b50aa455b).
+"""
+
+def lr_warmup_cosine_decay(
+    global_step,
+    warmup_steps,
+    hold=0,
+    total_steps=0,
+    start_lr=0.0,
+    target_lr=1e-2,
+):
+    # Cosine decay
+    learning_rate = (
+        0.5
+        * target_lr
+        * (
+            1
+            + ops.cos(
+                math.pi
+                * ops.convert_to_tensor(
+                    global_step - warmup_steps - hold, dtype="float32"
+                )
+                / ops.convert_to_tensor(
+                    total_steps - warmup_steps - hold, dtype="float32"
+                )
+            )
+        )
+    )
+
+    warmup_lr = (target_lr * (global_step / warmup_steps))
+
+    if hold > 0:
+        learning_rate = ops.where(
+            global_step > warmup_steps + hold, learning_rate, target_lr
+        )
+
+    learning_rate = ops.where(
+        global_step < warmup_steps, warmup_lr, learning_rate
+    )
+    return learning_rate
+
+
+class WarmUpCosineDecay(
+    schedules.LearningRateSchedule
+):
+    def __init__(
+        self, warmup_steps, total_steps, hold, start_lr=0.0, target_lr=1e-2
+    ):
+        super().__init__()
+        self.start_lr = start_lr
+        self.target_lr = target_lr
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.hold = hold
+
+    def __call__(self, step):
+        lr = lr_warmup_cosine_decay(
+            global_step=step,
+            total_steps=self.total_steps,
+            warmup_steps=self.warmup_steps,
+            start_lr=self.start_lr,
+            target_lr=self.target_lr,
+            hold=self.hold,
+        )
+
+        return ops.where(step > self.total_steps, 0.0, lr)
+
+"""![WarmUpCosineDecay schedule](https://i.imgur.com/YCr5pII.png)
+
+The schedule looks a as we expect.
+
+Next let's construct this optimizer:
+"""
+
+total_images = 9000
+total_steps = (total_images // BATCH_SIZE) * EPOCHS
+warmup_steps = int(0.1 * total_steps)
+hold_steps = int(0.45 * total_steps)
+schedule = WarmUpCosineDecay(
+    start_lr=0.05,
+    target_lr=1e-2,
+    warmup_steps=warmup_steps,
+    total_steps=total_steps,
+    hold=hold_steps,
+)
+optimizer = optimizers.SGD(
+    weight_decay=5e-4,
+    learning_rate=schedule,
+    momentum=0.9,
+)
+
+"""At long last, we can now build our model and call `fit()`!
+`keras_cv.models.EfficientNetV2B0Backbone()` is a convenience alias for
+`keras_cv.models.EfficientNetV2Backbone.from_preset('efficientnetv2_b0')`.
+Note that this preset does not come with any pretrained weights.
+"""
+
+backbone = keras_cv.models.ResNet18V2Backbone()
+model = keras.Sequential(
+    [
+        backbone,
+        keras.layers.GlobalMaxPooling2D(),
+        keras.layers.Dropout(rate=0.5),
+        keras.layers.Dense(101, activation="softmax"),
+    ]
+)
+
+"""Since the labels produced by MixUp() and CutMix() are somewhat artificial,
+we employ label smoothing to prevent the model from overfitting to artifacts
+of this augmentation process.
+"""
+
+loss = losses.CategoricalCrossentropy(label_smoothing=0.1)
+
+"""Let's compile our model:"""
+
+model.compile(
+    loss=loss,
+    optimizer=optimizer,
+    metrics=[
+        metrics.CategoricalAccuracy(),
+        metrics.TopKCategoricalAccuracy(k=5),
+    ],
+)
+
+"""and finally call fit()."""
+
+model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=eval_ds,
+)
+
+"""Congratulations!  You now know how to train a powerful image classifier
+from scratch in KerasCV.
+Depending on the availability of labeled data for your application, training
+from scratch may or may not be more powerful than using transfer learning in
+addition to the data augmentations discussed above. For smaller datasets,
+pretrained models generally produce high accuracy and faster convergence.
+
+## Conclusions
+
+While image classification is perhaps the simplest problem in computer vision,
+the modern landscape has numerous complex components.
+Luckily, KerasCV offers robust, production-grade APIs to make assembling most
+of these components possible in one line of code.
+Through the use of KerasCV's `ImageClassifier` API, pretrained weights, and
+KerasCV data augmentations you can assemble everything you need to train a
+powerful classifier in a few hundred lines of code!
+
+As a follow up exercise, give the following a try:
+
+- Fine tune a KerasCV classifier on your own dataset
+- Learn more about [KerasCV's data augmentations](https://keras.io/guides/keras_cv/cut_mix_mix_up_and_rand_augment/)
+- Check out how we train our models on [ImageNet](https://github.com/keras-team/keras-cv/blob/master/examples/training/classification/imagenet/basic_training.py)
+"""
diff --git a/keras_core/callbacks/learning_rate_scheduler.py b/keras_core/callbacks/learning_rate_scheduler.py
index 6218f8e07..a72c44661 100644
--- a/keras_core/callbacks/learning_rate_scheduler.py
+++ b/keras_core/callbacks/learning_rate_scheduler.py
@@ -76,4 +76,6 @@ class LearningRateScheduler(Callback):
 
     def on_epoch_end(self, epoch, logs=None):
         logs = logs or {}
-        logs["learning_rate"] = self.model.optimizer.learning_rate.value
+        logs["learning_rate"] = float(
+            backend.convert_to_numpy(self.model.optimizer.learning_rate)
+        )
diff --git a/keras_core/callbacks/learning_rate_scheduler_test.py b/keras_core/callbacks/learning_rate_scheduler_test.py
index 568446e6a..633e2e68b 100644
--- a/keras_core/callbacks/learning_rate_scheduler_test.py
+++ b/keras_core/callbacks/learning_rate_scheduler_test.py
@@ -107,3 +107,18 @@ class LearningRateSchedulerTest(testing.TestCase):
                 callbacks=[lr_scheduler],
                 epochs=2,
             )
+
+    @pytest.mark.requires_trainable_backend
+    def test_learning_rate_in_history(self):
+        lr_scheduler = callbacks.LearningRateScheduler(lambda step, lr: 0.5)
+
+        history = self.model.fit(
+            self.x_train,
+            self.y_train,
+            callbacks=[lr_scheduler],
+            epochs=1,
+        )
+
+        self.assertTrue("learning_rate" in history.history)
+        self.assertEqual(type(history.history["learning_rate"][0]), float)
+        self.assertEqual(history.history["learning_rate"][0], 0.5)
diff --git a/keras_core/callbacks/tensorboard_test.py b/keras_core/callbacks/tensorboard_test.py
index eecad42b5..ca41a7069 100644
--- a/keras_core/callbacks/tensorboard_test.py
+++ b/keras_core/callbacks/tensorboard_test.py
@@ -391,7 +391,6 @@ class TestTensorBoardV2(testing.TestCase):
             },
         )
         expected_image_summaries = {
-            _ObservedSummary(logdir=train_dir, tag="image"),
             _ObservedSummary(logdir=train_dir, tag="bias/image"),
             _ObservedSummary(logdir=train_dir, tag="kernel/image"),
         }
diff --git a/keras_core/layers/activations/elu_test.py b/keras_core/layers/activations/elu_test.py
index 6b85c1903..77c13ac4f 100644
--- a/keras_core/layers/activations/elu_test.py
+++ b/keras_core/layers/activations/elu_test.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-import tensorflow as tf
 
 from keras_core import testing
 from keras_core.layers.activations import elu
@@ -21,11 +20,12 @@ class ELUTest(testing.TestCase):
         )
 
     def test_correctness(self):
+        def np_elu(x, alpha=1.0):
+            return (x > 0) * x + (x <= 0) * alpha * (np.exp(x) - 1)
+
         x = np.random.random((2, 2, 5))
         elu_layer = elu.ELU()
-        tf_elu_layer = tf.keras.layers.ELU()
-        self.assertAllClose(elu_layer(x), tf_elu_layer(x))
+        self.assertAllClose(elu_layer(x), np_elu(x))
 
         elu_layer = elu.ELU(alpha=0.7)
-        tf_elu_layer = tf.keras.layers.ELU(alpha=0.7)
-        self.assertAllClose(elu_layer(x), tf_elu_layer(x))
+        self.assertAllClose(elu_layer(x), np_elu(x, alpha=0.7))
diff --git a/keras_core/layers/activations/prelu_test.py b/keras_core/layers/activations/prelu_test.py
index ea4f79559..73666c774 100644
--- a/keras_core/layers/activations/prelu_test.py
+++ b/keras_core/layers/activations/prelu_test.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-import tensorflow as tf
 
 from keras_core import testing
 from keras_core.layers.activations import prelu
@@ -22,6 +21,9 @@ class PReLUTest(testing.TestCase):
         )
 
     def test_prelu_correctness(self):
+        def np_prelu(x, alpha):
+            return (x > 0) * x + (x <= 0) * alpha * x
+
         inputs = np.random.randn(2, 10, 5, 3)
         prelu_layer = prelu.PReLU(
             alpha_initializer="glorot_uniform",
@@ -29,18 +31,9 @@ class PReLUTest(testing.TestCase):
             alpha_constraint="non_neg",
             shared_axes=(1, 2),
         )
-        tf_prelu_layer = tf.keras.layers.PReLU(
-            alpha_initializer="glorot_uniform",
-            alpha_regularizer="l1",
-            alpha_constraint="non_neg",
-            shared_axes=(1, 2),
-        )
-
         prelu_layer.build(inputs.shape)
-        tf_prelu_layer.build(inputs.shape)
 
         weights = np.random.random((1, 1, 3))
         prelu_layer.alpha.assign(weights)
-        tf_prelu_layer.alpha.assign(weights)
-
-        self.assertAllClose(prelu_layer(inputs), tf_prelu_layer(inputs))
+        ref_out = np_prelu(inputs, weights)
+        self.assertAllClose(prelu_layer(inputs), ref_out)
diff --git a/keras_core/layers/attention/multi_head_attention.py b/keras_core/layers/attention/multi_head_attention.py
index bb127b77e..37f93258d 100644
--- a/keras_core/layers/attention/multi_head_attention.py
+++ b/keras_core/layers/attention/multi_head_attention.py
@@ -4,6 +4,7 @@ import string
 
 import numpy as np
 
+from keras_core import backend
 from keras_core import constraints
 from keras_core import initializers
 from keras_core import ops
@@ -115,6 +116,8 @@ class MultiHeadAttention(Layer):
         self.supports_masking = True
         self._num_heads = num_heads
         self._key_dim = key_dim
+        # Cache 1.0 / math.sqrt(self._key_dim).
+        self._inverse_sqrt_key_dim = None
         self._value_dim = value_dim if value_dim else key_dim
         self._dropout = dropout
         self._use_bias = use_bias
@@ -311,6 +314,9 @@ class MultiHeadAttention(Layer):
         )
         self._softmax = Softmax(axis=norm_axes)
         self._dropout_layer = Dropout(rate=self._dropout)
+        self._inverse_sqrt_key_dim = backend.convert_to_tensor(
+            1.0 / math.sqrt(float(self._key_dim))
+        )
 
     def _masked_softmax(self, attention_scores, attention_mask=None):
         # Normalize the attention scores to probabilities.
@@ -355,7 +361,7 @@ class MultiHeadAttention(Layer):
         # Note: Applying scalar multiply at the smaller end of einsum improves
         # XLA performance, but may introduce slight numeric differences in
         # the Transformer attention head.
-        query = ops.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
+        query = ops.multiply(query, self._inverse_sqrt_key_dim)
 
         # Take the dot product between "query" and "key" to get the raw
         # attention scores.
diff --git a/keras_core/layers/core/dense.py b/keras_core/layers/core/dense.py
index d7d70324c..d2b50fb30 100644
--- a/keras_core/layers/core/dense.py
+++ b/keras_core/layers/core/dense.py
@@ -87,12 +87,14 @@ class Dense(Layer):
     def build(self, input_shape):
         input_dim = input_shape[-1]
         self.kernel = self.add_weight(
+            name="kernel",
             shape=(input_dim, self.units),
             initializer=self.kernel_initializer,
             regularizer=self.kernel_regularizer,
         )
         if self.use_bias:
             self.bias = self.add_weight(
+                name="bias",
                 shape=(self.units,),
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
diff --git a/keras_core/layers/pooling/global_average_pooling_test.py b/keras_core/layers/pooling/global_average_pooling_test.py
index 76f5afcb2..60a832742 100644
--- a/keras_core/layers/pooling/global_average_pooling_test.py
+++ b/keras_core/layers/pooling/global_average_pooling_test.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-import tensorflow as tf
 from absl.testing import parameterized
 
 from keras_core import layers
@@ -95,21 +94,30 @@ class GlobalAveragePoolingCorrectnessTest(
         ("channels_last", False),
         ("channels_last", True),
         ("channels_first", False),
+        ("channels_first", True),
     )
     def test_global_average_pooling1d(self, data_format, keepdims):
-        inputs = np.arange(24, dtype="float32").reshape((2, 3, 4))
+        def np_gap1d(x, data_format, keepdims, mask=None):
+            steps_axis = 1 if data_format == "channels_last" else 2
+            if mask is not None:
+                mask = np.expand_dims(
+                    mask, 2 if data_format == "channels_last" else 1
+                )
+                x *= mask
+                res = np.sum(x, axis=steps_axis) / np.sum(mask, axis=steps_axis)
+            else:
+                res = np.mean(x, axis=steps_axis)
+            if keepdims:
+                res = np.expand_dims(res, axis=steps_axis)
+            return res
 
+        inputs = np.arange(24, dtype="float32").reshape((2, 3, 4))
         layer = layers.GlobalAveragePooling1D(
             data_format=data_format,
             keepdims=keepdims,
         )
-        tf_keras_layer = tf.keras.layers.GlobalAveragePooling1D(
-            data_format=data_format,
-            keepdims=keepdims,
-        )
-
         outputs = layer(inputs)
-        expected = tf_keras_layer(inputs)
+        expected = np_gap1d(inputs, data_format, keepdims)
         self.assertAllClose(outputs, expected)
 
         if data_format == "channels_last":
@@ -117,47 +125,53 @@ class GlobalAveragePoolingCorrectnessTest(
         else:
             mask = np.array([[1, 1, 0, 0], [0, 1, 0, 1]], dtype="int32")
         outputs = layer(inputs, mask)
-        expected = tf_keras_layer(inputs, mask)
+        expected = np_gap1d(inputs, data_format, keepdims, mask)
         self.assertAllClose(outputs, expected)
 
     @parameterized.parameters(
         ("channels_last", False),
         ("channels_last", True),
         ("channels_first", False),
+        ("channels_first", True),
     )
     def test_global_average_pooling2d(self, data_format, keepdims):
-        inputs = np.arange(96, dtype="float32").reshape((2, 3, 4, 4))
+        def np_gap2d(x, data_format, keepdims):
+            steps_axis = [1, 2] if data_format == "channels_last" else [2, 3]
+            res = np.apply_over_axes(np.mean, x, steps_axis)
+            if not keepdims:
+                res = res.squeeze()
+            return res
 
+        inputs = np.arange(96, dtype="float32").reshape((2, 3, 4, 4))
         layer = layers.GlobalAveragePooling2D(
             data_format=data_format,
             keepdims=keepdims,
         )
-        tf_keras_layer = tf.keras.layers.GlobalAveragePooling2D(
-            data_format=data_format,
-            keepdims=keepdims,
-        )
-
         outputs = layer(inputs)
-        expected = tf_keras_layer(inputs)
+        expected = np_gap2d(inputs, data_format, keepdims)
         self.assertAllClose(outputs, expected)
 
     @parameterized.parameters(
         ("channels_last", False),
         ("channels_last", True),
         ("channels_first", False),
+        ("channels_first", True),
     )
     def test_global_average_pooling3d(self, data_format, keepdims):
-        inputs = np.arange(360, dtype="float32").reshape((2, 3, 3, 5, 4))
+        def np_gap3d(x, data_format, keepdims):
+            steps_axis = (
+                [1, 2, 3] if data_format == "channels_last" else [2, 3, 4]
+            )
+            res = np.apply_over_axes(np.mean, x, steps_axis)
+            if not keepdims:
+                res = res.squeeze()
+            return res
 
+        inputs = np.arange(360, dtype="float32").reshape((2, 3, 3, 5, 4))
         layer = layers.GlobalAveragePooling3D(
             data_format=data_format,
             keepdims=keepdims,
         )
-        tf_keras_layer = tf.keras.layers.GlobalAveragePooling3D(
-            data_format=data_format,
-            keepdims=keepdims,
-        )
-
         outputs = layer(inputs)
-        expected = tf_keras_layer(inputs)
+        expected = np_gap3d(inputs, data_format, keepdims)
         self.assertAllClose(outputs, expected)
diff --git a/keras_core/ops/image.py b/keras_core/ops/image.py
index a7af118b8..7351a7425 100644
--- a/keras_core/ops/image.py
+++ b/keras_core/ops/image.py
@@ -271,6 +271,8 @@ class ExtractPatches(Operation):
         data_format="channels_last",
     ):
         super().__init__()
+        if isinstance(size, int):
+            size = (size, size)
         self.size = size
         self.strides = strides
         self.dilation_rate = dilation_rate
@@ -348,14 +350,16 @@ def extract_patches(
 
     Examples:
 
-    >>> image = np.random.random((1, 20, 20, 3)) # batch of 2 RGB images
+    >>> image = np.random.random(
+    ...     (2, 20, 20, 3)
+    ... ).astype("float32") # batch of 2 RGB images
     >>> patches = keras_core.ops.image.extract_patches(image, (5, 5))
     >>> patches.shape
-    (1, 4, 4, 75)
-    >>> image = np.random.random((20, 20, 3)) # batch of 2 RGB images
+    (2, 4, 4, 75)
+    >>> image = np.random.random((20, 20, 3)).astype("float32") # 1 RGB image
     >>> patches = keras_core.ops.image.extract_patches(image, (3, 3), (1, 1))
     >>> patches.shape
-    (4, 4, 75)
+    (18, 18, 27)
     """
     if any_symbolic_tensors((image,)):
         return ExtractPatches(
diff --git a/keras_core/ops/image_test.py b/keras_core/ops/image_test.py
index 6452df04a..1e4d05880 100644
--- a/keras_core/ops/image_test.py
+++ b/keras_core/ops/image_test.py
@@ -31,6 +31,8 @@ class ImageOpsDynamicShapeTest(testing.TestCase):
         p_h, p_w = 5, 5
         out = kimage.extract_patches(x, (p_h, p_w))
         self.assertEqual(out.shape, (None, 4, 4, 75))
+        out = kimage.extract_patches(x, 5)
+        self.assertEqual(out.shape, (None, 4, 4, 75))
 
 
 class ImageOpsStaticShapeTest(testing.TestCase):
@@ -50,6 +52,8 @@ class ImageOpsStaticShapeTest(testing.TestCase):
         p_h, p_w = 5, 5
         out = kimage.extract_patches(x, (p_h, p_w))
         self.assertEqual(out.shape, (4, 4, 75))
+        out = kimage.extract_patches(x, 5)
+        self.assertEqual(out.shape, (4, 4, 75))
 
 
 AFFINE_TRANSFORM_INTERPOLATIONS = {  # map to order
@@ -310,9 +314,7 @@ class ImageOpsCorrectnessTest(testing.TestCase, parameterized.TestCase):
             and backend.backend() == "tensorflow"
             and dilation_rate > 1
         ):
-            pytest.skip(
-                "dilation_rate>1 with strides>1 than not supported with TF"
-            )
+            pytest.skip("dilation_rate>1 with strides>1 not supported with TF")
         if data_format == "channels_first":
             image = np.random.uniform(size=(1, 3, 20, 20))
         else: