From f6cb39615821fbbe18a864b49db7cc2b3a274386 Mon Sep 17 00:00:00 2001
From: Francois Chollet <francois.chollet@gmail.com>
Date: Sat, 10 Jun 2023 11:58:52 -0700
Subject: [PATCH] Merge branch 'main' of github.com:keras-team/keras-core

---
 .../keras_io/tensorflow/vision/captcha_ocr.py | 342 +++++++++
 ...ject_detection_using_vision_transformer.py | 503 +++++++++++++
 .../vision/semisupervised_simclr.py           | 673 ++++++++++++++++++
 .../tensorflow/vision/swim_transformers.py    | 547 ++++++++++++++
 .../timeseries_classification_from_scratch.py | 226 ++++++
 keras_core/backend/jax/numpy.py               |   2 +
 6 files changed, 2293 insertions(+)
 create mode 100644 examples/keras_io/tensorflow/vision/captcha_ocr.py
 create mode 100644 examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py
 create mode 100644 examples/keras_io/tensorflow/vision/semisupervised_simclr.py
 create mode 100644 examples/keras_io/tensorflow/vision/swim_transformers.py
 create mode 100755 examples/keras_io/timeseries/timeseries_classification_from_scratch.py

diff --git a/examples/keras_io/tensorflow/vision/captcha_ocr.py b/examples/keras_io/tensorflow/vision/captcha_ocr.py
new file mode 100644
index 000000000..5126d7fff
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/captcha_ocr.py
@@ -0,0 +1,342 @@
+"""
+Title: OCR model for reading Captchas
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/06/14
+Last modified: 2020/06/26
+Description: How to implement an OCR model using CNNs, RNNs and CTC loss.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates a simple OCR model built with the Functional API. Apart from
+combining CNN and RNN, it also illustrates how you can instantiate a new layer
+and use it as an "Endpoint layer" for implementing CTC loss. For a detailed
+guide to layer subclassing, please check out
+[this page](https://keras.io/guides/making_new_layers_and_models_via_subclassing/)
+in the developer guides.
+"""
+
+"""
+## Setup
+"""
+
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pathlib import Path
+from collections import Counter
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+
+"""
+## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
+Let's download the data.
+"""
+
+
+"""shell
+curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
+unzip -qq captcha_images_v2.zip
+"""
+
+
+"""
+The dataset contains 1040 captcha files as `png` images. The label for each sample is a string,
+the name of the file (minus the file extension).
+We will map each character in the string to an integer for training the model. Similary,
+we will need to map the predictions of the model back to strings. For this purpose
+we will maintain two dictionaries, mapping characters to integers, and integers to characters,
+respectively.
+"""
+
+
+# Path to the data directory
+data_dir = Path("./captcha_images_v2/")
+
+# Get list of all the images
+images = sorted(list(map(str, list(data_dir.glob("*.png")))))
+labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
+characters = set(char for label in labels for char in label)
+characters = sorted(list(characters))
+
+print("Number of images found: ", len(images))
+print("Number of labels found: ", len(labels))
+print("Number of unique characters: ", len(characters))
+print("Characters present: ", characters)
+
+# Batch size for training and validation
+batch_size = 16
+
+# Desired image dimensions
+img_width = 200
+img_height = 50
+
+# Factor by which the image is going to be downsampled
+# by the convolutional blocks. We will be using two
+# convolution blocks and each block will have
+# a pooling layer which downsample the features by a factor of 2.
+# Hence total downsampling factor would be 4.
+downsample_factor = 4
+
+# Maximum length of any captcha in the dataset
+max_length = max([len(label) for label in labels])
+
+
+"""
+## Preprocessing
+"""
+
+
+# Mapping characters to integers
+char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None)
+
+# Mapping integers back to original characters
+num_to_char = layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+)
+
+
+def split_data(images, labels, train_size=0.9, shuffle=True):
+    # 1. Get the total size of the dataset
+    size = len(images)
+    # 2. Make an indices array and shuffle it, if required
+    indices = np.arange(size)
+    if shuffle:
+        np.random.shuffle(indices)
+    # 3. Get the size of training samples
+    train_samples = int(size * train_size)
+    # 4. Split data into training and validation sets
+    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
+    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
+    return x_train, x_valid, y_train, y_valid
+
+
+# Splitting data into training and validation sets
+x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))
+
+
+def encode_single_sample(img_path, label):
+    # 1. Read image
+    img = tf.io.read_file(img_path)
+    # 2. Decode and convert to grayscale
+    img = tf.io.decode_png(img, channels=1)
+    # 3. Convert to float32 in [0, 1] range
+    img = tf.image.convert_image_dtype(img, tf.float32)
+    # 4. Resize to the desired size
+    img = tf.image.resize(img, [img_height, img_width])
+    # 5. Transpose the image because we want the time
+    # dimension to correspond to the width of the image.
+    img = tf.transpose(img, perm=[1, 0, 2])
+    # 6. Map the characters in label to numbers
+    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
+    # 7. Return a dict as our model is expecting two inputs
+    return {"image": img, "label": label}
+
+
+"""
+## Create `Dataset` objects
+"""
+
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_dataset = (
+    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
+validation_dataset = (
+    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+"""
+## Visualize the data
+"""
+
+
+_, ax = plt.subplots(4, 4, figsize=(10, 5))
+for batch in train_dataset.take(1):
+    images = batch["image"]
+    labels = batch["label"]
+    for i in range(16):
+        img = (images[i] * 255).numpy().astype("uint8")
+        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
+        ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray")
+        ax[i // 4, i % 4].set_title(label)
+        ax[i // 4, i % 4].axis("off")
+plt.show()
+
+"""
+## Model
+"""
+
+
+class CTCLayer(layers.Layer):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.loss_fn = keras.backend.ctc_batch_cost
+
+    def call(self, y_true, y_pred):
+        # Compute the training-time loss value and add it
+        # to the layer using `self.add_loss()`.
+        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
+        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+
+        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
+        self.add_loss(loss)
+
+        # At test time, just return the computed predictions
+        return y_pred
+
+
+def build_model():
+    # Inputs to the model
+    input_img = layers.Input(
+        shape=(img_width, img_height, 1), name="image", dtype="float32"
+    )
+    labels = layers.Input(name="label", shape=(None,), dtype="float32")
+
+    # First conv block
+    x = layers.Conv2D(
+        32,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv1",
+    )(input_img)
+    x = layers.MaxPooling2D((2, 2), name="pool1")(x)
+
+    # Second conv block
+    x = layers.Conv2D(
+        64,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv2",
+    )(x)
+    x = layers.MaxPooling2D((2, 2), name="pool2")(x)
+
+    # We have used two max pool with pool size and strides 2.
+    # Hence, downsampled feature maps are 4x smaller. The number of
+    # filters in the last layer is 64. Reshape accordingly before
+    # passing the output to the RNN part of the model
+    new_shape = ((img_width // 4), (img_height // 4) * 64)
+    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
+    x = layers.Dense(64, activation="relu", name="dense1")(x)
+    x = layers.Dropout(0.2)(x)
+
+    # RNNs
+    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
+    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
+
+    # Output layer
+    x = layers.Dense(
+        len(char_to_num.get_vocabulary()) + 1, activation="softmax", name="dense2"
+    )(x)
+
+    # Add CTC layer for calculating CTC loss at each step
+    output = CTCLayer(name="ctc_loss")(labels, x)
+
+    # Define the model
+    model = keras.models.Model(
+        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
+    )
+    # Optimizer
+    opt = keras.optimizers.Adam()
+    # Compile the model and return
+    model.compile(optimizer=opt)
+    return model
+
+
+# Get the model
+model = build_model()
+model.summary()
+
+"""
+## Training
+"""
+
+
+epochs = 1
+early_stopping_patience = 10
+# Add early stopping
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
+)
+
+# Train the model
+history = model.fit(
+    train_dataset,
+    validation_data=validation_dataset,
+    epochs=epochs,
+    callbacks=[early_stopping],
+)
+
+
+"""
+## Inference
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha).
+"""
+
+
+# Get the prediction model by extracting layers till the output layer
+prediction_model = keras.models.Model(
+    model.get_layer(name="image").input, model.get_layer(name="dense2").output
+)
+prediction_model.summary()
+
+
+# A utility function to decode the output of the network
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
+        :, :max_length
+    ]
+    # Iterate over the results and get back the text
+    output_text = []
+    for res in results:
+        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
+        output_text.append(res)
+    return output_text
+
+
+#  Let's check results on some validation samples
+for batch in validation_dataset.take(1):
+    batch_images = batch["image"]
+    batch_labels = batch["label"]
+
+    preds = prediction_model.predict(batch_images)
+    pred_texts = decode_batch_predictions(preds)
+
+    orig_texts = []
+    for label in batch_labels:
+        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+        orig_texts.append(label)
+
+    _, ax = plt.subplots(4, 4, figsize=(15, 5))
+    for i in range(len(pred_texts)):
+        img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
+        img = img.T
+        title = f"Prediction: {pred_texts[i]}"
+        ax[i // 4, i % 4].imshow(img, cmap="gray")
+        ax[i // 4, i % 4].set_title(title)
+        ax[i // 4, i % 4].axis("off")
+plt.show()
\ No newline at end of file
diff --git a/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py b/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py
new file mode 100644
index 000000000..4e4636b96
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py
@@ -0,0 +1,503 @@
+"""
+Title: Object detection with Vision Transformers
+Author: [Karan V. Dave](https://www.linkedin.com/in/karan-dave-811413164/)
+Date created: 2022/03/27
+Last modified: 2022/03/27
+Description: A simple Keras implementation of object detection using Vision Transformers.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The article
+[Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929)
+architecture by Alexey Dosovitskiy et al.
+demonstrates that a pure transformer applied directly to sequences of image
+patches can perform well on object detection tasks.
+
+In this Keras example, we implement an object detection ViT
+and we train it on the
+[Caltech 101 dataset](http://www.vision.caltech.edu/datasets/)
+to detect an airplane in the given image.
+
+This example requires TensorFlow 2.4 or higher.
+"""
+
+"""
+## Imports and setup
+"""
+
+import numpy as np
+import tensorflow as tf
+import keras_core as keras
+from keras_core import layers
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+import os
+import scipy.io
+import shutil
+
+"""
+## Prepare dataset
+
+We use the [Caltech 101 Dataset](https://data.caltech.edu/records/mzrjq-6wc02).
+"""
+
+# Path to images and annotations
+path_images = "./101_ObjectCategories/airplanes/"
+path_annot = "./Annotations/Airplanes_Side_2/"
+
+path_to_downloaded_file = keras.utils.get_file(
+    fname="caltech_101_zipped",
+    origin="https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
+    extract=True,
+    archive_format="zip",  # downloaded file format
+    cache_dir="./",  # cache and extract in current directory
+)
+
+# Extracting tar files found inside main zip file
+shutil.unpack_archive("./datasets/caltech-101/101_ObjectCategories.tar.gz", "./")
+shutil.unpack_archive("./datasets/caltech-101/Annotations.tar", "./")
+
+# list of paths to images and annotations
+image_paths = [
+    f for f in os.listdir(path_images) if os.path.isfile(os.path.join(path_images, f))
+]
+annot_paths = [
+    f for f in os.listdir(path_annot) if os.path.isfile(os.path.join(path_annot, f))
+]
+
+image_paths.sort()
+annot_paths.sort()
+
+image_size = 224  # resize input images to this size
+
+images, targets = [], []
+
+# loop over the annotations and images, preprocess them and store in lists
+for i in range(0, len(annot_paths)):
+    # Access bounding box coordinates
+    annot = scipy.io.loadmat(path_annot + annot_paths[i])["box_coord"][0]
+
+    top_left_x, top_left_y = annot[2], annot[0]
+    bottom_right_x, bottom_right_y = annot[3], annot[1]
+
+    image = keras.utils.load_img(
+        path_images + image_paths[i],
+    )
+    (w, h) = image.size[:2]
+
+    # resize train set images
+    if i < int(len(annot_paths) * 0.8):
+        # resize image if it is for training dataset
+        image = image.resize((image_size, image_size))
+
+    # convert image to array and append to list
+    images.append(keras.utils.img_to_array(image))
+
+    # apply relative scaling to bounding boxes as per given image and append to list
+    targets.append(
+        (
+            float(top_left_x) / w,
+            float(top_left_y) / h,
+            float(bottom_right_x) / w,
+            float(bottom_right_y) / h,
+        )
+    )
+
+# Convert the list to numpy array, split to train and test dataset
+(x_train), (y_train) = (
+    np.asarray(images[: int(len(images) * 0.8)]),
+    np.asarray(targets[: int(len(targets) * 0.8)]),
+)
+(x_test), (y_test) = (
+    np.asarray(images[int(len(images) * 0.8) :]),
+    np.asarray(targets[int(len(targets) * 0.8) :]),
+)
+
+"""
+## Implement multilayer-perceptron (MLP)
+
+We use the code from the Keras example
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/)
+as a reference.
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=tf.nn.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Implement the patch creation layer
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+
+    #     Override function to avoid error while saving model
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update(
+            {
+                "input_shape": input_shape,
+                "patch_size": patch_size,
+                "num_patches": num_patches,
+                "projection_dim": projection_dim,
+                "num_heads": num_heads,
+                "transformer_units": transformer_units,
+                "transformer_layers": transformer_layers,
+                "mlp_head_units": mlp_head_units,
+            }
+        )
+        return config
+
+    def call(self, images):
+        batch_size = tf.shape(images)[0]
+        patches = tf.image.extract_patches(
+            images=images,
+            sizes=[1, self.patch_size, self.patch_size, 1],
+            strides=[1, self.patch_size, self.patch_size, 1],
+            rates=[1, 1, 1, 1],
+            padding="VALID",
+        )
+        # return patches
+        return tf.reshape(patches, [batch_size, -1, patches.shape[-1]])
+
+
+"""
+## Display patches for an input image
+"""
+
+patch_size = 32  # Size of the patches to be extracted from the input images
+
+plt.figure(figsize=(4, 4))
+plt.imshow(x_train[0].astype("uint8"))
+plt.axis("off")
+
+patches = Patches(patch_size)(tf.convert_to_tensor([x_train[0]]))
+print(f"Image size: {image_size} X {image_size}")
+print(f"Patch size: {patch_size} X {patch_size}")
+print(f"{patches.shape[1]} patches per image \n{patches.shape[-1]} elements per patch")
+
+
+n = int(np.sqrt(patches.shape[1]))
+plt.figure(figsize=(4, 4))
+for i, patch in enumerate(patches[0]):
+    ax = plt.subplot(n, n, i + 1)
+    patch_img = tf.reshape(patch, (patch_size, patch_size, 3))
+    plt.imshow(patch_img.numpy().astype("uint8"))
+    plt.axis("off")
+
+"""
+## Implement the patch encoding layer
+
+The `PatchEncoder` layer linearly transforms a patch by projecting it into a
+vector of size `projection_dim`. It also adds a learnable position
+embedding to the projected vector.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(self, num_patches, projection_dim):
+        super().__init__()
+        self.num_patches = num_patches
+        self.projection = layers.Dense(units=projection_dim)
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+
+    # Override function to avoid error while saving model
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update(
+            {
+                "input_shape": input_shape,
+                "patch_size": patch_size,
+                "num_patches": num_patches,
+                "projection_dim": projection_dim,
+                "num_heads": num_heads,
+                "transformer_units": transformer_units,
+                "transformer_layers": transformer_layers,
+                "mlp_head_units": mlp_head_units,
+            }
+        )
+        return config
+
+    def call(self, patch):
+        positions = tf.range(start=0, limit=self.num_patches, delta=1)
+        encoded = self.projection(patch) + self.position_embedding(positions)
+        return encoded
+
+
+"""
+## Build the ViT model
+
+The ViT model has multiple Transformer blocks.
+The `MultiHeadAttention` layer is used for self-attention,
+applied to the sequence of image patches. The encoded patches (skip connection)
+and self-attention layer outputs are normalized and fed into a
+multilayer perceptron (MLP).
+The model outputs four dimensions representing
+the bounding box coordinates of an object.
+"""
+
+
+def create_vit_object_detector(
+    input_shape,
+    patch_size,
+    num_patches,
+    projection_dim,
+    num_heads,
+    transformer_units,
+    transformer_layers,
+    mlp_head_units,
+):
+    inputs = layers.Input(shape=input_shape)
+    # Create patches
+    patches = Patches(patch_size)(inputs)
+    # Encode patches
+    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
+
+    # Create multiple layers of the Transformer block.
+    for _ in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, encoded_patches])
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # MLP
+        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
+        # Skip connection 2.
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Create a [batch_size, projection_dim] tensor.
+    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+    representation = layers.Flatten()(representation)
+    representation = layers.Dropout(0.3)(representation)
+    # Add MLP.
+    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.3)
+
+    bounding_box = layers.Dense(4)(
+        features
+    )  # Final four neurons that output bounding box
+
+    # return Keras model.
+    return keras.Model(inputs=inputs, outputs=bounding_box)
+
+
+"""
+## Run the experiment
+"""
+
+
+def run_experiment(model, learning_rate, weight_decay, batch_size, num_epochs):
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    )
+
+    # Compile model.
+    model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError())
+
+    checkpoint_filepath = "logs/model.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_loss",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[
+            checkpoint_callback,
+            keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
+        ],
+    )
+
+    return history
+
+
+input_shape = (image_size, image_size, 3)  # input image shape
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 32
+num_epochs = 1
+num_patches = (image_size // patch_size) ** 2
+projection_dim = 64
+num_heads = 4
+# Size of the transformer layers
+transformer_units = [
+    projection_dim * 2,
+    projection_dim,
+]
+transformer_layers = 4
+mlp_head_units = [2048, 1024, 512, 64, 32]  # Size of the dense layers
+
+
+history = []
+num_patches = (image_size // patch_size) ** 2
+
+vit_object_detector = create_vit_object_detector(
+    input_shape,
+    patch_size,
+    num_patches,
+    projection_dim,
+    num_heads,
+    transformer_units,
+    transformer_layers,
+    mlp_head_units,
+)
+
+# Train model
+history = run_experiment(
+    vit_object_detector, learning_rate, weight_decay, batch_size, num_epochs
+)
+
+
+"""
+## Evaluate the model
+"""
+
+import matplotlib.patches as patches
+
+# Saves the model in current path
+vit_object_detector.save("vit_object_detector.keras", save_format="keras")
+
+
+# To calculate IoU (intersection over union, given two bounding boxes)
+def bounding_box_intersection_over_union(box_predicted, box_truth):
+    # get (x, y) coordinates of intersection of bounding boxes
+    top_x_intersect = max(box_predicted[0], box_truth[0])
+    top_y_intersect = max(box_predicted[1], box_truth[1])
+    bottom_x_intersect = min(box_predicted[2], box_truth[2])
+    bottom_y_intersect = min(box_predicted[3], box_truth[3])
+
+    # calculate area of the intersection bb (bounding box)
+    intersection_area = max(0, bottom_x_intersect - top_x_intersect + 1) * max(
+        0, bottom_y_intersect - top_y_intersect + 1
+    )
+
+    # calculate area of the prediction bb and ground-truth bb
+    box_predicted_area = (box_predicted[2] - box_predicted[0] + 1) * (
+        box_predicted[3] - box_predicted[1] + 1
+    )
+    box_truth_area = (box_truth[2] - box_truth[0] + 1) * (
+        box_truth[3] - box_truth[1] + 1
+    )
+
+    # calculate intersection over union by taking intersection
+    # area and dividing it by the sum of predicted bb and ground truth
+    # bb areas subtracted by  the interesection area
+
+    # return ioU
+    return intersection_area / float(
+        box_predicted_area + box_truth_area - intersection_area
+    )
+
+
+i, mean_iou = 0, 0
+
+# Compare results for 10 images in the test set
+for input_image in x_test[:10]:
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 15))
+    im = input_image
+
+    # Display the image
+    ax1.imshow(im.astype("uint8"))
+    ax2.imshow(im.astype("uint8"))
+
+    input_image = cv2.resize(
+        input_image, (image_size, image_size), interpolation=cv2.INTER_AREA
+    )
+    input_image = np.expand_dims(input_image, axis=0)
+    preds = vit_object_detector.predict(input_image)[0]
+
+    (h, w) = (im).shape[0:2]
+
+    top_left_x, top_left_y = int(preds[0] * w), int(preds[1] * h)
+
+    bottom_right_x, bottom_right_y = int(preds[2] * w), int(preds[3] * h)
+
+    box_predicted = [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
+    # Create the bounding box
+    rect = patches.Rectangle(
+        (top_left_x, top_left_y),
+        bottom_right_x - top_left_x,
+        bottom_right_y - top_left_y,
+        facecolor="none",
+        edgecolor="red",
+        linewidth=1,
+    )
+    # Add the bounding box to the image
+    ax1.add_patch(rect)
+    ax1.set_xlabel(
+        "Predicted: "
+        + str(top_left_x)
+        + ", "
+        + str(top_left_y)
+        + ", "
+        + str(bottom_right_x)
+        + ", "
+        + str(bottom_right_y)
+    )
+
+    top_left_x, top_left_y = int(y_test[i][0] * w), int(y_test[i][1] * h)
+
+    bottom_right_x, bottom_right_y = int(y_test[i][2] * w), int(y_test[i][3] * h)
+
+    box_truth = top_left_x, top_left_y, bottom_right_x, bottom_right_y
+
+    mean_iou += bounding_box_intersection_over_union(box_predicted, box_truth)
+    # Create the bounding box
+    rect = patches.Rectangle(
+        (top_left_x, top_left_y),
+        bottom_right_x - top_left_x,
+        bottom_right_y - top_left_y,
+        facecolor="none",
+        edgecolor="red",
+        linewidth=1,
+    )
+    # Add the bounding box to the image
+    ax2.add_patch(rect)
+    ax2.set_xlabel(
+        "Target: "
+        + str(top_left_x)
+        + ", "
+        + str(top_left_y)
+        + ", "
+        + str(bottom_right_x)
+        + ", "
+        + str(bottom_right_y)
+        + "\n"
+        + "IoU"
+        + str(bounding_box_intersection_over_union(box_predicted, box_truth))
+    )
+    i = i + 1
+
+print("mean_iou: " + str(mean_iou / len(x_test[:10])))
+plt.show()
+
+"""
+This example demonstrates that a pure Transformer can be trained
+to predict the bounding boxes of an object in a given image,
+thus extending the use of Transformers to object detection tasks.
+The model can be improved further by tuning hyper-parameters and pre-training.
+"""
\ No newline at end of file
diff --git a/examples/keras_io/tensorflow/vision/semisupervised_simclr.py b/examples/keras_io/tensorflow/vision/semisupervised_simclr.py
new file mode 100644
index 000000000..c4c61da86
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/semisupervised_simclr.py
@@ -0,0 +1,673 @@
+"""
+Title: Semi-supervised image classification using contrastive pretraining with SimCLR
+Author: [András Béres](https://www.linkedin.com/in/andras-beres-789190210)
+Date created: 2021/04/24
+Last modified: 2021/04/24
+Description: Contrastive pretraining with SimCLR for semi-supervised image classification on the STL-10 dataset.
+Accelerator: GPU
+"""
+"""
+## Introduction
+
+### Semi-supervised learning
+
+Semi-supervised learning is a machine learning paradigm that deals with
+**partially labeled datasets**. When applying deep learning in the real world,
+one usually has to gather a large dataset to make it work well. However, while
+the cost of labeling scales linearly with the dataset size (labeling each
+example takes a constant time), model performance only scales
+[sublinearly](https://arxiv.org/abs/2001.08361) with it. This means that
+labeling more and more samples becomes less and less cost-efficient, while
+gathering unlabeled data is generally cheap, as it is usually readily available
+in large quantities.
+
+Semi-supervised learning offers to solve this problem by only requiring a
+partially labeled dataset, and by being label-efficient by utilizing the
+unlabeled examples for learning as well.
+
+In this example, we will pretrain an encoder with contrastive learning on the
+[STL-10](https://ai.stanford.edu/~acoates/stl10/) semi-supervised dataset using
+no labels at all, and then fine-tune it using only its labeled subset.
+
+### Contrastive learning
+
+On the highest level, the main idea behind contrastive learning is to **learn
+representations that are invariant to image augmentations** in a self-supervised
+manner. One problem with this objective is that it has a trivial degenerate
+solution: the case where the representations are constant, and do not depend at all on the
+input images.
+
+Contrastive learning avoids this trap by modifying the objective in the
+following way: it pulls representations of augmented versions/views of the same
+image closer to each other (contracting positives), while simultaneously pushing
+different images away from each other (contrasting negatives) in representation
+space.
+
+One such contrastive approach is [SimCLR](https://arxiv.org/abs/2002.05709),
+which essentially identifies the core components needed to optimize this
+objective, and can achieve high performance by scaling this simple approach.
+
+Another approach is [SimSiam](https://arxiv.org/abs/2011.10566)
+([Keras example](https://keras.io/examples/vision/simsiam/)),
+whose main difference from
+SimCLR is that the former does not use any negatives in its loss. Therefore, it does not
+explicitly prevent the trivial solution, and, instead, avoids it implicitly by
+architecture design (asymmetric encoding paths using a predictor network and
+batch normalization (BatchNorm) are applied in the final layers).
+
+For further reading about SimCLR, check out
+[the official Google AI blog post](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html),
+and for an overview of self-supervised learning across both vision and language
+check out
+[this blog post](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/).
+"""
+
+"""
+## Setup
+"""
+
+# Make sure we are able to handle large datasets
+import resource
+low, high = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))
+
+import math
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+import keras_core as keras
+from keras_core import layers
+
+"""
+## Hyperparameter setup
+"""
+# Dataset hyperparameters
+unlabeled_dataset_size = 100000
+labeled_dataset_size = 5000
+image_size = 96
+image_channels = 3
+
+# Algorithm hyperparameters
+num_epochs = 1
+batch_size = 525  # Corresponds to 200 steps per epoch
+width = 128
+temperature = 0.1
+# Stronger augmentations for contrastive, weaker ones for supervised training
+contrastive_augmentation = {"min_area": 0.25, "brightness": 0.6, "jitter": 0.2}
+classification_augmentation = {"min_area": 0.75, "brightness": 0.3, "jitter": 0.1}
+
+"""
+## Dataset
+
+During training we will simultaneously load a large batch of unlabeled images along with a
+smaller batch of labeled images.
+"""
+
+
+def prepare_dataset():
+    # Labeled and unlabeled samples are loaded synchronously
+    # with batch sizes selected accordingly
+    steps_per_epoch = (unlabeled_dataset_size + labeled_dataset_size) // batch_size
+    unlabeled_batch_size = unlabeled_dataset_size // steps_per_epoch
+    labeled_batch_size = labeled_dataset_size // steps_per_epoch
+    print(
+        f"batch size is {unlabeled_batch_size} (unlabeled) + {labeled_batch_size} (labeled)"
+    )
+
+    # Turning off shuffle to lower resource usage
+    unlabeled_train_dataset = (
+        tfds.load("stl10", split="unlabelled", as_supervised=True, shuffle_files=False)
+        .shuffle(buffer_size=10 * unlabeled_batch_size)
+        .batch(unlabeled_batch_size)
+    )
+    labeled_train_dataset = (
+        tfds.load("stl10", split="train", as_supervised=True, shuffle_files=False)
+        .shuffle(buffer_size=10 * labeled_batch_size)
+        .batch(labeled_batch_size)
+    )
+    test_dataset = (
+        tfds.load("stl10", split="test", as_supervised=True)
+        .batch(batch_size)
+        .prefetch(buffer_size=tf.data.AUTOTUNE)
+    )
+
+    # Labeled and unlabeled datasets are zipped together
+    train_dataset = tf.data.Dataset.zip(
+        (unlabeled_train_dataset, labeled_train_dataset)
+    ).prefetch(buffer_size=tf.data.AUTOTUNE)
+
+    return train_dataset, labeled_train_dataset, test_dataset
+
+
+# Load STL10 dataset
+train_dataset, labeled_train_dataset, test_dataset = prepare_dataset()
+
+"""
+## Image augmentations
+
+The two most important image augmentations for contrastive learning are the
+following:
+
+- Cropping: forces the model to encode different parts of the same image
+similarly, we implement it with the
+[RandomTranslation](https://keras.io/api/layers/preprocessing_layers/image_preprocessing/random_translation/)
+and
+[RandomZoom](https://keras.io/api/layers/preprocessing_layers/image_preprocessing/random_zoom/)
+layers
+- Color jitter: prevents a trivial color histogram-based solution to the task by
+distorting color histograms. A principled way to implement that is by affine
+transformations in color space.
+
+In this example we use random horizontal flips as well. Stronger augmentations
+are applied for contrastive learning, along with weaker ones for supervised
+classification to avoid overfitting on the few labeled examples.
+
+We implement random color jitter as a custom preprocessing layer. Using
+preprocessing layers for data augmentation has the following two advantages:
+
+- The data augmentation will run on GPU in batches, so the training will not be
+bottlenecked by the data pipeline in environments with constrained CPU
+resources (such as a Colab Notebook, or a personal machine)
+- Deployment is easier as the data preprocessing pipeline is encapsulated in the
+model, and does not have to be reimplemented when deploying it
+"""
+
+
+# Distorts the color distibutions of images
+class RandomColorAffine(layers.Layer):
+    def __init__(self, brightness=0, jitter=0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.brightness = brightness
+        self.jitter = jitter
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"brightness": self.brightness, "jitter": self.jitter})
+        return config
+
+    def call(self, images, training=True):
+        if training:
+            batch_size = tf.shape(images)[0]
+
+            # Same for all colors
+            brightness_scales = 1 + tf.random.uniform(
+                (batch_size, 1, 1, 1), minval=-self.brightness, maxval=self.brightness
+            )
+            # Different for all colors
+            jitter_matrices = tf.random.uniform(
+                (batch_size, 1, 3, 3), minval=-self.jitter, maxval=self.jitter
+            )
+
+            color_transforms = (
+                tf.eye(3, batch_shape=[batch_size, 1]) * brightness_scales
+                + jitter_matrices
+            )
+            images = tf.clip_by_value(tf.matmul(images, color_transforms), 0, 1)
+        return images
+
+
+# Image augmentation module
+def get_augmenter(min_area, brightness, jitter):
+    zoom_factor = 1.0 - math.sqrt(min_area)
+    return keras.Sequential(
+        [
+            keras.Input(shape=(image_size, image_size, image_channels)),
+            layers.Rescaling(1 / 255, dtype="uint8"),
+            layers.RandomFlip("horizontal"),
+            layers.RandomTranslation(zoom_factor / 2, zoom_factor / 2),
+            layers.RandomZoom((-zoom_factor, 0.0), (-zoom_factor, 0.0)),
+            RandomColorAffine(brightness, jitter),
+        ]
+    )
+
+
+def visualize_augmentations(num_images):
+    # Sample a batch from a dataset
+    images = next(iter(train_dataset))[0][0][:num_images]
+
+    # Apply augmentations
+    augmented_images = zip(
+        images,
+        get_augmenter(**classification_augmentation)(images),
+        get_augmenter(**contrastive_augmentation)(images),
+        get_augmenter(**contrastive_augmentation)(images),
+    )
+    row_titles = [
+        "Original:",
+        "Weakly augmented:",
+        "Strongly augmented:",
+        "Strongly augmented:",
+    ]
+    plt.figure(figsize=(num_images * 2.2, 4 * 2.2), dpi=100)
+    for column, image_row in enumerate(augmented_images):
+        for row, image in enumerate(image_row):
+            plt.subplot(4, num_images, row * num_images + column + 1)
+            plt.imshow(image)
+            if column == 0:
+                plt.title(row_titles[row], loc="left")
+            plt.axis("off")
+    plt.tight_layout()
+
+
+visualize_augmentations(num_images=8)
+
+"""
+## Encoder architecture
+"""
+
+
+# Define the encoder architecture
+def get_encoder():
+    return keras.Sequential(
+        [
+            keras.Input(shape=(image_size, image_size, image_channels)),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Flatten(),
+            layers.Dense(width, activation="relu"),
+        ],
+        name="encoder",
+    )
+
+
+"""
+## Supervised baseline model
+
+A baseline supervised model is trained using random initialization.
+"""
+
+# Baseline supervised training with random initialization
+baseline_model = keras.Sequential(
+    [
+        keras.Input(shape=(image_size, image_size, image_channels)),
+        get_augmenter(**classification_augmentation),
+        get_encoder(),
+        layers.Dense(10),
+    ],
+    name="baseline_model",
+)
+baseline_model.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+)
+
+baseline_history = baseline_model.fit(
+    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(baseline_history.history["val_acc"]) * 100
+    )
+)
+
+"""
+## Self-supervised model for contrastive pretraining
+
+We pretrain an encoder on unlabeled images with a contrastive loss.
+A nonlinear projection head is attached to the top of the encoder, as it
+improves the quality of representations of the encoder.
+
+We use the InfoNCE/NT-Xent/N-pairs loss, which can be interpreted in the
+following way:
+
+1. We treat each image in the batch as if it had its own class.
+2. Then, we have two examples (a pair of augmented views) for each "class".
+3. Each view's representation is compared to every possible pair's one (for both
+  augmented versions).
+4. We use the temperature-scaled cosine similarity of compared representations as
+  logits.
+5. Finally, we use categorical cross-entropy as the "classification" loss
+
+The following two metrics are used for monitoring the pretraining performance:
+
+- [Contrastive accuracy (SimCLR Table 5)](https://arxiv.org/abs/2002.05709):
+Self-supervised metric, the ratio of cases in which the representation of an
+image is more similar to its differently augmented version's one, than to the
+representation of any other image in the current batch. Self-supervised
+metrics can be used for hyperparameter tuning even in the case when there are
+no labeled examples.
+- [Linear probing accuracy](https://arxiv.org/abs/1603.08511): Linear probing is
+a popular metric to evaluate self-supervised classifiers. It is computed as
+the accuracy of a logistic regression classifier trained on top of the
+encoder's features. In our case, this is done by training a single dense layer
+on top of the frozen encoder. Note that contrary to traditional approach where
+the classifier is trained after the pretraining phase, in this example we
+train it during pretraining. This might slightly decrease its accuracy, but
+that way we can monitor its value during training, which helps with
+experimentation and debugging.
+
+Another widely used supervised metric is the
+[KNN accuracy](https://arxiv.org/abs/1805.01978), which is the accuracy of a KNN
+classifier trained on top of the encoder's features, which is not implemented in
+this example.
+"""
+
+
+# Define the contrastive model with model-subclassing
+class ContrastiveModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+
+        self.temperature = temperature
+        self.contrastive_augmenter = get_augmenter(**contrastive_augmentation)
+        self.classification_augmenter = get_augmenter(**classification_augmentation)
+        self.encoder = get_encoder()
+        # Non-linear MLP as projection head
+        self.projection_head = keras.Sequential(
+            [
+                keras.Input(shape=(width,)),
+                layers.Dense(width, activation="relu"),
+                layers.Dense(width),
+            ],
+            name="projection_head",
+        )
+        # Single dense layer for linear probing
+        self.linear_probe = keras.Sequential(
+            [layers.Input(shape=(width,)), layers.Dense(10)], name="linear_probe"
+        )
+
+        self.encoder.summary()
+        self.projection_head.summary()
+        self.linear_probe.summary()
+
+    def compile(self, contrastive_optimizer, probe_optimizer, **kwargs):
+        super().compile(**kwargs)
+
+        self.contrastive_optimizer = contrastive_optimizer
+        self.probe_optimizer = probe_optimizer
+
+        # self.contrastive_loss will be defined as a method
+        self.probe_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+        self.contrastive_loss_tracker = keras.metrics.Mean(name="c_loss")
+        self.contrastive_accuracy = keras.metrics.SparseCategoricalAccuracy(
+            name="c_acc"
+        )
+        self.probe_loss_tracker = keras.metrics.Mean(name="p_loss")
+        self.probe_accuracy = keras.metrics.SparseCategoricalAccuracy(name="p_acc")
+
+    @property
+    def metrics(self):
+        return [
+            self.contrastive_loss_tracker,
+            self.contrastive_accuracy,
+            self.probe_loss_tracker,
+            self.probe_accuracy,
+        ]
+
+    def contrastive_loss(self, projections_1, projections_2):
+        # InfoNCE loss (information noise-contrastive estimation)
+        # NT-Xent loss (normalized temperature-scaled cross entropy)
+
+        # Cosine similarity: the dot product of the l2-normalized feature vectors
+        projections_1 = tf.math.l2_normalize(projections_1, axis=1)
+        projections_2 = tf.math.l2_normalize(projections_2, axis=1)
+        similarities = (
+            tf.matmul(projections_1, projections_2, transpose_b=True) / self.temperature
+        )
+
+        # The similarity between the representations of two augmented views of the
+        # same image should be higher than their similarity with other views
+        batch_size = tf.shape(projections_1)[0]
+        contrastive_labels = tf.range(batch_size)
+        self.contrastive_accuracy.update_state(contrastive_labels, similarities)
+        self.contrastive_accuracy.update_state(
+            contrastive_labels, tf.transpose(similarities)
+        )
+
+        # The temperature-scaled similarities are used as logits for cross-entropy
+        # a symmetrized version of the loss is used here
+        loss_1_2 = keras.losses.sparse_categorical_crossentropy(
+            contrastive_labels, similarities, from_logits=True
+        )
+        loss_2_1 = keras.losses.sparse_categorical_crossentropy(
+            contrastive_labels, tf.transpose(similarities), from_logits=True
+        )
+        return (loss_1_2 + loss_2_1) / 2
+
+    def train_step(self, data):
+        (unlabeled_images, _), (labeled_images, labels) = data
+
+        # Both labeled and unlabeled images are used, without labels
+        images = tf.concat((unlabeled_images, labeled_images), axis=0)
+        # Each image is augmented twice, differently
+        augmented_images_1 = self.contrastive_augmenter(images, training=True)
+        augmented_images_2 = self.contrastive_augmenter(images, training=True)
+        with tf.GradientTape() as tape:
+            features_1 = self.encoder(augmented_images_1, training=True)
+            features_2 = self.encoder(augmented_images_2, training=True)
+            # The representations are passed through a projection mlp
+            projections_1 = self.projection_head(features_1, training=True)
+            projections_2 = self.projection_head(features_2, training=True)
+            contrastive_loss = self.contrastive_loss(projections_1, projections_2)
+        gradients = tape.gradient(
+            contrastive_loss,
+            self.encoder.trainable_weights + self.projection_head.trainable_weights,
+        )
+        self.contrastive_optimizer.apply_gradients(
+            zip(
+                gradients,
+                self.encoder.trainable_weights + self.projection_head.trainable_weights,
+            )
+        )
+        self.contrastive_loss_tracker.update_state(contrastive_loss)
+
+        # Labels are only used in evalutation for an on-the-fly logistic regression
+        preprocessed_images = self.classification_augmenter(
+            labeled_images, training=True
+        )
+        with tf.GradientTape() as tape:
+            # the encoder is used in inference mode here to avoid regularization
+            # and updating the batch normalization paramers if they are used
+            features = self.encoder(preprocessed_images, training=False)
+            class_logits = self.linear_probe(features, training=True)
+            probe_loss = self.probe_loss(labels, class_logits)
+        gradients = tape.gradient(probe_loss, self.linear_probe.trainable_weights)
+        self.probe_optimizer.apply_gradients(
+            zip(gradients, self.linear_probe.trainable_weights)
+        )
+        self.probe_loss_tracker.update_state(probe_loss)
+        self.probe_accuracy.update_state(labels, class_logits)
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, data):
+        labeled_images, labels = data
+
+        # For testing the components are used with a training=False flag
+        preprocessed_images = self.classification_augmenter(
+            labeled_images, training=False
+        )
+        features = self.encoder(preprocessed_images, training=False)
+        class_logits = self.linear_probe(features, training=False)
+        probe_loss = self.probe_loss(labels, class_logits)
+        self.probe_loss_tracker.update_state(probe_loss)
+        self.probe_accuracy.update_state(labels, class_logits)
+
+        # Only the probe metrics are logged at test time
+        return {m.name: m.result() for m in self.metrics[2:]}
+
+
+# Contrastive pretraining
+pretraining_model = ContrastiveModel()
+pretraining_model.compile(
+    contrastive_optimizer=keras.optimizers.Adam(),
+    probe_optimizer=keras.optimizers.Adam(),
+)
+
+pretraining_history = pretraining_model.fit(
+    train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(pretraining_history.history["val_p_acc"]) * 100
+    )
+)
+
+"""
+## Supervised finetuning of the pretrained encoder
+
+We then finetune the encoder on the labeled examples, by attaching
+a single randomly initalized fully connected classification layer on its top.
+"""
+
+# Supervised finetuning of the pretrained encoder
+finetuning_model = keras.Sequential(
+    [
+        layers.Input(shape=(image_size, image_size, image_channels)),
+        get_augmenter(**classification_augmentation),
+        pretraining_model.encoder,
+        layers.Dense(10),
+    ],
+    name="finetuning_model",
+)
+finetuning_model.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+)
+
+finetuning_history = finetuning_model.fit(
+    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(finetuning_history.history["val_acc"]) * 100
+    )
+)
+
+"""
+## Comparison against the baseline
+"""
+
+
+# The classification accuracies of the baseline and the pretraining + finetuning process:
+def plot_training_curves(pretraining_history, finetuning_history, baseline_history):
+    for metric_key, metric_name in zip(["acc", "loss"], ["accuracy", "loss"]):
+        plt.figure(figsize=(8, 5), dpi=100)
+        plt.plot(
+            baseline_history.history[f"val_{metric_key}"], label="supervised baseline"
+        )
+        plt.plot(
+            pretraining_history.history[f"val_p_{metric_key}"],
+            label="self-supervised pretraining",
+        )
+        plt.plot(
+            finetuning_history.history[f"val_{metric_key}"],
+            label="supervised finetuning",
+        )
+        plt.legend()
+        plt.title(f"Classification {metric_name} during training")
+        plt.xlabel("epochs")
+        plt.ylabel(f"validation {metric_name}")
+
+
+plot_training_curves(pretraining_history, finetuning_history, baseline_history)
+
+"""
+By comparing the training curves, we can see that when using contrastive
+pretraining, a higher validation accuracy can be reached, paired with a lower
+validation loss, which means that the pretrained network was able to generalize
+better when seeing only a small amount of labeled examples.
+"""
+
+"""
+## Improving further
+
+### Architecture
+
+The experiment in the original paper demonstrated that increasing the width and depth of the
+models improves performance at a higher rate than for supervised learning. Also,
+using a [ResNet-50](https://keras.io/api/applications/resnet/#resnet50-function)
+encoder is quite standard in the literature. However keep in mind, that more
+powerful models will not only increase training time but will also require more
+memory and will limit the maximal batch size you can use.
+
+It has [been](https://arxiv.org/abs/1905.09272)
+[reported](https://arxiv.org/abs/1911.05722) that the usage of BatchNorm layers
+could sometimes degrade performance, as it introduces an intra-batch dependency
+between samples, which is why I did not have used them in this example. In my
+experiments however, using BatchNorm, especially in the projection head,
+improves performance.
+
+### Hyperparameters
+
+The hyperparameters used in this example have been tuned manually for this task and
+architecture. Therefore, without changing them, only marginal gains can be expected
+from further hyperparameter tuning.
+
+However for a different task or model architecture these would need tuning, so
+here are my notes on the most important ones:
+
+- **Batch size**: since the objective can be interpreted as a classification
+over a batch of images (loosely speaking), the batch size is actually a more
+important hyperparameter than usual. The higher, the better.
+- **Temperature**: the temperature defines the "softness" of the softmax
+distribution that is used in the cross-entropy loss, and is an important
+hyperparameter. Lower values generally lead to a higher contrastive accuracy.
+A recent trick (in [ALIGN](https://arxiv.org/abs/2102.05918)) is to learn
+the temperature's value as well (which can be done by defining it as a
+tf.Variable, and applying gradients on it). Even though this provides a good baseline
+value, in my experiments the learned temperature was somewhat lower
+than optimal, as it is optimized with respect to the contrastive loss, which is not a
+perfect proxy for representation quality.
+- **Image augmentation strength**: during pretraining stronger augmentations
+increase the difficulty of the task, however after a point too strong
+augmentations will degrade performance. During finetuning stronger
+augmentations reduce overfitting while in my experience too strong
+augmentations decrease the performance gains from pretraining. The whole data
+augmentation pipeline can be seen as an important hyperparameter of the
+algorithm, implementations of other custom image augmentation layers in Keras
+can be found in
+[this repository](https://github.com/beresandras/image-augmentation-layers-keras).
+- **Learning rate schedule**: a constant schedule is used here, but it is
+quite common in the literature to use a
+[cosine decay schedule](https://www.tensorflow.org/api_docs/python/tf/keras/experimental/CosineDecay),
+which can further improve performance.
+- **Optimizer**: Adam is used in this example, as it provides good performance
+with default parameters. SGD with momentum requires more tuning, however it
+could slightly increase performance.
+"""
+
+"""
+## Related works
+
+Other instance-level (image-level) contrastive learning methods:
+
+- [MoCo](https://arxiv.org/abs/1911.05722)
+([v2](https://arxiv.org/abs/2003.04297),
+[v3](https://arxiv.org/abs/2104.02057)): uses a momentum-encoder as well,
+whose weights are an exponential moving average of the target encoder
+- [SwAV](https://arxiv.org/abs/2006.09882): uses clustering instead of pairwise
+comparison
+- [BarlowTwins](https://arxiv.org/abs/2103.03230): uses a cross
+correlation-based objective instead of pairwise comparison
+
+Keras implementations of **MoCo** and **BarlowTwins** can be found in
+[this repository](https://github.com/beresandras/contrastive-classification-keras),
+which includes a Colab notebook.
+
+There is also a new line of works, which optimize a similar objective, but
+without the use of any negatives:
+
+- [BYOL](https://arxiv.org/abs/2006.07733): momentum-encoder + no negatives
+- [SimSiam](https://arxiv.org/abs/2011.10566)
+([Keras example](https://keras.io/examples/vision/simsiam/)):
+no momentum-encoder + no negatives
+
+In my experience, these methods are more brittle (they can collapse to a constant
+representation, I could not get them to work using this encoder architecture).
+Even though they are generally more dependent on the
+[model](https://generallyintelligent.ai/understanding-self-supervised-contrastive-learning.html)
+[architecture](https://arxiv.org/abs/2010.10241), they can improve
+performance at smaller batch sizes.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/semi-supervised-classification-simclr)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/semi-supervised-classification).
+"""
\ No newline at end of file
diff --git a/examples/keras_io/tensorflow/vision/swim_transformers.py b/examples/keras_io/tensorflow/vision/swim_transformers.py
new file mode 100644
index 000000000..ed1e72ed1
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/swim_transformers.py
@@ -0,0 +1,547 @@
+"""
+Title: Image classification with Swin Transformers
+Author: [Rishit Dagli](https://twitter.com/rishit_dagli)
+Date created: 2021/09/08
+Last modified: 2021/09/08
+Description: Image classification using Swin Transformers, a general-purpose backbone for computer vision.
+Accelerator: GPU
+"""
+"""
+This example implements [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+by Liu et al. for image classification, and demonstrates it on the
+[CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+Swin Transformer (**S**hifted **Win**dow Transformer) can serve as a general-purpose backbone
+for computer vision. Swin Transformer is a hierarchical Transformer whose
+representations are computed with _shifted windows_. The shifted window scheme
+brings greater efficiency by limiting self-attention computation to
+non-overlapping local windows while also allowing for cross-window connections.
+This architecture has the flexibility to model information at various scales and has
+a linear computational complexity with respect to image size.
+
+This example requires TensorFlow 2.5 or higher.
+"""
+
+"""
+## Setup
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import keras_core as keras
+from keras_core import layers
+
+"""
+## Prepare the data
+
+We load the CIFAR-100 dataset through `tf.keras.datasets`,
+normalize the images, and convert the integer labels to one-hot encoded vectors.
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+x_train, x_test = x_train / 255.0, x_test / 255.0
+y_train = keras.utils.numerical_utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.numerical_utils.to_categorical(y_test, num_classes)
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+plt.figure(figsize=(10, 10))
+for i in range(25):
+    plt.subplot(5, 5, i + 1)
+    plt.xticks([])
+    plt.yticks([])
+    plt.grid(False)
+    plt.imshow(x_train[i])
+plt.show()
+
+"""
+## Configure the hyperparameters
+
+A key parameter to pick is the `patch_size`, the size of the input patches.
+In order to use each pixel as an individual input, you can set `patch_size` to `(1, 1)`.
+Below, we take inspiration from the original paper settings
+for training on ImageNet-1K, keeping most of the original settings for this example.
+"""
+
+patch_size = (2, 2)  # 2-by-2 sized patches
+dropout_rate = 0.03  # Dropout rate
+num_heads = 8  # Attention heads
+embed_dim = 64  # Embedding dimension
+num_mlp = 256  # MLP layer size
+qkv_bias = True  # Convert embedded patches to query, key, and values with a learnable additive value
+window_size = 2  # Size of attention window
+shift_size = 1  # Size of shifting window
+image_dimension = 32  # Initial image size
+
+num_patch_x = input_shape[0] // patch_size[0]
+num_patch_y = input_shape[1] // patch_size[1]
+
+learning_rate = 1e-3
+batch_size = 128
+num_epochs = 1
+validation_split = 0.1
+weight_decay = 0.0001
+label_smoothing = 0.1
+
+"""
+## Helper functions
+
+We create two helper functions to help us get a sequence of
+patches from the image, merge patches, and apply dropout.
+"""
+
+
+def window_partition(x, window_size):
+    _, height, width, channels = x.shape
+    patch_num_y = height // window_size
+    patch_num_x = width // window_size
+    x = tf.reshape(
+        x, shape=(-1, patch_num_y, window_size, patch_num_x, window_size, channels)
+    )
+    x = tf.transpose(x, (0, 1, 3, 2, 4, 5))
+    windows = tf.reshape(x, shape=(-1, window_size, window_size, channels))
+    return windows
+
+
+def window_reverse(windows, window_size, height, width, channels):
+    patch_num_y = height // window_size
+    patch_num_x = width // window_size
+    x = tf.reshape(
+        windows,
+        shape=(-1, patch_num_y, patch_num_x, window_size, window_size, channels),
+    )
+    x = tf.transpose(x, perm=(0, 1, 3, 2, 4, 5))
+    x = tf.reshape(x, shape=(-1, height, width, channels))
+    return x
+
+
+class DropPath(layers.Layer):
+    def __init__(self, drop_prob=None, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prob
+
+    def call(self, x):
+        input_shape = tf.shape(x)
+        batch_size = input_shape[0]
+        rank = x.shape.rank
+        shape = (batch_size,) + (1,) * (rank - 1)
+        random_tensor = (1 - self.drop_prob) + tf.random.uniform(shape, dtype=x.dtype)
+        path_mask = tf.floor(random_tensor)
+        output = tf.math.divide(x, 1 - self.drop_prob) * path_mask
+        return output
+
+
+"""
+## Window based multi-head self-attention
+
+Usually Transformers perform global self-attention, where the relationships between
+a token and all other tokens are computed. The global computation leads to quadratic
+complexity with respect to the number of tokens. Here, as the [original paper](https://arxiv.org/abs/2103.14030)
+suggests, we compute self-attention within local windows, in a non-overlapping manner.
+Global self-attention leads to quadratic computational complexity in the number of patches,
+whereas window-based self-attention leads to linear complexity and is easily scalable.
+"""
+
+
+class WindowAttention(layers.Layer):
+    def __init__(
+        self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.scale = (dim // num_heads) ** -0.5
+        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
+        self.dropout = layers.Dropout(dropout_rate)
+        self.proj = layers.Dense(dim)
+
+    def build(self, input_shape):
+        num_window_elements = (2 * self.window_size[0] - 1) * (
+            2 * self.window_size[1] - 1
+        )
+        self.relative_position_bias_table = self.add_weight(
+            shape=(num_window_elements, self.num_heads),
+            initializer=tf.initializers.Zeros(),
+            trainable=True,
+        )
+        coords_h = np.arange(self.window_size[0])
+        coords_w = np.arange(self.window_size[1])
+        coords_matrix = np.meshgrid(coords_h, coords_w, indexing="ij")
+        coords = np.stack(coords_matrix)
+        coords_flatten = coords.reshape(2, -1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+
+        self.relative_position_index = tf.Variable(
+            initial_value=lambda: tf.convert_to_tensor(relative_position_index), trainable=False
+        )
+
+    def call(self, x, mask=None):
+        _, size, channels = x.shape
+        head_dim = channels // self.num_heads
+        x_qkv = self.qkv(x)
+        x_qkv = tf.reshape(x_qkv, shape=(-1, size, 3, self.num_heads, head_dim))
+        x_qkv = tf.transpose(x_qkv, perm=(2, 0, 3, 1, 4))
+        q, k, v = x_qkv[0], x_qkv[1], x_qkv[2]
+        q = q * self.scale
+        k = tf.transpose(k, perm=(0, 1, 3, 2))
+        attn = q @ k
+
+        num_window_elements = self.window_size[0] * self.window_size[1]
+        relative_position_index_flat = tf.reshape(
+            self.relative_position_index, shape=(-1,)
+        )
+        relative_position_bias = tf.gather(
+            self.relative_position_bias_table, relative_position_index_flat
+        )
+        relative_position_bias = tf.reshape(
+            relative_position_bias, shape=(num_window_elements, num_window_elements, -1)
+        )
+        relative_position_bias = tf.transpose(relative_position_bias, perm=(2, 0, 1))
+        attn = attn + tf.expand_dims(relative_position_bias, axis=0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            mask_float = tf.cast(
+                tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32
+            )
+            attn = (
+                tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size))
+                + mask_float
+            )
+            attn = tf.reshape(attn, shape=(-1, self.num_heads, size, size))
+            attn = keras.activations.softmax(attn, axis=-1)
+        else:
+            attn = keras.activations.softmax(attn, axis=-1)
+        attn = self.dropout(attn)
+
+        x_qkv = attn @ v
+        x_qkv = tf.transpose(x_qkv, perm=(0, 2, 1, 3))
+        x_qkv = tf.reshape(x_qkv, shape=(-1, size, channels))
+        x_qkv = self.proj(x_qkv)
+        x_qkv = self.dropout(x_qkv)
+        return x_qkv
+
+
+"""
+## The complete Swin Transformer model
+
+Finally, we put together the complete Swin Transformer by replacing the standard multi-head
+attention (MHA) with shifted windows attention. As suggested in the
+original paper, we create a model comprising of a shifted window-based MHA
+layer, followed by a 2-layer MLP with GELU nonlinearity in between, applying
+`LayerNormalization` before each MSA layer and each MLP, and a residual
+connection after each of these layers.
+
+Notice that we only create a simple MLP with 2 Dense and
+2 Dropout layers. Often you will see models using ResNet-50 as the MLP which is
+quite standard in the literature. However in this paper the authors use a
+2-layer MLP with GELU nonlinearity in between.
+"""
+
+
+class SwinTransformer(layers.Layer):
+    def __init__(
+        self,
+        dim,
+        num_patch,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        num_mlp=1024,
+        qkv_bias=True,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.dim = dim  # number of input dimensions
+        self.num_patch = num_patch  # number of embedded patches
+        self.num_heads = num_heads  # number of attention heads
+        self.window_size = window_size  # size of window
+        self.shift_size = shift_size  # size of window shift
+        self.num_mlp = num_mlp  # number of MLP nodes
+
+        self.norm1 = layers.LayerNormalization(epsilon=1e-5)
+        self.attn = WindowAttention(
+            dim,
+            window_size=(self.window_size, self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            dropout_rate=dropout_rate,
+        )
+        self.drop_path = DropPath(dropout_rate)
+        self.norm2 = layers.LayerNormalization(epsilon=1e-5)
+
+        self.mlp = keras.Sequential(
+            [
+                layers.Dense(num_mlp),
+                layers.Activation(keras.activations.gelu),
+                layers.Dropout(dropout_rate),
+                layers.Dense(dim),
+                layers.Dropout(dropout_rate),
+            ]
+        )
+
+        if min(self.num_patch) < self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.num_patch)
+
+    def build(self, input_shape):
+        if self.shift_size == 0:
+            self.attn_mask = None
+        else:
+            height, width = self.num_patch
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            mask_array = np.zeros((1, height, width, 1))
+            count = 0
+            for h in h_slices:
+                for w in w_slices:
+                    mask_array[:, h, w, :] = count
+                    count += 1
+            mask_array = tf.convert_to_tensor(mask_array)
+
+            # mask array to windows
+            mask_windows = window_partition(mask_array, self.window_size)
+            mask_windows = tf.reshape(
+                mask_windows, shape=[-1, self.window_size * self.window_size]
+            )
+            attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims(
+                mask_windows, axis=2
+            )
+            attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask)
+            attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask)
+            self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False)
+
+    def call(self, x):
+        height, width = self.num_patch
+        _, num_patches_before, channels = x.shape
+        x_skip = x
+        x = self.norm1(x)
+        x = tf.reshape(x, shape=(-1, height, width, channels))
+        if self.shift_size > 0:
+            shifted_x = tf.roll(
+                x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
+            )
+        else:
+            shifted_x = x
+
+        x_windows = window_partition(shifted_x, self.window_size)
+        x_windows = tf.reshape(
+            x_windows, shape=(-1, self.window_size * self.window_size, channels)
+        )
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)
+
+        attn_windows = tf.reshape(
+            attn_windows, shape=(-1, self.window_size, self.window_size, channels)
+        )
+        shifted_x = window_reverse(
+            attn_windows, self.window_size, height, width, channels
+        )
+        if self.shift_size > 0:
+            x = tf.roll(
+                shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
+            )
+        else:
+            x = shifted_x
+
+        x = tf.reshape(x, shape=(-1, height * width, channels))
+        x = self.drop_path(x)
+        x = x_skip + x
+        x_skip = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x_skip + x
+        return x
+
+
+"""
+## Model training and evaluation
+
+### Extract and embed patches
+
+We first create 3 layers to help us extract, embed and merge patches from the
+images on top of which we will later use the Swin Transformer class we built.
+"""
+
+
+class PatchExtract(layers.Layer):
+    def __init__(self, patch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size_x = patch_size[0]
+        self.patch_size_y = patch_size[0]
+
+    def call(self, images):
+        batch_size = tf.shape(images)[0]
+        patches = tf.image.extract_patches(
+            images=images,
+            sizes=(1, self.patch_size_x, self.patch_size_y, 1),
+            strides=(1, self.patch_size_x, self.patch_size_y, 1),
+            rates=(1, 1, 1, 1),
+            padding="VALID",
+        )
+        patch_dim = patches.shape[-1]
+        patch_num = patches.shape[1]
+        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))
+
+
+class PatchEmbedding(layers.Layer):
+    def __init__(self, num_patch, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.num_patch = num_patch
+        self.proj = layers.Dense(embed_dim)
+        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)
+
+    def call(self, patch):
+        pos = tf.range(start=0, limit=self.num_patch, delta=1)
+        return self.proj(patch) + self.pos_embed(pos)
+
+
+class PatchMerging(keras.layers.Layer):
+    def __init__(self, num_patch, embed_dim):
+        super().__init__()
+        self.num_patch = num_patch
+        self.embed_dim = embed_dim
+        self.linear_trans = layers.Dense(2 * embed_dim, use_bias=False)
+
+    def call(self, x):
+        height, width = self.num_patch
+        _, _, C = x.shape
+        x = tf.reshape(x, shape=(-1, height, width, C))
+        x0 = x[:, 0::2, 0::2, :]
+        x1 = x[:, 1::2, 0::2, :]
+        x2 = x[:, 0::2, 1::2, :]
+        x3 = x[:, 1::2, 1::2, :]
+        x = tf.concat((x0, x1, x2, x3), axis=-1)
+        x = tf.reshape(x, shape=(-1, (height // 2) * (width // 2), 4 * C))
+        return self.linear_trans(x)
+
+
+"""
+### Build the model
+
+We put together the Swin Transformer model.
+"""
+
+input = layers.Input(input_shape)
+x = layers.RandomCrop(image_dimension, image_dimension)(input)
+x = layers.RandomFlip("horizontal")(x)
+x = PatchExtract(patch_size)(x)
+x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(x)
+x = SwinTransformer(
+    dim=embed_dim,
+    num_patch=(num_patch_x, num_patch_y),
+    num_heads=num_heads,
+    window_size=window_size,
+    shift_size=0,
+    num_mlp=num_mlp,
+    qkv_bias=qkv_bias,
+    dropout_rate=dropout_rate,
+)(x)
+x = SwinTransformer(
+    dim=embed_dim,
+    num_patch=(num_patch_x, num_patch_y),
+    num_heads=num_heads,
+    window_size=window_size,
+    shift_size=shift_size,
+    num_mlp=num_mlp,
+    qkv_bias=qkv_bias,
+    dropout_rate=dropout_rate,
+)(x)
+x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x)
+x = layers.GlobalAveragePooling1D()(x)
+output = layers.Dense(num_classes, activation="softmax")(x)
+
+"""
+### Train on CIFAR-100
+
+We train the model on CIFAR-100. Here, we only train the model
+for 40 epochs to keep the training time short in this example.
+In practice, you should train for 150 epochs to reach convergence.
+"""
+
+model = keras.Model(input, output)
+model.compile(
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    ),
+    metrics=[
+        keras.metrics.CategoricalAccuracy(name="accuracy"),
+        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=num_epochs,
+    validation_split=validation_split,
+)
+
+"""
+Let's visualize the training progress of the model.
+"""
+
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.show()
+
+"""
+Let's display the final results of the training on CIFAR-100.
+"""
+
+loss, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+print(f"Test loss: {round(loss, 2)}")
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+"""
+The Swin Transformer model we just trained has just 152K parameters, and it gets
+us to ~75% test top-5 accuracy within just 40 epochs without any signs of overfitting
+as well as seen in above graph. This means we can train this network for longer
+(perhaps with a bit more regularization) and obtain even better performance.
+This performance can further be improved by additional techniques like cosine
+decay learning rate schedule, other data augmentation techniques. While experimenting,
+I tried training the model for 150 epochs with a slightly higher dropout and greater
+embedding dimensions which pushes the performance to ~72% test accuracy on CIFAR-100
+as you can see in the screenshot.
+
+![Results of training for longer](https://i.imgur.com/9vnQesZ.png)
+
+The authors present a top-1 accuracy of 87.3% on ImageNet. The authors also present
+a number of experiments to study how input sizes, optimizers etc. affect the final
+performance of this model. The authors further present using this model for object detection,
+semantic segmentation and instance segmentation as well and report competitive results
+for these. You are strongly advised to also check out the
+[original paper](https://arxiv.org/abs/2103.14030).
+
+This example takes inspiration from the official
+[PyTorch](https://github.com/microsoft/Swin-Transformer) and
+[TensorFlow](https://github.com/VcampSoldiers/Swin-Transformer-Tensorflow) implementations.
+"""
\ No newline at end of file
diff --git a/examples/keras_io/timeseries/timeseries_classification_from_scratch.py b/examples/keras_io/timeseries/timeseries_classification_from_scratch.py
new file mode 100755
index 000000000..73564a35c
--- /dev/null
+++ b/examples/keras_io/timeseries/timeseries_classification_from_scratch.py
@@ -0,0 +1,226 @@
+"""
+Title: Timeseries classification from scratch
+Author: [hfawaz](https://github.com/hfawaz/)
+Date created: 2020/07/21
+Last modified: 2021/07/16
+Description: Training a timeseries classifier from scratch on the FordA dataset from the UCR/UEA archive.
+Accelerator: GPU
+"""
+"""
+## Introduction
+
+This example shows how to do timeseries classification from scratch, starting from raw
+CSV timeseries files on disk. We demonstrate the workflow on the FordA dataset from the
+[UCR/UEA archive](https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/).
+
+"""
+
+"""
+## Setup
+
+"""
+import keras_core as keras
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+## Load the data: the FordA dataset
+
+### Dataset description
+
+The dataset we are using here is called FordA.
+The data comes from the UCR archive.
+The dataset contains 3601 training instances and another 1320 testing instances.
+Each timeseries corresponds to a measurement of engine noise captured by a motor sensor.
+For this task, the goal is to automatically detect the presence of a specific issue with
+the engine. The problem is a balanced binary classification task. The full description of
+this dataset can be found [here](http://www.j-wichard.de/publications/FordPaper.pdf).
+
+### Read the TSV data
+
+We will use the `FordA_TRAIN` file for training and the
+`FordA_TEST` file for testing. The simplicity of this dataset
+allows us to demonstrate effectively how to use ConvNets for timeseries classification.
+In this file, the first column corresponds to the label.
+"""
+
+
+def readucr(filename):
+    data = np.loadtxt(filename, delimiter="\t")
+    y = data[:, 0]
+    x = data[:, 1:]
+    return x, y.astype(int)
+
+
+root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
+
+x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
+x_test, y_test = readucr(root_url + "FordA_TEST.tsv")
+
+"""
+## Visualize the data
+
+Here we visualize one timeseries example for each class in the dataset.
+
+"""
+
+classes = np.unique(np.concatenate((y_train, y_test), axis=0))
+
+plt.figure()
+for c in classes:
+    c_x_train = x_train[y_train == c]
+    plt.plot(c_x_train[0], label="class " + str(c))
+plt.legend(loc="best")
+plt.show()
+plt.close()
+
+"""
+## Standardize the data
+
+Our timeseries are already in a single length (500). However, their values are
+usually in various ranges. This is not ideal for a neural network;
+in general we should seek to make the input values normalized.
+For this specific dataset, the data is already z-normalized: each timeseries sample
+has a mean equal to zero and a standard deviation equal to one. This type of
+normalization is very common for timeseries classification problems, see
+[Bagnall et al. (2016)](https://link.springer.com/article/10.1007/s10618-016-0483-9).
+
+Note that the timeseries data used here are univariate, meaning we only have one channel
+per timeseries example.
+We will therefore transform the timeseries into a multivariate one with one channel
+using a simple reshaping via numpy.
+This will allow us to construct a model that is easily applicable to multivariate time
+series.
+"""
+
+x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
+x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
+
+"""
+Finally, in order to use `sparse_categorical_crossentropy`, we will have to count
+the number of classes beforehand.
+"""
+
+num_classes = len(np.unique(y_train))
+
+"""
+Now we shuffle the training set because we will be using the `validation_split` option
+later when training.
+"""
+
+idx = np.random.permutation(len(x_train))
+x_train = x_train[idx]
+y_train = y_train[idx]
+
+"""
+Standardize the labels to positive integers.
+The expected labels will then be 0 and 1.
+"""
+
+y_train[y_train == -1] = 0
+y_test[y_test == -1] = 0
+
+"""
+## Build a model
+
+We build a Fully Convolutional Neural Network originally proposed in
+[this paper](https://arxiv.org/abs/1611.06455).
+The implementation is based on the TF 2 version provided
+[here](https://github.com/hfawaz/dl-4-tsc/).
+The following hyperparameters (kernel_size, filters, the usage of BatchNorm) were found
+via random search using [KerasTuner](https://github.com/keras-team/keras-tuner).
+
+"""
+
+
+def make_model(input_shape):
+    input_layer = keras.layers.Input(input_shape)
+
+    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(input_layer)
+    conv1 = keras.layers.BatchNormalization()(conv1)
+    conv1 = keras.layers.ReLU()(conv1)
+
+    conv2 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv1)
+    conv2 = keras.layers.BatchNormalization()(conv2)
+    conv2 = keras.layers.ReLU()(conv2)
+
+    conv3 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv2)
+    conv3 = keras.layers.BatchNormalization()(conv3)
+    conv3 = keras.layers.ReLU()(conv3)
+
+    gap = keras.layers.GlobalAveragePooling1D()(conv3)
+
+    output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap)
+
+    return keras.models.Model(inputs=input_layer, outputs=output_layer)
+
+
+model = make_model(input_shape=x_train.shape[1:])
+keras.utils.plot_model(model, show_shapes=True)
+
+"""
+## Train the model
+
+"""
+
+epochs = 500
+batch_size = 32
+
+callbacks = [
+    keras.callbacks.ModelCheckpoint(
+        "best_model.keras", save_best_only=True, monitor="val_loss"
+    ),
+    keras.callbacks.ReduceLROnPlateau(
+        monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
+    ),
+    keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1),
+]
+model.compile(
+    optimizer="adam",
+    loss="sparse_categorical_crossentropy",
+    metrics=["sparse_categorical_accuracy"]
+)
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=epochs,
+    callbacks=callbacks,
+    validation_split=0.2,
+    verbose=1,
+)
+
+"""
+## Evaluate model on test data
+"""
+
+model = keras.models.load_model("best_model.keras")
+
+test_loss, test_acc = model.evaluate(x_test, y_test)
+
+print("Test accuracy", test_acc)
+print("Test loss", test_loss)
+
+"""
+## Plot the model's training and validation loss
+"""
+
+metric = "sparse_categorical_accuracy"
+plt.figure()
+plt.plot(history.history[metric])
+plt.plot(history.history["val_" + metric])
+plt.title("model " + metric)
+plt.ylabel(metric, fontsize="large")
+plt.xlabel("epoch", fontsize="large")
+plt.legend(["train", "val"], loc="best")
+plt.show()
+plt.close()
+
+"""
+We can see how the training accuracy reaches almost 0.95 after 100 epochs.
+However, by observing the validation accuracy we can see how the network still needs
+training until it reaches almost 0.97 for both the validation and the training accuracy
+after 200 epochs. Beyond the 200th epoch, if we continue on training, the validation
+accuracy will start decreasing while the training accuracy will continue on increasing:
+the model starts overfitting.
+"""
diff --git a/keras_core/backend/jax/numpy.py b/keras_core/backend/jax/numpy.py
index 178941f48..db22ad19b 100644
--- a/keras_core/backend/jax/numpy.py
+++ b/keras_core/backend/jax/numpy.py
@@ -471,6 +471,8 @@ def tan(x):
 
 
 def tensordot(x1, x2, axes=2):
+    x1 = convert_to_tensor(x1)
+    x2 = convert_to_tensor(x2)
     return jnp.tensordot(x1, x2, axes=axes)