From f6cb39615821fbbe18a864b49db7cc2b3a274386 Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Sat, 10 Jun 2023 11:58:52 -0700 Subject: [PATCH] Merge branch 'main' of github.com:keras-team/keras-core --- .../keras_io/tensorflow/vision/captcha_ocr.py | 342 +++++++++ ...ject_detection_using_vision_transformer.py | 503 +++++++++++++ .../vision/semisupervised_simclr.py | 673 ++++++++++++++++++ .../tensorflow/vision/swim_transformers.py | 547 ++++++++++++++ .../timeseries_classification_from_scratch.py | 226 ++++++ keras_core/backend/jax/numpy.py | 2 + 6 files changed, 2293 insertions(+) create mode 100644 examples/keras_io/tensorflow/vision/captcha_ocr.py create mode 100644 examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py create mode 100644 examples/keras_io/tensorflow/vision/semisupervised_simclr.py create mode 100644 examples/keras_io/tensorflow/vision/swim_transformers.py create mode 100755 examples/keras_io/timeseries/timeseries_classification_from_scratch.py diff --git a/examples/keras_io/tensorflow/vision/captcha_ocr.py b/examples/keras_io/tensorflow/vision/captcha_ocr.py new file mode 100644 index 000000000..5126d7fff --- /dev/null +++ b/examples/keras_io/tensorflow/vision/captcha_ocr.py @@ -0,0 +1,342 @@ +""" +Title: OCR model for reading Captchas +Author: [A_K_Nain](https://twitter.com/A_K_Nain) +Date created: 2020/06/14 +Last modified: 2020/06/26 +Description: How to implement an OCR model using CNNs, RNNs and CTC loss. +Accelerator: GPU +""" + +""" +## Introduction + +This example demonstrates a simple OCR model built with the Functional API. Apart from +combining CNN and RNN, it also illustrates how you can instantiate a new layer +and use it as an "Endpoint layer" for implementing CTC loss. For a detailed +guide to layer subclassing, please check out +[this page](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) +in the developer guides. +""" + +""" +## Setup +""" + +import os +import numpy as np +import matplotlib.pyplot as plt + +from pathlib import Path +from collections import Counter + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + + +""" +## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images) +Let's download the data. +""" + + +"""shell +curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip +unzip -qq captcha_images_v2.zip +""" + + +""" +The dataset contains 1040 captcha files as `png` images. The label for each sample is a string, +the name of the file (minus the file extension). +We will map each character in the string to an integer for training the model. Similary, +we will need to map the predictions of the model back to strings. For this purpose +we will maintain two dictionaries, mapping characters to integers, and integers to characters, +respectively. +""" + + +# Path to the data directory +data_dir = Path("./captcha_images_v2/") + +# Get list of all the images +images = sorted(list(map(str, list(data_dir.glob("*.png"))))) +labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images] +characters = set(char for label in labels for char in label) +characters = sorted(list(characters)) + +print("Number of images found: ", len(images)) +print("Number of labels found: ", len(labels)) +print("Number of unique characters: ", len(characters)) +print("Characters present: ", characters) + +# Batch size for training and validation +batch_size = 16 + +# Desired image dimensions +img_width = 200 +img_height = 50 + +# Factor by which the image is going to be downsampled +# by the convolutional blocks. We will be using two +# convolution blocks and each block will have +# a pooling layer which downsample the features by a factor of 2. +# Hence total downsampling factor would be 4. +downsample_factor = 4 + +# Maximum length of any captcha in the dataset +max_length = max([len(label) for label in labels]) + + +""" +## Preprocessing +""" + + +# Mapping characters to integers +char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None) + +# Mapping integers back to original characters +num_to_char = layers.StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True +) + + +def split_data(images, labels, train_size=0.9, shuffle=True): + # 1. Get the total size of the dataset + size = len(images) + # 2. Make an indices array and shuffle it, if required + indices = np.arange(size) + if shuffle: + np.random.shuffle(indices) + # 3. Get the size of training samples + train_samples = int(size * train_size) + # 4. Split data into training and validation sets + x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]] + x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]] + return x_train, x_valid, y_train, y_valid + + +# Splitting data into training and validation sets +x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels)) + + +def encode_single_sample(img_path, label): + # 1. Read image + img = tf.io.read_file(img_path) + # 2. Decode and convert to grayscale + img = tf.io.decode_png(img, channels=1) + # 3. Convert to float32 in [0, 1] range + img = tf.image.convert_image_dtype(img, tf.float32) + # 4. Resize to the desired size + img = tf.image.resize(img, [img_height, img_width]) + # 5. Transpose the image because we want the time + # dimension to correspond to the width of the image. + img = tf.transpose(img, perm=[1, 0, 2]) + # 6. Map the characters in label to numbers + label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8")) + # 7. Return a dict as our model is expecting two inputs + return {"image": img, "label": label} + + +""" +## Create `Dataset` objects +""" + + +train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +train_dataset = ( + train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) + .batch(batch_size) + .prefetch(buffer_size=tf.data.AUTOTUNE) +) + +validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)) +validation_dataset = ( + validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) + .batch(batch_size) + .prefetch(buffer_size=tf.data.AUTOTUNE) +) + +""" +## Visualize the data +""" + + +_, ax = plt.subplots(4, 4, figsize=(10, 5)) +for batch in train_dataset.take(1): + images = batch["image"] + labels = batch["label"] + for i in range(16): + img = (images[i] * 255).numpy().astype("uint8") + label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8") + ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray") + ax[i // 4, i % 4].set_title(label) + ax[i // 4, i % 4].axis("off") +plt.show() + +""" +## Model +""" + + +class CTCLayer(layers.Layer): + def __init__(self, name=None): + super().__init__(name=name) + self.loss_fn = keras.backend.ctc_batch_cost + + def call(self, y_true, y_pred): + # Compute the training-time loss value and add it + # to the layer using `self.add_loss()`. + batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") + input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") + label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") + + input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") + label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") + + loss = self.loss_fn(y_true, y_pred, input_length, label_length) + self.add_loss(loss) + + # At test time, just return the computed predictions + return y_pred + + +def build_model(): + # Inputs to the model + input_img = layers.Input( + shape=(img_width, img_height, 1), name="image", dtype="float32" + ) + labels = layers.Input(name="label", shape=(None,), dtype="float32") + + # First conv block + x = layers.Conv2D( + 32, + (3, 3), + activation="relu", + kernel_initializer="he_normal", + padding="same", + name="Conv1", + )(input_img) + x = layers.MaxPooling2D((2, 2), name="pool1")(x) + + # Second conv block + x = layers.Conv2D( + 64, + (3, 3), + activation="relu", + kernel_initializer="he_normal", + padding="same", + name="Conv2", + )(x) + x = layers.MaxPooling2D((2, 2), name="pool2")(x) + + # We have used two max pool with pool size and strides 2. + # Hence, downsampled feature maps are 4x smaller. The number of + # filters in the last layer is 64. Reshape accordingly before + # passing the output to the RNN part of the model + new_shape = ((img_width // 4), (img_height // 4) * 64) + x = layers.Reshape(target_shape=new_shape, name="reshape")(x) + x = layers.Dense(64, activation="relu", name="dense1")(x) + x = layers.Dropout(0.2)(x) + + # RNNs + x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x) + x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x) + + # Output layer + x = layers.Dense( + len(char_to_num.get_vocabulary()) + 1, activation="softmax", name="dense2" + )(x) + + # Add CTC layer for calculating CTC loss at each step + output = CTCLayer(name="ctc_loss")(labels, x) + + # Define the model + model = keras.models.Model( + inputs=[input_img, labels], outputs=output, name="ocr_model_v1" + ) + # Optimizer + opt = keras.optimizers.Adam() + # Compile the model and return + model.compile(optimizer=opt) + return model + + +# Get the model +model = build_model() +model.summary() + +""" +## Training +""" + + +epochs = 1 +early_stopping_patience = 10 +# Add early stopping +early_stopping = keras.callbacks.EarlyStopping( + monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True +) + +# Train the model +history = model.fit( + train_dataset, + validation_data=validation_dataset, + epochs=epochs, + callbacks=[early_stopping], +) + + +""" +## Inference + +You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha) +and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha). +""" + + +# Get the prediction model by extracting layers till the output layer +prediction_model = keras.models.Model( + model.get_layer(name="image").input, model.get_layer(name="dense2").output +) +prediction_model.summary() + + +# A utility function to decode the output of the network +def decode_batch_predictions(pred): + input_len = np.ones(pred.shape[0]) * pred.shape[1] + # Use greedy search. For complex tasks, you can use beam search + results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ + :, :max_length + ] + # Iterate over the results and get back the text + output_text = [] + for res in results: + res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8") + output_text.append(res) + return output_text + + +# Let's check results on some validation samples +for batch in validation_dataset.take(1): + batch_images = batch["image"] + batch_labels = batch["label"] + + preds = prediction_model.predict(batch_images) + pred_texts = decode_batch_predictions(preds) + + orig_texts = [] + for label in batch_labels: + label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8") + orig_texts.append(label) + + _, ax = plt.subplots(4, 4, figsize=(15, 5)) + for i in range(len(pred_texts)): + img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8) + img = img.T + title = f"Prediction: {pred_texts[i]}" + ax[i // 4, i % 4].imshow(img, cmap="gray") + ax[i // 4, i % 4].set_title(title) + ax[i // 4, i % 4].axis("off") +plt.show() \ No newline at end of file diff --git a/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py b/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py new file mode 100644 index 000000000..4e4636b96 --- /dev/null +++ b/examples/keras_io/tensorflow/vision/object_detection_using_vision_transformer.py @@ -0,0 +1,503 @@ +""" +Title: Object detection with Vision Transformers +Author: [Karan V. Dave](https://www.linkedin.com/in/karan-dave-811413164/) +Date created: 2022/03/27 +Last modified: 2022/03/27 +Description: A simple Keras implementation of object detection using Vision Transformers. +Accelerator: GPU +""" + +""" +## Introduction + +The article +[Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929) +architecture by Alexey Dosovitskiy et al. +demonstrates that a pure transformer applied directly to sequences of image +patches can perform well on object detection tasks. + +In this Keras example, we implement an object detection ViT +and we train it on the +[Caltech 101 dataset](http://www.vision.caltech.edu/datasets/) +to detect an airplane in the given image. + +This example requires TensorFlow 2.4 or higher. +""" + +""" +## Imports and setup +""" + +import numpy as np +import tensorflow as tf +import keras_core as keras +from keras_core import layers +import matplotlib.pyplot as plt +import numpy as np +import cv2 +import os +import scipy.io +import shutil + +""" +## Prepare dataset + +We use the [Caltech 101 Dataset](https://data.caltech.edu/records/mzrjq-6wc02). +""" + +# Path to images and annotations +path_images = "./101_ObjectCategories/airplanes/" +path_annot = "./Annotations/Airplanes_Side_2/" + +path_to_downloaded_file = keras.utils.get_file( + fname="caltech_101_zipped", + origin="https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip", + extract=True, + archive_format="zip", # downloaded file format + cache_dir="./", # cache and extract in current directory +) + +# Extracting tar files found inside main zip file +shutil.unpack_archive("./datasets/caltech-101/101_ObjectCategories.tar.gz", "./") +shutil.unpack_archive("./datasets/caltech-101/Annotations.tar", "./") + +# list of paths to images and annotations +image_paths = [ + f for f in os.listdir(path_images) if os.path.isfile(os.path.join(path_images, f)) +] +annot_paths = [ + f for f in os.listdir(path_annot) if os.path.isfile(os.path.join(path_annot, f)) +] + +image_paths.sort() +annot_paths.sort() + +image_size = 224 # resize input images to this size + +images, targets = [], [] + +# loop over the annotations and images, preprocess them and store in lists +for i in range(0, len(annot_paths)): + # Access bounding box coordinates + annot = scipy.io.loadmat(path_annot + annot_paths[i])["box_coord"][0] + + top_left_x, top_left_y = annot[2], annot[0] + bottom_right_x, bottom_right_y = annot[3], annot[1] + + image = keras.utils.load_img( + path_images + image_paths[i], + ) + (w, h) = image.size[:2] + + # resize train set images + if i < int(len(annot_paths) * 0.8): + # resize image if it is for training dataset + image = image.resize((image_size, image_size)) + + # convert image to array and append to list + images.append(keras.utils.img_to_array(image)) + + # apply relative scaling to bounding boxes as per given image and append to list + targets.append( + ( + float(top_left_x) / w, + float(top_left_y) / h, + float(bottom_right_x) / w, + float(bottom_right_y) / h, + ) + ) + +# Convert the list to numpy array, split to train and test dataset +(x_train), (y_train) = ( + np.asarray(images[: int(len(images) * 0.8)]), + np.asarray(targets[: int(len(targets) * 0.8)]), +) +(x_test), (y_test) = ( + np.asarray(images[int(len(images) * 0.8) :]), + np.asarray(targets[int(len(targets) * 0.8) :]), +) + +""" +## Implement multilayer-perceptron (MLP) + +We use the code from the Keras example +[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/) +as a reference. +""" + + +def mlp(x, hidden_units, dropout_rate): + for units in hidden_units: + x = layers.Dense(units, activation=tf.nn.gelu)(x) + x = layers.Dropout(dropout_rate)(x) + return x + + +""" +## Implement the patch creation layer +""" + + +class Patches(layers.Layer): + def __init__(self, patch_size): + super().__init__() + self.patch_size = patch_size + + # Override function to avoid error while saving model + def get_config(self): + config = super().get_config().copy() + config.update( + { + "input_shape": input_shape, + "patch_size": patch_size, + "num_patches": num_patches, + "projection_dim": projection_dim, + "num_heads": num_heads, + "transformer_units": transformer_units, + "transformer_layers": transformer_layers, + "mlp_head_units": mlp_head_units, + } + ) + return config + + def call(self, images): + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=[1, self.patch_size, self.patch_size, 1], + strides=[1, self.patch_size, self.patch_size, 1], + rates=[1, 1, 1, 1], + padding="VALID", + ) + # return patches + return tf.reshape(patches, [batch_size, -1, patches.shape[-1]]) + + +""" +## Display patches for an input image +""" + +patch_size = 32 # Size of the patches to be extracted from the input images + +plt.figure(figsize=(4, 4)) +plt.imshow(x_train[0].astype("uint8")) +plt.axis("off") + +patches = Patches(patch_size)(tf.convert_to_tensor([x_train[0]])) +print(f"Image size: {image_size} X {image_size}") +print(f"Patch size: {patch_size} X {patch_size}") +print(f"{patches.shape[1]} patches per image \n{patches.shape[-1]} elements per patch") + + +n = int(np.sqrt(patches.shape[1])) +plt.figure(figsize=(4, 4)) +for i, patch in enumerate(patches[0]): + ax = plt.subplot(n, n, i + 1) + patch_img = tf.reshape(patch, (patch_size, patch_size, 3)) + plt.imshow(patch_img.numpy().astype("uint8")) + plt.axis("off") + +""" +## Implement the patch encoding layer + +The `PatchEncoder` layer linearly transforms a patch by projecting it into a +vector of size `projection_dim`. It also adds a learnable position +embedding to the projected vector. +""" + + +class PatchEncoder(layers.Layer): + def __init__(self, num_patches, projection_dim): + super().__init__() + self.num_patches = num_patches + self.projection = layers.Dense(units=projection_dim) + self.position_embedding = layers.Embedding( + input_dim=num_patches, output_dim=projection_dim + ) + + # Override function to avoid error while saving model + def get_config(self): + config = super().get_config().copy() + config.update( + { + "input_shape": input_shape, + "patch_size": patch_size, + "num_patches": num_patches, + "projection_dim": projection_dim, + "num_heads": num_heads, + "transformer_units": transformer_units, + "transformer_layers": transformer_layers, + "mlp_head_units": mlp_head_units, + } + ) + return config + + def call(self, patch): + positions = tf.range(start=0, limit=self.num_patches, delta=1) + encoded = self.projection(patch) + self.position_embedding(positions) + return encoded + + +""" +## Build the ViT model + +The ViT model has multiple Transformer blocks. +The `MultiHeadAttention` layer is used for self-attention, +applied to the sequence of image patches. The encoded patches (skip connection) +and self-attention layer outputs are normalized and fed into a +multilayer perceptron (MLP). +The model outputs four dimensions representing +the bounding box coordinates of an object. +""" + + +def create_vit_object_detector( + input_shape, + patch_size, + num_patches, + projection_dim, + num_heads, + transformer_units, + transformer_layers, + mlp_head_units, +): + inputs = layers.Input(shape=input_shape) + # Create patches + patches = Patches(patch_size)(inputs) + # Encode patches + encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + + # Create multiple layers of the Transformer block. + for _ in range(transformer_layers): + # Layer normalization 1. + x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + # Create a multi-head attention layer. + attention_output = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + )(x1, x1) + # Skip connection 1. + x2 = layers.Add()([attention_output, encoded_patches]) + # Layer normalization 2. + x3 = layers.LayerNormalization(epsilon=1e-6)(x2) + # MLP + x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) + # Skip connection 2. + encoded_patches = layers.Add()([x3, x2]) + + # Create a [batch_size, projection_dim] tensor. + representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) + representation = layers.Flatten()(representation) + representation = layers.Dropout(0.3)(representation) + # Add MLP. + features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.3) + + bounding_box = layers.Dense(4)( + features + ) # Final four neurons that output bounding box + + # return Keras model. + return keras.Model(inputs=inputs, outputs=bounding_box) + + +""" +## Run the experiment +""" + + +def run_experiment(model, learning_rate, weight_decay, batch_size, num_epochs): + optimizer = keras.optimizers.AdamW( + learning_rate=learning_rate, weight_decay=weight_decay + ) + + # Compile model. + model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError()) + + checkpoint_filepath = "logs/model.weights.h5" + checkpoint_callback = keras.callbacks.ModelCheckpoint( + checkpoint_filepath, + monitor="val_loss", + save_best_only=True, + save_weights_only=True, + ) + + history = model.fit( + x=x_train, + y=y_train, + batch_size=batch_size, + epochs=num_epochs, + validation_split=0.1, + callbacks=[ + checkpoint_callback, + keras.callbacks.EarlyStopping(monitor="val_loss", patience=10), + ], + ) + + return history + + +input_shape = (image_size, image_size, 3) # input image shape +learning_rate = 0.001 +weight_decay = 0.0001 +batch_size = 32 +num_epochs = 1 +num_patches = (image_size // patch_size) ** 2 +projection_dim = 64 +num_heads = 4 +# Size of the transformer layers +transformer_units = [ + projection_dim * 2, + projection_dim, +] +transformer_layers = 4 +mlp_head_units = [2048, 1024, 512, 64, 32] # Size of the dense layers + + +history = [] +num_patches = (image_size // patch_size) ** 2 + +vit_object_detector = create_vit_object_detector( + input_shape, + patch_size, + num_patches, + projection_dim, + num_heads, + transformer_units, + transformer_layers, + mlp_head_units, +) + +# Train model +history = run_experiment( + vit_object_detector, learning_rate, weight_decay, batch_size, num_epochs +) + + +""" +## Evaluate the model +""" + +import matplotlib.patches as patches + +# Saves the model in current path +vit_object_detector.save("vit_object_detector.keras", save_format="keras") + + +# To calculate IoU (intersection over union, given two bounding boxes) +def bounding_box_intersection_over_union(box_predicted, box_truth): + # get (x, y) coordinates of intersection of bounding boxes + top_x_intersect = max(box_predicted[0], box_truth[0]) + top_y_intersect = max(box_predicted[1], box_truth[1]) + bottom_x_intersect = min(box_predicted[2], box_truth[2]) + bottom_y_intersect = min(box_predicted[3], box_truth[3]) + + # calculate area of the intersection bb (bounding box) + intersection_area = max(0, bottom_x_intersect - top_x_intersect + 1) * max( + 0, bottom_y_intersect - top_y_intersect + 1 + ) + + # calculate area of the prediction bb and ground-truth bb + box_predicted_area = (box_predicted[2] - box_predicted[0] + 1) * ( + box_predicted[3] - box_predicted[1] + 1 + ) + box_truth_area = (box_truth[2] - box_truth[0] + 1) * ( + box_truth[3] - box_truth[1] + 1 + ) + + # calculate intersection over union by taking intersection + # area and dividing it by the sum of predicted bb and ground truth + # bb areas subtracted by the interesection area + + # return ioU + return intersection_area / float( + box_predicted_area + box_truth_area - intersection_area + ) + + +i, mean_iou = 0, 0 + +# Compare results for 10 images in the test set +for input_image in x_test[:10]: + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 15)) + im = input_image + + # Display the image + ax1.imshow(im.astype("uint8")) + ax2.imshow(im.astype("uint8")) + + input_image = cv2.resize( + input_image, (image_size, image_size), interpolation=cv2.INTER_AREA + ) + input_image = np.expand_dims(input_image, axis=0) + preds = vit_object_detector.predict(input_image)[0] + + (h, w) = (im).shape[0:2] + + top_left_x, top_left_y = int(preds[0] * w), int(preds[1] * h) + + bottom_right_x, bottom_right_y = int(preds[2] * w), int(preds[3] * h) + + box_predicted = [top_left_x, top_left_y, bottom_right_x, bottom_right_y] + # Create the bounding box + rect = patches.Rectangle( + (top_left_x, top_left_y), + bottom_right_x - top_left_x, + bottom_right_y - top_left_y, + facecolor="none", + edgecolor="red", + linewidth=1, + ) + # Add the bounding box to the image + ax1.add_patch(rect) + ax1.set_xlabel( + "Predicted: " + + str(top_left_x) + + ", " + + str(top_left_y) + + ", " + + str(bottom_right_x) + + ", " + + str(bottom_right_y) + ) + + top_left_x, top_left_y = int(y_test[i][0] * w), int(y_test[i][1] * h) + + bottom_right_x, bottom_right_y = int(y_test[i][2] * w), int(y_test[i][3] * h) + + box_truth = top_left_x, top_left_y, bottom_right_x, bottom_right_y + + mean_iou += bounding_box_intersection_over_union(box_predicted, box_truth) + # Create the bounding box + rect = patches.Rectangle( + (top_left_x, top_left_y), + bottom_right_x - top_left_x, + bottom_right_y - top_left_y, + facecolor="none", + edgecolor="red", + linewidth=1, + ) + # Add the bounding box to the image + ax2.add_patch(rect) + ax2.set_xlabel( + "Target: " + + str(top_left_x) + + ", " + + str(top_left_y) + + ", " + + str(bottom_right_x) + + ", " + + str(bottom_right_y) + + "\n" + + "IoU" + + str(bounding_box_intersection_over_union(box_predicted, box_truth)) + ) + i = i + 1 + +print("mean_iou: " + str(mean_iou / len(x_test[:10]))) +plt.show() + +""" +This example demonstrates that a pure Transformer can be trained +to predict the bounding boxes of an object in a given image, +thus extending the use of Transformers to object detection tasks. +The model can be improved further by tuning hyper-parameters and pre-training. +""" \ No newline at end of file diff --git a/examples/keras_io/tensorflow/vision/semisupervised_simclr.py b/examples/keras_io/tensorflow/vision/semisupervised_simclr.py new file mode 100644 index 000000000..c4c61da86 --- /dev/null +++ b/examples/keras_io/tensorflow/vision/semisupervised_simclr.py @@ -0,0 +1,673 @@ +""" +Title: Semi-supervised image classification using contrastive pretraining with SimCLR +Author: [András Béres](https://www.linkedin.com/in/andras-beres-789190210) +Date created: 2021/04/24 +Last modified: 2021/04/24 +Description: Contrastive pretraining with SimCLR for semi-supervised image classification on the STL-10 dataset. +Accelerator: GPU +""" +""" +## Introduction + +### Semi-supervised learning + +Semi-supervised learning is a machine learning paradigm that deals with +**partially labeled datasets**. When applying deep learning in the real world, +one usually has to gather a large dataset to make it work well. However, while +the cost of labeling scales linearly with the dataset size (labeling each +example takes a constant time), model performance only scales +[sublinearly](https://arxiv.org/abs/2001.08361) with it. This means that +labeling more and more samples becomes less and less cost-efficient, while +gathering unlabeled data is generally cheap, as it is usually readily available +in large quantities. + +Semi-supervised learning offers to solve this problem by only requiring a +partially labeled dataset, and by being label-efficient by utilizing the +unlabeled examples for learning as well. + +In this example, we will pretrain an encoder with contrastive learning on the +[STL-10](https://ai.stanford.edu/~acoates/stl10/) semi-supervised dataset using +no labels at all, and then fine-tune it using only its labeled subset. + +### Contrastive learning + +On the highest level, the main idea behind contrastive learning is to **learn +representations that are invariant to image augmentations** in a self-supervised +manner. One problem with this objective is that it has a trivial degenerate +solution: the case where the representations are constant, and do not depend at all on the +input images. + +Contrastive learning avoids this trap by modifying the objective in the +following way: it pulls representations of augmented versions/views of the same +image closer to each other (contracting positives), while simultaneously pushing +different images away from each other (contrasting negatives) in representation +space. + +One such contrastive approach is [SimCLR](https://arxiv.org/abs/2002.05709), +which essentially identifies the core components needed to optimize this +objective, and can achieve high performance by scaling this simple approach. + +Another approach is [SimSiam](https://arxiv.org/abs/2011.10566) +([Keras example](https://keras.io/examples/vision/simsiam/)), +whose main difference from +SimCLR is that the former does not use any negatives in its loss. Therefore, it does not +explicitly prevent the trivial solution, and, instead, avoids it implicitly by +architecture design (asymmetric encoding paths using a predictor network and +batch normalization (BatchNorm) are applied in the final layers). + +For further reading about SimCLR, check out +[the official Google AI blog post](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html), +and for an overview of self-supervised learning across both vision and language +check out +[this blog post](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/). +""" + +""" +## Setup +""" + +# Make sure we are able to handle large datasets +import resource +low, high = resource.getrlimit(resource.RLIMIT_NOFILE) +resource.setrlimit(resource.RLIMIT_NOFILE, (high, high)) + +import math +import matplotlib.pyplot as plt +import tensorflow as tf +import tensorflow_datasets as tfds + +import keras_core as keras +from keras_core import layers + +""" +## Hyperparameter setup +""" +# Dataset hyperparameters +unlabeled_dataset_size = 100000 +labeled_dataset_size = 5000 +image_size = 96 +image_channels = 3 + +# Algorithm hyperparameters +num_epochs = 1 +batch_size = 525 # Corresponds to 200 steps per epoch +width = 128 +temperature = 0.1 +# Stronger augmentations for contrastive, weaker ones for supervised training +contrastive_augmentation = {"min_area": 0.25, "brightness": 0.6, "jitter": 0.2} +classification_augmentation = {"min_area": 0.75, "brightness": 0.3, "jitter": 0.1} + +""" +## Dataset + +During training we will simultaneously load a large batch of unlabeled images along with a +smaller batch of labeled images. +""" + + +def prepare_dataset(): + # Labeled and unlabeled samples are loaded synchronously + # with batch sizes selected accordingly + steps_per_epoch = (unlabeled_dataset_size + labeled_dataset_size) // batch_size + unlabeled_batch_size = unlabeled_dataset_size // steps_per_epoch + labeled_batch_size = labeled_dataset_size // steps_per_epoch + print( + f"batch size is {unlabeled_batch_size} (unlabeled) + {labeled_batch_size} (labeled)" + ) + + # Turning off shuffle to lower resource usage + unlabeled_train_dataset = ( + tfds.load("stl10", split="unlabelled", as_supervised=True, shuffle_files=False) + .shuffle(buffer_size=10 * unlabeled_batch_size) + .batch(unlabeled_batch_size) + ) + labeled_train_dataset = ( + tfds.load("stl10", split="train", as_supervised=True, shuffle_files=False) + .shuffle(buffer_size=10 * labeled_batch_size) + .batch(labeled_batch_size) + ) + test_dataset = ( + tfds.load("stl10", split="test", as_supervised=True) + .batch(batch_size) + .prefetch(buffer_size=tf.data.AUTOTUNE) + ) + + # Labeled and unlabeled datasets are zipped together + train_dataset = tf.data.Dataset.zip( + (unlabeled_train_dataset, labeled_train_dataset) + ).prefetch(buffer_size=tf.data.AUTOTUNE) + + return train_dataset, labeled_train_dataset, test_dataset + + +# Load STL10 dataset +train_dataset, labeled_train_dataset, test_dataset = prepare_dataset() + +""" +## Image augmentations + +The two most important image augmentations for contrastive learning are the +following: + +- Cropping: forces the model to encode different parts of the same image +similarly, we implement it with the +[RandomTranslation](https://keras.io/api/layers/preprocessing_layers/image_preprocessing/random_translation/) +and +[RandomZoom](https://keras.io/api/layers/preprocessing_layers/image_preprocessing/random_zoom/) +layers +- Color jitter: prevents a trivial color histogram-based solution to the task by +distorting color histograms. A principled way to implement that is by affine +transformations in color space. + +In this example we use random horizontal flips as well. Stronger augmentations +are applied for contrastive learning, along with weaker ones for supervised +classification to avoid overfitting on the few labeled examples. + +We implement random color jitter as a custom preprocessing layer. Using +preprocessing layers for data augmentation has the following two advantages: + +- The data augmentation will run on GPU in batches, so the training will not be +bottlenecked by the data pipeline in environments with constrained CPU +resources (such as a Colab Notebook, or a personal machine) +- Deployment is easier as the data preprocessing pipeline is encapsulated in the +model, and does not have to be reimplemented when deploying it +""" + + +# Distorts the color distibutions of images +class RandomColorAffine(layers.Layer): + def __init__(self, brightness=0, jitter=0, **kwargs): + super().__init__(**kwargs) + + self.brightness = brightness + self.jitter = jitter + + def get_config(self): + config = super().get_config() + config.update({"brightness": self.brightness, "jitter": self.jitter}) + return config + + def call(self, images, training=True): + if training: + batch_size = tf.shape(images)[0] + + # Same for all colors + brightness_scales = 1 + tf.random.uniform( + (batch_size, 1, 1, 1), minval=-self.brightness, maxval=self.brightness + ) + # Different for all colors + jitter_matrices = tf.random.uniform( + (batch_size, 1, 3, 3), minval=-self.jitter, maxval=self.jitter + ) + + color_transforms = ( + tf.eye(3, batch_shape=[batch_size, 1]) * brightness_scales + + jitter_matrices + ) + images = tf.clip_by_value(tf.matmul(images, color_transforms), 0, 1) + return images + + +# Image augmentation module +def get_augmenter(min_area, brightness, jitter): + zoom_factor = 1.0 - math.sqrt(min_area) + return keras.Sequential( + [ + keras.Input(shape=(image_size, image_size, image_channels)), + layers.Rescaling(1 / 255, dtype="uint8"), + layers.RandomFlip("horizontal"), + layers.RandomTranslation(zoom_factor / 2, zoom_factor / 2), + layers.RandomZoom((-zoom_factor, 0.0), (-zoom_factor, 0.0)), + RandomColorAffine(brightness, jitter), + ] + ) + + +def visualize_augmentations(num_images): + # Sample a batch from a dataset + images = next(iter(train_dataset))[0][0][:num_images] + + # Apply augmentations + augmented_images = zip( + images, + get_augmenter(**classification_augmentation)(images), + get_augmenter(**contrastive_augmentation)(images), + get_augmenter(**contrastive_augmentation)(images), + ) + row_titles = [ + "Original:", + "Weakly augmented:", + "Strongly augmented:", + "Strongly augmented:", + ] + plt.figure(figsize=(num_images * 2.2, 4 * 2.2), dpi=100) + for column, image_row in enumerate(augmented_images): + for row, image in enumerate(image_row): + plt.subplot(4, num_images, row * num_images + column + 1) + plt.imshow(image) + if column == 0: + plt.title(row_titles[row], loc="left") + plt.axis("off") + plt.tight_layout() + + +visualize_augmentations(num_images=8) + +""" +## Encoder architecture +""" + + +# Define the encoder architecture +def get_encoder(): + return keras.Sequential( + [ + keras.Input(shape=(image_size, image_size, image_channels)), + layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"), + layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"), + layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"), + layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"), + layers.Flatten(), + layers.Dense(width, activation="relu"), + ], + name="encoder", + ) + + +""" +## Supervised baseline model + +A baseline supervised model is trained using random initialization. +""" + +# Baseline supervised training with random initialization +baseline_model = keras.Sequential( + [ + keras.Input(shape=(image_size, image_size, image_channels)), + get_augmenter(**classification_augmentation), + get_encoder(), + layers.Dense(10), + ], + name="baseline_model", +) +baseline_model.compile( + optimizer=keras.optimizers.Adam(), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")], +) + +baseline_history = baseline_model.fit( + labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset +) + +print( + "Maximal validation accuracy: {:.2f}%".format( + max(baseline_history.history["val_acc"]) * 100 + ) +) + +""" +## Self-supervised model for contrastive pretraining + +We pretrain an encoder on unlabeled images with a contrastive loss. +A nonlinear projection head is attached to the top of the encoder, as it +improves the quality of representations of the encoder. + +We use the InfoNCE/NT-Xent/N-pairs loss, which can be interpreted in the +following way: + +1. We treat each image in the batch as if it had its own class. +2. Then, we have two examples (a pair of augmented views) for each "class". +3. Each view's representation is compared to every possible pair's one (for both + augmented versions). +4. We use the temperature-scaled cosine similarity of compared representations as + logits. +5. Finally, we use categorical cross-entropy as the "classification" loss + +The following two metrics are used for monitoring the pretraining performance: + +- [Contrastive accuracy (SimCLR Table 5)](https://arxiv.org/abs/2002.05709): +Self-supervised metric, the ratio of cases in which the representation of an +image is more similar to its differently augmented version's one, than to the +representation of any other image in the current batch. Self-supervised +metrics can be used for hyperparameter tuning even in the case when there are +no labeled examples. +- [Linear probing accuracy](https://arxiv.org/abs/1603.08511): Linear probing is +a popular metric to evaluate self-supervised classifiers. It is computed as +the accuracy of a logistic regression classifier trained on top of the +encoder's features. In our case, this is done by training a single dense layer +on top of the frozen encoder. Note that contrary to traditional approach where +the classifier is trained after the pretraining phase, in this example we +train it during pretraining. This might slightly decrease its accuracy, but +that way we can monitor its value during training, which helps with +experimentation and debugging. + +Another widely used supervised metric is the +[KNN accuracy](https://arxiv.org/abs/1805.01978), which is the accuracy of a KNN +classifier trained on top of the encoder's features, which is not implemented in +this example. +""" + + +# Define the contrastive model with model-subclassing +class ContrastiveModel(keras.Model): + def __init__(self): + super().__init__() + + self.temperature = temperature + self.contrastive_augmenter = get_augmenter(**contrastive_augmentation) + self.classification_augmenter = get_augmenter(**classification_augmentation) + self.encoder = get_encoder() + # Non-linear MLP as projection head + self.projection_head = keras.Sequential( + [ + keras.Input(shape=(width,)), + layers.Dense(width, activation="relu"), + layers.Dense(width), + ], + name="projection_head", + ) + # Single dense layer for linear probing + self.linear_probe = keras.Sequential( + [layers.Input(shape=(width,)), layers.Dense(10)], name="linear_probe" + ) + + self.encoder.summary() + self.projection_head.summary() + self.linear_probe.summary() + + def compile(self, contrastive_optimizer, probe_optimizer, **kwargs): + super().compile(**kwargs) + + self.contrastive_optimizer = contrastive_optimizer + self.probe_optimizer = probe_optimizer + + # self.contrastive_loss will be defined as a method + self.probe_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True) + + self.contrastive_loss_tracker = keras.metrics.Mean(name="c_loss") + self.contrastive_accuracy = keras.metrics.SparseCategoricalAccuracy( + name="c_acc" + ) + self.probe_loss_tracker = keras.metrics.Mean(name="p_loss") + self.probe_accuracy = keras.metrics.SparseCategoricalAccuracy(name="p_acc") + + @property + def metrics(self): + return [ + self.contrastive_loss_tracker, + self.contrastive_accuracy, + self.probe_loss_tracker, + self.probe_accuracy, + ] + + def contrastive_loss(self, projections_1, projections_2): + # InfoNCE loss (information noise-contrastive estimation) + # NT-Xent loss (normalized temperature-scaled cross entropy) + + # Cosine similarity: the dot product of the l2-normalized feature vectors + projections_1 = tf.math.l2_normalize(projections_1, axis=1) + projections_2 = tf.math.l2_normalize(projections_2, axis=1) + similarities = ( + tf.matmul(projections_1, projections_2, transpose_b=True) / self.temperature + ) + + # The similarity between the representations of two augmented views of the + # same image should be higher than their similarity with other views + batch_size = tf.shape(projections_1)[0] + contrastive_labels = tf.range(batch_size) + self.contrastive_accuracy.update_state(contrastive_labels, similarities) + self.contrastive_accuracy.update_state( + contrastive_labels, tf.transpose(similarities) + ) + + # The temperature-scaled similarities are used as logits for cross-entropy + # a symmetrized version of the loss is used here + loss_1_2 = keras.losses.sparse_categorical_crossentropy( + contrastive_labels, similarities, from_logits=True + ) + loss_2_1 = keras.losses.sparse_categorical_crossentropy( + contrastive_labels, tf.transpose(similarities), from_logits=True + ) + return (loss_1_2 + loss_2_1) / 2 + + def train_step(self, data): + (unlabeled_images, _), (labeled_images, labels) = data + + # Both labeled and unlabeled images are used, without labels + images = tf.concat((unlabeled_images, labeled_images), axis=0) + # Each image is augmented twice, differently + augmented_images_1 = self.contrastive_augmenter(images, training=True) + augmented_images_2 = self.contrastive_augmenter(images, training=True) + with tf.GradientTape() as tape: + features_1 = self.encoder(augmented_images_1, training=True) + features_2 = self.encoder(augmented_images_2, training=True) + # The representations are passed through a projection mlp + projections_1 = self.projection_head(features_1, training=True) + projections_2 = self.projection_head(features_2, training=True) + contrastive_loss = self.contrastive_loss(projections_1, projections_2) + gradients = tape.gradient( + contrastive_loss, + self.encoder.trainable_weights + self.projection_head.trainable_weights, + ) + self.contrastive_optimizer.apply_gradients( + zip( + gradients, + self.encoder.trainable_weights + self.projection_head.trainable_weights, + ) + ) + self.contrastive_loss_tracker.update_state(contrastive_loss) + + # Labels are only used in evalutation for an on-the-fly logistic regression + preprocessed_images = self.classification_augmenter( + labeled_images, training=True + ) + with tf.GradientTape() as tape: + # the encoder is used in inference mode here to avoid regularization + # and updating the batch normalization paramers if they are used + features = self.encoder(preprocessed_images, training=False) + class_logits = self.linear_probe(features, training=True) + probe_loss = self.probe_loss(labels, class_logits) + gradients = tape.gradient(probe_loss, self.linear_probe.trainable_weights) + self.probe_optimizer.apply_gradients( + zip(gradients, self.linear_probe.trainable_weights) + ) + self.probe_loss_tracker.update_state(probe_loss) + self.probe_accuracy.update_state(labels, class_logits) + + return {m.name: m.result() for m in self.metrics} + + def test_step(self, data): + labeled_images, labels = data + + # For testing the components are used with a training=False flag + preprocessed_images = self.classification_augmenter( + labeled_images, training=False + ) + features = self.encoder(preprocessed_images, training=False) + class_logits = self.linear_probe(features, training=False) + probe_loss = self.probe_loss(labels, class_logits) + self.probe_loss_tracker.update_state(probe_loss) + self.probe_accuracy.update_state(labels, class_logits) + + # Only the probe metrics are logged at test time + return {m.name: m.result() for m in self.metrics[2:]} + + +# Contrastive pretraining +pretraining_model = ContrastiveModel() +pretraining_model.compile( + contrastive_optimizer=keras.optimizers.Adam(), + probe_optimizer=keras.optimizers.Adam(), +) + +pretraining_history = pretraining_model.fit( + train_dataset, epochs=num_epochs, validation_data=test_dataset +) +print( + "Maximal validation accuracy: {:.2f}%".format( + max(pretraining_history.history["val_p_acc"]) * 100 + ) +) + +""" +## Supervised finetuning of the pretrained encoder + +We then finetune the encoder on the labeled examples, by attaching +a single randomly initalized fully connected classification layer on its top. +""" + +# Supervised finetuning of the pretrained encoder +finetuning_model = keras.Sequential( + [ + layers.Input(shape=(image_size, image_size, image_channels)), + get_augmenter(**classification_augmentation), + pretraining_model.encoder, + layers.Dense(10), + ], + name="finetuning_model", +) +finetuning_model.compile( + optimizer=keras.optimizers.Adam(), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")], +) + +finetuning_history = finetuning_model.fit( + labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset +) +print( + "Maximal validation accuracy: {:.2f}%".format( + max(finetuning_history.history["val_acc"]) * 100 + ) +) + +""" +## Comparison against the baseline +""" + + +# The classification accuracies of the baseline and the pretraining + finetuning process: +def plot_training_curves(pretraining_history, finetuning_history, baseline_history): + for metric_key, metric_name in zip(["acc", "loss"], ["accuracy", "loss"]): + plt.figure(figsize=(8, 5), dpi=100) + plt.plot( + baseline_history.history[f"val_{metric_key}"], label="supervised baseline" + ) + plt.plot( + pretraining_history.history[f"val_p_{metric_key}"], + label="self-supervised pretraining", + ) + plt.plot( + finetuning_history.history[f"val_{metric_key}"], + label="supervised finetuning", + ) + plt.legend() + plt.title(f"Classification {metric_name} during training") + plt.xlabel("epochs") + plt.ylabel(f"validation {metric_name}") + + +plot_training_curves(pretraining_history, finetuning_history, baseline_history) + +""" +By comparing the training curves, we can see that when using contrastive +pretraining, a higher validation accuracy can be reached, paired with a lower +validation loss, which means that the pretrained network was able to generalize +better when seeing only a small amount of labeled examples. +""" + +""" +## Improving further + +### Architecture + +The experiment in the original paper demonstrated that increasing the width and depth of the +models improves performance at a higher rate than for supervised learning. Also, +using a [ResNet-50](https://keras.io/api/applications/resnet/#resnet50-function) +encoder is quite standard in the literature. However keep in mind, that more +powerful models will not only increase training time but will also require more +memory and will limit the maximal batch size you can use. + +It has [been](https://arxiv.org/abs/1905.09272) +[reported](https://arxiv.org/abs/1911.05722) that the usage of BatchNorm layers +could sometimes degrade performance, as it introduces an intra-batch dependency +between samples, which is why I did not have used them in this example. In my +experiments however, using BatchNorm, especially in the projection head, +improves performance. + +### Hyperparameters + +The hyperparameters used in this example have been tuned manually for this task and +architecture. Therefore, without changing them, only marginal gains can be expected +from further hyperparameter tuning. + +However for a different task or model architecture these would need tuning, so +here are my notes on the most important ones: + +- **Batch size**: since the objective can be interpreted as a classification +over a batch of images (loosely speaking), the batch size is actually a more +important hyperparameter than usual. The higher, the better. +- **Temperature**: the temperature defines the "softness" of the softmax +distribution that is used in the cross-entropy loss, and is an important +hyperparameter. Lower values generally lead to a higher contrastive accuracy. +A recent trick (in [ALIGN](https://arxiv.org/abs/2102.05918)) is to learn +the temperature's value as well (which can be done by defining it as a +tf.Variable, and applying gradients on it). Even though this provides a good baseline +value, in my experiments the learned temperature was somewhat lower +than optimal, as it is optimized with respect to the contrastive loss, which is not a +perfect proxy for representation quality. +- **Image augmentation strength**: during pretraining stronger augmentations +increase the difficulty of the task, however after a point too strong +augmentations will degrade performance. During finetuning stronger +augmentations reduce overfitting while in my experience too strong +augmentations decrease the performance gains from pretraining. The whole data +augmentation pipeline can be seen as an important hyperparameter of the +algorithm, implementations of other custom image augmentation layers in Keras +can be found in +[this repository](https://github.com/beresandras/image-augmentation-layers-keras). +- **Learning rate schedule**: a constant schedule is used here, but it is +quite common in the literature to use a +[cosine decay schedule](https://www.tensorflow.org/api_docs/python/tf/keras/experimental/CosineDecay), +which can further improve performance. +- **Optimizer**: Adam is used in this example, as it provides good performance +with default parameters. SGD with momentum requires more tuning, however it +could slightly increase performance. +""" + +""" +## Related works + +Other instance-level (image-level) contrastive learning methods: + +- [MoCo](https://arxiv.org/abs/1911.05722) +([v2](https://arxiv.org/abs/2003.04297), +[v3](https://arxiv.org/abs/2104.02057)): uses a momentum-encoder as well, +whose weights are an exponential moving average of the target encoder +- [SwAV](https://arxiv.org/abs/2006.09882): uses clustering instead of pairwise +comparison +- [BarlowTwins](https://arxiv.org/abs/2103.03230): uses a cross +correlation-based objective instead of pairwise comparison + +Keras implementations of **MoCo** and **BarlowTwins** can be found in +[this repository](https://github.com/beresandras/contrastive-classification-keras), +which includes a Colab notebook. + +There is also a new line of works, which optimize a similar objective, but +without the use of any negatives: + +- [BYOL](https://arxiv.org/abs/2006.07733): momentum-encoder + no negatives +- [SimSiam](https://arxiv.org/abs/2011.10566) +([Keras example](https://keras.io/examples/vision/simsiam/)): +no momentum-encoder + no negatives + +In my experience, these methods are more brittle (they can collapse to a constant +representation, I could not get them to work using this encoder architecture). +Even though they are generally more dependent on the +[model](https://generallyintelligent.ai/understanding-self-supervised-contrastive-learning.html) +[architecture](https://arxiv.org/abs/2010.10241), they can improve +performance at smaller batch sizes. + +You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/semi-supervised-classification-simclr) +and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/semi-supervised-classification). +""" \ No newline at end of file diff --git a/examples/keras_io/tensorflow/vision/swim_transformers.py b/examples/keras_io/tensorflow/vision/swim_transformers.py new file mode 100644 index 000000000..ed1e72ed1 --- /dev/null +++ b/examples/keras_io/tensorflow/vision/swim_transformers.py @@ -0,0 +1,547 @@ +""" +Title: Image classification with Swin Transformers +Author: [Rishit Dagli](https://twitter.com/rishit_dagli) +Date created: 2021/09/08 +Last modified: 2021/09/08 +Description: Image classification using Swin Transformers, a general-purpose backbone for computer vision. +Accelerator: GPU +""" +""" +This example implements [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +by Liu et al. for image classification, and demonstrates it on the +[CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html). + +Swin Transformer (**S**hifted **Win**dow Transformer) can serve as a general-purpose backbone +for computer vision. Swin Transformer is a hierarchical Transformer whose +representations are computed with _shifted windows_. The shifted window scheme +brings greater efficiency by limiting self-attention computation to +non-overlapping local windows while also allowing for cross-window connections. +This architecture has the flexibility to model information at various scales and has +a linear computational complexity with respect to image size. + +This example requires TensorFlow 2.5 or higher. +""" + +""" +## Setup +""" + +import matplotlib.pyplot as plt +import numpy as np +import tensorflow as tf +import keras_core as keras +from keras_core import layers + +""" +## Prepare the data + +We load the CIFAR-100 dataset through `tf.keras.datasets`, +normalize the images, and convert the integer labels to one-hot encoded vectors. +""" + +num_classes = 100 +input_shape = (32, 32, 3) + +(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data() +x_train, x_test = x_train / 255.0, x_test / 255.0 +y_train = keras.utils.numerical_utils.to_categorical(y_train, num_classes) +y_test = keras.utils.numerical_utils.to_categorical(y_test, num_classes) +print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}") +print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}") + +plt.figure(figsize=(10, 10)) +for i in range(25): + plt.subplot(5, 5, i + 1) + plt.xticks([]) + plt.yticks([]) + plt.grid(False) + plt.imshow(x_train[i]) +plt.show() + +""" +## Configure the hyperparameters + +A key parameter to pick is the `patch_size`, the size of the input patches. +In order to use each pixel as an individual input, you can set `patch_size` to `(1, 1)`. +Below, we take inspiration from the original paper settings +for training on ImageNet-1K, keeping most of the original settings for this example. +""" + +patch_size = (2, 2) # 2-by-2 sized patches +dropout_rate = 0.03 # Dropout rate +num_heads = 8 # Attention heads +embed_dim = 64 # Embedding dimension +num_mlp = 256 # MLP layer size +qkv_bias = True # Convert embedded patches to query, key, and values with a learnable additive value +window_size = 2 # Size of attention window +shift_size = 1 # Size of shifting window +image_dimension = 32 # Initial image size + +num_patch_x = input_shape[0] // patch_size[0] +num_patch_y = input_shape[1] // patch_size[1] + +learning_rate = 1e-3 +batch_size = 128 +num_epochs = 1 +validation_split = 0.1 +weight_decay = 0.0001 +label_smoothing = 0.1 + +""" +## Helper functions + +We create two helper functions to help us get a sequence of +patches from the image, merge patches, and apply dropout. +""" + + +def window_partition(x, window_size): + _, height, width, channels = x.shape + patch_num_y = height // window_size + patch_num_x = width // window_size + x = tf.reshape( + x, shape=(-1, patch_num_y, window_size, patch_num_x, window_size, channels) + ) + x = tf.transpose(x, (0, 1, 3, 2, 4, 5)) + windows = tf.reshape(x, shape=(-1, window_size, window_size, channels)) + return windows + + +def window_reverse(windows, window_size, height, width, channels): + patch_num_y = height // window_size + patch_num_x = width // window_size + x = tf.reshape( + windows, + shape=(-1, patch_num_y, patch_num_x, window_size, window_size, channels), + ) + x = tf.transpose(x, perm=(0, 1, 3, 2, 4, 5)) + x = tf.reshape(x, shape=(-1, height, width, channels)) + return x + + +class DropPath(layers.Layer): + def __init__(self, drop_prob=None, **kwargs): + super().__init__(**kwargs) + self.drop_prob = drop_prob + + def call(self, x): + input_shape = tf.shape(x) + batch_size = input_shape[0] + rank = x.shape.rank + shape = (batch_size,) + (1,) * (rank - 1) + random_tensor = (1 - self.drop_prob) + tf.random.uniform(shape, dtype=x.dtype) + path_mask = tf.floor(random_tensor) + output = tf.math.divide(x, 1 - self.drop_prob) * path_mask + return output + + +""" +## Window based multi-head self-attention + +Usually Transformers perform global self-attention, where the relationships between +a token and all other tokens are computed. The global computation leads to quadratic +complexity with respect to the number of tokens. Here, as the [original paper](https://arxiv.org/abs/2103.14030) +suggests, we compute self-attention within local windows, in a non-overlapping manner. +Global self-attention leads to quadratic computational complexity in the number of patches, +whereas window-based self-attention leads to linear complexity and is easily scalable. +""" + + +class WindowAttention(layers.Layer): + def __init__( + self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0, **kwargs + ): + super().__init__(**kwargs) + self.dim = dim + self.window_size = window_size + self.num_heads = num_heads + self.scale = (dim // num_heads) ** -0.5 + self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias) + self.dropout = layers.Dropout(dropout_rate) + self.proj = layers.Dense(dim) + + def build(self, input_shape): + num_window_elements = (2 * self.window_size[0] - 1) * ( + 2 * self.window_size[1] - 1 + ) + self.relative_position_bias_table = self.add_weight( + shape=(num_window_elements, self.num_heads), + initializer=tf.initializers.Zeros(), + trainable=True, + ) + coords_h = np.arange(self.window_size[0]) + coords_w = np.arange(self.window_size[1]) + coords_matrix = np.meshgrid(coords_h, coords_w, indexing="ij") + coords = np.stack(coords_matrix) + coords_flatten = coords.reshape(2, -1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.transpose([1, 2, 0]) + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) + + self.relative_position_index = tf.Variable( + initial_value=lambda: tf.convert_to_tensor(relative_position_index), trainable=False + ) + + def call(self, x, mask=None): + _, size, channels = x.shape + head_dim = channels // self.num_heads + x_qkv = self.qkv(x) + x_qkv = tf.reshape(x_qkv, shape=(-1, size, 3, self.num_heads, head_dim)) + x_qkv = tf.transpose(x_qkv, perm=(2, 0, 3, 1, 4)) + q, k, v = x_qkv[0], x_qkv[1], x_qkv[2] + q = q * self.scale + k = tf.transpose(k, perm=(0, 1, 3, 2)) + attn = q @ k + + num_window_elements = self.window_size[0] * self.window_size[1] + relative_position_index_flat = tf.reshape( + self.relative_position_index, shape=(-1,) + ) + relative_position_bias = tf.gather( + self.relative_position_bias_table, relative_position_index_flat + ) + relative_position_bias = tf.reshape( + relative_position_bias, shape=(num_window_elements, num_window_elements, -1) + ) + relative_position_bias = tf.transpose(relative_position_bias, perm=(2, 0, 1)) + attn = attn + tf.expand_dims(relative_position_bias, axis=0) + + if mask is not None: + nW = mask.shape[0] + mask_float = tf.cast( + tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32 + ) + attn = ( + tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size)) + + mask_float + ) + attn = tf.reshape(attn, shape=(-1, self.num_heads, size, size)) + attn = keras.activations.softmax(attn, axis=-1) + else: + attn = keras.activations.softmax(attn, axis=-1) + attn = self.dropout(attn) + + x_qkv = attn @ v + x_qkv = tf.transpose(x_qkv, perm=(0, 2, 1, 3)) + x_qkv = tf.reshape(x_qkv, shape=(-1, size, channels)) + x_qkv = self.proj(x_qkv) + x_qkv = self.dropout(x_qkv) + return x_qkv + + +""" +## The complete Swin Transformer model + +Finally, we put together the complete Swin Transformer by replacing the standard multi-head +attention (MHA) with shifted windows attention. As suggested in the +original paper, we create a model comprising of a shifted window-based MHA +layer, followed by a 2-layer MLP with GELU nonlinearity in between, applying +`LayerNormalization` before each MSA layer and each MLP, and a residual +connection after each of these layers. + +Notice that we only create a simple MLP with 2 Dense and +2 Dropout layers. Often you will see models using ResNet-50 as the MLP which is +quite standard in the literature. However in this paper the authors use a +2-layer MLP with GELU nonlinearity in between. +""" + + +class SwinTransformer(layers.Layer): + def __init__( + self, + dim, + num_patch, + num_heads, + window_size=7, + shift_size=0, + num_mlp=1024, + qkv_bias=True, + dropout_rate=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.dim = dim # number of input dimensions + self.num_patch = num_patch # number of embedded patches + self.num_heads = num_heads # number of attention heads + self.window_size = window_size # size of window + self.shift_size = shift_size # size of window shift + self.num_mlp = num_mlp # number of MLP nodes + + self.norm1 = layers.LayerNormalization(epsilon=1e-5) + self.attn = WindowAttention( + dim, + window_size=(self.window_size, self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + dropout_rate=dropout_rate, + ) + self.drop_path = DropPath(dropout_rate) + self.norm2 = layers.LayerNormalization(epsilon=1e-5) + + self.mlp = keras.Sequential( + [ + layers.Dense(num_mlp), + layers.Activation(keras.activations.gelu), + layers.Dropout(dropout_rate), + layers.Dense(dim), + layers.Dropout(dropout_rate), + ] + ) + + if min(self.num_patch) < self.window_size: + self.shift_size = 0 + self.window_size = min(self.num_patch) + + def build(self, input_shape): + if self.shift_size == 0: + self.attn_mask = None + else: + height, width = self.num_patch + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + mask_array = np.zeros((1, height, width, 1)) + count = 0 + for h in h_slices: + for w in w_slices: + mask_array[:, h, w, :] = count + count += 1 + mask_array = tf.convert_to_tensor(mask_array) + + # mask array to windows + mask_windows = window_partition(mask_array, self.window_size) + mask_windows = tf.reshape( + mask_windows, shape=[-1, self.window_size * self.window_size] + ) + attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims( + mask_windows, axis=2 + ) + attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask) + attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask) + self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False) + + def call(self, x): + height, width = self.num_patch + _, num_patches_before, channels = x.shape + x_skip = x + x = self.norm1(x) + x = tf.reshape(x, shape=(-1, height, width, channels)) + if self.shift_size > 0: + shifted_x = tf.roll( + x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2] + ) + else: + shifted_x = x + + x_windows = window_partition(shifted_x, self.window_size) + x_windows = tf.reshape( + x_windows, shape=(-1, self.window_size * self.window_size, channels) + ) + attn_windows = self.attn(x_windows, mask=self.attn_mask) + + attn_windows = tf.reshape( + attn_windows, shape=(-1, self.window_size, self.window_size, channels) + ) + shifted_x = window_reverse( + attn_windows, self.window_size, height, width, channels + ) + if self.shift_size > 0: + x = tf.roll( + shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2] + ) + else: + x = shifted_x + + x = tf.reshape(x, shape=(-1, height * width, channels)) + x = self.drop_path(x) + x = x_skip + x + x_skip = x + x = self.norm2(x) + x = self.mlp(x) + x = self.drop_path(x) + x = x_skip + x + return x + + +""" +## Model training and evaluation + +### Extract and embed patches + +We first create 3 layers to help us extract, embed and merge patches from the +images on top of which we will later use the Swin Transformer class we built. +""" + + +class PatchExtract(layers.Layer): + def __init__(self, patch_size, **kwargs): + super().__init__(**kwargs) + self.patch_size_x = patch_size[0] + self.patch_size_y = patch_size[0] + + def call(self, images): + batch_size = tf.shape(images)[0] + patches = tf.image.extract_patches( + images=images, + sizes=(1, self.patch_size_x, self.patch_size_y, 1), + strides=(1, self.patch_size_x, self.patch_size_y, 1), + rates=(1, 1, 1, 1), + padding="VALID", + ) + patch_dim = patches.shape[-1] + patch_num = patches.shape[1] + return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim)) + + +class PatchEmbedding(layers.Layer): + def __init__(self, num_patch, embed_dim, **kwargs): + super().__init__(**kwargs) + self.num_patch = num_patch + self.proj = layers.Dense(embed_dim) + self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim) + + def call(self, patch): + pos = tf.range(start=0, limit=self.num_patch, delta=1) + return self.proj(patch) + self.pos_embed(pos) + + +class PatchMerging(keras.layers.Layer): + def __init__(self, num_patch, embed_dim): + super().__init__() + self.num_patch = num_patch + self.embed_dim = embed_dim + self.linear_trans = layers.Dense(2 * embed_dim, use_bias=False) + + def call(self, x): + height, width = self.num_patch + _, _, C = x.shape + x = tf.reshape(x, shape=(-1, height, width, C)) + x0 = x[:, 0::2, 0::2, :] + x1 = x[:, 1::2, 0::2, :] + x2 = x[:, 0::2, 1::2, :] + x3 = x[:, 1::2, 1::2, :] + x = tf.concat((x0, x1, x2, x3), axis=-1) + x = tf.reshape(x, shape=(-1, (height // 2) * (width // 2), 4 * C)) + return self.linear_trans(x) + + +""" +### Build the model + +We put together the Swin Transformer model. +""" + +input = layers.Input(input_shape) +x = layers.RandomCrop(image_dimension, image_dimension)(input) +x = layers.RandomFlip("horizontal")(x) +x = PatchExtract(patch_size)(x) +x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(x) +x = SwinTransformer( + dim=embed_dim, + num_patch=(num_patch_x, num_patch_y), + num_heads=num_heads, + window_size=window_size, + shift_size=0, + num_mlp=num_mlp, + qkv_bias=qkv_bias, + dropout_rate=dropout_rate, +)(x) +x = SwinTransformer( + dim=embed_dim, + num_patch=(num_patch_x, num_patch_y), + num_heads=num_heads, + window_size=window_size, + shift_size=shift_size, + num_mlp=num_mlp, + qkv_bias=qkv_bias, + dropout_rate=dropout_rate, +)(x) +x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x) +x = layers.GlobalAveragePooling1D()(x) +output = layers.Dense(num_classes, activation="softmax")(x) + +""" +### Train on CIFAR-100 + +We train the model on CIFAR-100. Here, we only train the model +for 40 epochs to keep the training time short in this example. +In practice, you should train for 150 epochs to reach convergence. +""" + +model = keras.Model(input, output) +model.compile( + loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing), + optimizer=keras.optimizers.AdamW( + learning_rate=learning_rate, weight_decay=weight_decay + ), + metrics=[ + keras.metrics.CategoricalAccuracy(name="accuracy"), + keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"), + ], +) + +history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=num_epochs, + validation_split=validation_split, +) + +""" +Let's visualize the training progress of the model. +""" + +plt.plot(history.history["loss"], label="train_loss") +plt.plot(history.history["val_loss"], label="val_loss") +plt.xlabel("Epochs") +plt.ylabel("Loss") +plt.title("Train and Validation Losses Over Epochs", fontsize=14) +plt.legend() +plt.grid() +plt.show() + +""" +Let's display the final results of the training on CIFAR-100. +""" + +loss, accuracy, top_5_accuracy = model.evaluate(x_test, y_test) +print(f"Test loss: {round(loss, 2)}") +print(f"Test accuracy: {round(accuracy * 100, 2)}%") +print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%") + +""" +The Swin Transformer model we just trained has just 152K parameters, and it gets +us to ~75% test top-5 accuracy within just 40 epochs without any signs of overfitting +as well as seen in above graph. This means we can train this network for longer +(perhaps with a bit more regularization) and obtain even better performance. +This performance can further be improved by additional techniques like cosine +decay learning rate schedule, other data augmentation techniques. While experimenting, +I tried training the model for 150 epochs with a slightly higher dropout and greater +embedding dimensions which pushes the performance to ~72% test accuracy on CIFAR-100 +as you can see in the screenshot. + +![Results of training for longer](https://i.imgur.com/9vnQesZ.png) + +The authors present a top-1 accuracy of 87.3% on ImageNet. The authors also present +a number of experiments to study how input sizes, optimizers etc. affect the final +performance of this model. The authors further present using this model for object detection, +semantic segmentation and instance segmentation as well and report competitive results +for these. You are strongly advised to also check out the +[original paper](https://arxiv.org/abs/2103.14030). + +This example takes inspiration from the official +[PyTorch](https://github.com/microsoft/Swin-Transformer) and +[TensorFlow](https://github.com/VcampSoldiers/Swin-Transformer-Tensorflow) implementations. +""" \ No newline at end of file diff --git a/examples/keras_io/timeseries/timeseries_classification_from_scratch.py b/examples/keras_io/timeseries/timeseries_classification_from_scratch.py new file mode 100755 index 000000000..73564a35c --- /dev/null +++ b/examples/keras_io/timeseries/timeseries_classification_from_scratch.py @@ -0,0 +1,226 @@ +""" +Title: Timeseries classification from scratch +Author: [hfawaz](https://github.com/hfawaz/) +Date created: 2020/07/21 +Last modified: 2021/07/16 +Description: Training a timeseries classifier from scratch on the FordA dataset from the UCR/UEA archive. +Accelerator: GPU +""" +""" +## Introduction + +This example shows how to do timeseries classification from scratch, starting from raw +CSV timeseries files on disk. We demonstrate the workflow on the FordA dataset from the +[UCR/UEA archive](https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/). + +""" + +""" +## Setup + +""" +import keras_core as keras +import numpy as np +import matplotlib.pyplot as plt + +""" +## Load the data: the FordA dataset + +### Dataset description + +The dataset we are using here is called FordA. +The data comes from the UCR archive. +The dataset contains 3601 training instances and another 1320 testing instances. +Each timeseries corresponds to a measurement of engine noise captured by a motor sensor. +For this task, the goal is to automatically detect the presence of a specific issue with +the engine. The problem is a balanced binary classification task. The full description of +this dataset can be found [here](http://www.j-wichard.de/publications/FordPaper.pdf). + +### Read the TSV data + +We will use the `FordA_TRAIN` file for training and the +`FordA_TEST` file for testing. The simplicity of this dataset +allows us to demonstrate effectively how to use ConvNets for timeseries classification. +In this file, the first column corresponds to the label. +""" + + +def readucr(filename): + data = np.loadtxt(filename, delimiter="\t") + y = data[:, 0] + x = data[:, 1:] + return x, y.astype(int) + + +root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/" + +x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv") +x_test, y_test = readucr(root_url + "FordA_TEST.tsv") + +""" +## Visualize the data + +Here we visualize one timeseries example for each class in the dataset. + +""" + +classes = np.unique(np.concatenate((y_train, y_test), axis=0)) + +plt.figure() +for c in classes: + c_x_train = x_train[y_train == c] + plt.plot(c_x_train[0], label="class " + str(c)) +plt.legend(loc="best") +plt.show() +plt.close() + +""" +## Standardize the data + +Our timeseries are already in a single length (500). However, their values are +usually in various ranges. This is not ideal for a neural network; +in general we should seek to make the input values normalized. +For this specific dataset, the data is already z-normalized: each timeseries sample +has a mean equal to zero and a standard deviation equal to one. This type of +normalization is very common for timeseries classification problems, see +[Bagnall et al. (2016)](https://link.springer.com/article/10.1007/s10618-016-0483-9). + +Note that the timeseries data used here are univariate, meaning we only have one channel +per timeseries example. +We will therefore transform the timeseries into a multivariate one with one channel +using a simple reshaping via numpy. +This will allow us to construct a model that is easily applicable to multivariate time +series. +""" + +x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1)) +x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1)) + +""" +Finally, in order to use `sparse_categorical_crossentropy`, we will have to count +the number of classes beforehand. +""" + +num_classes = len(np.unique(y_train)) + +""" +Now we shuffle the training set because we will be using the `validation_split` option +later when training. +""" + +idx = np.random.permutation(len(x_train)) +x_train = x_train[idx] +y_train = y_train[idx] + +""" +Standardize the labels to positive integers. +The expected labels will then be 0 and 1. +""" + +y_train[y_train == -1] = 0 +y_test[y_test == -1] = 0 + +""" +## Build a model + +We build a Fully Convolutional Neural Network originally proposed in +[this paper](https://arxiv.org/abs/1611.06455). +The implementation is based on the TF 2 version provided +[here](https://github.com/hfawaz/dl-4-tsc/). +The following hyperparameters (kernel_size, filters, the usage of BatchNorm) were found +via random search using [KerasTuner](https://github.com/keras-team/keras-tuner). + +""" + + +def make_model(input_shape): + input_layer = keras.layers.Input(input_shape) + + conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(input_layer) + conv1 = keras.layers.BatchNormalization()(conv1) + conv1 = keras.layers.ReLU()(conv1) + + conv2 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv1) + conv2 = keras.layers.BatchNormalization()(conv2) + conv2 = keras.layers.ReLU()(conv2) + + conv3 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv2) + conv3 = keras.layers.BatchNormalization()(conv3) + conv3 = keras.layers.ReLU()(conv3) + + gap = keras.layers.GlobalAveragePooling1D()(conv3) + + output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap) + + return keras.models.Model(inputs=input_layer, outputs=output_layer) + + +model = make_model(input_shape=x_train.shape[1:]) +keras.utils.plot_model(model, show_shapes=True) + +""" +## Train the model + +""" + +epochs = 500 +batch_size = 32 + +callbacks = [ + keras.callbacks.ModelCheckpoint( + "best_model.keras", save_best_only=True, monitor="val_loss" + ), + keras.callbacks.ReduceLROnPlateau( + monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001 + ), + keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1), +] +model.compile( + optimizer="adam", + loss="sparse_categorical_crossentropy", + metrics=["sparse_categorical_accuracy"] +) +history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + callbacks=callbacks, + validation_split=0.2, + verbose=1, +) + +""" +## Evaluate model on test data +""" + +model = keras.models.load_model("best_model.keras") + +test_loss, test_acc = model.evaluate(x_test, y_test) + +print("Test accuracy", test_acc) +print("Test loss", test_loss) + +""" +## Plot the model's training and validation loss +""" + +metric = "sparse_categorical_accuracy" +plt.figure() +plt.plot(history.history[metric]) +plt.plot(history.history["val_" + metric]) +plt.title("model " + metric) +plt.ylabel(metric, fontsize="large") +plt.xlabel("epoch", fontsize="large") +plt.legend(["train", "val"], loc="best") +plt.show() +plt.close() + +""" +We can see how the training accuracy reaches almost 0.95 after 100 epochs. +However, by observing the validation accuracy we can see how the network still needs +training until it reaches almost 0.97 for both the validation and the training accuracy +after 200 epochs. Beyond the 200th epoch, if we continue on training, the validation +accuracy will start decreasing while the training accuracy will continue on increasing: +the model starts overfitting. +""" diff --git a/keras_core/backend/jax/numpy.py b/keras_core/backend/jax/numpy.py index 178941f48..db22ad19b 100644 --- a/keras_core/backend/jax/numpy.py +++ b/keras_core/backend/jax/numpy.py @@ -471,6 +471,8 @@ def tan(x): def tensordot(x1, x2, axes=2): + x1 = convert_to_tensor(x1) + x2 = convert_to_tensor(x2) return jnp.tensordot(x1, x2, axes=axes)