diff --git a/examples/keras_io/structured_data/deep_neural_decision_forests.py b/examples/keras_io/structured_data/deep_neural_decision_forests.py new file mode 100644 index 000000000..e9a6ea833 --- /dev/null +++ b/examples/keras_io/structured_data/deep_neural_decision_forests.py @@ -0,0 +1,484 @@ +""" +Title: Classification with Neural Decision Forests +Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/) +Date created: 2021/01/15 +Last modified: 2021/01/15 +Description: How to train differentiable decision trees for end-to-end learning in deep neural networks. +Accelerator: GPU +""" + +""" +## Introduction + +This example provides an implementation of the +[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529) +model introduced by P. Kontschieder et al. for structured data classification. +It demonstrates how to build a stochastic and differentiable decision tree model, +train it end-to-end, and unify decision trees with deep representation learning. + +## The dataset + +This example uses the +[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income) +provided by the +[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). +The task is binary classification +to predict whether a person is likely to be making over USD 50,000 a year. + +The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features +and 9 categorical features. +""" + +""" +## Setup +""" + +import keras_core as keras +from keras_core import layers +from keras_core.layers import StringLookup +from keras_core import ops + + +from tensorflow import data as tf_data +import numpy as np +import pandas as pd + +import math + + +_dtype = "float32" + +""" +## Prepare the data +""" + +CSV_HEADER = [ + "age", + "workclass", + "fnlwgt", + "education", + "education_num", + "marital_status", + "occupation", + "relationship", + "race", + "gender", + "capital_gain", + "capital_loss", + "hours_per_week", + "native_country", + "income_bracket", +] + +train_data_url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" +) +train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER) + +test_data_url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" +) +test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER) + +print(f"Train dataset shape: {train_data.shape}") +print(f"Test dataset shape: {test_data.shape}") + +""" +Remove the first record (because it is not a valid data example) and a trailing +'dot' in the class labels. +""" + +test_data = test_data[1:] +test_data.income_bracket = test_data.income_bracket.apply( + lambda value: value.replace(".", "") +) + +""" +We store the training and test data splits locally as CSV files. +""" + +train_data_file = "train_data.csv" +test_data_file = "test_data.csv" + +train_data.to_csv(train_data_file, index=False, header=False) +test_data.to_csv(test_data_file, index=False, header=False) + +""" +## Define dataset metadata + +Here, we define the metadata of the dataset that will be useful for reading and parsing +and encoding input features. +""" + +# A list of the numerical feature names. +NUMERIC_FEATURE_NAMES = [ + "age", + "education_num", + "capital_gain", + "capital_loss", + "hours_per_week", +] +# A dictionary of the categorical features and their vocabulary. +CATEGORICAL_FEATURES_WITH_VOCABULARY = { + "workclass": sorted(list(train_data["workclass"].unique())), + "education": sorted(list(train_data["education"].unique())), + "marital_status": sorted(list(train_data["marital_status"].unique())), + "occupation": sorted(list(train_data["occupation"].unique())), + "relationship": sorted(list(train_data["relationship"].unique())), + "race": sorted(list(train_data["race"].unique())), + "gender": sorted(list(train_data["gender"].unique())), + "native_country": sorted(list(train_data["native_country"].unique())), +} +# A list of the columns to ignore from the dataset. +IGNORE_COLUMN_NAMES = ["fnlwgt"] +# A list of the categorical feature names. +CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()) +# A list of all the input features. +FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES +# A list of column default values for each feature. +COLUMN_DEFAULTS = [ + [0.0] + if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES + else ["NA"] + for feature_name in CSV_HEADER +] +# The name of the target feature. +TARGET_FEATURE_NAME = "income_bracket" +# A list of the labels of the target features. +TARGET_LABELS = [" <=50K", " >50K"] + +""" +## Create `tf_data.Dataset` objects for training and validation + +We create an input function to read and parse the file, and convert features and labels +into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets) +for training and validation. We also preprocess the input by mapping the target label +to an index. +""" + + +target_label_lookup = StringLookup( + vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0 +) + + +lookup_dict = {} +for feature_name in CATEGORICAL_FEATURE_NAMES: + vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] + # Create a lookup to convert a string values to an integer indices. + # Since we are not using a mask token, nor expecting any out of vocabulary + # (oov) token, we set mask_token to None and num_oov_indices to 0. + lookup = StringLookup( + vocabulary=vocabulary, mask_token=None, num_oov_indices=0 + ) + lookup_dict[feature_name] = lookup + + +def encode_categorical(batch_x, batch_y): + for feature_name in CATEGORICAL_FEATURE_NAMES: + batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name]) + + return batch_x, batch_y + + +def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128): + dataset = ( + tf_data.experimental.make_csv_dataset( + csv_file_path, + batch_size=batch_size, + column_names=CSV_HEADER, + column_defaults=COLUMN_DEFAULTS, + label_name=TARGET_FEATURE_NAME, + num_epochs=1, + header=False, + na_value="?", + shuffle=shuffle, + ) + .map(lambda features, target: (features, target_label_lookup(target))) + .map(encode_categorical) + ) + + return dataset.cache() + + +""" +## Create model inputs +""" + + +def create_model_inputs(): + inputs = {} + for feature_name in FEATURE_NAMES: + if feature_name in NUMERIC_FEATURE_NAMES: + inputs[feature_name] = layers.Input( + name=feature_name, shape=(), dtype=_dtype + ) + else: + inputs[feature_name] = layers.Input( + name=feature_name, shape=(), dtype="int32" + ) + return inputs + + +""" +## Encode input features +""" + + +def encode_inputs(inputs): + encoded_features = [] + for feature_name in inputs: + if feature_name in CATEGORICAL_FEATURE_NAMES: + vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] + # Create a lookup to convert a string values to an integer indices. + # Since we are not using a mask token, nor expecting any out of vocabulary + # (oov) token, we set mask_token to None and num_oov_indices to 0. + value_index = inputs[feature_name] + embedding_dims = int(math.sqrt(lookup.vocabulary_size())) + # Create an embedding layer with the specified dimensions. + embedding = layers.Embedding( + input_dim=lookup.vocabulary_size(), output_dim=embedding_dims + ) + # Convert the index values to embedding representations. + encoded_feature = embedding(value_index) + else: + # Use the numerical features as-is. + encoded_feature = inputs[feature_name] + if inputs[feature_name].shape[-1] is None: + encoded_feature = keras.ops.expand_dims(encoded_feature, -1) + + encoded_features.append(encoded_feature) + + encoded_features = layers.concatenate(encoded_features) + return encoded_features + + +""" +## Deep Neural Decision Tree + +A neural decision tree model has two sets of weights to learn. The first set is `pi`, +which represents the probability distribution of the classes in the tree leaves. +The second set is the weights of the routing layer `decision_fn`, which represents the probability +of going to each leave. The forward pass of the model works as follows: + +1. The model expects input `features` as a single vector encoding all the features of an instance +in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images +or dense transformations applied to structured data features. +2. The model first applies a `used_features_mask` to randomly select a subset of input features to use. +3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves +by iteratively performing a *stochastic* routing throughout the tree levels. +4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the +leaves to produce the final `outputs`. +""" + + +class NeuralDecisionTree(keras.Model): + def __init__(self, depth, num_features, used_features_rate, num_classes): + super().__init__() + self.depth = depth + self.num_leaves = 2**depth + self.num_classes = num_classes + + # Create a mask for the randomly selected features. + num_used_features = int(num_features * used_features_rate) + one_hot = np.eye(num_features) + sampled_feature_indices = np.random.choice( + np.arange(num_features), num_used_features, replace=False + ) + self.used_features_mask = ops.convert_to_tensor( + one_hot[sampled_feature_indices], dtype=_dtype + ) + + # Initialize the weights of the classes in leaves. + self.pi = self.add_weight( + initializer="random_normal", + shape=[self.num_leaves, self.num_classes], + dtype=_dtype, + trainable=True, + ) + + # Initialize the stochastic routing layer. + self.decision_fn = layers.Dense( + units=self.num_leaves, activation="sigmoid", name="decision" + ) + + def call(self, features): + batch_size = ops.shape(features)[0] + + # Apply the feature mask to the input features. + features = ops.matmul( + features, ops.transpose(self.used_features_mask) + ) # [batch_size, num_used_features] + # Compute the routing probabilities. + decisions = ops.expand_dims( + self.decision_fn(features), axis=2 + ) # [batch_size, num_leaves, 1] + # Concatenate the routing probabilities with their complements. + decisions = layers.concatenate( + [decisions, 1 - decisions], axis=2 + ) # [batch_size, num_leaves, 2] + + mu = ops.ones([batch_size, 1, 1]) + + begin_idx = 1 + end_idx = 2 + # Traverse the tree in breadth-first order. + for level in range(self.depth): + mu = ops.reshape( + mu, [batch_size, -1, 1] + ) # [batch_size, 2 ** level, 1] + mu = ops.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2] + level_decisions = decisions[ + :, begin_idx:end_idx, : + ] # [batch_size, 2 ** level, 2] + mu = mu * level_decisions # [batch_size, 2**level, 2] + begin_idx = end_idx + end_idx = begin_idx + 2 ** (level + 1) + + mu = ops.reshape( + mu, [batch_size, self.num_leaves] + ) # [batch_size, num_leaves] + probabilities = keras.activations.softmax( + self.pi + ) # [num_leaves, num_classes] + outputs = ops.matmul(mu, probabilities) # [batch_size, num_classes] + return outputs + + +""" +## Deep Neural Decision Forest + +The neural decision forest model consists of a set of neural decision trees that are +trained simultaneously. The output of the forest model is the average outputs of its trees. +""" + + +class NeuralDecisionForest(keras.Model): + def __init__( + self, num_trees, depth, num_features, used_features_rate, num_classes + ): + super().__init__() + self.ensemble = [] + # Initialize the ensemble by adding NeuralDecisionTree instances. + # Each tree will have its own randomly selected input features to use. + for _ in range(num_trees): + self.ensemble.append( + NeuralDecisionTree( + depth, num_features, used_features_rate, num_classes + ) + ) + + def call(self, inputs): + # Initialize the outputs: a [batch_size, num_classes] matrix of zeros. + batch_size = ops.shape(inputs)[0] + outputs = ops.zeros([batch_size, num_classes]) + + # Aggregate the outputs of trees in the ensemble. + for tree in self.ensemble: + outputs += tree(inputs) + # Divide the outputs by the ensemble size to get the average. + outputs /= len(self.ensemble) + return outputs + + +""" +Finally, let's set up the code that will train and evaluate the model. +""" + +learning_rate = 0.01 +batch_size = 265 +num_epochs = 10 + + +def run_experiment(model): + model.compile( + optimizer=keras.optimizers.Adam(learning_rate=learning_rate), + loss=keras.losses.SparseCategoricalCrossentropy(), + metrics=[keras.metrics.SparseCategoricalAccuracy()], + ) + + print("Start training the model...") + train_dataset = get_dataset_from_csv( + train_data_file, shuffle=True, batch_size=batch_size + ) + + model.fit(train_dataset, epochs=num_epochs) + print("Model training finished") + + print("Evaluating the model on the test data...") + test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size) + + _, accuracy = model.evaluate(test_dataset) + print(f"Test accuracy: {round(accuracy * 100, 2)}%") + + +""" +## Experiment 1: train a decision tree model + +In this experiment, we train a single neural decision tree model +where we use all input features. +""" + +num_trees = 10 +depth = 10 +used_features_rate = 1.0 +num_classes = len(TARGET_LABELS) + + +def create_tree_model(): + inputs = create_model_inputs() + features = encode_inputs(inputs) + features = layers.BatchNormalization()(features) + num_features = features.shape[1] + + tree = NeuralDecisionTree( + depth, num_features, used_features_rate, num_classes + ) + + outputs = tree(features) + model = keras.Model(inputs=inputs, outputs=outputs) + return model + + +tree_model = create_tree_model() +run_experiment(tree_model) + + +""" +## Experiment 2: train a forest model + +In this experiment, we train a neural decision forest with `num_trees` trees +where each tree uses randomly selected 50% of the input features. You can control the number +of features to be used in each tree by setting the `used_features_rate` variable. +In addition, we set the depth to 5 instead of 10 compared to the previous experiment. +""" + +num_trees = 25 +depth = 5 +used_features_rate = 0.5 + + +def create_forest_model(): + inputs = create_model_inputs() + features = encode_inputs(inputs) + features = layers.BatchNormalization()(features) + num_features = features.shape[1] + + forest_model = NeuralDecisionForest( + num_trees, depth, num_features, used_features_rate, num_classes + ) + + outputs = forest_model(features) + model = keras.Model(inputs=inputs, outputs=outputs) + return model + + +forest_model = create_forest_model() + +run_experiment(forest_model) + + +""" +You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/neural-decision-forest) +and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Neural-Decision-Forest). +""" diff --git a/examples/keras_io/tensorflow/vision/simsiam.py b/examples/keras_io/tensorflow/vision/simsiam.py new file mode 100644 index 000000000..b9671407e --- /dev/null +++ b/examples/keras_io/tensorflow/vision/simsiam.py @@ -0,0 +1,443 @@ +""" +Title: Self-supervised contrastive learning with SimSiam +Author: [Sayak Paul](https://twitter.com/RisingSayak) +Date created: 2021/03/19 +Last modified: 2021/03/20 +Description: Implementation of a self-supervised learning method for computer vision. +Accelerator: GPU +""" +""" +Self-supervised learning (SSL) is an interesting branch of study in the field of +representation learning. SSL systems try to formulate a supervised signal from a corpus +of unlabeled data points. An example is we train a deep neural network to predict the +next word from a given set of words. In literature, these tasks are known as *pretext +tasks* or *auxiliary tasks*. If we [train such a network](https://arxiv.org/abs/1801.06146) on a huge dataset (such as +the [Wikipedia text corpus](https://www.corpusdata.org/wikipedia.asp)) it learns very effective +representations that transfer well to downstream tasks. Language models like +[BERT](https://arxiv.org/abs/1810.04805), [GPT-3](https://arxiv.org/abs/2005.14165), +[ELMo](https://allennlp.org/elmo) all benefit from this. + +Much like the language models we can train computer vision models using similar +approaches. To make things work in computer vision, we need to formulate the learning +tasks such that the underlying model (a deep neural network) is able to make sense of the +semantic information present in vision data. One such task is to a model to _contrast_ +between two different versions of the same image. The hope is that in this way the model +will have learn representations where the similar images are grouped as together possible +while the dissimilar images are further away. + +In this example, we will be implementing one such system called **SimSiam** proposed in +[Exploring Simple Siamese Representation Learning](https://arxiv.org/abs/2011.10566). It +is implemented as the following: + +1. We create two different versions of the same dataset with a stochastic data +augmentation pipeline. Note that the random initialization seed needs to be the same +during create these versions. +2. We take a ResNet without any classification head (**backbone**) and we add a shallow +fully-connected network (**projection head**) on top of it. Collectively, this is known +as the **encoder**. +3. We pass the output of the encoder through a **predictor** which is again a shallow +fully-connected network having an +[AutoEncoder](https://en.wikipedia.org/wiki/Autoencoder) like structure. +4. We then train our encoder to maximize the cosine similarity between the two different +versions of our dataset. +""" + +""" +## Setup +""" + +import os +os.environ['KERAS_BACKEND'] = 'tensorflow' + +from keras_core import layers +from keras_core import regularizers +import keras_core as keras +import tensorflow as tf + +import matplotlib.pyplot as plt +import numpy as np + +""" +## Define hyperparameters +""" + +AUTO = tf.data.AUTOTUNE +BATCH_SIZE = 128 +EPOCHS = 5 +CROP_TO = 32 +SEED = 26 + +PROJECT_DIM = 2048 +LATENT_DIM = 512 +WEIGHT_DECAY = 0.0005 + +""" +## Load the CIFAR-10 dataset +""" + +(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data() +print(f"Total training examples: {len(x_train)}") +print(f"Total test examples: {len(x_test)}") + +""" +## Defining our data augmentation pipeline + +As studied in [SimCLR](https://arxiv.org/abs/2002.05709) having the right data +augmentation pipeline is critical for SSL systems to work effectively in computer vision. +Two particular augmentation transforms that seem to matter the most are: 1.) Random +resized crops and 2.) Color distortions. Most of the other SSL systems for computer +vision (such as [BYOL](https://arxiv.org/abs/2006.07733), +[MoCoV2](https://arxiv.org/abs/2003.04297), [SwAV](https://arxiv.org/abs/2006.09882), +etc.) include these in their training pipelines. +""" + + +def flip_random_crop(image): + # With random crops we also apply horizontal flipping. + image = tf.image.random_flip_left_right(image) + image = tf.image.random_crop(image, (CROP_TO, CROP_TO, 3)) + return image + + +def color_jitter(x, strength=[0.4, 0.4, 0.4, 0.1]): + x = tf.image.random_brightness(x, max_delta=0.8 * strength[0]) + x = tf.image.random_contrast( + x, lower=1 - 0.8 * strength[1], upper=1 + 0.8 * strength[1] + ) + x = tf.image.random_saturation( + x, lower=1 - 0.8 * strength[2], upper=1 + 0.8 * strength[2] + ) + x = tf.image.random_hue(x, max_delta=0.2 * strength[3]) + # Affine transformations can disturb the natural range of + # RGB images, hence this is needed. + x = tf.clip_by_value(x, 0, 255) + return x + + +def color_drop(x): + x = tf.image.rgb_to_grayscale(x) + x = tf.tile(x, [1, 1, 3]) + return x + + +def random_apply(func, x, p): + if tf.random.uniform([], minval=0, maxval=1) < p: + return func(x) + else: + return x + + +def custom_augment(image): + # As discussed in the SimCLR paper, the series of augmentation + # transformations (except for random crops) need to be applied + # randomly to impose translational invariance. + image = flip_random_crop(image) + image = random_apply(color_jitter, image, p=0.8) + image = random_apply(color_drop, image, p=0.2) + return image + + +""" +It should be noted that an augmentation pipeline is generally dependent on various +properties of the dataset we are dealing with. For example, if images in the dataset are +heavily object-centric then taking random crops with a very high probability may hurt the +training performance. + +Let's now apply our augmentation pipeline to our dataset and visualize a few outputs. +""" + +""" +## Convert the data into TensorFlow `Dataset` objects + +Here we create two different versions of our dataset *without* any ground-truth labels. +""" + +ssl_ds_one = tf.data.Dataset.from_tensor_slices(x_train) +ssl_ds_one = ( + ssl_ds_one.shuffle(1024, seed=SEED) + .map(custom_augment, num_parallel_calls=AUTO) + .batch(BATCH_SIZE) + .prefetch(AUTO) +) + +ssl_ds_two = tf.data.Dataset.from_tensor_slices(x_train) +ssl_ds_two = ( + ssl_ds_two.shuffle(1024, seed=SEED) + .map(custom_augment, num_parallel_calls=AUTO) + .batch(BATCH_SIZE) + .prefetch(AUTO) +) + +# We then zip both of these datasets. +ssl_ds = tf.data.Dataset.zip((ssl_ds_one, ssl_ds_two)) + +# Visualize a few augmented images. +sample_images_one = next(iter(ssl_ds_one)) +plt.figure(figsize=(10, 10)) +for n in range(25): + ax = plt.subplot(5, 5, n + 1) + plt.imshow(sample_images_one[n].numpy().astype("int")) + plt.axis("off") +plt.show() + +# Ensure that the different versions of the dataset actually contain +# identical images. +sample_images_two = next(iter(ssl_ds_two)) +plt.figure(figsize=(10, 10)) +for n in range(25): + ax = plt.subplot(5, 5, n + 1) + plt.imshow(sample_images_two[n].numpy().astype("int")) + plt.axis("off") +plt.show() + +""" +Notice that the images in `samples_images_one` and `sample_images_two` are essentially +the same but are augmented differently. +""" + +""" +## Defining the encoder and the predictor + +We use an implementation of ResNet20 that is specifically configured for the CIFAR10 +dataset. The code is taken from the +[keras-idiomatic-programmer](https://github.com/GoogleCloudPlatform/keras-idiomatic-programmer/blob/master/zoo/resnet/resnet_cifar10_v2.py) repository. The hyperparameters of +these architectures have been referred from Section 3 and Appendix A of [the original +paper](https://arxiv.org/abs/2011.10566). +""" + +"""shell +wget -q https://shorturl.at/QS369 -O resnet_cifar10_v2.py +""" + +import resnet_cifar10_v2 + +N = 2 +DEPTH = N * 9 + 2 +NUM_BLOCKS = ((DEPTH - 2) // 9) - 1 + + +def get_encoder(): + # Input and backbone. + inputs = layers.Input((CROP_TO, CROP_TO, 3)) + x = layers.Rescaling(scale=1.0 / 127.5, offset=-1)(inputs) + x = resnet_cifar10_v2.stem(x) + x = resnet_cifar10_v2.learner(x, NUM_BLOCKS) + x = layers.GlobalAveragePooling2D(name="backbone_pool")(x) + + # Projection head. + x = layers.Dense( + PROJECT_DIM, + use_bias=False, + kernel_regularizer=regularizers.l2(WEIGHT_DECAY), + )(x) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + x = layers.Dense( + PROJECT_DIM, + use_bias=False, + kernel_regularizer=regularizers.l2(WEIGHT_DECAY), + )(x) + outputs = layers.BatchNormalization()(x) + return keras.Model(inputs, outputs, name="encoder") + + +def get_predictor(): + model = keras.Sequential( + [ + # Note the AutoEncoder-like structure. + layers.Input((PROJECT_DIM,)), + layers.Dense( + LATENT_DIM, + use_bias=False, + kernel_regularizer=regularizers.l2(WEIGHT_DECAY), + ), + layers.ReLU(), + layers.BatchNormalization(), + layers.Dense(PROJECT_DIM), + ], + name="predictor", + ) + return model + + +""" +## Defining the (pre-)training loop + +One of the main reasons behind training networks with these kinds of approaches is to +utilize the learned representations for downstream tasks like classification. This is why +this particular training phase is also referred to as _pre-training_. + +We start by defining the loss function. +""" + + +def compute_loss(p, z): + # The authors of SimSiam emphasize the impact of + # the `stop_gradient` operator in the paper as it + # has an important role in the overall optimization. + z = tf.stop_gradient(z) + p = tf.math.l2_normalize(p, axis=1) + z = tf.math.l2_normalize(z, axis=1) + # Negative cosine similarity (minimizing this is + # equivalent to maximizing the similarity). + return -tf.reduce_mean(tf.reduce_sum((p * z), axis=1)) + + +""" +We then define our training loop by overriding the `train_step()` function of the +`keras.Model` class. +""" + + +class SimSiam(keras.Model): + def __init__(self, encoder, predictor): + super().__init__() + self.encoder = encoder + self.predictor = predictor + self.loss_tracker = keras.metrics.Mean(name="loss") + + @property + def metrics(self): + return [self.loss_tracker] + + def train_step(self, data): + # Unpack the data. + ds_one, ds_two = data + + # Forward pass through the encoder and predictor. + with tf.GradientTape() as tape: + z1, z2 = self.encoder(ds_one), self.encoder(ds_two) + p1, p2 = self.predictor(z1), self.predictor(z2) + # Note that here we are enforcing the network to match + # the representations of two differently augmented batches + # of data. + loss = compute_loss(p1, z2) / 2 + compute_loss(p2, z1) / 2 + + # Compute gradients and update the parameters. + learnable_params = ( + self.encoder.trainable_variables + + self.predictor.trainable_variables + ) + gradients = tape.gradient(loss, learnable_params) + self.optimizer.apply_gradients(zip(gradients, learnable_params)) + + # Monitor loss. + self.loss_tracker.update_state(loss) + return {"loss": self.loss_tracker.result()} + + +""" +## Pre-training our networks + +In the interest of this example, we will train the model for only 5 epochs. In reality, +this should at least be 100 epochs. +""" + +# Create a cosine decay learning scheduler. +num_training_samples = len(x_train) +steps = EPOCHS * (num_training_samples // BATCH_SIZE) +lr_decayed_fn = keras.optimizers.schedules.CosineDecay( + initial_learning_rate=0.03, decay_steps=steps +) + +# Create an early stopping callback. +early_stopping = keras.callbacks.EarlyStopping( + monitor="loss", patience=5, restore_best_weights=True +) + +# Compile model and start training. +simsiam = SimSiam(get_encoder(), get_predictor()) +simsiam.compile(optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.6)) +history = simsiam.fit(ssl_ds, epochs=EPOCHS, callbacks=[early_stopping]) + +# Visualize the training progress of the model. +plt.plot(history.history["loss"]) +plt.grid() +plt.title("Negative Cosine Similairty") +plt.show() + +""" +If your solution gets very close to -1 (minimum value of our loss) very quickly with a +different dataset and a different backbone architecture that is likely because of +*representation collapse*. It is a phenomenon where the encoder yields similar output for +all the images. In that case additional hyperparameter tuning is required especially in +the following areas: + +* Strength of the color distortions and their probabilities. +* Learning rate and its schedule. +* Architecture of both the backbone and their projection head. + +""" + +""" +## Evaluating our SSL method + +The most popularly used method to evaluate a SSL method in computer vision (or any other +pre-training method as such) is to learn a linear classifier on the frozen features of +the trained backbone model (in this case it is ResNet20) and evaluate the classifier on +unseen images. Other methods include +[fine-tuning](https://keras.io/guides/transfer_learning/) on the source dataset or even a +target dataset with 5% or 10% labels present. Practically, we can use the backbone model +for any downstream task such as semantic segmentation, object detection, and so on where +the backbone models are usually pre-trained with *pure supervised learning*. +""" + +# We first create labeled `Dataset` objects. +train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)) + +# Then we shuffle, batch, and prefetch this dataset for performance. We +# also apply random resized crops as an augmentation but only to the +# training set. +train_ds = ( + train_ds.shuffle(1024) + .map(lambda x, y: (flip_random_crop(x), y), num_parallel_calls=AUTO) + .batch(BATCH_SIZE) + .prefetch(AUTO) +) +test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO) + +# Extract the backbone ResNet20. +backbone = keras.Model( + simsiam.encoder.input, simsiam.encoder.get_layer("backbone_pool").output +) + +# We then create our linear classifier and train it. +backbone.trainable = False +inputs = layers.Input((CROP_TO, CROP_TO, 3)) +x = backbone(inputs, training=False) +outputs = layers.Dense(10, activation="softmax")(x) +linear_model = keras.Model(inputs, outputs, name="linear_model") + +# Compile model and start training. +linear_model.compile( + loss="sparse_categorical_crossentropy", + metrics=["accuracy"], + optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.9), +) +history = linear_model.fit( + train_ds, validation_data=test_ds, epochs=EPOCHS, callbacks=[early_stopping] +) +_, test_acc = linear_model.evaluate(test_ds) +print("Test accuracy: {:.2f}%".format(test_acc * 100)) + +""" + +## Notes +* More data and longer pre-training schedule benefit SSL in general. +* SSL is particularly very helpful when you do not have access to very limited *labeled* +training data but you can manage to build a large corpus of unlabeled data. Recently, +using an SSL method called [SwAV](https://arxiv.org/abs/2006.09882), a group of +researchers at Facebook trained a [RegNet](https://arxiv.org/abs/2006.09882) on 2 Billion +images. They were able to achieve downstream performance very close to those achieved by +pure supervised pre-training. For some downstream tasks, their method even outperformed +the supervised counterparts. You can check out [their +paper](https://arxiv.org/pdf/2103.01988.pdf) to know the details. +* If you are interested to understand why contrastive SSL helps networks learn meaningful +representations, you can check out the following resources: + * [Self-supervised learning: The dark matter of +intelligence](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/) + * [Understanding self-supervised learning using controlled datasets with known +structure](https://sslneuips20.github.io/files/CameraReadys%203-77/64/CameraReady/Understanding_self_supervised_learning.pdf) + +""" diff --git a/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py b/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py new file mode 100644 index 000000000..3b1233c16 --- /dev/null +++ b/examples/keras_io/tensorflow/vision/visualizing_what_convnets_learn.py @@ -0,0 +1,209 @@ +""" +Title: Visualizing what convnets learn +Author: [fchollet](https://twitter.com/fchollet) +Date created: 2020/05/29 +Last modified: 2020/05/29 +Description: Displaying the visual patterns that convnet filters respond to. +Accelerator: GPU +""" +""" +## Introduction + +In this example, we look into what sort of visual patterns image classification models +learn. We'll be using the `ResNet50V2` model, trained on the ImageNet dataset. + +Our process is simple: we will create input images that maximize the activation of +specific filters in a target layer (picked somewhere in the middle of the model: layer +`conv3_block4_out`). Such images represent a visualization of the +pattern that the filter responds to. +""" + +""" +## Setup +""" + +import os + +os.environ["KERAS_BACKEND"] = "tensorflow" + +import keras_core as keras + + +import numpy as np +import tensorflow as tf + +# The dimensions of our input image +img_width = 180 +img_height = 180 +# Our target layer: we will visualize the filters from this layer. +# See `model.summary()` for list of layer names, if you want to change this. +layer_name = "conv3_block4_out" + +""" +## Build a feature extraction model +""" + +# Build a ResNet50V2 model loaded with pre-trained ImageNet weights +model = keras.applications.ResNet50V2(weights="imagenet", include_top=False) + +# Set up a model that returns the activation values for our target layer +layer = model.get_layer(name=layer_name) +feature_extractor = keras.Model(inputs=model.inputs, outputs=layer.output) + +""" +## Set up the gradient ascent process + +The "loss" we will maximize is simply the mean of the activation of a specific filter in +our target layer. To avoid border effects, we exclude border pixels. +""" + + +def compute_loss(input_image, filter_index): + activation = feature_extractor(input_image) + # We avoid border artifacts by only involving non-border pixels in the loss. + filter_activation = activation[:, 2:-2, 2:-2, filter_index] + return tf.reduce_mean(filter_activation) + + +""" +Our gradient ascent function simply computes the gradients of the loss above +with regard to the input image, and update the update image so as to move it +towards a state that will activate the target filter more strongly. +""" + + +@tf.function +def gradient_ascent_step(img, filter_index, learning_rate): + with tf.GradientTape() as tape: + tape.watch(img) + loss = compute_loss(img, filter_index) + # Compute gradients. + grads = tape.gradient(loss, img) + # Normalize gradients. + grads = tf.math.l2_normalize(grads) + img += learning_rate * grads + return loss, img + + +""" +## Set up the end-to-end filter visualization loop + +Our process is as follow: + +- Start from a random image that is close to "all gray" (i.e. visually netural) +- Repeatedly apply the gradient ascent step function defined above +- Convert the resulting input image back to a displayable form, by normalizing it, +center-cropping it, and restricting it to the [0, 255] range. +""" + + +def initialize_image(): + # We start from a gray image with some random noise + img = tf.random.uniform((1, img_width, img_height, 3)) + # ResNet50V2 expects inputs in the range [-1, +1]. + # Here we scale our random inputs to [-0.125, +0.125] + return (img - 0.5) * 0.25 + + +def visualize_filter(filter_index): + # We run gradient ascent for 20 steps + iterations = 30 + learning_rate = 10.0 + img = initialize_image() + for iteration in range(iterations): + loss, img = gradient_ascent_step(img, filter_index, learning_rate) + + # Decode the resulting input image + img = deprocess_image(img[0].numpy()) + return loss, img + + +def deprocess_image(img): + # Normalize array: center on 0., ensure variance is 0.15 + img -= img.mean() + img /= img.std() + 1e-5 + img *= 0.15 + + # Center crop + img = img[25:-25, 25:-25, :] + + # Clip to [0, 1] + img += 0.5 + img = np.clip(img, 0, 1) + + # Convert to RGB array + img *= 255 + img = np.clip(img, 0, 255).astype("uint8") + return img + + +""" +Let's try it out with filter 0 in the target layer: +""" + +from IPython.display import Image, display + +loss, img = visualize_filter(0) +keras.utils.save_img("0.png", img) + +""" +This is what an input that maximizes the response of filter 0 in the target layer would +look like: +""" + +display(Image("0.png")) + +""" +## Visualize the first 64 filters in the target layer + +Now, let's make a 8x8 grid of the first 64 filters +in the target layer to get of feel for the range +of different visual patterns that the model has learned. +""" + +# Compute image inputs that maximize per-filter activations +# for the first 64 filters of our target layer +all_imgs = [] +for filter_index in range(64): + print("Processing filter %d" % (filter_index,)) + loss, img = visualize_filter(filter_index) + all_imgs.append(img) + +# Build a black picture with enough space for +# our 8 x 8 filters of size 128 x 128, with a 5px margin in between +margin = 5 +n = 8 +cropped_width = img_width - 25 * 2 +cropped_height = img_height - 25 * 2 +width = n * cropped_width + (n - 1) * margin +height = n * cropped_height + (n - 1) * margin +stitched_filters = np.zeros((width, height, 3)) + +# Fill the picture with our saved filters +for i in range(n): + for j in range(n): + img = all_imgs[i * n + j] + stitched_filters[ + (cropped_width + margin) * i : (cropped_width + margin) * i + cropped_width, + (cropped_height + margin) * j : (cropped_height + margin) * j + + cropped_height, + :, + ] = img +keras.utils.save_img("stiched_filters.png", stitched_filters) + +from IPython.display import Image, display + +display(Image("stiched_filters.png")) + +""" +Image classification models see the world by decomposing their inputs over a "vector +basis" of texture filters such as these. + +See also +[this old blog post](https://blog.keras.io/how-convolutional-neural-networks-see-the-world.html) +for analysis and interpretation. + +Example available on HuggingFace. + +[![Generic badge](https://img.shields.io/badge/🤗%20Spaces-What%20Convnets%20Learn-black.svg)](https://huggingface.co/spaces/keras-io/what-convnets-learn) +""" diff --git a/examples/keras_io/vision/image_classifier.py b/examples/keras_io/vision/image_classifier.py new file mode 100644 index 000000000..d681446dc --- /dev/null +++ b/examples/keras_io/vision/image_classifier.py @@ -0,0 +1,695 @@ +# -*- coding: utf-8 -*- +""" +Author: [lukewood](https://lukewood.xyz) +Date created: 03/28/2023 +Last modified: 07/25/2023 +Description: Use KerasCV to train powerful image classifiers. +""" + +""" +## Introduction + +Classification is the process of predicting a categorical label for a given +input image. +While classification is a relatively straightforward computer vision task, +modern approaches still are built of several complex components. +Luckily, KerasCV provides APIs to construct commonly used components. + +This guide demonstrates KerasCV's modular approach to solving image +classification problems at three levels of complexity: + +- Inference with a pretrained classifier +- Fine-tuning a pretrained backbone +- Training a image classifier from scratch + +## Multi-Backend Support + +KerasCV's `ImageClassifier` model supports several backends like JAX, PyTorch, +and TensorFlow with the help of `keras_core`. To enable multi-backend support +in KerasCV, set the `KERAS_CV_MULTI_BACKEND` environment variable. We can +then switch between different backends by setting the `KERAS_BACKEND` +environment variable. Currently, `"tensorflow"`, `"jax"`, and `"torch"` are +supported. + +This demonstration uses the Jax backend. +""" + +import os + +os.environ["KERAS_CV_MULTI_BACKEND"] = "1" +os.environ["KERAS_BACKEND"] = "jax" + +import json +import math +import keras_cv +import keras_core as keras +from keras_core import ops +from keras_core import losses +from keras_core import optimizers +from keras_core.optimizers import schedules +from keras_core import metrics +import tensorflow as tf +from tensorflow import data as tf_data +import tensorflow_datasets as tfds +import numpy as np + +"""## Inference with a pretrained classifier + +Let's get started with the simplest KerasCV API: a pretrained classifier. +In this example, we will construct a classifier that was +pretrained on the ImageNet dataset. +We'll use this model to solve the age old "Cat or Dog" problem. + +The highest level module in KerasCV is a *task*. A *task* is a `keras.Model` +consisting of a (generally pretrained) backbone model and task-specific +layers. Here's an example using `keras_cv.models.ImageClassifier` with an +EfficientNetV2B0 Backbone. + +EfficientNetV2B0 is a great starting model when constructing an image +classification pipeline. +This architecture manages to achieve high accuracy, while using a +parameter count of 7M. +If an EfficientNetV2B0 is not powerful enough for the task you are hoping to +solve, be sure to check out +[KerasCV's other available Backbones](https://github.com/keras-team/keras-cv/tree/master/keras_cv/models/backbones)! +""" + +classifier = keras_cv.models.ImageClassifier.from_preset( + "efficientnetv2_b0_imagenet_classifier" +) + +"""You may notice a small deviation from the old `keras.applications` API; +where you would construct the class with +`EfficientNetV2B0(weights="imagenet")`. While the old API was great for +classification, it did not scale effectively to other use cases that required +complex architectures, like object deteciton and semantic segmentation. + +Now that our classifier is built, let's apply it to this cute cat picture! +""" + +filepath = keras.utils.get_file(origin="https://i.imgur.com/9i63gLN.jpg") +image = keras.utils.load_img(filepath) +image = np.array(image) +keras_cv.visualization.plot_image_gallery( + image[None, ...], rows=1, cols=1, value_range=(0, 255), show=True, scale=4 +) + +"""Next, let's get some predictions from our classifier:""" + +predictions = classifier.predict(np.expand_dims(image, axis=0)) + +"""Predictions come in the form of softmax-ed category rankings. +We can find the index of the top classes using a simple argsort function: +""" + +top_classes = predictions[0].argsort(axis=-1) + +"""In order to decode the class mappings, we can construct a mapping from +category indices to ImageNet class names. +For convenience, I've stored the ImageNet class mapping in a GitHub gist. +Let's download and load it now. +""" + +classes = keras.utils.get_file( + origin="https://gist.githubusercontent.com/LukeWood/62eebcd5c5c4a4d0e0b7845780f76d55/raw/fde63e5e4c09e2fa0a3436680f436bdcb8325aac/ImagenetClassnames.json" +) +with open(classes, "rb") as f: + classes = json.load(f) + +"""Now we can simply look up the class names via index:""" + +top_two = [classes[str(i)] for i in top_classes[-2:]] +print("Top two classes are:", top_two) + +"""Great! Both of these appear to be correct! +However, one of the classes is "Velvet". +We're trying to classify Cats VS Dogs. +We don't care about the velvet blanket! + +Ideally, we'd have a classifier that only performs computation to determine if +an image is a cat or a dog, and has all of its resources dedicated to this +task. This can be solved by fine tuning our own classifier. + +# Fine tuning a pretrained classifier + +When labeled images specific to our task are available, fine-tuning a custom +classifier can improve performance. +If we want to train a Cats vs Dogs Classifier, using explicitly labeled Cat vs +Dog data should perform better than the generic classifier! +For many tasks, no relevant pretrained model +will be available (e.g., categorizing images specific to your application). + +First, let's get started by loading some data: +""" + +BATCH_SIZE = 32 +IMAGE_SIZE = (224, 224) +AUTOTUNE = tf_data.AUTOTUNE +tfds.disable_progress_bar() + +data, dataset_info = tfds.load( + "cats_vs_dogs", + with_info=True, + as_supervised=True +) +train_steps_per_epoch = ( + dataset_info.splits["train"].num_examples // BATCH_SIZE +) +train_dataset = data["train"] + +num_classes = dataset_info.features["label"].num_classes + +resizing = keras_cv.layers.Resizing( + IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True +) +encoder = keras.layers.CategoryEncoding(num_classes, "one_hot", dtype="int32") + + +def preprocess_inputs(image, label): + # Staticly resize images as we only iterate the dataset once. + return resizing(image), encoder(label) + + +# Shuffle the dataset to increase diversity of batches. +# 10*BATCH_SIZE follows the assumption that bigger machines can handle bigger +# shuffle buffers. +train_dataset = train_dataset.shuffle( + 10 * BATCH_SIZE, reshuffle_each_iteration=True +).map(preprocess_inputs, num_parallel_calls=AUTOTUNE) +train_dataset = train_dataset.batch(BATCH_SIZE) + +images = next(iter(train_dataset.take(1)))[0] +keras_cv.visualization.plot_image_gallery(images, value_range=(0, 255)) + +"""Meow! + +Next let's construct our model. +The use of imagenet in the preset name indicates that the backbone was +pretrained on the ImageNet dataset. +Pretrained backbones extract more information from our labeled examples by +leveraging patterns extracted from potentially much larger datasets. + +Next lets put together our classifier: +""" + +model = keras_cv.models.ImageClassifier.from_preset( + "efficientnetv2_b0_imagenet", num_classes=2 +) +model.compile( + loss="categorical_crossentropy", + optimizer=keras.optimizers.SGD(learning_rate=0.01), + metrics=["accuracy"], +) + +"""Here our classifier is just a simple `keras.Sequential`. +All that is left to do is call `model.fit()`: +""" + +model.fit(train_dataset) + +"""Let's look at how our model performs after the fine tuning:""" + +predictions = model.predict(np.expand_dims(image, axis=0)) + +classes = {0: "cat", 1: "dog"} +print("Top class is:", classes[predictions[0].argmax()]) + +"""Awesome - looks like the model correctly classified the image. + +# Train a Classifier from Scratch + +Now that we've gotten our hands dirty with classification, let's take on one +last task: training a classification model from scratch! +A standard benchmark for image classification is the ImageNet dataset, however +due to licensing constraints we will use the CalTech 101 image classification +dataset in this tutorial. +While we use the simpler CalTech 101 dataset in this guide, the same training +template may be used on ImageNet to achieve near state-of-the-art scores. + +Let's start out by tackling data loading: +""" + +NUM_CLASSES = 101 +# Change epochs to 100~ to fully train. +EPOCHS = 1 + +encoder = keras.layers.CategoryEncoding(NUM_CLASSES, "one_hot", dtype="int32") + + +def package_inputs(image, label): + return {"images": image, "labels": encoder(label)} + + +train_ds, eval_ds = tfds.load( + "caltech101", split=["train", "test"], as_supervised="true" +) +train_ds = train_ds.map(package_inputs, num_parallel_calls=tf_data.AUTOTUNE) +eval_ds = eval_ds.map(package_inputs, num_parallel_calls=tf_data.AUTOTUNE) + +train_ds = train_ds.shuffle(BATCH_SIZE * 16) + +"""The CalTech101 dataset has different sizes for every image, so we use the +`ragged_batch()` API to batch them together while maintaining each individual +image's shape information. +""" + +train_ds = train_ds.ragged_batch(BATCH_SIZE) +eval_ds = eval_ds.ragged_batch(BATCH_SIZE) + +batch = next(iter(train_ds.take(1))) +image_batch = batch["images"] +label_batch = batch["labels"] + +keras_cv.visualization.plot_image_gallery( + image_batch.to_tensor(), + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""## Data Augmentation + +In our previous finetuning exmaple, we performed a static resizing operation +and did not utilize any image augmentation. +This is because a single pass over the training set was sufficient to achieve +decent results. +When training to solve a more difficult task, you'll want to include data +augmentation in your data pipeline. + +Data augmentation is a technique to make your model robust to changes in input +data such as lighting, cropping, and orientation. +KerasCV includes some of the most useful augmentations in the +`keras_cv.layers` API. +Creating an optimal pipeline of augmentations is an art, but in this section +of the guide we'll offer some tips on best practices for classification. + +One caveat to be aware of with image data augmentation is that you must be +careful to not shift your augmented data distribution too far from the +original data distribution. +The goal is to prevent overfitting and increase generalization, +but samples that lie completely out of the data distribution simply add noise +to the training process. + +The first augmentation we'll use is `RandomFlip`. +This augmentation behaves more or less how you'd expect: it either flips the +image or not. +While this augmentation is useful in CalTech101 and ImageNet, it should be +noted that it should not be used on tasks where the data distribution is not +vertical mirror invariant. +An example of a dataset where this occurs is MNIST hand written digits. +Flipping a `6` over the +vertical axis will make the digit appear more like a `7` than a `6`, but the +label will still show a `6`. +""" + +random_flip = keras_cv.layers.RandomFlip() +augmenters = [random_flip] + +image_batch = random_flip(image_batch) +keras_cv.visualization.plot_image_gallery( + image_batch.to_tensor(), + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""Half of the images have been flipped! + +The next augmentation we'll use is `RandomCropAndResize`. +This operation selects a random subset of the image, then resizes it to the +provided target size. +By using this augmentation, we force our classifier to become spatially +invariant. +Additionally, this layer accepts an `aspect_ratio_factor` which can be used to +distort the aspect ratio of the image. +While this can improve model performance, it should be used with caution. +It is very easy for an aspect ratio distortion to shift a sample too far from +the original training set's data distribution. +Remember - the goal of data augmentation is to produce more training samples +that align with the data distribution of your training set! + +`RandomCropAndResize` also can handle `tf.RaggedTensor` inputs. In the +CalTech101 image dataset images come in a wide variety of sizes. +As such they cannot easily be batched together into a dense training batch. +Luckily, `RandomCropAndResize` handles the Ragged -> Dense conversion process +for you! + +Let's add a `RandomCropAndResize` to our set of augmentations: +""" + +crop_and_resize = keras_cv.layers.RandomCropAndResize( + target_size=IMAGE_SIZE, + crop_area_factor=(0.8, 1.0), + aspect_ratio_factor=(0.9, 1.1), +) +augmenters += [crop_and_resize] + +image_batch = crop_and_resize(image_batch) +keras_cv.visualization.plot_image_gallery( + image_batch, + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""Great! We are now working with a batch of dense images. +Next up, lets include some spatial and color-based jitter to our training set. +This will allow us to produce a classifier that is robust to lighting +flickers, shadows, and more. + +There are limitless ways to augment an image by altering color and spatial +features, but perhaps the most battle tested technique is +[`RandAugment`](https://arxiv.org/abs/1909.13719). +`RandAugment` is actually a set of 10 different augmentations: +`AutoContrast`, `Equalize`, `Solarize`, `RandomColorJitter`, `RandomContrast`, +`RandomBrightness`, `ShearX`, `ShearY`, `TranslateX` and `TranslateY`. +At inference time, `num_augmentations` augmenters are sampled for each image, +and random magnitude factors are sampled for each. +These augmentations are then applied sequentially. + +KerasCV makes tuning these parameters easy using the `augmentations_per_image` +and `magnitude` parameters! +Let's take it for a spin: +""" + +rand_augment = keras_cv.layers.RandAugment( + augmentations_per_image=3, + magnitude=0.3, + value_range=(0, 255), +) +augmenters += [rand_augment] + +image_batch = rand_augment(image_batch) +keras_cv.visualization.plot_image_gallery( + image_batch, + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""Looks great; but we're not done yet! +What if an image is missing one critical feature of a class? For example, +what if a leaf is blocking the view of a cat's ear, but our classifier +learned to classify cats simply by observing their ears? + +One easy approach to tackling this is to use `RandomCutout`, which randomly +strips out a sub-section of the image: +""" + +random_cutout = keras_cv.layers.RandomCutout( + width_factor=0.4, height_factor=0.4 +) +keras_cv.visualization.plot_image_gallery( + random_cutout(image_batch), + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""While this tackles the problem reasonably well, it can cause the classifier +to develop responses to borders between features and black pixel areas caused +by the cutout. + +[`CutMix`](https://arxiv.org/abs/1905.04899) solves the same issue by using +a more complex (and more effective) technique. +Instead of replacing the cut-out areas with black pixels, `CutMix` replaces +these regions with regions of other images sampled from within your training +set! +Following this replacement, the image's classification label is updated to be +a blend of the original and mixed image's class label. + +What does this look like in practice? Let's check it out: +""" + +cut_mix = keras_cv.layers.CutMix() +# CutMix needs to modify both images and labels +inputs = {"images": image_batch, "labels": tf.cast(label_batch, "float32")} + +keras_cv.visualization.plot_image_gallery( + cut_mix(inputs)["images"], + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""Let's hold off from adding it to our augmenter for a minute - more on that +soon! + +Next, let's look into `MixUp()`. +Unfortunately, while `MixUp()` has been empirically shown to *substantially* +improve both the robustness and the generalization of the trained model, +it is not well-understood why such improvement occurs... but +a little alchemy never hurt anyone! + +`MixUp()` works by sampling two images from a batch, then proceeding to +literally blend together their pixel intensities as well as their +classification labels. + +Let's see it in action: +""" + +mix_up = keras_cv.layers.MixUp() +# MixUp needs to modify both images and labels +inputs = {"images": image_batch, "labels": tf.cast(label_batch, "float32")} + +keras_cv.visualization.plot_image_gallery( + mix_up(inputs)["images"], + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""If you look closely, you'll see that the images have been blended together. + +Instead of applying `CutMix()` and `MixUp()` to every image, we instead pick +one or the other to apply to each batch. +This can be expressed using `keras_cv.layers.RandomChoice()` +""" + +cut_mix_or_mix_up = keras_cv.layers.RandomChoice( + [cut_mix, mix_up], batchwise=True +) +augmenters += [cut_mix_or_mix_up] + +"""Now let's apply our final augmenter to the training data:""" + +augmenter = keras_cv.layers.Augmenter(augmenters) +train_ds = train_ds.map(augmenter, num_parallel_calls=tf_data.AUTOTUNE) + +image_batch = next(iter(train_ds.take(1)))["images"] +keras_cv.visualization.plot_image_gallery( + image_batch, + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""We also need to resize our evaluation set to get dense batches of the image +size expected by our model. We use the deterministic +`keras_cv.layers.Resizing` in this case to avoid adding noise to our +evaluation metric. +""" + +inference_resizing = keras_cv.layers.Resizing( + IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True +) +eval_ds = eval_ds.map(inference_resizing, num_parallel_calls=tf_data.AUTOTUNE) + +inference_resizing = keras_cv.layers.Resizing( + IMAGE_SIZE[0], IMAGE_SIZE[1], crop_to_aspect_ratio=True +) +eval_ds = eval_ds.map(inference_resizing, num_parallel_calls=tf_data.AUTOTUNE) + +image_batch = next(iter(eval_ds.take(1)))["images"] +keras_cv.visualization.plot_image_gallery( + image_batch, + rows=3, + cols=3, + value_range=(0, 255), + show=True, +) + +"""Finally, lets unpackage our datasets and prepare to pass them to +`model.fit()`, which accepts a tuple of `(images, labels)`. +""" + +def unpackage_dict(inputs): + return inputs["images"], inputs["labels"] + + +train_ds = train_ds.map(unpackage_dict, num_parallel_calls=tf_data.AUTOTUNE) +eval_ds = eval_ds.map(unpackage_dict, num_parallel_calls=tf_data.AUTOTUNE) + +"""Data augmentation is by far the hardest piece of training a modern +classifier. +Congratulations on making it this far! + +## Optimizer Tuning + +To achieve optimal performance, we need to use a learning rate schedule +instead of a single learning rate. While we won't go into detail on the +Cosine decay with warmup schedule used here, [you can read more about it +here](https://scorrea92.medium.com/cosine-learning-rate-decay-e8b50aa455b). +""" + +def lr_warmup_cosine_decay( + global_step, + warmup_steps, + hold=0, + total_steps=0, + start_lr=0.0, + target_lr=1e-2, +): + # Cosine decay + learning_rate = ( + 0.5 + * target_lr + * ( + 1 + + ops.cos( + math.pi + * ops.convert_to_tensor( + global_step - warmup_steps - hold, dtype="float32" + ) + / ops.convert_to_tensor( + total_steps - warmup_steps - hold, dtype="float32" + ) + ) + ) + ) + + warmup_lr = (target_lr * (global_step / warmup_steps)) + + if hold > 0: + learning_rate = ops.where( + global_step > warmup_steps + hold, learning_rate, target_lr + ) + + learning_rate = ops.where( + global_step < warmup_steps, warmup_lr, learning_rate + ) + return learning_rate + + +class WarmUpCosineDecay( + schedules.LearningRateSchedule +): + def __init__( + self, warmup_steps, total_steps, hold, start_lr=0.0, target_lr=1e-2 + ): + super().__init__() + self.start_lr = start_lr + self.target_lr = target_lr + self.warmup_steps = warmup_steps + self.total_steps = total_steps + self.hold = hold + + def __call__(self, step): + lr = lr_warmup_cosine_decay( + global_step=step, + total_steps=self.total_steps, + warmup_steps=self.warmup_steps, + start_lr=self.start_lr, + target_lr=self.target_lr, + hold=self.hold, + ) + + return ops.where(step > self.total_steps, 0.0, lr) + +"""![WarmUpCosineDecay schedule](https://i.imgur.com/YCr5pII.png) + +The schedule looks a as we expect. + +Next let's construct this optimizer: +""" + +total_images = 9000 +total_steps = (total_images // BATCH_SIZE) * EPOCHS +warmup_steps = int(0.1 * total_steps) +hold_steps = int(0.45 * total_steps) +schedule = WarmUpCosineDecay( + start_lr=0.05, + target_lr=1e-2, + warmup_steps=warmup_steps, + total_steps=total_steps, + hold=hold_steps, +) +optimizer = optimizers.SGD( + weight_decay=5e-4, + learning_rate=schedule, + momentum=0.9, +) + +"""At long last, we can now build our model and call `fit()`! +`keras_cv.models.EfficientNetV2B0Backbone()` is a convenience alias for +`keras_cv.models.EfficientNetV2Backbone.from_preset('efficientnetv2_b0')`. +Note that this preset does not come with any pretrained weights. +""" + +backbone = keras_cv.models.ResNet18V2Backbone() +model = keras.Sequential( + [ + backbone, + keras.layers.GlobalMaxPooling2D(), + keras.layers.Dropout(rate=0.5), + keras.layers.Dense(101, activation="softmax"), + ] +) + +"""Since the labels produced by MixUp() and CutMix() are somewhat artificial, +we employ label smoothing to prevent the model from overfitting to artifacts +of this augmentation process. +""" + +loss = losses.CategoricalCrossentropy(label_smoothing=0.1) + +"""Let's compile our model:""" + +model.compile( + loss=loss, + optimizer=optimizer, + metrics=[ + metrics.CategoricalAccuracy(), + metrics.TopKCategoricalAccuracy(k=5), + ], +) + +"""and finally call fit().""" + +model.fit( + train_ds, + epochs=EPOCHS, + validation_data=eval_ds, +) + +"""Congratulations! You now know how to train a powerful image classifier +from scratch in KerasCV. +Depending on the availability of labeled data for your application, training +from scratch may or may not be more powerful than using transfer learning in +addition to the data augmentations discussed above. For smaller datasets, +pretrained models generally produce high accuracy and faster convergence. + +## Conclusions + +While image classification is perhaps the simplest problem in computer vision, +the modern landscape has numerous complex components. +Luckily, KerasCV offers robust, production-grade APIs to make assembling most +of these components possible in one line of code. +Through the use of KerasCV's `ImageClassifier` API, pretrained weights, and +KerasCV data augmentations you can assemble everything you need to train a +powerful classifier in a few hundred lines of code! + +As a follow up exercise, give the following a try: + +- Fine tune a KerasCV classifier on your own dataset +- Learn more about [KerasCV's data augmentations](https://keras.io/guides/keras_cv/cut_mix_mix_up_and_rand_augment/) +- Check out how we train our models on [ImageNet](https://github.com/keras-team/keras-cv/blob/master/examples/training/classification/imagenet/basic_training.py) +""" diff --git a/keras_core/callbacks/learning_rate_scheduler.py b/keras_core/callbacks/learning_rate_scheduler.py index 6218f8e07..a72c44661 100644 --- a/keras_core/callbacks/learning_rate_scheduler.py +++ b/keras_core/callbacks/learning_rate_scheduler.py @@ -76,4 +76,6 @@ class LearningRateScheduler(Callback): def on_epoch_end(self, epoch, logs=None): logs = logs or {} - logs["learning_rate"] = self.model.optimizer.learning_rate.value + logs["learning_rate"] = float( + backend.convert_to_numpy(self.model.optimizer.learning_rate) + ) diff --git a/keras_core/callbacks/learning_rate_scheduler_test.py b/keras_core/callbacks/learning_rate_scheduler_test.py index 568446e6a..633e2e68b 100644 --- a/keras_core/callbacks/learning_rate_scheduler_test.py +++ b/keras_core/callbacks/learning_rate_scheduler_test.py @@ -107,3 +107,18 @@ class LearningRateSchedulerTest(testing.TestCase): callbacks=[lr_scheduler], epochs=2, ) + + @pytest.mark.requires_trainable_backend + def test_learning_rate_in_history(self): + lr_scheduler = callbacks.LearningRateScheduler(lambda step, lr: 0.5) + + history = self.model.fit( + self.x_train, + self.y_train, + callbacks=[lr_scheduler], + epochs=1, + ) + + self.assertTrue("learning_rate" in history.history) + self.assertEqual(type(history.history["learning_rate"][0]), float) + self.assertEqual(history.history["learning_rate"][0], 0.5) diff --git a/keras_core/callbacks/tensorboard_test.py b/keras_core/callbacks/tensorboard_test.py index eecad42b5..ca41a7069 100644 --- a/keras_core/callbacks/tensorboard_test.py +++ b/keras_core/callbacks/tensorboard_test.py @@ -391,7 +391,6 @@ class TestTensorBoardV2(testing.TestCase): }, ) expected_image_summaries = { - _ObservedSummary(logdir=train_dir, tag="image"), _ObservedSummary(logdir=train_dir, tag="bias/image"), _ObservedSummary(logdir=train_dir, tag="kernel/image"), } diff --git a/keras_core/layers/activations/elu_test.py b/keras_core/layers/activations/elu_test.py index 6b85c1903..77c13ac4f 100644 --- a/keras_core/layers/activations/elu_test.py +++ b/keras_core/layers/activations/elu_test.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import tensorflow as tf from keras_core import testing from keras_core.layers.activations import elu @@ -21,11 +20,12 @@ class ELUTest(testing.TestCase): ) def test_correctness(self): + def np_elu(x, alpha=1.0): + return (x > 0) * x + (x <= 0) * alpha * (np.exp(x) - 1) + x = np.random.random((2, 2, 5)) elu_layer = elu.ELU() - tf_elu_layer = tf.keras.layers.ELU() - self.assertAllClose(elu_layer(x), tf_elu_layer(x)) + self.assertAllClose(elu_layer(x), np_elu(x)) elu_layer = elu.ELU(alpha=0.7) - tf_elu_layer = tf.keras.layers.ELU(alpha=0.7) - self.assertAllClose(elu_layer(x), tf_elu_layer(x)) + self.assertAllClose(elu_layer(x), np_elu(x, alpha=0.7)) diff --git a/keras_core/layers/activations/prelu_test.py b/keras_core/layers/activations/prelu_test.py index ea4f79559..73666c774 100644 --- a/keras_core/layers/activations/prelu_test.py +++ b/keras_core/layers/activations/prelu_test.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import tensorflow as tf from keras_core import testing from keras_core.layers.activations import prelu @@ -22,6 +21,9 @@ class PReLUTest(testing.TestCase): ) def test_prelu_correctness(self): + def np_prelu(x, alpha): + return (x > 0) * x + (x <= 0) * alpha * x + inputs = np.random.randn(2, 10, 5, 3) prelu_layer = prelu.PReLU( alpha_initializer="glorot_uniform", @@ -29,18 +31,9 @@ class PReLUTest(testing.TestCase): alpha_constraint="non_neg", shared_axes=(1, 2), ) - tf_prelu_layer = tf.keras.layers.PReLU( - alpha_initializer="glorot_uniform", - alpha_regularizer="l1", - alpha_constraint="non_neg", - shared_axes=(1, 2), - ) - prelu_layer.build(inputs.shape) - tf_prelu_layer.build(inputs.shape) weights = np.random.random((1, 1, 3)) prelu_layer.alpha.assign(weights) - tf_prelu_layer.alpha.assign(weights) - - self.assertAllClose(prelu_layer(inputs), tf_prelu_layer(inputs)) + ref_out = np_prelu(inputs, weights) + self.assertAllClose(prelu_layer(inputs), ref_out) diff --git a/keras_core/layers/attention/multi_head_attention.py b/keras_core/layers/attention/multi_head_attention.py index bb127b77e..37f93258d 100644 --- a/keras_core/layers/attention/multi_head_attention.py +++ b/keras_core/layers/attention/multi_head_attention.py @@ -4,6 +4,7 @@ import string import numpy as np +from keras_core import backend from keras_core import constraints from keras_core import initializers from keras_core import ops @@ -115,6 +116,8 @@ class MultiHeadAttention(Layer): self.supports_masking = True self._num_heads = num_heads self._key_dim = key_dim + # Cache 1.0 / math.sqrt(self._key_dim). + self._inverse_sqrt_key_dim = None self._value_dim = value_dim if value_dim else key_dim self._dropout = dropout self._use_bias = use_bias @@ -311,6 +314,9 @@ class MultiHeadAttention(Layer): ) self._softmax = Softmax(axis=norm_axes) self._dropout_layer = Dropout(rate=self._dropout) + self._inverse_sqrt_key_dim = backend.convert_to_tensor( + 1.0 / math.sqrt(float(self._key_dim)) + ) def _masked_softmax(self, attention_scores, attention_mask=None): # Normalize the attention scores to probabilities. @@ -355,7 +361,7 @@ class MultiHeadAttention(Layer): # Note: Applying scalar multiply at the smaller end of einsum improves # XLA performance, but may introduce slight numeric differences in # the Transformer attention head. - query = ops.multiply(query, 1.0 / math.sqrt(float(self._key_dim))) + query = ops.multiply(query, self._inverse_sqrt_key_dim) # Take the dot product between "query" and "key" to get the raw # attention scores. diff --git a/keras_core/layers/core/dense.py b/keras_core/layers/core/dense.py index d7d70324c..d2b50fb30 100644 --- a/keras_core/layers/core/dense.py +++ b/keras_core/layers/core/dense.py @@ -87,12 +87,14 @@ class Dense(Layer): def build(self, input_shape): input_dim = input_shape[-1] self.kernel = self.add_weight( + name="kernel", shape=(input_dim, self.units), initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, ) if self.use_bias: self.bias = self.add_weight( + name="bias", shape=(self.units,), initializer=self.bias_initializer, regularizer=self.bias_regularizer, diff --git a/keras_core/layers/pooling/global_average_pooling_test.py b/keras_core/layers/pooling/global_average_pooling_test.py index 76f5afcb2..60a832742 100644 --- a/keras_core/layers/pooling/global_average_pooling_test.py +++ b/keras_core/layers/pooling/global_average_pooling_test.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import tensorflow as tf from absl.testing import parameterized from keras_core import layers @@ -95,21 +94,30 @@ class GlobalAveragePoolingCorrectnessTest( ("channels_last", False), ("channels_last", True), ("channels_first", False), + ("channels_first", True), ) def test_global_average_pooling1d(self, data_format, keepdims): - inputs = np.arange(24, dtype="float32").reshape((2, 3, 4)) + def np_gap1d(x, data_format, keepdims, mask=None): + steps_axis = 1 if data_format == "channels_last" else 2 + if mask is not None: + mask = np.expand_dims( + mask, 2 if data_format == "channels_last" else 1 + ) + x *= mask + res = np.sum(x, axis=steps_axis) / np.sum(mask, axis=steps_axis) + else: + res = np.mean(x, axis=steps_axis) + if keepdims: + res = np.expand_dims(res, axis=steps_axis) + return res + inputs = np.arange(24, dtype="float32").reshape((2, 3, 4)) layer = layers.GlobalAveragePooling1D( data_format=data_format, keepdims=keepdims, ) - tf_keras_layer = tf.keras.layers.GlobalAveragePooling1D( - data_format=data_format, - keepdims=keepdims, - ) - outputs = layer(inputs) - expected = tf_keras_layer(inputs) + expected = np_gap1d(inputs, data_format, keepdims) self.assertAllClose(outputs, expected) if data_format == "channels_last": @@ -117,47 +125,53 @@ class GlobalAveragePoolingCorrectnessTest( else: mask = np.array([[1, 1, 0, 0], [0, 1, 0, 1]], dtype="int32") outputs = layer(inputs, mask) - expected = tf_keras_layer(inputs, mask) + expected = np_gap1d(inputs, data_format, keepdims, mask) self.assertAllClose(outputs, expected) @parameterized.parameters( ("channels_last", False), ("channels_last", True), ("channels_first", False), + ("channels_first", True), ) def test_global_average_pooling2d(self, data_format, keepdims): - inputs = np.arange(96, dtype="float32").reshape((2, 3, 4, 4)) + def np_gap2d(x, data_format, keepdims): + steps_axis = [1, 2] if data_format == "channels_last" else [2, 3] + res = np.apply_over_axes(np.mean, x, steps_axis) + if not keepdims: + res = res.squeeze() + return res + inputs = np.arange(96, dtype="float32").reshape((2, 3, 4, 4)) layer = layers.GlobalAveragePooling2D( data_format=data_format, keepdims=keepdims, ) - tf_keras_layer = tf.keras.layers.GlobalAveragePooling2D( - data_format=data_format, - keepdims=keepdims, - ) - outputs = layer(inputs) - expected = tf_keras_layer(inputs) + expected = np_gap2d(inputs, data_format, keepdims) self.assertAllClose(outputs, expected) @parameterized.parameters( ("channels_last", False), ("channels_last", True), ("channels_first", False), + ("channels_first", True), ) def test_global_average_pooling3d(self, data_format, keepdims): - inputs = np.arange(360, dtype="float32").reshape((2, 3, 3, 5, 4)) + def np_gap3d(x, data_format, keepdims): + steps_axis = ( + [1, 2, 3] if data_format == "channels_last" else [2, 3, 4] + ) + res = np.apply_over_axes(np.mean, x, steps_axis) + if not keepdims: + res = res.squeeze() + return res + inputs = np.arange(360, dtype="float32").reshape((2, 3, 3, 5, 4)) layer = layers.GlobalAveragePooling3D( data_format=data_format, keepdims=keepdims, ) - tf_keras_layer = tf.keras.layers.GlobalAveragePooling3D( - data_format=data_format, - keepdims=keepdims, - ) - outputs = layer(inputs) - expected = tf_keras_layer(inputs) + expected = np_gap3d(inputs, data_format, keepdims) self.assertAllClose(outputs, expected) diff --git a/keras_core/ops/image.py b/keras_core/ops/image.py index a7af118b8..7351a7425 100644 --- a/keras_core/ops/image.py +++ b/keras_core/ops/image.py @@ -271,6 +271,8 @@ class ExtractPatches(Operation): data_format="channels_last", ): super().__init__() + if isinstance(size, int): + size = (size, size) self.size = size self.strides = strides self.dilation_rate = dilation_rate @@ -348,14 +350,16 @@ def extract_patches( Examples: - >>> image = np.random.random((1, 20, 20, 3)) # batch of 2 RGB images + >>> image = np.random.random( + ... (2, 20, 20, 3) + ... ).astype("float32") # batch of 2 RGB images >>> patches = keras_core.ops.image.extract_patches(image, (5, 5)) >>> patches.shape - (1, 4, 4, 75) - >>> image = np.random.random((20, 20, 3)) # batch of 2 RGB images + (2, 4, 4, 75) + >>> image = np.random.random((20, 20, 3)).astype("float32") # 1 RGB image >>> patches = keras_core.ops.image.extract_patches(image, (3, 3), (1, 1)) >>> patches.shape - (4, 4, 75) + (18, 18, 27) """ if any_symbolic_tensors((image,)): return ExtractPatches( diff --git a/keras_core/ops/image_test.py b/keras_core/ops/image_test.py index 6452df04a..1e4d05880 100644 --- a/keras_core/ops/image_test.py +++ b/keras_core/ops/image_test.py @@ -31,6 +31,8 @@ class ImageOpsDynamicShapeTest(testing.TestCase): p_h, p_w = 5, 5 out = kimage.extract_patches(x, (p_h, p_w)) self.assertEqual(out.shape, (None, 4, 4, 75)) + out = kimage.extract_patches(x, 5) + self.assertEqual(out.shape, (None, 4, 4, 75)) class ImageOpsStaticShapeTest(testing.TestCase): @@ -50,6 +52,8 @@ class ImageOpsStaticShapeTest(testing.TestCase): p_h, p_w = 5, 5 out = kimage.extract_patches(x, (p_h, p_w)) self.assertEqual(out.shape, (4, 4, 75)) + out = kimage.extract_patches(x, 5) + self.assertEqual(out.shape, (4, 4, 75)) AFFINE_TRANSFORM_INTERPOLATIONS = { # map to order @@ -310,9 +314,7 @@ class ImageOpsCorrectnessTest(testing.TestCase, parameterized.TestCase): and backend.backend() == "tensorflow" and dilation_rate > 1 ): - pytest.skip( - "dilation_rate>1 with strides>1 than not supported with TF" - ) + pytest.skip("dilation_rate>1 with strides>1 not supported with TF") if data_format == "channels_first": image = np.random.uniform(size=(1, 3, 20, 20)) else: