From 336c6a042bf5ca508ea3c3ee0fba2624d7f9c99f Mon Sep 17 00:00:00 2001
From: Sayed Qaiser Ali <66676360+sqali@users.noreply.github.com>
Date: Tue, 18 Jul 2023 21:57:40 +0530
Subject: [PATCH] Update symbolic_arguments.py (#513)

* Update symbolic_arguments.py

Added validations to __init__ function

* Update symbolic_arguments.py

Removed the # TODO as requested
---
 benchmarks/torch_ctl_benchmark/README.md      |   9 +-
 examples/keras_io/tensorflow/vision/cct.py    | 400 ++++++++++++++++++
 .../{tensorflow => }/vision/autoencoder.py    |   0
 .../backend/torch/optimizers/__init__.py      |   1 +
 .../torch/optimizers/torch_optimizer.py       |  24 ++
 .../backend/torch/optimizers/torch_sgd.py     |  43 ++
 keras_core/layers/core/input_layer_test.py    |  88 ++++
 keras_core/layers/layer.py                    |  82 ++--
 .../layers/preprocessing/random_crop.py       | 148 ++-----
 keras_core/layers/rnn/gru.py                  |   7 +-
 keras_core/layers/rnn/lstm.py                 |   7 +-
 keras_core/ops/symbolic_arguments.py          |   2 +-
 keras_core/optimizers/optimizer.py            |   8 +-
 keras_core/optimizers/sgd_test.py             |   8 +-
 14 files changed, 664 insertions(+), 163 deletions(-)
 create mode 100644 examples/keras_io/tensorflow/vision/cct.py
 rename examples/keras_io/{tensorflow => }/vision/autoencoder.py (100%)
 create mode 100644 keras_core/backend/torch/optimizers/__init__.py
 create mode 100644 keras_core/backend/torch/optimizers/torch_optimizer.py
 create mode 100644 keras_core/backend/torch/optimizers/torch_sgd.py
 create mode 100644 keras_core/layers/core/input_layer_test.py

diff --git a/benchmarks/torch_ctl_benchmark/README.md b/benchmarks/torch_ctl_benchmark/README.md
index 977ae1d72..fa7cc6566 100644
--- a/benchmarks/torch_ctl_benchmark/README.md
+++ b/benchmarks/torch_ctl_benchmark/README.md
@@ -1,9 +1,10 @@
 # Benchmark the performance of torch custom training loop
 
-This directory contains benchmarks to compare the performance between Keras and
-Torch while using Torch custom training loop. The benchmark purpose is to
-understand the performance diff resulting from the modeling API choice (Keras
-or Torch).
+This directory contains benchmarks to compare the performance of a Keras model
+and a equivalent Torch model while using the same Torch custom training loop.
+
+The benchmark purpose is to understand the performance diff resulting from the
+modeling API choice (Keras or Torch).
 
 To run the benchmark, use the command below and change to your target:
 
diff --git a/examples/keras_io/tensorflow/vision/cct.py b/examples/keras_io/tensorflow/vision/cct.py
new file mode 100644
index 000000000..99a0008b9
--- /dev/null
+++ b/examples/keras_io/tensorflow/vision/cct.py
@@ -0,0 +1,400 @@
+"""
+Title: Compact Convolutional Transformers
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Converted to Keras Core by: [Muhammad Anas Raza](https://anasrz.com)
+Date created: 2021/06/30
+Last modified: 2023/07/17
+Description: Compact Convolutional Transformers for efficient image classification.
+Accelerator: GPU
+"""
+"""
+As discussed in the [Vision Transformers (ViT)](https://arxiv.org/abs/2010.11929) paper,
+a Transformer-based architecture for vision typically requires a larger dataset than
+usual, as well as a longer pre-training schedule. [ImageNet-1k](http://imagenet.org/)
+(which has about a million images) is considered to fall under the medium-sized data regime with
+respect to ViTs. This is primarily because, unlike CNNs, ViTs (or a typical
+Transformer-based architecture) do not have well-informed inductive biases (such as
+convolutions for processing images). This begs the question: can't we combine the
+benefits of convolution and the benefits of Transformers
+in a single network architecture? These benefits include parameter-efficiency, and
+self-attention to process long-range and global dependencies (interactions between
+different regions in an image).
+
+In [Escaping the Big Data Paradigm with Compact Transformers](https://arxiv.org/abs/2104.05704),
+Hassani et al. present an approach for doing exactly this. They proposed the
+**Compact Convolutional Transformer** (CCT) architecture. In this example, we will work on an
+implementation of CCT and we will see how well it performs on the CIFAR-10 dataset.
+
+If you are unfamiliar with the concept of self-attention or Transformers, you can read
+[this chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11/r-3/312)
+from  François Chollet's book *Deep Learning with Python*. This example uses
+code snippets from another example,
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+
+
+"""
+
+
+"""
+## Imports
+"""
+
+import os
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+from keras_core import layers
+import keras_core as keras
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+
+"""
+## Hyperparameters and constants
+"""
+
+positional_emb = True
+conv_layers = 2
+projection_dim = 128
+
+num_heads = 2
+transformer_units = [
+    projection_dim,
+    projection_dim,
+]
+transformer_layers = 2
+stochastic_depth_rate = 0.1
+
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 128
+num_epochs = 30
+image_size = 32
+
+"""
+## Load CIFAR-10 dataset
+"""
+
+num_classes = 10
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## The CCT tokenizer
+
+The first recipe introduced by the CCT authors is the tokenizer for processing the
+images. In a standard ViT, images are organized into uniform *non-overlapping* patches.
+This eliminates the boundary-level information present in between different patches. This
+is important for a neural network to effectively exploit the locality information. The
+figure below presents an illustration of how images are organized into patches.
+
+![](https://i.imgur.com/IkBK9oY.png)
+
+We already know that convolutions are quite good at exploiting locality information. So,
+based on this, the authors introduce an all-convolution mini-network to produce image
+patches.
+"""
+
+
+class CCTTokenizer(layers.Layer):
+    def __init__(
+        self,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        pooling_kernel_size=3,
+        pooling_stride=2,
+        num_conv_layers=conv_layers,
+        num_output_channels=[64, 128],
+        positional_emb=positional_emb,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # This is our tokenizer.
+        self.conv_model = keras.Sequential()
+        for i in range(num_conv_layers):
+            self.conv_model.add(
+                layers.Conv2D(
+                    num_output_channels[i],
+                    kernel_size,
+                    stride,
+                    padding="valid",
+                    use_bias=False,
+                    activation="relu",
+                    kernel_initializer="he_normal",
+                )
+            )
+            self.conv_model.add(layers.ZeroPadding2D(padding))
+            self.conv_model.add(
+                layers.MaxPool2D(pooling_kernel_size, pooling_stride, "same")
+            )
+
+        self.positional_emb = positional_emb
+
+    def call(self, images):
+        outputs = self.conv_model(images)
+        # After passing the images through our mini-network the spatial dimensions
+        # are flattened to form sequences.
+        reshaped = tf.reshape(
+            outputs,
+            (-1, tf.shape(outputs)[1] * tf.shape(outputs)[2], tf.shape(outputs)[-1]),
+        )
+        return reshaped
+
+    def positional_embedding(self, image_size):
+        # Positional embeddings are optional in CCT. Here, we calculate
+        # the number of sequences and initialize an `Embedding` layer to
+        # compute the positional embeddings later.
+        if self.positional_emb:
+            dummy_inputs = tf.ones((1, image_size, image_size, 3))
+            dummy_outputs = self.call(dummy_inputs)
+            sequence_length = tf.shape(dummy_outputs)[1]
+            projection_dim = tf.shape(dummy_outputs)[-1]
+
+            embed_layer = layers.Embedding(
+                input_dim=sequence_length, output_dim=projection_dim
+            )
+            return embed_layer, sequence_length
+        else:
+            return None
+
+"""
+## Sequence Pooling
+Another recipe introduced in CCT is attention pooling or sequence pooling. In ViT, only
+the feature map corresponding to the class token is pooled and is then used for the
+subsequent classification task (or any other downstream task).
+"""
+
+class SequencePooling(layers.Layer):
+    def __init__(self):
+        super().__init__()
+        self.attention = layers.Dense(1)
+
+    def call(self, x):
+        attention_weights = tf.nn.softmax(self.attention(x), axis=1)
+        weighted_representation = tf.matmul(
+        attention_weights, x, transpose_a=True
+    )
+        return tf.squeeze(weighted_representation, -2)
+
+
+"""
+## Stochastic depth for regularization
+
+[Stochastic depth](https://arxiv.org/abs/1603.09382) is a regularization technique that
+randomly drops a set of layers. During inference, the layers are kept as they are. It is
+very much similar to [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) but only
+that it operates on a block of layers rather than individual nodes present inside a
+layer. In CCT, stochastic depth is used just before the residual blocks of a Transformers
+encoder.
+"""
+
+
+# Referred from: github.com:rwightman/pytorch-image-models.
+class StochasticDepth(layers.Layer):
+    def __init__(self, drop_prop, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prop
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_prob
+            shape = (tf.shape(x)[0],) + (1,) * (tf.shape(x).shape[0] - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+"""
+## MLP for the Transformers encoder
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=tf.nn.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Data augmentation
+
+In the [original paper](https://arxiv.org/abs/2104.05704), the authors use
+[AutoAugment](https://arxiv.org/abs/1805.09501) to induce stronger regularization. For
+this example, we will be using the standard geometric augmentations like random cropping
+and flipping.
+"""
+
+# Note the rescaling layer. These layers have pre-defined inference behavior.
+data_augmentation = keras.Sequential(
+    [
+        layers.Rescaling(scale=1.0 / 255),
+        layers.RandomCrop(image_size, image_size),
+        layers.RandomFlip("horizontal"),
+    ],
+    name="data_augmentation",
+)
+
+"""
+## The final CCT model
+
+In CCT, outputs from the Transformers encoder are weighted and then passed on to the final task-specific layer (in
+this example, we do classification).
+"""
+
+
+
+
+def create_cct_model(
+    image_size=image_size,
+    input_shape=input_shape,
+    num_heads=num_heads,
+    projection_dim=projection_dim,
+    transformer_units=transformer_units,
+):
+    inputs = layers.Input(input_shape)
+
+    # Augment data.
+    augmented = data_augmentation(inputs)
+
+    # Encode patches.
+    cct_tokenizer = CCTTokenizer()
+    encoded_patches = cct_tokenizer(augmented)
+
+    # Apply positional embedding.
+    if positional_emb:
+        pos_embed, seq_length = cct_tokenizer.positional_embedding(image_size)
+        positions = tf.range(start=0, limit=seq_length, delta=1)
+        position_embeddings = pos_embed(positions)
+        encoded_patches += position_embeddings
+
+    # Calculate Stochastic Depth probabilities.
+    dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)]
+
+    # Create multiple layers of the Transformer block.
+    for i in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
+
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+
+        # Skip connection 1.
+        attention_output = StochasticDepth(dpr[i])(attention_output)
+        x2 = layers.Add()([attention_output, encoded_patches])
+
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-5)(x2)
+
+        # MLP.
+        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
+
+        # Skip connection 2.
+        x3 = StochasticDepth(dpr[i])(x3)
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Apply sequence pooling.
+    representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
+    weighted_representation = SequencePooling()(representation)
+
+    # Classify outputs.
+    logits = layers.Dense(num_classes)(weighted_representation)
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=logits)
+    return model
+
+
+"""
+## Model training and evaluation
+"""
+
+
+def run_experiment(model):
+    optimizer = keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001)
+
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.CategoricalCrossentropy(
+            from_logits=True, label_smoothing=0.1
+        ),
+        metrics=[
+            keras.metrics.CategoricalAccuracy(name="accuracy"),
+            keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[checkpoint_callback],
+    )
+
+    model.load_weights(checkpoint_filepath)
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    return history
+
+
+cct_model = create_cct_model()
+history = run_experiment(cct_model)
+
+"""
+Let's now visualize the training progress of the model.
+"""
+
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.show()
+
+"""
+The CCT model we just trained has just **0.4 million** parameters, and it gets us to
+~78% top-1 accuracy within 30 epochs. The plot above shows no signs of overfitting as
+well. This means we can train this network for longer (perhaps with a bit more
+regularization) and may obtain even better performance. This performance can further be
+improved by additional recipes like cosine decay learning rate schedule, other data augmentation
+techniques like [AutoAugment](https://arxiv.org/abs/1805.09501),
+[MixUp](https://arxiv.org/abs/1710.09412) or
+[Cutmix](https://arxiv.org/abs/1905.04899). With these modifications, the authors present
+95.1% top-1 accuracy on the CIFAR-10 dataset. The authors also present a number of
+experiments to study how the number of convolution blocks, Transformers layers, etc.
+affect the final performance of CCTs.
+
+For a comparison, a ViT model takes about **4.7 million** parameters and **100
+epochs** of training to reach a top-1 accuracy of 78.22% on the CIFAR-10 dataset. You can
+refer to
+[this notebook](https://colab.research.google.com/gist/sayakpaul/1a80d9f582b044354a1a26c5cb3d69e5/image_classification_with_vision_transformer.ipynb)
+to know about the experimental setup.
+
+The authors also demonstrate the performance of Compact Convolutional Transformers on
+NLP tasks and they report competitive results there.
+"""
diff --git a/examples/keras_io/tensorflow/vision/autoencoder.py b/examples/keras_io/vision/autoencoder.py
similarity index 100%
rename from examples/keras_io/tensorflow/vision/autoencoder.py
rename to examples/keras_io/vision/autoencoder.py
diff --git a/keras_core/backend/torch/optimizers/__init__.py b/keras_core/backend/torch/optimizers/__init__.py
new file mode 100644
index 000000000..1b7d9c306
--- /dev/null
+++ b/keras_core/backend/torch/optimizers/__init__.py
@@ -0,0 +1 @@
+from keras_core.backend.torch.optimizers.torch_optimizer import TorchOptimizer
diff --git a/keras_core/backend/torch/optimizers/torch_optimizer.py b/keras_core/backend/torch/optimizers/torch_optimizer.py
new file mode 100644
index 000000000..6cdd91ed1
--- /dev/null
+++ b/keras_core/backend/torch/optimizers/torch_optimizer.py
@@ -0,0 +1,24 @@
+import torch
+
+from keras_core.optimizers.base_optimizer import BaseOptimizer
+
+
+class TorchOptimizer(BaseOptimizer):
+    def __new__(cls, *args, **kwargs):
+        # Import locally to avoid circular imports.
+        from keras_core import optimizers
+        from keras_core.backend.torch.optimizers import torch_sgd
+
+        OPTIMIZERS = {optimizers.SGD: torch_sgd.SGD}
+        if cls in OPTIMIZERS:
+            return OPTIMIZERS[cls](*args, **kwargs)
+        return super().__new__(cls)
+
+    def _apply_weight_decay(self, variables):
+        if self.weight_decay is None:
+            return
+
+        torch._foreach_mul_(
+            [v.value for v in variables if self._use_weight_decay(v)],
+            1 - self.weight_decay * self._get_current_learning_rate(),
+        )
diff --git a/keras_core/backend/torch/optimizers/torch_sgd.py b/keras_core/backend/torch/optimizers/torch_sgd.py
new file mode 100644
index 000000000..8bcee460e
--- /dev/null
+++ b/keras_core/backend/torch/optimizers/torch_sgd.py
@@ -0,0 +1,43 @@
+import torch
+
+from keras_core import optimizers
+
+
+class SGD(optimizers.SGD):
+    def _internal_apply_gradients(self, grads_and_vars):
+        grads, trainable_variables = zip(*grads_and_vars)
+
+        self._parallel_update_step(
+            grads,
+            [v.value for v in trainable_variables],
+            self._get_current_learning_rate(),
+        )
+        self.iterations.assign(self.iterations + 1)
+
+    def _parallel_update_step(
+        self,
+        grads,
+        variables,
+        learning_rate,
+    ):
+        if self.momentum != 0:
+            bufs = [
+                self.momentums[self._get_variable_index(variable.value)]
+                for variable in variables
+            ]
+
+            for i in range(len(bufs)):
+                if bufs[i] is None:
+                    bufs[i] = torch.clone(grads[i]).detach()
+
+            torch._foreach_mul_(bufs, self.momentum)
+            torch._foreach_add_(bufs, grads, alpha=-learning_rate)
+
+            if self.nesterov:
+                torch._foreach_add_(variables, grads, alpha=-learning_rate)
+                torch._foreach_add_(variables, bufs, alpha=self.momentum)
+            else:
+                torch._foreach_add_(variables, bufs)
+
+        else:
+            torch._foreach_add_(variables, grads, alpha=-learning_rate)
diff --git a/keras_core/layers/core/input_layer_test.py b/keras_core/layers/core/input_layer_test.py
new file mode 100644
index 000000000..437864782
--- /dev/null
+++ b/keras_core/layers/core/input_layer_test.py
@@ -0,0 +1,88 @@
+import numpy as np
+
+from keras_core import testing
+from keras_core.backend import KerasTensor
+from keras_core.layers import InputLayer
+
+
+class InputLayerTest(testing.TestCase):
+    # Testing happy path for layer without input tensor
+    def test_input_basic(self):
+        input_shape = (2, 3)
+        batch_size = 4
+        dtype = "float32"
+        ndim = len(tuple((batch_size,) + input_shape))
+
+        values = InputLayer(
+            shape=input_shape, batch_size=batch_size, dtype=dtype
+        )
+
+        self.assertEqual(values.dtype, dtype)
+        self.assertEqual(values.batch_shape[0], batch_size)
+        self.assertEqual(values.batch_shape[1:], input_shape)
+        self.assertEqual(values.trainable, True)
+        self.assertIsInstance(values.output, KerasTensor)
+        self.assertEqual(values.output.ndim, ndim)
+        self.assertEqual(values.output.dtype, dtype)
+
+    # Testing shape is not None and batch_shape is not None condition
+    def test_input_error1(self):
+        input_shape = (2, 3)
+
+        with self.assertRaisesRegex(
+            ValueError, "cannot pass both `shape` and `batch_shape`"
+        ):
+            InputLayer(shape=input_shape, batch_shape=input_shape)
+
+    # Testing batch_size is not None and batch_shape is not None
+    def test_input_error2(self):
+        input_shape = (2, 3)
+        batch_size = 4
+
+        with self.assertRaisesRegex(
+            ValueError, "cannot pass both `batch_size` and `batch_shape`"
+        ):
+            InputLayer(batch_size=batch_size, batch_shape=input_shape)
+
+    # Testing shape is None and batch_shape is None
+    def test_input_error3(self):
+        with self.assertRaisesRegex(ValueError, "pass a `shape` argument."):
+            InputLayer(shape=None, batch_shape=None)
+
+    # Testing Input tensor is not Keras tensor
+    def test_input_tensor_error(self):
+        input_shape = (2, 3)
+        batch_size = 4
+        input_tensor = np.zeros(input_shape)
+
+        with self.assertRaisesRegex(
+            ValueError, "Argument `input_tensor` must be a KerasTensor"
+        ):
+            InputLayer(
+                shape=input_shape,
+                batch_size=batch_size,
+                input_tensor=input_tensor,
+            )
+
+    # Testing happy path for layer with input tensor
+    def testing_input_tensor(self):
+        input_shape = (2, 3)
+        batch_size = 4
+        dtype = "float32"
+        input_tensor = KerasTensor(shape=input_shape, dtype=dtype)
+
+        values = InputLayer(
+            shape=input_shape,
+            batch_size=batch_size,
+            input_tensor=input_tensor,
+            dtype=dtype,
+        )
+
+        self.assertEqual(values.dtype, dtype)
+        self.assertEqual(values.batch_shape[0], batch_size)
+        self.assertEqual(values.batch_shape[1:], input_shape)
+        self.assertEqual(values.trainable, True)
+        self.assertIsInstance(values.output, KerasTensor)
+        self.assertEqual(values.output, input_tensor)
+        self.assertEqual(values.output.ndim, input_tensor.ndim)
+        self.assertEqual(values.output.dtype, dtype)
diff --git a/keras_core/layers/layer.py b/keras_core/layers/layer.py
index 4080fbff8..a7fdf0c11 100644
--- a/keras_core/layers/layer.py
+++ b/keras_core/layers/layer.py
@@ -84,8 +84,8 @@ class Layer(BackendLayer, Operation):
 
     Attributes:
         name: The name of the layer (string).
-        dtype: The dtype of the layer's weights.
-        variable_dtype: Dtype of the layer's variables.
+        dtype: Dtype of the layer's weights. Alias of `layer.variable_dtype`.
+        variable_dtype: Dtype of the layer's weights.
         compute_dtype: The dtype of the layer's computations.
             Layers automatically cast inputs to this dtype, which causes
             the computations and output to also be in this dtype.
@@ -374,21 +374,19 @@ class Layer(BackendLayer, Operation):
         constraint=None,
         name=None,
     ):
-        # TODO: handle layout
-        self._check_super_called()
-        initializer = initializers.get(initializer)
-        variable = backend.Variable(
-            initializer=initializer,
+        """Add a weight variable to the layer.
+
+        Alias of `add_weight()`.
+        """
+        return self.add_weight(
             shape=shape,
-            dtype=dtype or self.variable_dtype,
+            initializer=initializer,
+            dtype=dtype,
             trainable=trainable,
+            regularizer=regularizer,
+            constraint=constraint,
             name=name,
         )
-        # Will be added to layer.losses
-        variable.regularizer = regularizer
-        variable.constraint = constraint
-        self._track_variable(variable)
-        return variable
 
     def add_weight(
         self,
@@ -402,8 +400,6 @@ class Layer(BackendLayer, Operation):
     ):
         """Add a weight variable to the layer.
 
-        Alias of `add_variable()`.
-
         Args:
             shape: Shape tuple for the variable.
                 Must be fully-defined (no `None` entries).
@@ -422,15 +418,21 @@ class Layer(BackendLayer, Operation):
             name: String name of the variable. Useful
                 for debugging purposes.
         """
-        return self.add_variable(
-            shape=shape,
+        # TODO: handle layout
+        self._check_super_called()
+        initializer = initializers.get(initializer)
+        variable = backend.Variable(
             initializer=initializer,
-            dtype=dtype,
+            shape=shape,
+            dtype=dtype or self.variable_dtype,
             trainable=trainable,
-            regularizer=regularizer,
-            constraint=constraint,
             name=name,
         )
+        # Will be added to layer.losses
+        variable.regularizer = regularizer
+        variable.constraint = constraint
+        self._track_variable(variable)
+        return variable
 
     @property
     def trainable(self):
@@ -459,9 +461,13 @@ class Layer(BackendLayer, Operation):
 
     @property
     def variables(self):
-        # Return only weights/rng state/metric variables
-        # of all Layers, recursively.
-        # Also deduplicate them.
+        """List of all layer state, including metric variables and random seeds.
+
+        This extends `layer.weights` to include all state used by the layer
+        including state for metrics and `SeedGenerator`s.
+        """
+        # Return all `Variables` associate with the layer including metrics
+        # and random seeds. Also deduplicate them.
         variables = []
         seen_ids = set()
         for v in self._trainable_variables + self._non_trainable_variables:
@@ -481,20 +487,32 @@ class Layer(BackendLayer, Operation):
 
     @property
     def trainable_variables(self):
+        """List of all trainable layer state.
+
+        This is equivalent to `layer.trainable_weights`.
+        """
         if not self.trainable:
             return []
         return [v for v in self.variables if v.trainable]
 
     @property
     def non_trainable_variables(self):
+        """List of all non-trainable layer state.
+
+        This extends `layer.non_trainable_weights` to include all state used by
+        the layer including state for metrics and `SeedGenerator`s.
+        """
         if not self.trainable:
             return self.variables
         return [v for v in self.variables if not v.trainable]
 
     @property
     def weights(self):
-        """List of weight variables of the layer."""
-        # Return only "own weights" of all Layers, recursively.
+        """List of all weight variables of the layer.
+
+        Unlike, `layer.variables` this excludes metric state and random seeds.
+        """
+        # Return only `Variables` directly owned by layers and sub-layers.
         # Also deduplicate them.
         weights = []
         seen_ids = set()
@@ -511,10 +529,9 @@ class Layer(BackendLayer, Operation):
 
     @property
     def trainable_weights(self):
-        """List of trainable weight variables of the layer.
+        """List of all trainable weight variables of the layer.
 
-        These are the weights that get updated by the optimizer
-        during training.
+        These are the weights that get updated by the optimizer during training.
         """
         if not self.trainable:
             return []
@@ -522,10 +539,11 @@ class Layer(BackendLayer, Operation):
 
     @property
     def non_trainable_weights(self):
-        """List of non-trainable weight variables of the layer.
+        """List of all non-trainable weight variables of the layer.
 
-        Non-trainable weights may include batch normalization statistics,
-        metric variables, or RNG seed variables.
+        These are the weights that should not be updated by the optimizer during
+        training. Unlike, `layer.non_trainable_variables` this excludes metric
+        state and random seeds.
         """
         if not self.trainable:
             return self.weights
@@ -555,7 +573,7 @@ class Layer(BackendLayer, Operation):
 
     @property
     def dtype(self):
-        """The dtype of the state (weights) of the layer."""
+        """Alias of `layer.variable_dtype`."""
         return self.variable_dtype
 
     @property
diff --git a/keras_core/layers/preprocessing/random_crop.py b/keras_core/layers/preprocessing/random_crop.py
index 5a1889413..4d1653de3 100644
--- a/keras_core/layers/preprocessing/random_crop.py
+++ b/keras_core/layers/preprocessing/random_crop.py
@@ -1,15 +1,14 @@
+import numpy as np
+
 from keras_core import backend
-from keras_core import ops
 from keras_core.api_export import keras_core_export
-from keras_core.layers.preprocessing.tf_data_layer import TFDataLayer
-from keras_core.random.seed_generator import SeedGenerator
+from keras_core.layers.layer import Layer
 from keras_core.utils import backend_utils
-from keras_core.utils import image_utils
-from keras_core.utils import dtype_utils
+from keras_core.utils.module_utils import tensorflow as tf
 
 
 @keras_core_export("keras_core.layers.RandomCrop")
-class RandomCrop(TFDataLayer):
+class RandomCrop(Layer):
     """A preprocessing layer which randomly crops images during training.
 
     During training, this layer will randomly choose a location to crop images
@@ -53,128 +52,41 @@ class RandomCrop(TFDataLayer):
             `name` and `dtype`.
     """
 
-    def __init__(
-        self, height, width, seed=None, data_format=None, name=None, **kwargs
-    ):
+    def __init__(self, height, width, seed=None, name=None, **kwargs):
+        if not tf.available:
+            raise ImportError(
+                "Layer RandomCrop requires TensorFlow. "
+                "Install it via `pip install tensorflow`."
+            )
+
         super().__init__(name=name, **kwargs)
-        self.height = height
-        self.width = width
         self.seed = seed or backend.random.make_default_seed()
-        self.seed_generator = SeedGenerator(seed)
-        self.data_format = backend.standardize_data_format(data_format)
-
-        if self.data_format == "channels_first":
-            self.heigh_axis = -2
-            self.width_axis = -1
-        elif self.data_format == "channels_last":
-            self.height_axis = -3
-            self.width_axis = -2
-
+        self.layer = tf.keras.layers.RandomCrop(
+            height=height,
+            width=width,
+            seed=self.seed,
+            name=name,
+        )
         self.supports_masking = False
         self.supports_jit = False
         self._convert_input_args = False
         self._allow_non_tensor_positional_args = True
 
     def call(self, inputs, training=True):
-        inputs = self.backend.cast(inputs, self.compute_dtype)
-        input_shape = self.backend.shape(inputs)
-        is_batched = len(input_shape) > 3
-        inputs = (
-            self.backend.numpy.expand_dims(inputs, axis=0)
-            if not is_batched
-            else inputs
-        )
-
-        h_diff = input_shape[self.height_axis] - self.height
-        w_diff = input_shape[self.width_axis] - self.width
-
-        def random_crop():
-            # input_dtype_max = (2 ** dtype_utils.dtype_size(inputs.dtype)) - 1
-            input_height, input_width = (
-                input_shape[self.height_axis],
-                input_shape[self.width_axis],
-            )
-
-            h_start = self.backend.cast(
-                ops.random.uniform(
-                    (),
-                    0,
-                    maxval=float(input_height - self.height + 1),
-                    dtype=inputs.dtype,
-                    seed=self.seed_generator,
-                ),
-                h_diff.dtype,
-            )
-            w_start = self.backend.cast(
-                ops.random.uniform(
-                    (),
-                    0,
-                    maxval=float(input_width - self.width + 1),
-                    dtype=inputs.dtype,
-                    seed=self.seed_generator,
-                ),
-                h_diff.dtype,
-            )
-            # rands = ops.random.uniform(
-            #     [2], 0, input_dtype_max, inputs.dtype, seed=self.seed_generator
-            # )
-            # original_dtype = h_diff.dtype
-            # h_start = self.backend.cast(
-            #     rands[0] % self.backend.cast((h_diff + 1), self.compute_dtype),
-            #     original_dtype,
-            # )
-            # w_start = self.backend.cast(
-            #     rands[1] % self.backend.cast((w_diff + 1), self.compute_dtype),
-            #     original_dtype,
-            # )
-            if self.data_format == "channels_last":
-                return inputs[
-                    :,
-                    h_start : h_start + self.height,
-                    w_start : w_start + self.width,
-                ]
-            else:
-                return inputs[
-                    :,
-                    :,
-                    h_start : h_start + self.height,
-                    w_start : w_start + self.width,
-                ]
-
-        def resize():
-            outputs = image_utils.smart_resize(
-                inputs,
-                [self.height, self.width],
-                data_format=self.data_format,
-                backend_module=self.backend,
-            )
-            # smart_resize will always output float32, so we need to re-cast.
-            return self.backend.cast(outputs, self.compute_dtype)
-
-        outputs = self.backend.cond(
-            self.backend.numpy.all((training, h_diff >= 0, w_diff >= 0)),
-            random_crop,
-            resize,
-        )
-
-        if self.backend != "tensorflow" and not backend_utils.in_tf_graph():
-            outputs = self.backend.convert_to_tensor(outputs)
+        if not isinstance(inputs, (tf.Tensor, np.ndarray, list, tuple)):
+            inputs = tf.convert_to_tensor(backend.convert_to_numpy(inputs))
+        outputs = self.layer.call(inputs, training=training)
+        if (
+            backend.backend() != "tensorflow"
+            and not backend_utils.in_tf_graph()
+        ):
+            outputs = backend.convert_to_tensor(outputs)
         return outputs
 
-    def compute_output_shape(self, input_shape, *args, **kwargs):
-        input_shape = list(input_shape)
-        input_shape[self.height_axis] = self.height
-        input_shape[self.width_axis] = self.width
-        return tuple(input_shape)
+    def compute_output_shape(self, input_shape):
+        return tuple(self.layer.compute_output_shape(input_shape))
 
     def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "height": self.height,
-                "width": self.width,
-                "seed": self.seed,
-                "data_format": self.data_format,
-            }
-        )
+        config = self.layer.get_config()
+        config.update({"seed": self.seed})
         return config
diff --git a/keras_core/layers/rnn/gru.py b/keras_core/layers/rnn/gru.py
index 7712fae0a..f9a582eb3 100644
--- a/keras_core/layers/rnn/gru.py
+++ b/keras_core/layers/rnn/gru.py
@@ -522,7 +522,7 @@ class GRU(RNN):
                 # implementation of the inner GRU loop. In the case of
                 # TF for instance, it will leverage cuDNN when feasible, and
                 # it will raise NotImplementedError otherwise.
-                return backend.gru(
+                out = backend.gru(
                     sequences,
                     initial_state,
                     mask,
@@ -536,6 +536,11 @@ class GRU(RNN):
                     unroll=self.unroll,
                     reset_after=self.cell.reset_after,
                 )
+                # We disable jit_compile for the model in this case,
+                # since cuDNN ops aren't XLA compatible.
+                if backend.backend() == "tensorflow":
+                    self.supports_jit = False
+                return out
             except NotImplementedError:
                 pass
         return super().inner_loop(
diff --git a/keras_core/layers/rnn/lstm.py b/keras_core/layers/rnn/lstm.py
index bcd8efdf6..cf5bf3d27 100644
--- a/keras_core/layers/rnn/lstm.py
+++ b/keras_core/layers/rnn/lstm.py
@@ -502,7 +502,7 @@ class LSTM(RNN):
                 # implementation of the inner LSTM loop. In the case of
                 # TF for instance, it will leverage cuDNN when feasible, and
                 # it will raise NotImplementedError otherwise.
-                return backend.lstm(
+                out = backend.lstm(
                     sequences,
                     initial_state[0],
                     initial_state[1],
@@ -516,6 +516,11 @@ class LSTM(RNN):
                     go_backwards=self.go_backwards,
                     unroll=self.unroll,
                 )
+                # We disable jit_compile for the model in this case,
+                # since cuDNN ops aren't XLA compatible.
+                if backend.backend() == "tensorflow":
+                    self.supports_jit = False
+                return out
             except NotImplementedError:
                 pass
         return super().inner_loop(
diff --git a/keras_core/ops/symbolic_arguments.py b/keras_core/ops/symbolic_arguments.py
index 3ae7a86b4..4fdf9ed41 100644
--- a/keras_core/ops/symbolic_arguments.py
+++ b/keras_core/ops/symbolic_arguments.py
@@ -5,7 +5,7 @@ from keras_core.backend import KerasTensor
 
 class SymbolicArguments:
     def __init__(self, *args, **kwargs):
-        # TODO: validation
+
         self.args = tree.map_structure(lambda x: x, args)
         self.kwargs = tree.map_structure(lambda x: x, kwargs)
         self._flat_arguments = tree.flatten((self.args, self.kwargs))
diff --git a/keras_core/optimizers/optimizer.py b/keras_core/optimizers/optimizer.py
index 4ef62e133..643229815 100644
--- a/keras_core/optimizers/optimizer.py
+++ b/keras_core/optimizers/optimizer.py
@@ -3,9 +3,13 @@ from keras_core.api_export import keras_core_export
 from keras_core.optimizers import base_optimizer
 
 if backend.backend() == "tensorflow":
-    from keras_core.backend.tensorflow import optimizer as tf_optimizer
+    from keras_core.backend.tensorflow.optimizer import TFOptimizer
 
-    BackendOptimizer = tf_optimizer.TFOptimizer
+    BackendOptimizer = TFOptimizer
+elif backend.backend() == "torch":
+    from keras_core.backend.torch.optimizers import TorchOptimizer
+
+    BackendOptimizer = TorchOptimizer
 else:
     BackendOptimizer = base_optimizer.BaseOptimizer
 
diff --git a/keras_core/optimizers/sgd_test.py b/keras_core/optimizers/sgd_test.py
index f40683131..191ed3c1d 100644
--- a/keras_core/optimizers/sgd_test.py
+++ b/keras_core/optimizers/sgd_test.py
@@ -21,7 +21,7 @@ class SGDTest(testing.TestCase):
     def test_single_step(self):
         optimizer = SGD(learning_rate=0.5)
         self.assertEqual(len(optimizer.variables), 2)
-        grads = np.array([1.0, 6.0, 7.0, 2.0])
+        grads = ops.array([1.0, 6.0, 7.0, 2.0])
         vars = backend.Variable([1.0, 2.0, 3.0, 4.0])
         optimizer.build([vars])
         optimizer.apply_gradients(zip([grads], [vars]))
@@ -32,7 +32,7 @@ class SGDTest(testing.TestCase):
 
     def test_weight_decay(self):
         grads, var1, var2, var3 = (
-            np.zeros(()),
+            ops.zeros(()),
             backend.Variable(2.0),
             backend.Variable(2.0, name="exclude"),
             backend.Variable(2.0),
@@ -56,8 +56,8 @@ class SGDTest(testing.TestCase):
         optimizer = SGD(nesterov=True)
 
         x = backend.Variable(np.ones([10]))
-        grads = np.arange(0.1, 1.1, 0.1)
-        first_grads = np.full((10,), 0.01)
+        grads = ops.arange(0.1, 1.1, 0.1)
+        first_grads = ops.full((10,), 0.01)
 
         # fmt: off
         golden = np.array(