Sync OSS keras to head.

PiperOrigin-RevId: 347838100
2020-12-16 09:34:29 -08:00 · 2020-12-16 09:34:29 -08:00 · f0c0c877ba
commit f0c0c877ba
parent af1a2eb1f5
10 changed files with 408 additions and 350 deletions
--- a/keras/engine/BUILD
+++ b/keras/engine/BUILD
@ -1,7 +1,10 @@
 # Description:
 #   Contains the Keras engine API (internal TensorFlow version).
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 package(
--- a/keras/layers/gru_v2_test.py
+++ b/keras/layers/gru_v2_test.py
@ -27,8 +27,8 @@ import shutil
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import test_util as tf_test_util
 import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from keras import combinations
 from keras import keras_parameterized
 from keras import testing_utils
--- a/keras/layers/local.py
+++ b/keras/layers/local.py
@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Locally-connected layers.
+"""Locally-connected layers."""
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -59,79 +58,61 @@ class LocallyConnected1D(Layer):
  ```
  Arguments:
-      filters: Integer, the dimensionality of the output space
+      filters: Integer, the dimensionality of the output space (i.e. the number
-          (i.e. the number of output filters in the convolution).
+        of output filters in the convolution).
-      kernel_size: An integer or tuple/list of a single integer,
+      kernel_size: An integer or tuple/list of a single integer, specifying the
-          specifying the length of the 1D convolution window.
+        length of the 1D convolution window.
-      strides: An integer or tuple/list of a single integer,
+      strides: An integer or tuple/list of a single integer, specifying the
-          specifying the stride length of the convolution.
+        stride length of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
+      padding: Currently only supports `"valid"` (case-insensitive). `"same"`
-          any `dilation_rate` value != 1.
+        may be supported in the future. `"valid"` means no padding.
-      padding: Currently only supports `"valid"` (case-insensitive).
+      data_format: A string, one of `channels_last` (default) or
-          `"same"` may be supported in the future.
+        `channels_first`. The ordering of the dimensions in the inputs.
-          `"valid"` means no padding.
+        `channels_last` corresponds to inputs with shape `(batch, length,
-      data_format: A string,
+        channels)` while `channels_first` corresponds to inputs with shape
-          one of `channels_last` (default) or `channels_first`.
+        `(batch, channels, length)`. It defaults to the `image_data_format`
-          The ordering of the dimensions in the inputs.
+        value found in your Keras config file at `~/.keras/keras.json`. If you
-          `channels_last` corresponds to inputs with shape
+        never set it, then it will be "channels_last".
-          `(batch, length, channels)` while `channels_first`
+      activation: Activation function to use. If you don't specify anything, no
-          corresponds to inputs with shape
+        activation is applied
          `(batch, channels, length)`.
          It defaults to the `image_data_format` value found in your
          Keras config file at `~/.keras/keras.json`.
          If you never set it, then it will be "channels_last".
      activation: Activation function to use.
          If you don't specify anything, no activation is applied
          (ie. "linear" activation: `a(x) = x`).
      use_bias: Boolean, whether the layer uses a bias vector.
      kernel_initializer: Initializer for the `kernel` weights matrix.
      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
-          the `kernel` weights matrix.
+        matrix.
      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
+      activity_regularizer: Regularizer function applied to the output of the
-          the output of the layer (its "activation")..
+        layer (its "activation")..
      kernel_constraint: Constraint function applied to the kernel matrix.
      bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1`, `2`, or `3`.
+      implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
-          `1` loops over input spatial locations to perform the forward pass.
+        over input spatial locations to perform the forward pass. It is
-          It is memory-efficient but performs a lot of (small) ops.
+        memory-efficient but performs a lot of (small) ops.  `2` stores layer
-
+        weights in a dense but sparsely-populated 2D matrix and implements the
-          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+        forward pass as a single matrix-multiply. It uses a lot of RAM but
-          and implements the forward pass as a single matrix-multiply. It uses
+        performs few (large) ops.  `3` stores layer weights in a sparse tensor
-          a lot of RAM but performs few (large) ops.
+        and implements the forward pass as a single sparse matrix-multiply.
          `3` stores layer weights in a sparse tensor and implements the forward
          pass as a single sparse matrix-multiply.
          How to choose:
          `1`: large, dense models,
          `2`: small models,
-          `3`: large, sparse models,
+          `3`: large, sparse models,  where "large" stands for large
-
+            input/output activations (i.e. many `filters`, `input_filters`,
-          where "large" stands for large input/output activations
+            large `input_size`, `output_size`), and "sparse" stands for few
-          (i.e. many `filters`, `input_filters`, large `input_size`,
+            connections between inputs and outputs, i.e. small ratio `filters *
-          `output_size`), and "sparse" stands for few connections between inputs
+            input_filters * kernel_size / (input_size * strides)`, where inputs
-          and outputs, i.e. small ratio
+            to and outputs of the layer are assumed to have shapes `(input_size,
-          `filters * input_filters * kernel_size / (input_size * strides)`,
+            input_filters)`, `(output_size, filters)` respectively.  It is
-          where inputs to and outputs of the layer are assumed to have shapes
+            recommended to benchmark each in the setting of interest to pick the
-          `(input_size, input_filters)`, `(output_size, filters)`
+            most efficient one (in terms of speed and memory usage). Correct
-          respectively.
+            choice of implementation can lead to dramatic speed improvements
-
+            (e.g. 50X), potentially at the expense of RAM.  Also, only
-          It is recommended to benchmark each in the setting of interest to pick
+            `padding="valid"` is supported by `implementation=1`.
          the most efficient one (in terms of speed and memory usage). Correct
          choice of implementation can lead to dramatic speed improvements (e.g.
          50X), potentially at the expense of RAM.
          Also, only `padding="valid"` is supported by `implementation=1`.
  Input shape:
      3D tensor with shape: `(batch_size, steps, input_dim)`
  Output shape:
-      3D tensor with shape: `(batch_size, new_steps, filters)`
+      3D tensor with shape: `(batch_size, new_steps, filters)` `steps` value
-      `steps` value might have changed due to padding or strides.
+        might have changed due to padding or strides.
  """
  def __init__(self,
@ -158,8 +139,8 @@ class LocallyConnected1D(Layer):
    self.padding = conv_utils.normalize_padding(padding)
    if self.padding != 'valid' and implementation == 1:
      raise ValueError('Invalid border mode for LocallyConnected1D '
-                       '(only "valid" is supported if implementation is 1): '
+                       '(only "valid" is supported if implementation is 1): ' +
-                       + padding)
+                       padding)
    self.data_format = conv_utils.normalize_data_format(data_format)
    self.activation = activations.get(activation)
    self.use_bias = use_bias
@ -181,10 +162,13 @@ class LocallyConnected1D(Layer):
      input_dim, input_length = input_shape[2], input_shape[1]
    if input_dim is None:
-      raise ValueError('Axis 2 of input should be fully-defined. '
+      raise ValueError(
-                       'Found shape:', input_shape)
+          'Axis 2 of input should be fully-defined. '
-    self.output_length = conv_utils.conv_output_length(
+          'Found shape:', input_shape)
-        input_length, self.kernel_size[0], self.padding, self.strides[0])
+    self.output_length = conv_utils.conv_output_length(input_length,
                                                       self.kernel_size[0],
                                                       self.padding,
                                                       self.strides[0])
    if self.implementation == 1:
      self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
@ -199,17 +183,18 @@ class LocallyConnected1D(Layer):
    elif self.implementation == 2:
      if self.data_format == 'channels_first':
-        self.kernel_shape = (input_dim, input_length,
+        self.kernel_shape = (input_dim, input_length, self.filters,
-                             self.filters, self.output_length)
+                             self.output_length)
      else:
-        self.kernel_shape = (input_length, input_dim,
+        self.kernel_shape = (input_length, input_dim, self.output_length,
-                             self.output_length, self.filters)
+                             self.filters)
-      self.kernel = self.add_weight(shape=self.kernel_shape,
+      self.kernel = self.add_weight(
-                                    initializer=self.kernel_initializer,
+          shape=self.kernel_shape,
-                                    name='kernel',
+          initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
+          name='kernel',
-                                    constraint=self.kernel_constraint)
+          regularizer=self.kernel_regularizer,
          constraint=self.kernel_constraint)
      self.kernel_mask = get_locallyconnected_mask(
          input_shape=(input_length,),
@ -231,8 +216,7 @@ class LocallyConnected1D(Layer):
              padding=self.padding,
              filters_in=input_dim,
              filters_out=self.filters,
-              data_format=self.data_format)
+              data_format=self.data_format))
      )
      self.kernel = self.add_weight(
          shape=(len(self.kernel_idxs),),
@ -242,8 +226,8 @@ class LocallyConnected1D(Layer):
          constraint=self.kernel_constraint)
    else:
-      raise ValueError('Unrecognized implementation mode: %d.'
+      raise ValueError('Unrecognized implementation mode: %d.' %
-                       % self.implementation)
+                       self.implementation)
    if self.use_bias:
      self.bias = self.add_weight(
@ -291,8 +275,8 @@ class LocallyConnected1D(Layer):
                                        self.compute_output_shape(inputs.shape))
    else:
-      raise ValueError('Unrecognized implementation mode: %d.'
+      raise ValueError('Unrecognized implementation mode: %d.' %
-                       % self.implementation)
+                       self.implementation)
    if self.use_bias:
      output = K.bias_add(output, self.bias, data_format=self.data_format)
@ -366,87 +350,71 @@ class LocallyConnected2D(Layer):
  ```
  Arguments:
-      filters: Integer, the dimensionality of the output space
+      filters: Integer, the dimensionality of the output space (i.e. the number
-          (i.e. the number of output filters in the convolution).
+        of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
+      kernel_size: An integer or tuple/list of 2 integers, specifying the width
-          width and height of the 2D convolution window.
+        and height of the 2D convolution window. Can be a single integer to
-          Can be a single integer to specify the same value for
+        specify the same value for all spatial dimensions.
-          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers, specifying the strides of
-      strides: An integer or tuple/list of 2 integers,
+        the convolution along the width and height. Can be a single integer to
-          specifying the strides of the convolution along the width and height.
+        specify the same value for all spatial dimensions.
-          Can be a single integer to specify the same value for
+      padding: Currently only support `"valid"` (case-insensitive). `"same"`
-          all spatial dimensions.
+        will be supported in future. `"valid"` means no padding.
-      padding: Currently only support `"valid"` (case-insensitive).
+      data_format: A string, one of `channels_last` (default) or
-          `"same"` will be supported in future.
+        `channels_first`. The ordering of the dimensions in the inputs.
-          `"valid"` means no padding.
+        `channels_last` corresponds to inputs with shape `(batch, height, width,
-      data_format: A string,
+        channels)` while `channels_first` corresponds to inputs with shape
-          one of `channels_last` (default) or `channels_first`.
+        `(batch, channels, height, width)`. It defaults to the
-          The ordering of the dimensions in the inputs.
+        `image_data_format` value found in your Keras config file at
-          `channels_last` corresponds to inputs with shape
+        `~/.keras/keras.json`. If you never set it, then it will be
-          `(batch, height, width, channels)` while `channels_first`
+        "channels_last".
-          corresponds to inputs with shape
+      activation: Activation function to use. If you don't specify anything, no
-          `(batch, channels, height, width)`.
+        activation is applied
          It defaults to the `image_data_format` value found in your
          Keras config file at `~/.keras/keras.json`.
          If you never set it, then it will be "channels_last".
      activation: Activation function to use.
          If you don't specify anything, no activation is applied
          (ie. "linear" activation: `a(x) = x`).
      use_bias: Boolean, whether the layer uses a bias vector.
      kernel_initializer: Initializer for the `kernel` weights matrix.
      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
-          the `kernel` weights matrix.
+        matrix.
      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
+      activity_regularizer: Regularizer function applied to the output of the
-          the output of the layer (its "activation").
+        layer (its "activation").
      kernel_constraint: Constraint function applied to the kernel matrix.
      bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1`, `2`, or `3`.
+      implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
-          `1` loops over input spatial locations to perform the forward pass.
+        over input spatial locations to perform the forward pass. It is
-          It is memory-efficient but performs a lot of (small) ops.
+        memory-efficient but performs a lot of (small) ops.  `2` stores layer
-
+        weights in a dense but sparsely-populated 2D matrix and implements the
-          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+        forward pass as a single matrix-multiply. It uses a lot of RAM but
-          and implements the forward pass as a single matrix-multiply. It uses
+        performs few (large) ops.  `3` stores layer weights in a sparse tensor
-          a lot of RAM but performs few (large) ops.
+        and implements the forward pass as a single sparse matrix-multiply.
          `3` stores layer weights in a sparse tensor and implements the forward
          pass as a single sparse matrix-multiply.
          How to choose:
          `1`: large, dense models,
          `2`: small models,
-          `3`: large, sparse models,
+          `3`: large, sparse models,  where "large" stands for large
-
+            input/output activations (i.e. many `filters`, `input_filters`,
-          where "large" stands for large input/output activations
+            large `np.prod(input_size)`, `np.prod(output_size)`), and "sparse"
-          (i.e. many `filters`, `input_filters`, large `np.prod(input_size)`,
+            stands for few connections between inputs and outputs, i.e. small
-          `np.prod(output_size)`), and "sparse" stands for few connections
+            ratio `filters * input_filters * np.prod(kernel_size) /
-          between inputs and outputs, i.e. small ratio
+            (np.prod(input_size) * np.prod(strides))`, where inputs to and
-          `filters * input_filters * np.prod(kernel_size) / (np.prod(input_size)
+            outputs of the layer are assumed to have shapes `input_size +
-          * np.prod(strides))`, where inputs to and outputs of the layer are
+            (input_filters,)`, `output_size + (filters,)` respectively.  It is
-          assumed to have shapes `input_size + (input_filters,)`,
+            recommended to benchmark each in the setting of interest to pick the
-          `output_size + (filters,)` respectively.
+            most efficient one (in terms of speed and memory usage). Correct
-
+            choice of implementation can lead to dramatic speed improvements
-          It is recommended to benchmark each in the setting of interest to pick
+            (e.g. 50X), potentially at the expense of RAM.  Also, only
-          the most efficient one (in terms of speed and memory usage). Correct
+            `padding="valid"` is supported by `implementation=1`.
          choice of implementation can lead to dramatic speed improvements (e.g.
          50X), potentially at the expense of RAM.
          Also, only `padding="valid"` is supported by `implementation=1`.
  Input shape:
-      4D tensor with shape:
+      4D tensor with shape: `(samples, channels, rows, cols)` if
-      `(samples, channels, rows, cols)` if data_format='channels_first'
+        data_format='channels_first'
-      or 4D tensor with shape:
+      or 4D tensor with shape: `(samples, rows, cols, channels)` if
-      `(samples, rows, cols, channels)` if data_format='channels_last'.
+        data_format='channels_last'.
  Output shape:
-      4D tensor with shape:
+      4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
-      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+        data_format='channels_first'
-      or 4D tensor with shape:
+      or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if
-      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+        data_format='channels_last'. `rows` and `cols` values might have changed
-      `rows` and `cols` values might have changed due to padding.
+        due to padding.
  """
  def __init__(self,
@ -473,8 +441,8 @@ class LocallyConnected2D(Layer):
    self.padding = conv_utils.normalize_padding(padding)
    if self.padding != 'valid' and implementation == 1:
      raise ValueError('Invalid border mode for LocallyConnected2D '
-                       '(only "valid" is supported if implementation is 1): '
+                       '(only "valid" is supported if implementation is 1): ' +
-                       + padding)
+                       padding)
    self.data_format = conv_utils.normalize_data_format(data_format)
    self.activation = activations.get(activation)
    self.use_bias = use_bias
@ -509,10 +477,8 @@ class LocallyConnected2D(Layer):
    self.output_col = output_col
    if self.implementation == 1:
-      self.kernel_shape = (
+      self.kernel_shape = (output_row * output_col, self.kernel_size[0] *
-          output_row * output_col,
+                           self.kernel_size[1] * input_filter, self.filters)
          self.kernel_size[0] * self.kernel_size[1] * input_filter,
          self.filters)
      self.kernel = self.add_weight(
          shape=self.kernel_shape,
@ -523,17 +489,18 @@ class LocallyConnected2D(Layer):
    elif self.implementation == 2:
      if self.data_format == 'channels_first':
-        self.kernel_shape = (input_filter, input_row, input_col,
+        self.kernel_shape = (input_filter, input_row, input_col, self.filters,
-                             self.filters, self.output_row, self.output_col)
+                             self.output_row, self.output_col)
      else:
        self.kernel_shape = (input_row, input_col, input_filter,
                             self.output_row, self.output_col, self.filters)
-      self.kernel = self.add_weight(shape=self.kernel_shape,
+      self.kernel = self.add_weight(
-                                    initializer=self.kernel_initializer,
+          shape=self.kernel_shape,
-                                    name='kernel',
+          initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
+          name='kernel',
-                                    constraint=self.kernel_constraint)
+          regularizer=self.kernel_regularizer,
          constraint=self.kernel_constraint)
      self.kernel_mask = get_locallyconnected_mask(
          input_shape=(input_row, input_col),
@ -555,8 +522,7 @@ class LocallyConnected2D(Layer):
              padding=self.padding,
              filters_in=input_filter,
              filters_out=self.filters,
-              data_format=self.data_format)
+              data_format=self.data_format))
      )
      self.kernel = self.add_weight(
          shape=(len(self.kernel_idxs),),
@ -566,8 +532,8 @@ class LocallyConnected2D(Layer):
          constraint=self.kernel_constraint)
    else:
-      raise ValueError('Unrecognized implementation mode: %d.'
+      raise ValueError('Unrecognized implementation mode: %d.' %
-                       % self.implementation)
+                       self.implementation)
    if self.use_bias:
      self.bias = self.add_weight(
@ -619,8 +585,8 @@ class LocallyConnected2D(Layer):
                                        self.compute_output_shape(inputs.shape))
    else:
-      raise ValueError('Unrecognized implementation mode: %d.'
+      raise ValueError('Unrecognized implementation mode: %d.' %
-                       % self.implementation)
+                       self.implementation)
    if self.use_bias:
      output = K.bias_add(output, self.bias, data_format=self.data_format)
@ -686,10 +652,10 @@ def get_locallyconnected_mask(input_shape, kernel_shape, strides, padding,
  `strides`, `padding` and `data_format`.
  Arguments:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)` spatial shape of the
-                 spatial shape of the input.
+      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-                  / receptive field.
+      receptive field.
    strides: tuple of size N, strides along each spatial dimension.
    padding: type of padding, string `"same"` or `"valid"`.
    data_format: a string, `"channels_first"` or `"channels_last"`.
@ -709,8 +675,7 @@ def get_locallyconnected_mask(input_shape, kernel_shape, strides, padding,
      input_shape=input_shape,
      kernel_shape=kernel_shape,
      strides=strides,
-      padding=padding
+      padding=padding)
  )
  ndims = int(mask.ndim / 2)
@ -739,34 +704,26 @@ def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
  reshapes to make `inputs` and `kernel` 2-D and `output` (N+2)-D.
  Arguments:
-      inputs: (N+2)-D tensor with shape
+      inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
-          `(batch_size, channels_in, d_in1, ..., d_inN)`
+        d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
          or
          `(batch_size, d_in1, ..., d_inN, channels_in)`.
      kernel: the unshared weights for N-D convolution,
-          an (N+2)-D tensor of shape:
+          an (N+2)-D tensor of shape: `(d_in1, ..., d_inN, channels_in, d_out2,
-          `(d_in1, ..., d_inN, channels_in, d_out2, ..., d_outN, channels_out)`
+            ..., d_outN, channels_out)` or `(channels_in, d_in1, ..., d_inN,
-          or
+            channels_out, d_out2, ..., d_outN)`, with the ordering of channels
-          `(channels_in, d_in1, ..., d_inN, channels_out, d_out2, ..., d_outN)`,
+            and spatial dimensions matching that of the input. Each entry is the
-          with the ordering of channels and spatial dimensions matching
+            weight between a particular input and output location, similarly to
-          that of the input.
+            a fully-connected weight matrix.
-          Each entry is the weight between a particular input and
+      kernel_mask: a float 0/1 mask tensor of shape: `(d_in1, ..., d_inN, 1,
-          output location, similarly to a fully-connected weight matrix.
+        d_out2, ..., d_outN, 1)` or `(1, d_in1, ..., d_inN, 1, d_out2, ...,
-      kernel_mask: a float 0/1 mask tensor of shape:
+        d_outN)`, with the ordering of singleton and spatial dimensions matching
-           `(d_in1, ..., d_inN, 1, d_out2, ..., d_outN, 1)`
+        that of the input. Mask represents the connectivity pattern of the layer
-           or
+        and is
-           `(1, d_in1, ..., d_inN, 1, d_out2, ..., d_outN)`,
+           precomputed elsewhere based on layer parameters: stride, padding, and
-           with the ordering of singleton and spatial dimensions
+             the receptive field shape.
           matching that of the input.
           Mask represents the connectivity pattern of the layer and is
           precomputed elsewhere based on layer parameters: stride,
           padding, and the receptive field shape.
      output_shape: a tuple of (N+2) elements representing the output shape:
-          `(batch_size, channels_out, d_out1, ..., d_outN)`
+        `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
-          or
+        d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
-          `(batch_size, d_out1, ..., d_outN, channels_out)`,
+        spatial dimensions matching that of the input.
          with the ordering of channels and spatial dimensions matching that of
          the input.
  Returns:
      Output (N+2)-D tensor with shape `output_shape`.
@ -777,8 +734,9 @@ def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
  kernel = make_2d(kernel, split_dim=K.ndim(kernel) // 2)
  output_flat = tf.compat.v1.sparse_matmul(inputs_flat, kernel, b_is_sparse=True)
-  output = K.reshape(output_flat,
+  output = K.reshape(output_flat, [
-                     [K.shape(output_flat)[0],] + output_shape.as_list()[1:])
+      K.shape(output_flat)[0],
  ] + output_shape.as_list()[1:])
  return output
@ -810,14 +768,16 @@ def local_conv_sparse_matmul(inputs, kernel, kernel_idxs, kernel_shape,
  """
  inputs_flat = K.reshape(inputs, (K.shape(inputs)[0], -1))
  output_flat = tf.raw_ops.SparseTensorDenseMatMul(
-      a_indices=kernel_idxs, a_values=kernel, a_shape=kernel_shape,
+      a_indices=kernel_idxs,
-      b=inputs_flat, adjoint_b=True)
+      a_values=kernel,
      a_shape=kernel_shape,
      b=inputs_flat,
      adjoint_b=True)
  output_flat_transpose = K.transpose(output_flat)
-  output_reshaped = K.reshape(
+  output_reshaped = K.reshape(output_flat_transpose, [
-      output_flat_transpose,
+      K.shape(output_flat_transpose)[0],
-      [K.shape(output_flat_transpose)[0],] + output_shape.as_list()[1:]
+  ] + output_shape.as_list()[1:])
  )
  return output_reshaped
@ -830,7 +790,7 @@ def make_2d(tensor, split_dim):
  Arguments:
    tensor: a tensor of shape `(d0, ..., d(N-1))`.
    split_dim: an integer from 1 to N-1, index of the dimension to group
-        dimensions before (excluding) and after (including).
+      dimensions before (excluding) and after (including).
  Returns:
    Tensor of shape
--- a/keras/layers/lstm_v2_test.py
+++ b/keras/layers/lstm_v2_test.py
@ -28,8 +28,8 @@ import time
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import test_util as tf_test_util
 import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from keras import keras_parameterized
 from keras import testing_utils
 from keras.layers import recurrent as rnn_v1
--- a/keras/layers/wrappers_test.py
+++ b/keras/layers/wrappers_test.py
@ -26,6 +26,7 @@ from absl.testing import parameterized
 import numpy as np
 import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from keras import combinations
 from keras import keras_parameterized
 from keras import testing_utils
@ -33,8 +34,6 @@ from keras.engine import base_layer_utils
 from keras.layers import core
 from keras.layers.rnn_cell_wrapper_v2 import ResidualWrapper
 from keras.utils import generic_utils
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import util as trackable_util
@ -653,7 +652,7 @@ class BidirectionalTest(tf.test.TestCase, parameterized.TestCase):
        model.compile(loss='mse', optimizer='sgd')
        model.fit(x, y, epochs=1, batch_size=1)
-    if context.executing_eagerly():
+    if tf.executing_eagerly():
      run_test()
    else:
      tf_test_util.enable_output_all_intermediates(run_test)()
--- a/keras/losses.py
+++ b/keras/losses.py
@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in loss functions.
+"""Built-in loss functions."""
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -85,8 +84,8 @@ class Loss(object):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op.
    """
    losses_utils.ReductionV2.validate(reduction)
@ -115,15 +114,15 @@ class Loss(object):
        sparse loss functions such as sparse categorical crossentropy where
        shape = `[batch_size, d0, .. dN-1]`
      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
-      sample_weight: Optional `sample_weight` acts as a
+      sample_weight: Optional `sample_weight` acts as a coefficient for the
-        coefficient for the loss. If a scalar is provided, then the loss is
+        loss. If a scalar is provided, then the loss is simply scaled by the
-        simply scaled by the given value. If `sample_weight` is a tensor of size
+        given value. If `sample_weight` is a tensor of size `[batch_size]`, then
-        `[batch_size]`, then the total loss for each sample of the batch is
+        the total loss for each sample of the batch is rescaled by the
-        rescaled by the corresponding element in the `sample_weight` vector. If
+        corresponding element in the `sample_weight` vector. If the shape of
-        the shape of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be
+        `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted to
-        broadcasted to this shape), then each loss element of `y_pred` is scaled
+        this shape), then each loss element of `y_pred` is scaled
        by the corresponding value of `sample_weight`. (Note on`dN-1`: all loss
-        functions reduce by 1 dimension, usually axis=-1.)
+          functions reduce by 1 dimension, usually axis=-1.)
    Returns:
      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
@ -223,8 +222,8 @@ class LossFunctionWrapper(Loss):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: (Optional) name for the loss.
      **kwargs: The keyword arguments that are passed on to `fn`.
    """
@ -243,8 +242,7 @@ class LossFunctionWrapper(Loss):
      Loss values per sample.
    """
    if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
-      y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+      y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
          y_pred, y_true)
    ag_fn = autograph.tf_convert(self.fn, ag_ctx.control_status_ctx())
    return ag_fn(y_true, y_pred, **self._fn_kwargs)
@ -307,8 +305,8 @@ class MeanSquaredError(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'mean_squared_error'.
    """
    super(MeanSquaredError, self).__init__(
@ -366,8 +364,8 @@ class MeanAbsoluteError(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'mean_absolute_error'.
    """
    super(MeanAbsoluteError, self).__init__(
@ -426,8 +424,8 @@ class MeanAbsolutePercentageError(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to
        'mean_absolute_percentage_error'.
    """
@ -487,8 +485,8 @@ class MeanSquaredLogarithmicError(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to
        'mean_squared_logarithmic_error'.
    """
@ -500,44 +498,64 @@ class MeanSquaredLogarithmicError(LossFunctionWrapper):
 class BinaryCrossentropy(LossFunctionWrapper):
  """Computes the cross-entropy loss between true labels and predicted labels.
-  Use this cross-entropy loss when there are only two label classes (assumed to
+  Use this cross-entropy loss for binary (0 or 1) classification applications.
-  be 0 and 1). For each example, there should be a single floating-point value
+  The loss function requires the following inputs:
  per prediction.
-  In the snippet below, each of the four examples has only a single
+  - `y_true` (true label): This is either 0 or 1.
-  floating-pointing value, and both `y_pred` and `y_true` have the shape
+  - `y_pred` (predicted value): This is the model's prediction, i.e, a single
-  `[batch_size]`.
+    floating-point value which either represents a
    [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
    when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
    `from_logits=False`).
-  Standalone usage:
+  **Recommended Usage:** (set `from_logits=True`)
-  >>> y_true = [[0., 1.], [0., 0.]]
+  With `tf.keras` API:
  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
  >>> bce = tf.keras.losses.BinaryCrossentropy()
  >>> bce(y_true, y_pred).numpy()
  0.815
  >>> # Calling with 'sample_weight'.
  >>> bce(y_true, y_pred, sample_weight=[1, 0]).numpy()
  0.458
   >>> # Using 'sum' reduction type.
  >>> bce = tf.keras.losses.BinaryCrossentropy(
  ...     reduction=tf.keras.losses.Reduction.SUM)
  >>> bce(y_true, y_pred).numpy()
  1.630
  >>> # Using 'none' reduction type.
  >>> bce = tf.keras.losses.BinaryCrossentropy(
  ...     reduction=tf.keras.losses.Reduction.NONE)
  >>> bce(y_true, y_pred).numpy()
  array([0.916 , 0.714], dtype=float32)
  Usage with the `tf.keras` API:
  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.BinaryCrossentropy())
+  model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    ....
  )
  ```
  As a standalone function:
  >>> # Example 1: (batch_size = 1, number of samples = 4)
  >>> y_true = [0, 1, 0, 0]
  >>> y_pred = [-18.6, 0.51, 2.94, -12.8]
  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
  >>> bce(y_true, y_pred).numpy()
  0.865
  >>> # Example 2: (batch_size = 2, number of samples = 4)
  >>> y_true = [[0, 1], [0, 0]]
  >>> y_pred = [[-18.6, 0.51], [2.94, -12.8]]
  >>> # Using default 'auto'/'sum_over_batch_size' reduction type.
  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
  >>> bce(y_true, y_pred).numpy()
  0.865
  >>> # Using 'sample_weight' attribute
  >>> bce(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
  0.243
  >>> # Using 'sum' reduction` type.
  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
  ...     reduction=tf.keras.losses.Reduction.SUM)
  >>> bce(y_true, y_pred).numpy()
  1.730
  >>> # Using 'none' reduction type.
  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
  ...     reduction=tf.keras.losses.Reduction.NONE)
  >>> bce(y_true, y_pred).numpy()
  array([0.235, 1.496], dtype=float32)
  **Default Usage:** (set `from_logits=False`)
  >>> # Make the following updates to the above "Recommended Usage" section
  >>> # 1. Set `from_logits=False`
  >>> tf.keras.losses.BinaryCrossentropy() # OR ...('from_logits=False')
  >>> # 2. Update `y_pred` to use probabilities instead of logits
  >>> y_pred = [0.6, 0.3, 0.2, 0.8] # OR [[0.6, 0.3], [0.2, 0.8]]
  """
  def __init__(self,
@ -563,8 +581,8 @@ class BinaryCrossentropy(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: (Optional) Name for the op. Defaults to 'binary_crossentropy'.
    """
    super(BinaryCrossentropy, self).__init__(
@ -633,9 +651,9 @@ class CategoricalCrossentropy(LossFunctionWrapper):
        default, we assume that `y_pred` encodes a probability distribution.
        **Note - Using from_logits=True is more numerically stable.**
      label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
-        meaning the confidence on label values are relaxed. e.g.
+        meaning the confidence on label values are relaxed. For example, if
-        `label_smoothing=0.2` means that we will use a value of `0.1` for label
+        `0.1`, use `0.1 / num_classes` for non-target labels and 
-        `0` and `0.9` for label `1`"
+        `0.9 + 0.1 / num_classes` for target labels.
      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
        option will be determined by the usage context. For almost all cases
@ -643,8 +661,8 @@ class CategoricalCrossentropy(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'categorical_crossentropy'.
    """
    super(CategoricalCrossentropy, self).__init__(
@ -720,8 +738,8 @@ class SparseCategoricalCrossentropy(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to
        'sparse_categorical_crossentropy'.
    """
@ -784,8 +802,8 @@ class Hinge(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'hinge'.
    """
    super(Hinge, self).__init__(hinge, name=name, reduction=reduction)
@ -845,8 +863,8 @@ class SquaredHinge(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'squared_hinge'.
    """
    super(SquaredHinge, self).__init__(
@ -905,8 +923,8 @@ class CategoricalHinge(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'categorical_hinge'.
    """
    super(CategoricalHinge, self).__init__(
@ -962,8 +980,8 @@ class Poisson(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'poisson'.
    """
    super(Poisson, self).__init__(poisson, name=name, reduction=reduction)
@ -1019,8 +1037,8 @@ class LogCosh(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'log_cosh'.
    """
    super(LogCosh, self).__init__(log_cosh, name=name, reduction=reduction)
@ -1079,8 +1097,8 @@ class KLDivergence(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'kl_divergence'.
    """
    super(KLDivergence, self).__init__(
@ -1147,20 +1165,17 @@ class Huber(LossFunctionWrapper):
        `tf.distribute.Strategy`, outside of built-in training loops such as
        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training)
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-        for more details.
+            more details.
      name: Optional name for the op. Defaults to 'huber_loss'.
    """
    super(Huber, self).__init__(
        huber, name=name, reduction=reduction, delta=delta)
-@keras_export('keras.metrics.mean_squared_error',
+@keras_export('keras.metrics.mean_squared_error', 'keras.metrics.mse',
-              'keras.metrics.mse',
+              'keras.metrics.MSE', 'keras.losses.mean_squared_error',
-              'keras.metrics.MSE',
+              'keras.losses.mse', 'keras.losses.MSE')
              'keras.losses.mean_squared_error',
              'keras.losses.mse',
              'keras.losses.MSE')
@tf.__internal__.dispatch.add_dispatch_support
 def mean_squared_error(y_true, y_pred):
  """Computes the mean squared error between labels and predictions.
@ -1191,12 +1206,9 @@ def mean_squared_error(y_true, y_pred):
  return K.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
-@keras_export('keras.metrics.mean_absolute_error',
+@keras_export('keras.metrics.mean_absolute_error', 'keras.metrics.mae',
-              'keras.metrics.mae',
+              'keras.metrics.MAE', 'keras.losses.mean_absolute_error',
-              'keras.metrics.MAE',
+              'keras.losses.mae', 'keras.losses.MAE')
              'keras.losses.mean_absolute_error',
              'keras.losses.mae',
              'keras.losses.MAE')
@tf.__internal__.dispatch.add_dispatch_support
 def mean_absolute_error(y_true, y_pred):
  """Computes the mean absolute error between labels and predictions.
@ -1225,11 +1237,9 @@ def mean_absolute_error(y_true, y_pred):
@keras_export('keras.metrics.mean_absolute_percentage_error',
-              'keras.metrics.mape',
+              'keras.metrics.mape', 'keras.metrics.MAPE',
              'keras.metrics.MAPE',
              'keras.losses.mean_absolute_percentage_error',
-              'keras.losses.mape',
+              'keras.losses.mape', 'keras.losses.MAPE')
              'keras.losses.MAPE')
@tf.__internal__.dispatch.add_dispatch_support
 def mean_absolute_percentage_error(y_true, y_pred):
  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
@ -1262,11 +1272,9 @@ def mean_absolute_percentage_error(y_true, y_pred):
@keras_export('keras.metrics.mean_squared_logarithmic_error',
-              'keras.metrics.msle',
+              'keras.metrics.msle', 'keras.metrics.MSLE',
              'keras.metrics.MSLE',
              'keras.losses.mean_squared_logarithmic_error',
-              'keras.losses.msle',
+              'keras.losses.msle', 'keras.losses.MSLE')
              'keras.losses.MSLE')
@tf.__internal__.dispatch.add_dispatch_support
 def mean_squared_logarithmic_error(y_true, y_pred):
  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
@ -1511,7 +1519,9 @@ def categorical_crossentropy(y_true,
    y_pred: Tensor of predicted targets.
    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
      example, if `0.1`, use `0.1 / num_classes` for non-target labels
      and `0.9 + 0.1 / num_classes` for target labels.
  Returns:
    Categorical crossentropy loss value.
@ -1582,7 +1592,9 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels by 
      squeezing them towards 0.5 That is, using `1. - 0.5 * label_smoothing`
      for the target class and `0.5 * label_smoothing` for the non-target class.
  Returns:
    Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
@ -1602,12 +1614,9 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
@keras_export('keras.metrics.kl_divergence',
-              'keras.metrics.kullback_leibler_divergence',
+              'keras.metrics.kullback_leibler_divergence', 'keras.metrics.kld',
-              'keras.metrics.kld',
+              'keras.metrics.KLD', 'keras.losses.kl_divergence',
-              'keras.metrics.KLD',
+              'keras.losses.kullback_leibler_divergence', 'keras.losses.kld',
              'keras.losses.kl_divergence',
              'keras.losses.kullback_leibler_divergence',
              'keras.losses.kld',
              'keras.losses.KLD')
@tf.__internal__.dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
--- a/keras/mixed_precision/autocast_variable.py
+++ b/keras/mixed_precision/autocast_variable.py
@ -69,12 +69,11 @@ class AutoCastVariable(tf.Variable, core.Tensor):
  called.
  """
-  def __init__(self, variable, op=None):
+  def __init__(self, variable):
    """Creates an AutoCastVariable instance.
    Args:
      variable: A floating-point resource variable to wrap.
      op: Optional operation of this variable.
    Raises:
      ValueError: If `variable` is not a floating-point resource variable
@ -86,7 +85,11 @@ class AutoCastVariable(tf.Variable, core.Tensor):
      raise ValueError('variable must be a floating point variable but has '
                       'type: %s' % variable.dtype.name)
    self._variable = variable
-    self._op = op
+    # 'delegate' means AutoCastVariable.op return self._variable.op, which will
    # raise an AttributeError in Eager (as intended). If set to any other value,
    # AutoCastVariable.op returns that value instead, which is used to set the
    # op attribute in AutoCastVariable.assign().
    self._op = 'delegate'
  def _should_cast(self):
    """Returns True if this variable should be casted when accessed."""
@ -211,10 +214,18 @@ class AutoCastVariable(tf.Variable, core.Tensor):
                           use_locking=None,
                           name=None,
                           read_value=True):
    # TODO(b/146181571): This logic can be simplified once
    # DistributedVariable.assign returns a DistributedVariable. Currently for
    # MirroredStrategy, it returns a Mirrored value.
    if tf.compat.v1.executing_eagerly_outside_functions():
      assign_op = update_fn(value, use_locking, name, False)
      if read_value:
-        return create_autocast_variable(self._variable, op=assign_op)
+        # We create a new AutoCastVariable with the same underlying tf.Variable.
        # The new AutoCastVariable is identical except the 'op' attribute is
        # defined. This matches the behavior of tf.Variable.assign.
        var = create_autocast_variable(self._variable)
        var._op = assign_op  # pylint:disable=protected-access
        return var
      return assign_op
    # Fallback to wrapping the returned variable in graph mode if possible
@ -310,9 +321,9 @@ class AutoCastVariable(tf.Variable, core.Tensor):
  @property
  def op(self):
-    if self._op is not None:
+    if self._op == 'delegate':
-      return self._op
+      return self._variable.op
-    return self._variable.op
+    return self._op
  def _as_graph_element(self):
    graph_element = self._variable._as_graph_element()  # pylint:disable=protected-access
@ -481,7 +492,7 @@ tf.register_tensor_conversion_function(AutoCastVariable,
                                        AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
-def create_autocast_variable(variable, op=None):
+def create_autocast_variable(variable):
  """Creates an AutoCastVariable that wraps another variable.
  This typically just returns `AutoCastVariable(variable)`. But, if the variable
@ -493,14 +504,13 @@ def create_autocast_variable(variable, op=None):
  Args:
    variable: A floating-point resource variable to wrap.
    op: Optional operation of this variable.
  Returns:
    An AutoCastVariable that wraps the variable.
  """
  if not isinstance(variable, (distribute_values.DistributedVariable,
                               ps_distribute_values.AggregatingVariable)):
-    return AutoCastVariable(variable, op=op)
+    return AutoCastVariable(variable)
  class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
    """An AutoCastVariable that also subclasses from variable.__class__.
@ -523,7 +533,7 @@ def create_autocast_variable(variable, op=None):
             ).format(v=self)
      # pylint: enable=missing-format-attribute
-  return AutoCastDistributedVariable(variable, op=op)
+  return AutoCastDistributedVariable(variable)
 class enable_auto_cast_variables(object):  # pylint:disable=invalid-name
--- a/keras/mixed_precision/autocast_variable_test.py
+++ b/keras/mixed_precision/autocast_variable_test.py
@ -26,7 +26,14 @@ from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.distribute import test_util
 from keras.mixed_precision import autocast_variable
 from keras.optimizer_v2 import adadelta
 from keras.optimizer_v2 import adagrad
 from keras.optimizer_v2 import adam
 from keras.optimizer_v2 import adamax
 from keras.optimizer_v2 import ftrl
 from keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from keras.optimizer_v2 import nadam
 from keras.optimizer_v2 import rmsprop
 maybe_distribute = tf.__internal__.test.combinations.combine(distribution=[
    tf.__internal__.distribute.combinations.default_strategy,
@ -335,11 +342,28 @@ class AutoCastVariableTest(tf.test.TestCase, parameterized.TestCase):
        self.assertAllClose(5., self.evaluate(run_assign()))
  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_assign_op(self, distribution):
+  def test_op_attribute(self, distribution):
    with distribution.scope():
      x = get_var(0., tf.float32)
      x = autocast_variable.create_autocast_variable(x)
      # Variable.op raises an AttributeError in Eager mode and is an op in graph
      # mode. Variable.assign(...).op is None in Eager mode and an op in Graph
      # mode or a tf.function. We test this is also true of AutoCastVariable.
      if tf.executing_eagerly():
        with self.assertRaisesRegex(
            AttributeError,
            'Tensor.op is meaningless when eager execution is enabled'):
          x.op  # pylint: disable=pointless-statement
        self.assertIsNone(x.assign(1.0).op)
        self.assertIsNone(x.assign_add(1.0).op)
        self.assertIsNone(x.assign_sub(1.0).op)
      else:
        self.assertIsNotNone(x.op)
        self.assertIsNotNone(x.assign(1.0).op)
        self.assertIsNotNone(x.assign_add(1.0).op)
        self.assertIsNotNone(x.assign_sub(1.0).op)
      @tf.function
      def func():
        self.assertIsNotNone(x.assign(1.0).op)
@ -486,25 +510,51 @@ class AutoCastVariableTest(tf.test.TestCase, parameterized.TestCase):
            'dtype_to_cast_to=float32 '
            'inner_variable=MirroredVariable.*>')
-  @parameterized.named_parameters(
+  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(
-      ('v1', tf.compat.v1.train.GradientDescentOptimizer),
+      optimizer_class=[
-      ('v2', gradient_descent_v2.SGD))
+          adadelta.Adadelta,
-  def test_optimizer(self, optimizer_class):
+          adagrad.Adagrad,
          adam.Adam,
          adamax.Adamax,
          ftrl.Ftrl,
          gradient_descent_v2.SGD,
          nadam.Nadam,
          rmsprop.RMSprop,
          tf.compat.v1.train.GradientDescentOptimizer
      ],
      use_tf_function=[False, True]))
  def test_optimizer(self, optimizer_class, use_tf_function):
    if use_tf_function and not tf.executing_eagerly():
      self.skipTest('Test does not support graph mode with tf.function')
    x = get_var(1., tf.float32)
    x = autocast_variable.create_autocast_variable(x)
-    opt = optimizer_class(1.)
+    y = get_var(1., tf.float32)
    opt = optimizer_class(learning_rate=1.)
    @tf.function
    def f():
-      opt.minimize(lambda: x + 1., var_list=[x])
+      # Minimize both the AutoCastVariable and the normal tf.Variable. Both
      # variables should be updated to the same value.
      op = opt.minimize(lambda: x + y, var_list=[x, y])
      return None if tf.compat.v1.executing_eagerly_outside_functions() else op
    if use_tf_function:
      f = tf.function(f)
    if tf.executing_eagerly():
      f()
    else:
-      op = f()  # pylint: disable=assignment-from-no-return
+      op = f()
      self.evaluate(tf.compat.v1.global_variables_initializer())
      self.evaluate(op)
-    self.assertEqual(self.evaluate(x), 0)
+    # Assert the AutoCastVariable has changed from its initial value
    self.assertNotEqual(self.evaluate(x), 1.)
    # Assert AutoCastVariable is updated correctly by comparing it to the normal
    # variable
    self.assertAlmostEqual(self.evaluate(x), self.evaluate(y))
    if optimizer_class in (gradient_descent_v2.SGD,
                           tf.compat.v1.train.GradientDescentOptimizer):
      # With SGD, the variables decreases by exactly 1
      self.assertEqual(self.evaluate(x), 0)
 if __name__ == '__main__':
--- a/keras/saving/saved_model/load.py
+++ b/keras/saving/saved_model/load.py
@ -139,7 +139,7 @@ def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
  # Recreate layers and metrics using the info stored in the metadata.
  keras_loader = KerasObjectLoader(metadata, object_graph_def)
-  keras_loader.load_layers()
+  keras_loader.load_layers(compile=compile)
  # Generate a dictionary of all loaded nodes.
  nodes_to_load = {'root': None}
@ -364,7 +364,7 @@ class KerasObjectLoader(object):
          obj_child, child_proto, child_id)
      self.loaded_nodes[child_id] = obj_child, setter
-  def load_layers(self):
+  def load_layers(self, compile=True):  # pylint: disable=redefined-builtin
    """Load all layer nodes from the metadata."""
    # Load metrics after models and layers, since it's likely that models
    # and layers will create the metric when initialized (this avoids wasting
@ -380,9 +380,20 @@ class KerasObjectLoader(object):
          node_metadata.metadata)
    for node_metadata in metric_list:
-      self.loaded_nodes[node_metadata.node_id] = self._load_layer(
+      try:
-          node_metadata.node_id, node_metadata.identifier,
+        self.loaded_nodes[node_metadata.node_id] = self._load_layer(
-          node_metadata.metadata)
+            node_metadata.node_id, node_metadata.identifier,
            node_metadata.metadata)
      except ValueError:
        # Metrics are only needed when the model is compiled later. We ignore
        # errors when trying to load custom metrics when `compile=False` until
        # custom metrics are serialized properly (b/135550038).
        if compile:
          raise
        logging.warning('Unable to restore custom metric. Please ensure that '
                        'the layer implements `get_config` and `from_config` '
                        'when saving. In addition, please use the '
                        '`custom_objects` arg when calling `load_model()`.')
  def _load_layer(self, node_id, identifier, metadata):
    """Load a single layer from a SavedUserObject proto."""
--- a/keras/saving/saved_model/saved_model_test.py
+++ b/keras/saving/saved_model/saved_model_test.py
@ -1142,6 +1142,22 @@ class MetricTest(tf.test.TestCase, parameterized.TestCase):
      self._test_metric_save_and_load(
          metric, self._save_model_dir(), 1, test_sample_weight=False)
  @keras_parameterized.run_with_all_model_types
  def test_custom_metric_model(self):
    class CustomMetric(keras.metrics.MeanSquaredError):
      pass
    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
    model.compile(loss='mse', optimizer='rmsprop', metrics=[CustomMetric()])
    saved_model_dir = self._save_model_dir()
    tf.saved_model.save(model, saved_model_dir)
    with self.assertRaisesRegex(ValueError, 'custom_objects'):
      keras_load.load(saved_model_dir)
    keras_load.load(saved_model_dir, compile=False)
 if __name__ == '__main__':
  tf.test.main()