Realtime Compositor: Implement Keying node

This patch implements the Keying node for the realtime compositor. To ease the implementation, some morphological operators were moved into algorithms and a mechanism to steal data between results was added to the Result class. Pull Request: https://projects.blender.org/blender/blender/pulls/108393
2023-06-24 13:02:33 +02:00 · 2023-06-24 13:02:33 +02:00 · c9e6399fe1
commit c9e6399fe1
parent ec428c3f7f
20 changed files with 781 additions and 129 deletions
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@ -68,10 +68,14 @@ set(SRC
  COM_texture_pool.hh
  COM_utilities.hh

-  algorithms/intern/algorithm_parallel_reduction.cc
+  algorithms/intern/morphological_distance.cc
+  algorithms/intern/morphological_distance_feather.cc
+  algorithms/intern/parallel_reduction.cc
  algorithms/intern/smaa.cc
  algorithms/intern/symmetric_separable_blur.cc

+  algorithms/COM_algorithm_morphological_distance.hh
+  algorithms/COM_algorithm_morphological_distance_feather.hh
  algorithms/COM_algorithm_parallel_reduction.hh
  algorithms/COM_algorithm_smaa.hh
  algorithms/COM_algorithm_symmetric_separable_blur.hh
@ -135,6 +139,11 @@ set(GLSL_SRC
  shaders/compositor_glare_streaks_filter.glsl
  shaders/compositor_id_mask.glsl
  shaders/compositor_image_crop.glsl
+  shaders/compositor_keying_compute_image.glsl
+  shaders/compositor_keying_compute_matte.glsl
+  shaders/compositor_keying_extract_chroma.glsl
+  shaders/compositor_keying_replace_chroma.glsl
+  shaders/compositor_keying_tweak_matte.glsl
  shaders/compositor_map_uv.glsl
  shaders/compositor_morphological_distance.glsl
  shaders/compositor_morphological_distance_feather.glsl
@ -239,6 +248,7 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/compositor_glare_info.hh
  shaders/infos/compositor_id_mask_info.hh
  shaders/infos/compositor_image_crop_info.hh
+  shaders/infos/compositor_keying_info.hh
  shaders/infos/compositor_map_uv_info.hh
  shaders/infos/compositor_morphological_distance_feather_info.hh
  shaders/infos/compositor_morphological_distance_info.hh
--- a/source/blender/compositor/realtime_compositor/COM_result.hh
+++ b/source/blender/compositor/realtime_compositor/COM_result.hh
@ -164,6 +164,20 @@ class Result {
   * the discussion above for more information. */
  void pass_through(Result &target);

+  /* Steal the allocated data from the given source result and assign it to this result, then
+   * remove any references to the data from the source result. It is assumed that:
+   *
+   *   - Both results are of the same type.
+   *   - This result is not allocated but the source result is allocated.
+   *   - Neither of the results is a proxy one, that is, has a master result.
+   *
+   * This is different from proxy results and the pass_through mechanism in that it can be used on
+   * temporary results. This is most useful in multi-step compositor operations where some steps
+   * can be optional, in that case, intermediate results can be temporary results that can
+   * eventually be stolen by the actual output of the operation. See the uses of the method for
+   * a practical example of use. */
+  void steal_data(Result &source);
+
  /* Transform the result by the given transformation. This effectively pre-multiply the given
   * transformation by the current transformation of the domain of the result. */
  void transform(const float3x3 &transformation);
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_morphological_distance.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_morphological_distance.hh
@ -0,0 +1,18 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "COM_context.hh"
+#include "COM_result.hh"
+
+namespace blender::realtime_compositor {
+
+/* Dilate or erode the given input using a morphological operator with a circular structuring
+ * element of radius equivalent to the absolute value of the given distance parameter. A positive
+ * distance corresponds to dilate operator, while a negative distance corresponds to an erode
+ * operator. */
+void morphological_distance(Context &context, Result &input, Result &output, int distance);
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_morphological_distance_feather.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_morphological_distance_feather.hh
@ -0,0 +1,22 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "DNA_scene_types.h"
+
+#include "COM_context.hh"
+#include "COM_result.hh"
+
+namespace blender::realtime_compositor {
+
+/* Dilate or erode the given input using a morphological inverse distance operation evaluated at
+ * the given falloff. The radius of the structuring element is equivalent to the absolute value of
+ * the given distance parameter. A positive distance corresponds to a dilate operator, while a
+ * negative distance corresponds to an erode operator. See the implementation and shader for more
+ * information. */
+void morphological_distance_feather(
+    Context &context, Result &input, Result &output, int distance, int falloff_type = PROP_SMOOTH);
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_symmetric_separable_blur.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_symmetric_separable_blur.hh
@ -6,6 +6,8 @@

 #include "BLI_math_vector_types.hh"

+#include "DNA_scene_types.h"
+
 #include "COM_context.hh"
 #include "COM_result.hh"

@ -22,8 +24,8 @@ void symmetric_separable_blur(Context &context,
                              Result &input,
                              Result &output,
                              float2 radius,
-                              int filter_type,
-                              bool extend_bounds,
-                              bool gamma_correct);
+                              int filter_type = R_FILTER_GAUSS,
+                              bool extend_bounds = false,
+                              bool gamma_correct = false);

 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/morphological_distance.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/morphological_distance.cc
@ -0,0 +1,46 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_math_base.hh"
+
+#include "GPU_shader.h"
+#include "GPU_texture.h"
+
+#include "COM_context.hh"
+#include "COM_result.hh"
+#include "COM_utilities.hh"
+
+#include "COM_algorithm_morphological_distance.hh"
+
+namespace blender::realtime_compositor {
+
+static const char *get_shader_name(int distance)
+{
+  if (distance > 0) {
+    return "compositor_morphological_distance_dilate";
+  }
+  return "compositor_morphological_distance_erode";
+}
+
+void morphological_distance(Context &context, Result &input, Result &output, int distance)
+{
+  GPUShader *shader = context.shader_manager().get(get_shader_name(distance));
+  GPU_shader_bind(shader);
+
+  /* Pass the absolute value of the distance. We have specialized shaders for each sign. */
+  GPU_shader_uniform_1i(shader, "radius", math::abs(distance));
+
+  input.bind_as_texture(shader, "input_tx");
+
+  output.allocate_texture(input.domain());
+  output.bind_as_image(shader, "output_img");
+
+  compute_dispatch_threads_at_least(shader, input.domain().size);
+
+  GPU_shader_unbind();
+  output.unbind_as_image();
+  input.unbind_as_texture();
+}
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/morphological_distance_feather.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/morphological_distance_feather.cc
@ -0,0 +1,107 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_math_base.hh"
+#include "BLI_math_vector_types.hh"
+
+#include "GPU_shader.h"
+#include "GPU_texture.h"
+
+#include "COM_algorithm_symmetric_separable_blur.hh"
+#include "COM_context.hh"
+#include "COM_morphological_distance_feather_weights.hh"
+#include "COM_result.hh"
+#include "COM_utilities.hh"
+
+namespace blender::realtime_compositor {
+
+const char *get_shader_name(int distance)
+{
+  if (distance > 0) {
+    return "compositor_morphological_distance_feather_dilate";
+  }
+  return "compositor_morphological_distance_feather_erode";
+}
+
+static Result horizontal_pass(Context &context, Result &input, int distance, int falloff_type)
+{
+  GPUShader *shader = context.shader_manager().get(get_shader_name(distance));
+  GPU_shader_bind(shader);
+
+  input.bind_as_texture(shader, "input_tx");
+
+  const MorphologicalDistanceFeatherWeights &weights =
+      context.cache_manager().morphological_distance_feather_weights.get(falloff_type,
+                                                                         math::abs(distance));
+  weights.bind_weights_as_texture(shader, "weights_tx");
+  weights.bind_distance_falloffs_as_texture(shader, "falloffs_tx");
+
+  /* We allocate an output image of a transposed size, that is, with a height equivalent to the
+   * width of the input and vice versa. This is done as a performance optimization. The shader
+   * will process the image horizontally and write it to the intermediate output transposed. Then
+   * the vertical pass will execute the same horizontal pass shader, but since its input is
+   * transposed, it will effectively do a vertical pass and write to the output transposed,
+   * effectively undoing the transposition in the horizontal pass. This is done to improve
+   * spatial cache locality in the shader and to avoid having two separate shaders for each of
+   * the passes. */
+  const Domain domain = input.domain();
+  const int2 transposed_domain = int2(domain.size.y, domain.size.x);
+
+  Result output = Result::Temporary(ResultType::Float, context.texture_pool());
+  output.allocate_texture(transposed_domain);
+  output.bind_as_image(shader, "output_img");
+
+  compute_dispatch_threads_at_least(shader, domain.size);
+
+  GPU_shader_unbind();
+  input.unbind_as_texture();
+  weights.unbind_weights_as_texture();
+  weights.unbind_distance_falloffs_as_texture();
+  output.unbind_as_image();
+
+  return output;
+}
+
+static void vertical_pass(Context &context,
+                          Result &original_input,
+                          Result &horizontal_pass_result,
+                          Result &output,
+                          int distance,
+                          int falloff_type)
+{
+  GPUShader *shader = context.shader_manager().get(get_shader_name(distance));
+  GPU_shader_bind(shader);
+
+  horizontal_pass_result.bind_as_texture(shader, "input_tx");
+
+  const MorphologicalDistanceFeatherWeights &weights =
+      context.cache_manager().morphological_distance_feather_weights.get(falloff_type,
+                                                                         math::abs(distance));
+  weights.bind_weights_as_texture(shader, "weights_tx");
+  weights.bind_distance_falloffs_as_texture(shader, "falloffs_tx");
+
+  const Domain domain = original_input.domain();
+  output.allocate_texture(domain);
+  output.bind_as_image(shader, "output_img");
+
+  /* Notice that the domain is transposed, see the note on the horizontal pass function for more
+   * information on the reasoning behind this. */
+  compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
+
+  GPU_shader_unbind();
+  horizontal_pass_result.unbind_as_texture();
+  weights.unbind_weights_as_texture();
+  weights.unbind_distance_falloffs_as_texture();
+  output.unbind_as_image();
+}
+
+void morphological_distance_feather(
+    Context &context, Result &input, Result &output, int distance, int falloff_type)
+{
+  Result horizontal_pass_result = horizontal_pass(context, input, distance, falloff_type);
+  vertical_pass(context, input, horizontal_pass_result, output, distance, falloff_type);
+  horizontal_pass_result.release();
+}
+
+}  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/algorithm_parallel_reduction.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/algorithm_parallel_reduction.cc
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/symmetric_separable_blur.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/symmetric_separable_blur.cc
@ -10,6 +10,7 @@
 #include "GPU_texture.h"

 #include "COM_context.hh"
+#include "COM_result.hh"
 #include "COM_utilities.hh"

 #include "COM_algorithm_symmetric_separable_blur.hh"
@ -18,6 +19,15 @@

 namespace blender::realtime_compositor {

+static const char *get_blur_shader(ResultType type)
+{
+  if (type == ResultType::Float) {
+    return "compositor_symmetric_separable_blur_float";
+  }
+
+  return "compositor_symmetric_separable_blur_color";
+}
+
 static Result horizontal_pass(Context &context,
                              Result &input,
                              float radius,
@ -25,7 +35,7 @@ static Result horizontal_pass(Context &context,
                              bool extend_bounds,
                              bool gamma_correct)
 {
-  GPUShader *shader = context.shader_manager().get("compositor_symmetric_separable_blur");
+  GPUShader *shader = context.shader_manager().get(get_blur_shader(input.type()));
  GPU_shader_bind(shader);

  GPU_shader_uniform_1b(shader, "extend_bounds", extend_bounds);
@ -53,7 +63,7 @@ static Result horizontal_pass(Context &context,
   * pass. */
  const int2 transposed_domain = int2(domain.size.y, domain.size.x);

-  Result output = Result::Temporary(ResultType::Color, context.texture_pool());
+  Result output = Result::Temporary(input.type(), context.texture_pool());
  output.allocate_texture(transposed_domain);
  output.bind_as_image(shader, "output_img");

@ -76,7 +86,7 @@ static void vertical_pass(Context &context,
                          bool extend_bounds,
                          bool gamma_correct)
 {
-  GPUShader *shader = context.shader_manager().get("compositor_symmetric_separable_blur");
+  GPUShader *shader = context.shader_manager().get(get_blur_shader(original_input.type()));
  GPU_shader_bind(shader);

  GPU_shader_uniform_1b(shader, "extend_bounds", extend_bounds);
--- a/source/blender/compositor/realtime_compositor/intern/result.cc
+++ b/source/blender/compositor/realtime_compositor/intern/result.cc
@ -2,6 +2,7 @@
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

+#include "BLI_assert.h"
 #include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"

@ -133,6 +134,33 @@ void Result::pass_through(Result &target)
  target.master_ = this;
 }

+void Result::steal_data(Result &source)
+{
+  BLI_assert(type_ == source.type_);
+  BLI_assert(!is_allocated() && source.is_allocated());
+  BLI_assert(master_ == nullptr && source.master_ == nullptr);
+
+  is_single_value_ = source.is_single_value_;
+  texture_ = source.texture_;
+  texture_pool_ = source.texture_pool_;
+  domain_ = source.domain_;
+
+  switch (type_) {
+    case ResultType::Float:
+      float_value_ = source.float_value_;
+      break;
+    case ResultType::Vector:
+      vector_value_ = source.vector_value_;
+      break;
+    case ResultType::Color:
+      color_value_ = source.color_value_;
+      break;
+  }
+
+  source.texture_ = nullptr;
+  source.texture_pool_ = nullptr;
+}
+
 void Result::transform(const float3x3 &transformation)
 {
  domain_.transform(transformation);
@ -235,6 +263,7 @@ void Result::release()
  reference_count_--;
  if (reference_count_ == 0) {
    texture_pool_->release(texture_);
+    texture_ = nullptr;
  }
 }

--- a/source/blender/compositor/realtime_compositor/shaders/compositor_keying_compute_image.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_keying_compute_image.glsl
@ -0,0 +1,21 @@
+#pragma BLENDER_REQUIRE(common_math_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 key = texture_load(key_tx, texel);
+  vec4 color = texture_load(input_tx, texel);
+  float matte = texture_load(matte_tx, texel).x;
+
+  /* Alpha multiply the matte to the image. */
+  color *= matte;
+
+  /* Color despill. */
+  ivec3 key_argmax = argmax(key.rgb);
+  float weighted_average = mix(color[key_argmax.y], color[key_argmax.z], despill_balance);
+  color[key_argmax.x] -= (color[key_argmax.x] - weighted_average) * despill_factor;
+
+  imageStore(output_img, texel, color);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_keying_compute_matte.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_keying_compute_matte.glsl
@ -0,0 +1,32 @@
+#pragma BLENDER_REQUIRE(common_math_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_common_color_utils.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+float compute_saturation(vec4 color, ivec3 argmax)
+{
+  float weighted_average = mix(color[argmax.y], color[argmax.z], key_balance);
+  return (color[argmax.x] - weighted_average) * abs(1.0 - weighted_average);
+}
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 input_color = texture_load(input_tx, texel);
+
+  /* We assume that the keying screen will not be overexposed in the image, so if the input
+   * brightness is high, we assume the pixel is opaque. */
+  if (min_v3(input_color) > 1.0f) {
+    imageStore(output_img, texel, vec4(1.0));
+    return;
+  }
+
+  vec4 key_color = texture_load(key_tx, texel);
+  ivec3 key_argmax = argmax(key_color.rgb);
+  float input_saturation = compute_saturation(input_color, key_argmax);
+  float key_saturation = compute_saturation(key_color, key_argmax);
+
+  float matte = 1.0f - clamp(input_saturation / key_saturation, 0.0, 1.0);
+
+  imageStore(output_img, texel, vec4(matte));
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_keying_extract_chroma.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_keying_extract_chroma.glsl
@ -0,0 +1,12 @@
+#pragma BLENDER_REQUIRE(gpu_shader_common_color_utils.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 color_ycca;
+  rgba_to_ycca_itu_709(texture_load(input_tx, texel), color_ycca);
+
+  imageStore(output_img, texel, color_ycca);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_keying_replace_chroma.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_keying_replace_chroma.glsl
@ -0,0 +1,17 @@
+#pragma BLENDER_REQUIRE(gpu_shader_common_color_utils.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 color_ycca;
+  rgba_to_ycca_itu_709(texture_load(input_tx, texel), color_ycca);
+
+  color_ycca.yz = texture_load(new_chroma_tx, texel).yz;
+
+  vec4 color_rgba;
+  ycca_to_rgba_itu_709(color_ycca, color_rgba);
+
+  imageStore(output_img, texel, color_rgba);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_keying_tweak_matte.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_keying_tweak_matte.glsl
@ -0,0 +1,54 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  float matte = texture_load(input_matte_tx, texel).x;
+
+  /* Search the neighbourhood around the current matte value and identify if it lies along the
+   * edges of the matte. This is needs to be computed only when we need to compute the edges output
+   * or tweak the levels of the matte. */
+  bool is_edge = false;
+  if (compute_edges || black_level != 0.0 || white_level != 1.0) {
+    /* Count the number of neighbours whose matte is sufficiently similar to the current matte,
+     * as controlled by the edge_tolerance factor. */
+    int count = 0;
+    for (int j = -edge_search_radius; j <= edge_search_radius; j++) {
+      for (int i = -edge_search_radius; i <= edge_search_radius; i++) {
+        float neighbour_matte = texture_load(input_matte_tx, texel + ivec2(i, j)).x;
+        count += int(distance(matte, neighbour_matte) < edge_tolerance);
+      }
+    }
+
+    /* If the number of neighbours that are sufficiently similar to the center matte is less that
+     * 90% of the total number of neighbours, then that means the variance is high in that areas
+     * and it is considered an edge. */
+    is_edge = count < ((edge_search_radius * 2 + 1) * (edge_search_radius * 2 + 1)) * 0.9;
+  }
+
+  float tweaked_matte = matte;
+
+  /* Remap the matte using the black and white levels, but only for areas that are not on the edge
+   * of the matte to preserve details. Also check for equality between levels to avoid zero
+   * division. */
+  if (!is_edge && white_level != black_level) {
+    tweaked_matte = clamp((matte - black_level) / (white_level - black_level), 0.0, 1.0);
+  }
+
+  /* Exclude unwanted areas using the provided garbage matte, 1 means unwanted, so invert the
+   * garbage matte and take the minimum. */
+  if (apply_garbage_matte) {
+    float garbage_matte = texture_load(garbage_matte_tx, texel).x;
+    tweaked_matte = min(tweaked_matte, 1.0 - garbage_matte);
+  }
+
+  /* Include wanted areas that were incorrectly keyed using the provided core matte. */
+  if (apply_core_matte) {
+    float core_matte = texture_load(core_matte_tx, texel).x;
+    tweaked_matte = max(tweaked_matte, core_matte);
+  }
+
+  imageStore(output_matte_img, texel, vec4(tweaked_matte));
+  imageStore(output_edges_img, texel, vec4(is_edge ? 1.0 : 0.0));
+}
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_keying_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_keying_info.hh
@ -0,0 +1,57 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "gpu_shader_create_info.hh"
+
+GPU_SHADER_CREATE_INFO(compositor_keying_extract_chroma)
+    .local_group_size(16, 16)
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_keying_extract_chroma.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_keying_replace_chroma)
+    .local_group_size(16, 16)
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "new_chroma_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_keying_replace_chroma.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_keying_compute_matte)
+    .local_group_size(16, 16)
+    .push_constant(Type::FLOAT, "key_balance")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "key_tx")
+    .image(0, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_keying_compute_matte.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_keying_tweak_matte)
+    .local_group_size(16, 16)
+    .push_constant(Type::BOOL, "compute_edges")
+    .push_constant(Type::BOOL, "apply_core_matte")
+    .push_constant(Type::BOOL, "apply_garbage_matte")
+    .push_constant(Type::INT, "edge_search_radius")
+    .push_constant(Type::FLOAT, "edge_tolerance")
+    .push_constant(Type::FLOAT, "black_level")
+    .push_constant(Type::FLOAT, "white_level")
+    .sampler(0, ImageType::FLOAT_2D, "input_matte_tx")
+    .sampler(1, ImageType::FLOAT_2D, "garbage_matte_tx")
+    .sampler(2, ImageType::FLOAT_2D, "core_matte_tx")
+    .image(0, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_matte_img")
+    .image(1, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_edges_img")
+    .compute_source("compositor_keying_tweak_matte.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_keying_compute_image)
+    .local_group_size(16, 16)
+    .push_constant(Type::FLOAT, "despill_factor")
+    .push_constant(Type::FLOAT, "despill_balance")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "matte_tx")
+    .sampler(2, ImageType::FLOAT_2D, "key_tx")
+    .image(0, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_keying_compute_image.glsl")
+    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_symmetric_separable_blur_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_symmetric_separable_blur_info.hh
@ -4,13 +4,21 @@

 #include "gpu_shader_create_info.hh"

-GPU_SHADER_CREATE_INFO(compositor_symmetric_separable_blur)
+GPU_SHADER_CREATE_INFO(compositor_symmetric_separable_blur_shared)
    .local_group_size(16, 16)
    .push_constant(Type::BOOL, "extend_bounds")
    .push_constant(Type::BOOL, "gamma_correct_input")
    .push_constant(Type::BOOL, "gamma_uncorrect_output")
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .sampler(1, ImageType::FLOAT_1D, "weights_tx")
-    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
-    .compute_source("compositor_symmetric_separable_blur.glsl")
+    .compute_source("compositor_symmetric_separable_blur.glsl");
+
+GPU_SHADER_CREATE_INFO(compositor_symmetric_separable_blur_float)
+    .additional_info("compositor_symmetric_separable_blur_shared")
+    .image(0, GPU_R16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_symmetric_separable_blur_color)
+    .additional_info("compositor_symmetric_separable_blur_shared")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
    .do_static_compilation(true);
--- a/source/blender/draw/intern/shaders/common_math_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_math_lib.glsl
@ -54,6 +54,20 @@ mat2 rot2_from_angle(float a)
  return mat2(c, -s, s, c);
 }

+/* Computes the full argmax of the given vector, that is, the index of the greatest component will
+ * be in the returned x component, the index of the smallest component will be in the returned z
+ * component, and the index of the middle component will be in the returned y component.
+ *
+ * This is computed by utilizing the fact that booleans are converted to the integers 0 and 1 for
+ * false and true respectively. So if we compare every component to all other components using the
+ * greaterThan comparator, we get 0 for the greatest component, because no other component is
+ * greater, 1 for the middle component, and 2 for the smallest component. */
+ivec3 argmax(vec3 v)
+{
+  return ivec3(greaterThan(v, v.xxx)) + ivec3(greaterThan(v, v.yyy)) +
+         ivec3(greaterThan(v, v.zzz));
+}
+
 #define min3(a, b, c) min(a, min(b, c))
 #define min4(a, b, c, d) min(a, min3(b, c, d))
 #define min5(a, b, c, d, e) min(a, min4(b, c, d, e))
--- a/source/blender/nodes/composite/nodes/node_composite_dilate.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_dilate.cc
@ -19,7 +19,8 @@
 #include "GPU_state.h"
 #include "GPU_texture.h"

-#include "COM_morphological_distance_feather_weights.hh"
+#include "COM_algorithm_morphological_distance.hh"
+#include "COM_algorithm_morphological_distance_feather.hh"
 #include "COM_node_operation.hh"
 #include "COM_utilities.hh"

@ -175,33 +176,7 @@ class DilateErodeOperation : public NodeOperation {

  void execute_distance()
  {
-    GPUShader *shader = shader_manager().get(get_morphological_distance_shader_name());
-    GPU_shader_bind(shader);
-
-    /* Pass the absolute value of the distance. We have specialized shaders for each sign. */
-    GPU_shader_uniform_1i(shader, "radius", math::abs(get_distance()));
-
-    const Result &input_mask = get_input("Mask");
-    input_mask.bind_as_texture(shader, "input_tx");
-
-    const Domain domain = compute_domain();
-    Result &output_mask = get_result("Mask");
-    output_mask.allocate_texture(domain);
-    output_mask.bind_as_image(shader, "output_img");
-
-    compute_dispatch_threads_at_least(shader, domain.size);
-
-    GPU_shader_unbind();
-    output_mask.unbind_as_image();
-    input_mask.unbind_as_texture();
-  }
-
-  const char *get_morphological_distance_shader_name()
-  {
-    if (get_distance() > 0) {
-      return "compositor_morphological_distance_dilate";
-    }
-    return "compositor_morphological_distance_erode";
+    morphological_distance(context(), get_input("Mask"), get_result("Mask"), get_distance());
  }

  /* ------------------------------------------
@ -244,87 +219,11 @@ class DilateErodeOperation : public NodeOperation {

  void execute_distance_feather()
  {
-    GPUTexture *horizontal_pass_result = execute_distance_feather_horizontal_pass();
-    execute_distance_feather_vertical_pass(horizontal_pass_result);
-  }
-
-  GPUTexture *execute_distance_feather_horizontal_pass()
-  {
-    GPUShader *shader = shader_manager().get(get_morphological_distance_feather_shader_name());
-    GPU_shader_bind(shader);
-
-    const Result &input_image = get_input("Mask");
-    input_image.bind_as_texture(shader, "input_tx");
-
-    const MorphologicalDistanceFeatherWeights &weights =
-        context().cache_manager().morphological_distance_feather_weights.get(
-            node_storage(bnode()).falloff, math::abs(get_distance()));
-    weights.bind_weights_as_texture(shader, "weights_tx");
-    weights.bind_distance_falloffs_as_texture(shader, "falloffs_tx");
-
-    /* We allocate an output image of a transposed size, that is, with a height equivalent to the
-     * width of the input and vice versa. This is done as a performance optimization. The shader
-     * will process the image horizontally and write it to the intermediate output transposed. Then
-     * the vertical pass will execute the same horizontal pass shader, but since its input is
-     * transposed, it will effectively do a vertical pass and write to the output transposed,
-     * effectively undoing the transposition in the horizontal pass. This is done to improve
-     * spatial cache locality in the shader and to avoid having two separate shaders for each of
-     * the passes. */
-    const Domain domain = compute_domain();
-    const int2 transposed_domain = int2(domain.size.y, domain.size.x);
-
-    GPUTexture *horizontal_pass_result = texture_pool().acquire_color(transposed_domain);
-    const int image_unit = GPU_shader_get_sampler_binding(shader, "output_img");
-    GPU_texture_image_bind(horizontal_pass_result, image_unit);
-
-    compute_dispatch_threads_at_least(shader, domain.size);
-
-    GPU_shader_unbind();
-    input_image.unbind_as_texture();
-    weights.unbind_weights_as_texture();
-    weights.unbind_distance_falloffs_as_texture();
-    GPU_texture_image_unbind(horizontal_pass_result);
-
-    return horizontal_pass_result;
-  }
-
-  void execute_distance_feather_vertical_pass(GPUTexture *horizontal_pass_result)
-  {
-    GPUShader *shader = shader_manager().get(get_morphological_distance_feather_shader_name());
-    GPU_shader_bind(shader);
-
-    GPU_memory_barrier(GPU_BARRIER_TEXTURE_FETCH);
-    const int texture_image_unit = GPU_shader_get_sampler_binding(shader, "input_tx");
-    GPU_texture_bind(horizontal_pass_result, texture_image_unit);
-
-    const MorphologicalDistanceFeatherWeights &weights =
-        context().cache_manager().morphological_distance_feather_weights.get(
-            node_storage(bnode()).falloff, math::abs(get_distance()));
-    weights.bind_weights_as_texture(shader, "weights_tx");
-    weights.bind_distance_falloffs_as_texture(shader, "falloffs_tx");
-
-    const Domain domain = compute_domain();
-    Result &output_image = get_result("Mask");
-    output_image.allocate_texture(domain);
-    output_image.bind_as_image(shader, "output_img");
-
-    /* Notice that the domain is transposed, see the note on the horizontal pass method for more
-     * information on the reasoning behind this. */
-    compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
-
-    GPU_shader_unbind();
-    output_image.unbind_as_image();
-    weights.unbind_weights_as_texture();
-    weights.unbind_distance_falloffs_as_texture();
-    GPU_texture_unbind(horizontal_pass_result);
-  }
-
-  const char *get_morphological_distance_feather_shader_name()
-  {
-    if (get_distance() > 0) {
-      return "compositor_morphological_distance_feather_dilate";
-    }
-    return "compositor_morphological_distance_feather_erode";
+    morphological_distance_feather(context(),
+                                   get_input("Mask"),
+                                   get_result("Mask"),
+                                   get_distance(),
+                                   node_storage(bnode()).falloff);
  }

  /* ---------------
--- a/source/blender/nodes/composite/nodes/node_composite_keying.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_keying.cc
@ -7,13 +7,22 @@
 */

 #include "BLI_math_base.h"
+#include "BLI_math_vector_types.hh"

 #include "DNA_movieclip_types.h"
+#include "DNA_scene_types.h"

 #include "UI_interface.h"
 #include "UI_resources.h"

+#include "GPU_shader.h"
+#include "GPU_texture.h"
+
+#include "COM_algorithm_morphological_distance.hh"
+#include "COM_algorithm_morphological_distance_feather.hh"
+#include "COM_algorithm_symmetric_separable_blur.hh"
 #include "COM_node_operation.hh"
+#include "COM_utilities.hh"

 #include "node_composite_util.hh"

@ -21,12 +30,18 @@

 namespace blender::nodes::node_composite_keying_cc {

+NODE_STORAGE_FUNCS(NodeKeyingData)
+
 static void cmp_node_keying_declare(NodeDeclarationBuilder &b)
 {
-  b.add_input<decl::Color>("Image").default_value({0.8f, 0.8f, 0.8f, 1.0f});
-  b.add_input<decl::Color>("Key Color").default_value({1.0f, 1.0f, 1.0f, 1.0f});
-  b.add_input<decl::Float>("Garbage Matte").hide_value();
-  b.add_input<decl::Float>("Core Matte").hide_value();
+  b.add_input<decl::Color>("Image")
+      .default_value({0.8f, 0.8f, 0.8f, 1.0f})
+      .compositor_domain_priority(0);
+  b.add_input<decl::Color>("Key Color")
+      .default_value({1.0f, 1.0f, 1.0f, 1.0f})
+      .compositor_domain_priority(1);
+  b.add_input<decl::Float>("Garbage Matte").hide_value().compositor_domain_priority(2);
+  b.add_input<decl::Float>("Core Matte").hide_value().compositor_domain_priority(3);
  b.add_output<decl::Color>("Image");
  b.add_output<decl::Float>("Matte");
  b.add_output<decl::Float>("Edges");
@ -72,10 +87,277 @@ class KeyingOperation : public NodeOperation {

  void execute() override
  {
-    get_input("Image").pass_through(get_result("Image"));
-    get_result("Matte").allocate_invalid();
-    get_result("Edges").allocate_invalid();
-    context().set_info_message("Viewport compositor setup not fully supported");
+    Result blurred_input = compute_blurred_input();
+
+    Result matte = compute_matte(blurred_input);
+    blurred_input.release();
+
+    /* This also computes the edges output if needed. */
+    Result tweaked_matte = compute_tweaked_matte(matte);
+    matte.release();
+
+    Result &output_image = get_result("Image");
+    Result &output_matte = get_result("Matte");
+    if (output_image.should_compute() || output_matte.should_compute()) {
+      Result blurred_matte = compute_blurred_matte(tweaked_matte);
+      tweaked_matte.release();
+
+      Result morphed_matte = compute_morphed_matte(blurred_matte);
+      blurred_matte.release();
+
+      Result feathered_matte = compute_feathered_matte(morphed_matte);
+      morphed_matte.release();
+
+      if (output_image.should_compute()) {
+        compute_image(feathered_matte);
+      }
+
+      if (output_matte.should_compute()) {
+        output_matte.steal_data(feathered_matte);
+      }
+      else {
+        feathered_matte.release();
+      }
+    }
+  }
+
+  Result compute_blurred_input()
+  {
+    /* No blur needed, return the original matte. We also increment the reference count of the
+     * input because the caller will release it after the call, and we want to extend its life
+     * since it is now returned as the output. */
+    const float blur_size = node_storage(bnode()).blur_pre;
+    if (blur_size == 0.0f) {
+      Result output = get_input("Image");
+      output.increment_reference_count();
+      return output;
+    }
+
+    Result chroma = extract_input_chroma();
+
+    Result blurred_chroma = Result::Temporary(ResultType::Color, context().texture_pool());
+    symmetric_separable_blur(context(), chroma, blurred_chroma, float2(blur_size), R_FILTER_BOX);
+    chroma.release();
+
+    Result blurred_input = replace_input_chroma(blurred_chroma);
+    blurred_chroma.release();
+
+    return blurred_input;
+  }
+
+  Result extract_input_chroma()
+  {
+    GPUShader *shader = context().shader_manager().get("compositor_keying_extract_chroma");
+    GPU_shader_bind(shader);
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    Result output = Result::Temporary(ResultType::Color, context().texture_pool());
+    output.allocate_texture(input.domain());
+    output.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, input.domain().size);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    output.unbind_as_image();
+
+    return output;
+  }
+
+  Result replace_input_chroma(Result &new_chroma)
+  {
+    GPUShader *shader = context().shader_manager().get("compositor_keying_replace_chroma");
+    GPU_shader_bind(shader);
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    new_chroma.bind_as_texture(shader, "new_chroma_tx");
+
+    Result output = Result::Temporary(ResultType::Color, context().texture_pool());
+    output.allocate_texture(input.domain());
+    output.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, input.domain().size);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    new_chroma.unbind_as_texture();
+    output.unbind_as_image();
+
+    return output;
+  }
+
+  Result compute_matte(Result &input)
+  {
+    GPUShader *shader = context().shader_manager().get("compositor_keying_compute_matte");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1f(shader, "key_balance", node_storage(bnode()).screen_balance);
+
+    input.bind_as_texture(shader, "input_tx");
+
+    Result &key_color = get_input("Key Color");
+    key_color.bind_as_texture(shader, "key_tx");
+
+    Result output = Result::Temporary(ResultType::Float, context().texture_pool());
+    output.allocate_texture(input.domain());
+    output.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, input.domain().size);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    key_color.unbind_as_texture();
+    output.unbind_as_image();
+
+    return output;
+  }
+
+  Result compute_tweaked_matte(Result &input_matte)
+  {
+    Result &output_edges = get_result("Edges");
+
+    const float black_level = node_storage(bnode()).clip_black;
+    const float white_level = node_storage(bnode()).clip_white;
+
+    const bool core_matte_exists = node().input_by_identifier("Core Matte")->is_logically_linked();
+    const bool garbage_matte_exists =
+        node().input_by_identifier("Garbage Matte")->is_logically_linked();
+
+    /* The edges output is not needed and the matte is not tweaked, so return the original matte.
+     * We also increment the reference count of the input because the caller will release it after
+     * the call, and we want to extend its life since it is now returned as the output. */
+    if (!output_edges.should_compute() && (black_level == 0.0f && white_level == 1.0f) &&
+        !core_matte_exists && !garbage_matte_exists)
+    {
+      Result output_matte = input_matte;
+      input_matte.increment_reference_count();
+      return output_matte;
+    }
+
+    GPUShader *shader = context().shader_manager().get("compositor_keying_tweak_matte");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1b(shader, "compute_edges", output_edges.should_compute());
+    GPU_shader_uniform_1b(shader, "apply_core_matte", core_matte_exists);
+    GPU_shader_uniform_1b(shader, "apply_garbage_matte", garbage_matte_exists);
+    GPU_shader_uniform_1i(shader, "edge_search_radius", node_storage(bnode()).edge_kernel_radius);
+    GPU_shader_uniform_1f(shader, "edge_tolerance", node_storage(bnode()).edge_kernel_tolerance);
+    GPU_shader_uniform_1f(shader, "black_level", black_level);
+    GPU_shader_uniform_1f(shader, "white_level", white_level);
+
+    input_matte.bind_as_texture(shader, "input_matte_tx");
+
+    Result &garbage_matte = get_input("Garbage Matte");
+    garbage_matte.bind_as_texture(shader, "garbage_matte_tx");
+
+    Result &core_matte = get_input("Core Matte");
+    core_matte.bind_as_texture(shader, "core_matte_tx");
+
+    Result output_matte = Result::Temporary(ResultType::Float, context().texture_pool());
+    output_matte.allocate_texture(input_matte.domain());
+    output_matte.bind_as_image(shader, "output_matte_img");
+
+    output_edges.allocate_texture(input_matte.domain());
+    output_edges.bind_as_image(shader, "output_edges_img");
+
+    compute_dispatch_threads_at_least(shader, input_matte.domain().size);
+
+    GPU_shader_unbind();
+    input_matte.unbind_as_texture();
+    garbage_matte.unbind_as_texture();
+    core_matte.unbind_as_texture();
+    output_matte.unbind_as_image();
+    output_edges.unbind_as_image();
+
+    return output_matte;
+  }
+
+  Result compute_blurred_matte(Result &input_matte)
+  {
+    const float blur_size = node_storage(bnode()).blur_post;
+    /* No blur needed, return the original matte. We also increment the reference count of the
+     * input because the caller will release it after the call, and we want to extend its life
+     * since it is now returned as the output. */
+    if (blur_size == 0.0f) {
+      Result output_matte = input_matte;
+      input_matte.increment_reference_count();
+      return output_matte;
+    }
+
+    Result blurred_matte = Result::Temporary(ResultType::Float, context().texture_pool());
+    symmetric_separable_blur(context(), input_matte, blurred_matte, float2(blur_size));
+
+    return blurred_matte;
+  }
+
+  Result compute_morphed_matte(Result &input_matte)
+  {
+    const int distance = node_storage(bnode()).dilate_distance;
+    /* No morphology needed, return the original matte. We also increment the reference count of
+     * the input because the caller will release it after the call, and we want to extend its life
+     * since it is now returned as the output. */
+    if (distance == 0) {
+      Result output_matte = input_matte;
+      input_matte.increment_reference_count();
+      return output_matte;
+    }
+
+    Result morphed_matte = Result::Temporary(ResultType::Float, context().texture_pool());
+    morphological_distance(context(), input_matte, morphed_matte, distance);
+
+    return morphed_matte;
+  }
+
+  Result compute_feathered_matte(Result &input_matte)
+  {
+    const int distance = node_storage(bnode()).feather_distance;
+    /* No feathering needed, return the original matte. We also increment the reference count of
+     * the input because the caller will release it after the call, and we want to extend its life
+     * since it is now returned as the output. */
+    if (distance == 0) {
+      Result output_matte = input_matte;
+      input_matte.increment_reference_count();
+      return output_matte;
+    }
+
+    Result feathered_matte = Result::Temporary(ResultType::Float, context().texture_pool());
+    morphological_distance_feather(
+        context(), input_matte, feathered_matte, distance, node_storage(bnode()).feather_falloff);
+
+    return feathered_matte;
+  }
+
+  void compute_image(Result &matte)
+  {
+    GPUShader *shader = context().shader_manager().get("compositor_keying_compute_image");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1f(shader, "despill_factor", node_storage(bnode()).despill_factor);
+    GPU_shader_uniform_1f(shader, "despill_balance", node_storage(bnode()).despill_balance);
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    Result &key = get_input("Key Color");
+    key.bind_as_texture(shader, "key_tx");
+
+    matte.bind_as_texture(shader, "matte_tx");
+
+    Result &output = get_result("Image");
+    output.allocate_texture(matte.domain());
+    output.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, input.domain().size);
+
+    GPU_shader_unbind();
+    input.unbind_as_texture();
+    key.unbind_as_texture();
+    matte.unbind_as_texture();
+    output.unbind_as_image();
  }
 };

@ -99,8 +381,6 @@ void register_node_type_cmp_keying()
  node_type_storage(
      &ntype, "NodeKeyingData", node_free_standard_storage, node_copy_standard_storage);
  ntype.get_compositor_operation = file_ns::get_compositor_operation;
-  ntype.realtime_compositor_unsupported_message = N_(
-      "Node not supported in the Viewport compositor");

  nodeRegisterType(&ntype);
 }