Realtime Compositor: Implement Classic Kuwahara

This patch implements the Classic Kuwahara node for the Realtime Compositor. A naive O(radius^2) implementation is used for radii up to 5 pixels, and a constant O(1) implementation based on summed area tables is used for higher radii at the cost of building and storing the tables. This is different from the CPU implementation in that it computes the variance as the average of the variance of each of the individual channels. This is done to avoid computing yet another SAT table for luminance. The CPU implementation will be adapted to match this in a future commit. The SAT implementation is based on the algorithm described in: Nehab, Diego, et al. "GPU-efficient recursive filtering and summed-area tables." Additionally, the Result class now allows full precision texture allocation, which was necessary for storing the SAT tables. Pull Request: https://projects.blender.org/blender/blender/pulls/109292
2023-07-19 14:04:18 +02:00 · 2023-07-19 14:04:18 +02:00 · 940558f9ac
commit 940558f9ac
parent 4c72dc98c2
17 changed files with 751 additions and 72 deletions
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@ -66,12 +66,14 @@ set(SRC
  algorithms/intern/morphological_distance_feather.cc
  algorithms/intern/parallel_reduction.cc
  algorithms/intern/smaa.cc
  algorithms/intern/summed_area_table.cc
  algorithms/intern/symmetric_separable_blur.cc
  algorithms/COM_algorithm_morphological_distance.hh
  algorithms/COM_algorithm_morphological_distance_feather.hh
  algorithms/COM_algorithm_parallel_reduction.hh
  algorithms/COM_algorithm_smaa.hh
  algorithms/COM_algorithm_summed_area_table.hh
  algorithms/COM_algorithm_symmetric_separable_blur.hh
  cached_resources/intern/cached_mask.cc
@ -140,6 +142,7 @@ set(GLSL_SRC
  shaders/compositor_keying_extract_chroma.glsl
  shaders/compositor_keying_replace_chroma.glsl
  shaders/compositor_keying_tweak_matte.glsl
  shaders/compositor_kuwahara_classic.glsl
  shaders/compositor_map_uv.glsl
  shaders/compositor_morphological_distance.glsl
  shaders/compositor_morphological_distance_feather.glsl
@ -158,6 +161,10 @@ set(GLSL_SRC
  shaders/compositor_smaa_edge_detection.glsl
  shaders/compositor_smaa_neighborhood_blending.glsl
  shaders/compositor_split_viewer.glsl
  shaders/compositor_summed_area_table_compute_complete_blocks.glsl
  shaders/compositor_summed_area_table_compute_complete_x_prologues.glsl
  shaders/compositor_summed_area_table_compute_complete_y_prologues.glsl
  shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
  shaders/compositor_sun_beams.glsl
  shaders/compositor_symmetric_blur.glsl
  shaders/compositor_symmetric_blur_variable_size.glsl
@ -196,6 +203,7 @@ set(GLSL_SRC
  shaders/library/gpu_shader_compositor_separate_combine.glsl
  shaders/library/gpu_shader_compositor_set_alpha.glsl
  shaders/library/gpu_shader_compositor_store_output.glsl
  shaders/library/gpu_shader_compositor_summed_area_table_lib.glsl
  shaders/library/gpu_shader_compositor_texture_utilities.glsl
  shaders/library/gpu_shader_compositor_type_conversion.glsl
 )
@ -245,6 +253,7 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/compositor_id_mask_info.hh
  shaders/infos/compositor_image_crop_info.hh
  shaders/infos/compositor_keying_info.hh
  shaders/infos/compositor_kuwahara_info.hh
  shaders/infos/compositor_map_uv_info.hh
  shaders/infos/compositor_morphological_distance_feather_info.hh
  shaders/infos/compositor_morphological_distance_info.hh
@ -261,6 +270,7 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/compositor_screen_lens_distortion_info.hh
  shaders/infos/compositor_smaa_info.hh
  shaders/infos/compositor_split_viewer_info.hh
  shaders/infos/compositor_summed_area_table_info.hh
  shaders/infos/compositor_sun_beams_info.hh
  shaders/infos/compositor_symmetric_blur_info.hh
  shaders/infos/compositor_symmetric_blur_variable_size_info.hh
--- a/source/blender/compositor/realtime_compositor/COM_result.hh
+++ b/source/blender/compositor/realtime_compositor/COM_result.hh
@ -25,6 +25,11 @@ enum class ResultType : uint8_t {
  Color,
 };
 enum class ResultPrecision : uint8_t {
  Full,
  Half,
 };
 /* ------------------------------------------------------------------------------------------------
 * Result
 *
@ -59,8 +64,11 @@ enum class ResultType : uint8_t {
 * pass_through method, see that method for more details. */
 class Result {
 private:
-  /* The base type of the texture or the type of the single value. */
+  /* The base type of the result's texture or single value. */
  ResultType type_;
  /* The precision of the result's texture, host-side single values are always stored using full
   * precision. */
  ResultPrecision precision_ = ResultPrecision::Half;
  /* If true, the result is a single value, otherwise, the result is a texture. */
  bool is_single_value_;
  /* A GPU texture storing the result data. This will be a 1x1 texture if the result is a single
@ -103,14 +111,18 @@ class Result {
  Result *master_ = nullptr;
 public:
-  /* Construct a result of the given type with the given texture pool that will be used to allocate
+  /* Construct a result of the given type and precision with the given texture pool that will be
-   * and release the result's texture. */
+   * used to allocate and release the result's texture. */
-  Result(ResultType type, TexturePool &texture_pool);
+  Result(ResultType type,
         TexturePool &texture_pool,
         ResultPrecision precision = ResultPrecision::Half);
  /* Identical to the standard constructor but initializes the reference count to 1. This is useful
   * to construct temporary results that are created and released by the developer manually, which
   * are typically used in operations that need temporary intermediate results. */
-  static Result Temporary(ResultType type, TexturePool &texture_pool);
+  static Result Temporary(ResultType type,
                          TexturePool &texture_pool,
                          ResultPrecision precision = ResultPrecision::Half);
  /* Declare the result to be a texture result, allocate a texture of an appropriate type with
   * the size of the given domain from the result's texture pool, and set the domain of the result
@ -267,6 +279,10 @@ class Result {
  /* Returns a reference to the domain of the result. See the Domain class. */
  const Domain &domain() const;
 private:
  /* Returns the appropriate texture format based on the result's type and precision. */
  eGPUTextureFormat get_texture_format() const;
 };
 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/COM_texture_pool.hh
+++ b/source/blender/compositor/realtime_compositor/COM_texture_pool.hh
@ -59,16 +59,6 @@ class TexturePool {
   * be uncleared and possibly contains garbage data. */
  GPUTexture *acquire(int2 size, eGPUTextureFormat format);
  /* Shorthand for acquire with GPU_RGBA16F format. */
  GPUTexture *acquire_color(int2 size);
  /* Shorthand for acquire with GPU_RGBA16F format. Identical to acquire_color because vectors are
   * 4D, and are thus stored in RGBA textures. */
  GPUTexture *acquire_vector(int2 size);
  /* Shorthand for acquire with GPU_R16F format. */
  GPUTexture *acquire_float(int2 size);
  /* Put the texture back into the pool, potentially to be acquired later by another user. Expects
   * the texture to be one that was acquired using the same texture pool. */
  void release(GPUTexture *texture);
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh
@ -0,0 +1,29 @@
 /* SPDX-FileCopyrightText: 2023 Blender Foundation
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */
 #pragma once
 #include "COM_context.hh"
 #include "COM_result.hh"
 namespace blender::realtime_compositor {
 /* Possible operations to apply on pixels before computing the summed area table. The Square
 * operation, for instance, can be useful to compute image variance from sum of squares. */
 enum class SummedAreaTableOperation : uint8_t {
  Identity,
  Square,
 };
 /* Computes a summed area table from the given input and write the table to the given output. A
 * summed are table is an image where each pixel contains the sum of all pixels in the areas down
 * and to its left toward the zero index, including the pixel itself. This table is particularly
 * useful to accelerate filters that requires averaging large rectangular areas of the input, like
 * a box filter. */
 void summed_area_table(Context &context,
                       Result &input,
                       Result &output,
                       SummedAreaTableOperation operation = SummedAreaTableOperation::Identity);
 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc
@ -0,0 +1,227 @@
 /* SPDX-FileCopyrightText: 2023 Blender Foundation
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */
 #include "BLI_math_base.hh"
 #include "BLI_math_vector.hh"
 #include "BLI_math_vector_types.hh"
 #include "GPU_compute.h"
 #include "GPU_shader.h"
 #include "GPU_texture.h"
 #include "COM_context.hh"
 #include "COM_result.hh"
 #include "COM_utilities.hh"
 #include "COM_algorithm_summed_area_table.hh"
 namespace blender::realtime_compositor {
 /* ------------------------------------------------------------------------------------------------
 * Summed Area Table
 *
 * An implementation of the summed area table algorithm from the paper:
 *
 *   Nehab, Diego, et al. "GPU-efficient recursive filtering and summed-area tables."
 *
 * This file is a straightforward implementation of each of the four passes described in
 * Algorithm SAT in section 6 of the paper. Note that we use Blender's convention of first
 * quadrant images, so we call prologues horizontal or X prologues, and we call transposed
 * prologues vertical or Y prologues. See each of the functions for more details. */
 static const char *get_compute_incomplete_prologues_shader(SummedAreaTableOperation operation)
 {
  switch (operation) {
    case SummedAreaTableOperation::Identity:
      return "compositor_summed_area_table_compute_incomplete_prologues_identity";
    case SummedAreaTableOperation::Square:
      return "compositor_summed_area_table_compute_incomplete_prologues_square";
  }
 }
 /* Computes the horizontal and vertical incomplete prologues from the given input using equations
 * (42) and (43) to implement the first pass of Algorithm SAT. Those equations accumulatively sum
 * each row in each block, writing the final sum to the X incomplete block, then sum each column in
 * the X accumulatively summed block, writing the final sum to the Y incomplete block. The output
 * is the prologues along the horizontal and vertical directions, where the accumulation axis is
 * stored along the vertical axis, so the X prologues are stored transposed for better cache
 * locality. */
 static void compute_incomplete_prologues(Context &context,
                                         Result &input,
                                         SummedAreaTableOperation operation,
                                         Result &incomplete_x_prologues,
                                         Result &incomplete_y_prologues)
 {
  GPUShader *shader = context.shader_manager().get(
      get_compute_incomplete_prologues_shader(operation));
  GPU_shader_bind(shader);
  input.bind_as_texture(shader, "input_tx");
  const int2 group_size = int2(16);
  const int2 input_size = input.domain().size;
  const int2 number_of_groups = math::divide_ceil(input_size, group_size);
  incomplete_x_prologues.allocate_texture(Domain(int2(input_size.y, number_of_groups.x)));
  incomplete_x_prologues.bind_as_image(shader, "incomplete_x_prologues_img");
  incomplete_y_prologues.allocate_texture(Domain(int2(input_size.x, number_of_groups.y)));
  incomplete_y_prologues.bind_as_image(shader, "incomplete_y_prologues_img");
  GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
  GPU_shader_unbind();
  input.unbind_as_texture();
  incomplete_x_prologues.unbind_as_image();
  incomplete_y_prologues.unbind_as_image();
 }
 /* Computes the complete X prologues and their sum from the incomplete X prologues using equation
 * (44) to implement the second pass of Algorithm SAT. That equation simply sum the incomplete
 * prologue and all incomplete prologues before it, writing the sum to the complete prologue. Then,
 * each of the complete prologues is summed using parallel reduction writing the sum to the output
 * sum for each block. The shader runs in parallel vertically, but serially horizontally. Note that
 * the input incomplete X prologues and output complete X prologues are stored transposed for
 * better cache locality, but the output sum is stored straight, not transposed. */
 static void compute_complete_x_prologues(Context &context,
                                         Result &input,
                                         Result &incomplete_x_prologues,
                                         Result &complete_x_prologues,
                                         Result &complete_x_prologues_sum)
 {
  GPUShader *shader = context.shader_manager().get(
      "compositor_summed_area_table_compute_complete_x_prologues");
  GPU_shader_bind(shader);
  incomplete_x_prologues.bind_as_texture(shader, "incomplete_x_prologues_tx");
  const int2 group_size = int2(16);
  const int2 input_size = input.domain().size;
  const int2 number_of_groups = math::divide_ceil(input_size, group_size);
  complete_x_prologues.allocate_texture(incomplete_x_prologues.domain());
  complete_x_prologues.bind_as_image(shader, "complete_x_prologues_img");
  complete_x_prologues_sum.allocate_texture(Domain(number_of_groups));
  complete_x_prologues_sum.bind_as_image(shader, "complete_x_prologues_sum_img");
  GPU_compute_dispatch(shader, number_of_groups.y, 1, 1);
  GPU_shader_unbind();
  incomplete_x_prologues.unbind_as_texture();
  complete_x_prologues.unbind_as_image();
  complete_x_prologues_sum.unbind_as_image();
 }
 /* Computes the complete Y prologues from the incomplete Y prologues using equation (45) to
 * implement the third pass of Algorithm SAT. That equation simply sum the incomplete prologue and
 * all incomplete prologues before it, then adds the sum of the complete X prologue for the same
 * block, writing the sum to the complete prologue. The shader runs in parallel horizontally, but
 * serially vertically. */
 static void compute_complete_y_prologues(Context &context,
                                         Result &input,
                                         Result &incomplete_y_prologues,
                                         Result &complete_x_prologues_sum,
                                         Result &complete_y_prologues)
 {
  GPUShader *shader = context.shader_manager().get(
      "compositor_summed_area_table_compute_complete_y_prologues");
  GPU_shader_bind(shader);
  incomplete_y_prologues.bind_as_texture(shader, "incomplete_y_prologues_tx");
  complete_x_prologues_sum.bind_as_texture(shader, "complete_x_prologues_sum_tx");
  const int2 group_size = int2(16);
  const int2 input_size = input.domain().size;
  const int2 number_of_groups = math::divide_ceil(input_size, group_size);
  complete_y_prologues.allocate_texture(incomplete_y_prologues.domain());
  complete_y_prologues.bind_as_image(shader, "complete_y_prologues_img");
  GPU_compute_dispatch(shader, number_of_groups.x, 1, 1);
  GPU_shader_unbind();
  incomplete_y_prologues.unbind_as_texture();
  complete_x_prologues_sum.unbind_as_texture();
  complete_y_prologues.unbind_as_image();
 }
 static const char *get_compute_complete_blocks_shader(SummedAreaTableOperation operation)
 {
  switch (operation) {
    case SummedAreaTableOperation::Identity:
      return "compositor_summed_area_table_compute_complete_blocks_identity";
    case SummedAreaTableOperation::Square:
      return "compositor_summed_area_table_compute_complete_blocks_square";
  }
 }
 /* Computes the final summed area table blocks from the complete X and Y prologues using equation
 * (41) to implement the fourth pass of Algorithm SAT. That equation simply uses an intermediate
 * shared memory to cascade the accumulation of rows and then column in each block using the
 * prologues as initial values and writes each step of the latter accumulation to the output. */
 static void compute_complete_blocks(Context &context,
                                    Result &input,
                                    Result &complete_x_prologues,
                                    Result &complete_y_prologues,
                                    SummedAreaTableOperation operation,
                                    Result &output)
 {
  GPUShader *shader = context.shader_manager().get(get_compute_complete_blocks_shader(operation));
  GPU_shader_bind(shader);
  input.bind_as_texture(shader, "input_tx");
  complete_x_prologues.bind_as_texture(shader, "complete_x_prologues_tx");
  complete_y_prologues.bind_as_texture(shader, "complete_y_prologues_tx");
  output.allocate_texture(input.domain());
  output.bind_as_image(shader, "output_img", true);
  const int2 group_size = int2(16);
  const int2 input_size = input.domain().size;
  const int2 number_of_groups = math::divide_ceil(input_size, group_size);
  GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
  GPU_shader_unbind();
  input.unbind_as_texture();
  complete_x_prologues.unbind_as_texture();
  complete_y_prologues.unbind_as_texture();
  output.unbind_as_image();
 }
 void summed_area_table(Context &context,
                       Result &input,
                       Result &output,
                       SummedAreaTableOperation operation)
 {
  Result incomplete_x_prologues = Result::Temporary(
      ResultType::Color, context.texture_pool(), ResultPrecision::Full);
  Result incomplete_y_prologues = Result::Temporary(
      ResultType::Color, context.texture_pool(), ResultPrecision::Full);
  compute_incomplete_prologues(
      context, input, operation, incomplete_x_prologues, incomplete_y_prologues);
  Result complete_x_prologues = Result::Temporary(
      ResultType::Color, context.texture_pool(), ResultPrecision::Full);
  Result complete_x_prologues_sum = Result::Temporary(
      ResultType::Color, context.texture_pool(), ResultPrecision::Full);
  compute_complete_x_prologues(
      context, input, incomplete_x_prologues, complete_x_prologues, complete_x_prologues_sum);
  incomplete_x_prologues.release();
  Result complete_y_prologues = Result::Temporary(
      ResultType::Color, context.texture_pool(), ResultPrecision::Full);
  compute_complete_y_prologues(
      context, input, incomplete_y_prologues, complete_x_prologues_sum, complete_y_prologues);
  incomplete_y_prologues.release();
  complete_x_prologues_sum.release();
  compute_complete_blocks(
      context, input, complete_x_prologues, complete_y_prologues, operation, output);
  complete_x_prologues.release();
  complete_y_prologues.release();
 }
 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/intern/result.cc
+++ b/source/blender/compositor/realtime_compositor/intern/result.cc
@ -16,19 +16,46 @@
 namespace blender::realtime_compositor {
-Result::Result(ResultType type, TexturePool &texture_pool)
+Result::Result(ResultType type, TexturePool &texture_pool, ResultPrecision precision)
-    : type_(type), texture_pool_(&texture_pool)
+    : type_(type), precision_(precision), texture_pool_(&texture_pool)
 {
 }
-Result Result::Temporary(ResultType type, TexturePool &texture_pool)
+Result Result::Temporary(ResultType type, TexturePool &texture_pool, ResultPrecision precision)
 {
-  Result result = Result(type, texture_pool);
+  Result result = Result(type, texture_pool, precision);
  result.set_initial_reference_count(1);
  result.reset();
  return result;
 }
 eGPUTextureFormat Result::get_texture_format() const
 {
  switch (precision_) {
    case ResultPrecision::Half:
      switch (type_) {
        case ResultType::Float:
          return GPU_R16F;
        case ResultType::Vector:
        case ResultType::Color:
          return GPU_RGBA16F;
      }
      break;
    case ResultPrecision::Full:
      switch (type_) {
        case ResultType::Float:
          return GPU_R32F;
        case ResultType::Vector:
        case ResultType::Color:
          return GPU_RGBA32F;
      }
      break;
  }
  BLI_assert_unreachable();
  return GPU_RGBA32F;
 }
 void Result::allocate_texture(Domain domain)
 {
  /* The result is not actually needed, so allocate a dummy single value texture instead. See the
@ -40,17 +67,7 @@ void Result::allocate_texture(Domain domain)
  }
  is_single_value_ = false;
-  switch (type_) {
+  texture_ = texture_pool_->acquire(domain.size, get_texture_format());
    case ResultType::Float:
      texture_ = texture_pool_->acquire_float(domain.size);
      break;
    case ResultType::Vector:
      texture_ = texture_pool_->acquire_vector(domain.size);
      break;
    case ResultType::Color:
      texture_ = texture_pool_->acquire_color(domain.size);
      break;
  }
  domain_ = domain;
 }
@ -59,17 +76,7 @@ void Result::allocate_single_value()
  is_single_value_ = true;
  /* Single values are stored in 1x1 textures as well as the single value members. */
  const int2 texture_size{1, 1};
-  switch (type_) {
+  texture_ = texture_pool_->acquire(texture_size, get_texture_format());
    case ResultType::Float:
      texture_ = texture_pool_->acquire_float(texture_size);
      break;
    case ResultType::Vector:
      texture_ = texture_pool_->acquire_vector(texture_size);
      break;
    case ResultType::Color:
      texture_ = texture_pool_->acquire_color(texture_size);
      break;
  }
  domain_ = Domain::identity();
 }
--- a/source/blender/compositor/realtime_compositor/intern/texture_pool.cc
+++ b/source/blender/compositor/realtime_compositor/intern/texture_pool.cc
@ -57,22 +57,6 @@ GPUTexture *TexturePool::acquire(int2 size, eGPUTextureFormat format)
  return allocate_texture(size, format);
 }
 GPUTexture *TexturePool::acquire_color(int2 size)
 {
  return acquire(size, GPU_RGBA16F);
 }
 GPUTexture *TexturePool::acquire_vector(int2 size)
 {
  /* Vectors are 4D, and are thus stored in RGBA textures. */
  return acquire(size, GPU_RGBA16F);
 }
 GPUTexture *TexturePool::acquire_float(int2 size)
 {
  return acquire(size, GPU_R16F);
 }
 void TexturePool::release(GPUTexture *texture)
 {
  textures_.lookup(TexturePoolKey(texture)).append(texture);
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
@ -0,0 +1,60 @@
 #pragma BLENDER_REQUIRE(common_math_lib.glsl)
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_summed_area_table_lib.glsl)
 void main()
 {
  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
  vec4 mean_of_squared_color_of_quadrants[4] = vec4[](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
  vec4 mean_of_color_of_quadrants[4] = vec4[](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
  /* Compute the above statistics for each of the quadrants around the current pixel. */
  for (int q = 0; q < 4; q++) {
    /* A fancy expression to compute the sign of the quadrant q. */
    ivec2 sign = ivec2((q % 2) * 2 - 1, ((q / 2) * 2 - 1));
    ivec2 lower_bound = texel - ivec2(sign.x > 0 ? 0 : radius, sign.y > 0 ? 0 : radius);
    ivec2 upper_bound = texel + ivec2(sign.x < 0 ? 0 : radius, sign.y < 0 ? 0 : radius);
    /* Limit the quadrants to the image bounds. */
    ivec2 image_bound = imageSize(output_img) - ivec2(1);
    ivec2 corrected_lower_bound = min(image_bound, max(ivec2(0), lower_bound));
    ivec2 corrected_upper_bound = min(image_bound, max(ivec2(0), upper_bound));
    ivec2 region_size = corrected_upper_bound - corrected_lower_bound + ivec2(1);
    int quadrant_pixel_count = region_size.x * region_size.y;
 #if defined(SUMMED_AREA_TABLE)
    mean_of_color_of_quadrants[q] = summed_area_table_sum(table_tx, lower_bound, upper_bound);
    mean_of_squared_color_of_quadrants[q] = summed_area_table_sum(
        squared_table_tx, lower_bound, upper_bound);
 #else
    for (int j = 0; j <= radius; j++) {
      for (int i = 0; i <= radius; i++) {
        vec4 color = texture_load(input_tx, texel + ivec2(i, j) * sign, vec4(0.0));
        mean_of_color_of_quadrants[q] += color;
        mean_of_squared_color_of_quadrants[q] += color * color;
      }
    }
 #endif
    mean_of_color_of_quadrants[q] /= quadrant_pixel_count;
    mean_of_squared_color_of_quadrants[q] /= quadrant_pixel_count;
  }
  /* Find the quadrant which has the minimum variance. */
  float minimum_variance = FLT_MAX;
  vec4 mean_color_of_chosen_quadrant = mean_of_color_of_quadrants[0];
  for (int q = 0; q < 4; q++) {
    vec4 color_mean = mean_of_color_of_quadrants[q];
    vec4 squared_color_mean = mean_of_squared_color_of_quadrants[q];
    vec4 color_variance = squared_color_mean - color_mean * color_mean;
    float variance = dot(color_variance.rgb, vec3(1.0));
    if (variance < minimum_variance) {
      minimum_variance = variance;
      mean_color_of_chosen_quadrant = color_mean;
    }
  }
  imageStore(output_img, texel, mean_color_of_chosen_quadrant);
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_blocks.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_blocks.glsl
@ -0,0 +1,34 @@
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 /* An intermediate shared memory where the result of X accumulation will be stored. */
 shared vec4 block[gl_WorkGroupSize.x][gl_WorkGroupSize.y];
 void main()
 {
  /* Accumulate the block along the horizontal direction starting from the X prologue value,
   * writing each accumulation step to the intermediate shared memory. */
  if (gl_LocalInvocationID.x == 0) {
    ivec2 x_prologue_texel = ivec2(gl_GlobalInvocationID.y, gl_WorkGroupID.x);
    vec4 x_accumulated_color = texture_load(complete_x_prologues_tx, x_prologue_texel, vec4(0.0));
    for (int i = 0; i < gl_WorkGroupSize.x; i++) {
      ivec2 texel = ivec2(gl_WorkGroupID.x * gl_WorkGroupSize.x + i, gl_GlobalInvocationID.y);
      x_accumulated_color += OPERATION(texture_load(input_tx, texel, vec4(0.0)));
      block[i][gl_LocalInvocationID.y] = x_accumulated_color;
    }
  }
  /* Make sure the result of X accumulation is completely done. */
  barrier();
  /* Accumulate the block along the vertical direction starting from the Y prologue value,
   * writing each accumulation step to the output image. */
  if (gl_LocalInvocationID.y == 0) {
    ivec2 y_prologue_texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y);
    vec4 y_accumulated_color = texture_load(complete_y_prologues_tx, y_prologue_texel, vec4(0.0));
    for (int i = 0; i < gl_WorkGroupSize.y; i++) {
      y_accumulated_color += block[gl_LocalInvocationID.x][i];
      ivec2 texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y * gl_WorkGroupSize.y + i);
      imageStore(output_img, texel, y_accumulated_color);
    }
  }
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_x_prologues.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_x_prologues.glsl
@ -0,0 +1,53 @@
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 /* A shared memory to sum the prologues using parallel reduction. See the parallel reduction shader
 * "compositor_parallel_reduction.glsl" for more information. */
 shared vec4 complete_prologue[gl_WorkGroupSize.x];
 /* See the compute_complete_x_prologues function for a description of this shader. */
 void main()
 {
  /* Note that the X prologues are stored transposed, hence the horizontal dispatch domain, even
   * though, conceptually, the dispatch domain covers the vertical axis of the image. */
  int x = int(gl_GlobalInvocationID.x);
  vec4 accumulated_color = vec4(0.0);
  for (int y = 0; y < texture_size(incomplete_x_prologues_tx).y; y++) {
    accumulated_color += texture_load(incomplete_x_prologues_tx, ivec2(x, y), vec4(0.0));
    imageStore(complete_x_prologues_img, ivec2(x, y), accumulated_color);
    if (gl_WorkGroupID.x == 0) {
      /* Note that the first row of sums is the result of summing the prologues of a virtual block
       * that is before the first row of blocks and we assume that those prologues are all zeros,
       * so we set the sum to zero in that case. This is implemented by setting the sums of the
       * first vertical workgroup to zero, white latter workgroups are summed as as usual and
       * stored starting from the second row. */
      imageStore(complete_x_prologues_sum_img, ivec2(y, 0), vec4(0.0));
    }
    /* A parallel reduction loop to sum the prologues. This is exactly the same as the parallel
     * reduction loop in the shader "compositor_parallel_reduction.glsl", see that shader for
     * more information. */
    complete_prologue[gl_LocalInvocationIndex] = accumulated_color;
    for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride /= 2) {
      barrier();
      if (gl_LocalInvocationIndex >= stride) {
        continue;
      }
      complete_prologue[gl_LocalInvocationIndex] =
          complete_prologue[gl_LocalInvocationIndex] +
          complete_prologue[gl_LocalInvocationIndex + stride];
    }
    barrier();
    if (gl_LocalInvocationIndex == 0) {
      /*  Note that we store using a transposed texel, but that is only to undo the transposition
       * mentioned above. Also note that we start from the second row because the first row is
       * set to zero as mentioned above. */
      vec4 sum = complete_prologue[0];
      imageStore(complete_x_prologues_sum_img, ivec2(y, gl_WorkGroupID.x + 1), sum);
    }
  }
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_y_prologues.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_y_prologues.glsl
@ -0,0 +1,14 @@
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 /* See the compute_complete_y_prologues function for a description of this shader. */
 void main()
 {
  int x = int(gl_GlobalInvocationID.x);
  vec4 accumulated_color = vec4(0.0);
  for (int y = 0; y < texture_size(incomplete_y_prologues_tx).y; y++) {
    accumulated_color += texture_load(incomplete_y_prologues_tx, ivec2(x, y));
    accumulated_color += texture_load(complete_x_prologues_sum_tx, ivec2(gl_WorkGroupID.x, y));
    imageStore(complete_y_prologues_img, ivec2(x, y), accumulated_color);
  }
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
@ -0,0 +1,52 @@
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 /* An intermediate shared memory where the result of X accumulation will be stored. */
 shared vec4 block[gl_WorkGroupSize.x][gl_WorkGroupSize.y];
 /* See the compute_incomplete_prologues function for a description of this shader. */
 void main()
 {
  /* Accumulate the block along the horizontal direction writing each accumulation step to the
   * intermediate shared memory block, and writing the final accumulated value to the suitable
   * prologue. */
  if (gl_LocalInvocationID.x == 0) {
    vec4 x_accumulated_color = vec4(0.0);
    for (int i = 0; i < gl_WorkGroupSize.x; i++) {
      ivec2 texel = ivec2(gl_WorkGroupID.x * gl_WorkGroupSize.x + i, gl_GlobalInvocationID.y);
      x_accumulated_color += OPERATION(texture_load(input_tx, texel, vec4(0.0)));
      block[i][gl_LocalInvocationID.y] = x_accumulated_color;
    }
    /* Note that the first column of prologues is the result of accumulating a virtual block that
     * is before the first column of blocks and we assume that this block is all zeros, so we set
     * the prologue to zero as well. This is implemented by writing starting from the second column
     * and writing zero to the first column, hence the plus one in the write_texel. */
    ivec2 write_texel = ivec2(gl_GlobalInvocationID.y, gl_WorkGroupID.x + 1);
    imageStore(incomplete_x_prologues_img, write_texel, x_accumulated_color);
    if (gl_WorkGroupID.x == 0) {
      imageStore(incomplete_x_prologues_img, ivec2(write_texel.x, 0), vec4(0.0));
    }
  }
  /* Make sure the result of X accumulation is completely done. */
  barrier();
  /* Accumulate the block along the vertical direction writing the final accumulated value to the
   * suitable prologue. */
  if (gl_LocalInvocationID.y == 0) {
    vec4 y_accumulated_color = vec4(0.0);
    for (int i = 0; i < gl_WorkGroupSize.y; i++) {
      y_accumulated_color += block[gl_LocalInvocationID.x][i];
    }
    /* Note that the first row of prologues is the result of accumulating a virtual block that is
     * before the first row of blocks and we assume that this block is all zeros, so we set the
     * prologue to zero as well. This is implemented by writing starting from the second row and
     * writing zero to the first row, hence the plus one in the write_texel. */
    ivec2 write_texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y + 1);
    imageStore(incomplete_y_prologues_img, write_texel, y_accumulated_color);
    if (gl_WorkGroupID.y == 0) {
      imageStore(incomplete_y_prologues_img, ivec2(write_texel.x, 0), vec4(0.0));
    }
  }
 }
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
@ -0,0 +1,23 @@
 /* SPDX-FileCopyrightText: 2023 Blender Foundation
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */
 #include "gpu_shader_create_info.hh"
 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_shared)
    .local_group_size(16, 16)
    .push_constant(Type::INT, "radius")
    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
    .compute_source("compositor_kuwahara_classic.glsl");
 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic)
    .additional_info("compositor_kuwahara_classic_shared")
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table)
    .additional_info("compositor_kuwahara_classic_shared")
    .define("SUMMED_AREA_TABLE")
    .sampler(0, ImageType::FLOAT_2D, "table_tx")
    .sampler(1, ImageType::FLOAT_2D, "squared_table_tx")
    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_summed_area_table_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_summed_area_table_info.hh
@ -0,0 +1,56 @@
 /* SPDX-FileCopyrightText: 2023 Blender Foundation
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */
 #include "gpu_shader_create_info.hh"
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_shared)
    .local_group_size(16, 16)
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_x_prologues_img")
    .image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_y_prologues_img")
    .compute_source("compositor_summed_area_table_compute_incomplete_prologues.glsl");
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_identity)
    .additional_info("compositor_summed_area_table_compute_incomplete_prologues_shared")
    .define("OPERATION(value)", "value")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_square)
    .additional_info("compositor_summed_area_table_compute_incomplete_prologues_shared")
    .define("OPERATION(value)", "value * value")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_x_prologues)
    .local_group_size(16)
    .sampler(0, ImageType::FLOAT_2D, "incomplete_x_prologues_tx")
    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "complete_x_prologues_img")
    .image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "complete_x_prologues_sum_img")
    .compute_source("compositor_summed_area_table_compute_complete_x_prologues.glsl")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_y_prologues)
    .local_group_size(16)
    .sampler(0, ImageType::FLOAT_2D, "incomplete_y_prologues_tx")
    .sampler(1, ImageType::FLOAT_2D, "complete_x_prologues_sum_tx")
    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "complete_y_prologues_img")
    .compute_source("compositor_summed_area_table_compute_complete_y_prologues.glsl")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_shared)
    .local_group_size(16, 16)
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
    .sampler(1, ImageType::FLOAT_2D, "complete_x_prologues_tx")
    .sampler(2, ImageType::FLOAT_2D, "complete_y_prologues_tx")
    .image(0, GPU_RGBA32F, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "output_img")
    .compute_source("compositor_summed_area_table_compute_complete_blocks.glsl");
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_identity)
    .additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
    .define("OPERATION(value)", "value")
    .do_static_compilation(true);
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_square)
    .additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
    .define("OPERATION(value)", "value * value")
    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_summed_area_table_lib.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_summed_area_table_lib.glsl
@ -0,0 +1,46 @@
 #pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
 /* Computes the sum of the rectangular region defined by the given lower and upper bounds from the
 * given summed area table. It is assumed that the given upper bound is larger than the given lower
 * bound, otherwise, undefined behavior is invoked. Looking at the diagram below, in order to
 * compute the sum of area X, we sample the table at each of the corners of the area X, to get:
 *
 *   Upper Right -> A + B + C + X      (1)
 *   Upper Left -> A + B               (2)
 *   Lower Right -> B + C              (3)
 *   Lower Left -> B                   (4)
 *
 * We start from (1) and subtract (2) and (3) to get rid of A and C to get:
 *
 *  (A + B + C + X) - (A + B) - (B + C) = (X - B)
 *
 * To get rid of B, we add (4) to get:
 *
 *  (X - B) + B = X
 *
 *         ^
 *         |
 *         +-------+-----+
 *         |       |     |
 *         |   A   |  X  |
 *         |       |     |
 *         +-------+-----+
 *         |       |     |
 *         |   B   |  C  |
 *         |       |     |
 *         o-------+-----+------>
 *
 * The aforementioned equation eliminates the edges between regions X, C, and A since they get
 * subtracted with C and A. To avoid this, we subtract 1 from the lower bound and fallback to zero
 * for out of bound sampling. */
 vec4 summed_area_table_sum(sampler2D table, ivec2 lower_bound, ivec2 upper_bound)
 {
  ivec2 corrected_lower_bound = lower_bound - ivec2(1);
  ivec2 corrected_upper_bound = min(texture_size(table) - ivec2(1), upper_bound);
  vec4 addend = texture_load(table, corrected_upper_bound, vec4(0.0)) +
                texture_load(table, corrected_lower_bound, vec4(0.0));
  vec4 subtrahend =
      texture_load(table, ivec2(corrected_lower_bound.x, corrected_upper_bound.y), vec4(0.0)) +
      texture_load(table, ivec2(corrected_upper_bound.x, corrected_lower_bound.y), vec4(0.0));
  return addend - subtrahend;
 }
--- a/source/blender/nodes/composite/nodes/node_composite_dilate.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_dilate.cc
@ -97,11 +97,12 @@ class DilateErodeOperation : public NodeOperation {
  void execute_step()
  {
-    GPUTexture *horizontal_pass_result = execute_step_horizontal_pass();
+    Result horizontal_pass_result = execute_step_horizontal_pass();
    execute_step_vertical_pass(horizontal_pass_result);
    horizontal_pass_result.release();
  }
-  GPUTexture *execute_step_horizontal_pass()
+  Result execute_step_horizontal_pass()
  {
    GPUShader *shader = shader_manager().get(get_morphological_step_shader_name());
    GPU_shader_bind(shader);
@ -123,20 +124,20 @@ class DilateErodeOperation : public NodeOperation {
    const Domain domain = compute_domain();
    const int2 transposed_domain = int2(domain.size.y, domain.size.x);
-    GPUTexture *horizontal_pass_result = texture_pool().acquire_color(transposed_domain);
+    Result horizontal_pass_result = Result::Temporary(ResultType::Color, texture_pool());
-    const int image_unit = GPU_shader_get_sampler_binding(shader, "output_img");
+    horizontal_pass_result.allocate_texture(transposed_domain);
-    GPU_texture_image_bind(horizontal_pass_result, image_unit);
+    horizontal_pass_result.bind_as_image(shader, "output_img");
    compute_dispatch_threads_at_least(shader, domain.size);
    GPU_shader_unbind();
    input_mask.unbind_as_texture();
-    GPU_texture_image_unbind(horizontal_pass_result);
+    horizontal_pass_result.unbind_as_image();
    return horizontal_pass_result;
  }
-  void execute_step_vertical_pass(GPUTexture *horizontal_pass_result)
+  void execute_step_vertical_pass(Result &horizontal_pass_result)
  {
    GPUShader *shader = shader_manager().get(get_morphological_step_shader_name());
    GPU_shader_bind(shader);
@ -144,9 +145,7 @@ class DilateErodeOperation : public NodeOperation {
    /* Pass the absolute value of the distance. We have specialized shaders for each sign. */
    GPU_shader_uniform_1i(shader, "radius", math::abs(get_distance()));
-    GPU_memory_barrier(GPU_BARRIER_TEXTURE_FETCH);
+    horizontal_pass_result.bind_as_texture(shader, "input_tx");
    const int texture_image_unit = GPU_shader_get_sampler_binding(shader, "input_tx");
    GPU_texture_bind(horizontal_pass_result, texture_image_unit);
    const Domain domain = compute_domain();
    Result &output_mask = get_result("Mask");
@ -158,8 +157,8 @@ class DilateErodeOperation : public NodeOperation {
    compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
    GPU_shader_unbind();
    horizontal_pass_result.unbind_as_texture();
    output_mask.unbind_as_image();
    GPU_texture_unbind(horizontal_pass_result);
  }
  const char *get_morphological_step_shader_name()
--- a/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
@ -6,13 +6,16 @@
 * \ingroup cmpnodes
 */
 #include "COM_node_operation.hh"
 #include "RNA_access.h"
 #include "UI_interface.h"
 #include "UI_resources.h"
 #include "COM_node_operation.hh"
 #include "COM_utilities.hh"
 #include "COM_algorithm_summed_area_table.hh"
 #include "node_composite_util.hh"
 /* **************** Kuwahara ******************** */
@ -62,6 +65,84 @@ class ConvertKuwaharaOperation : public NodeOperation {
  using NodeOperation::NodeOperation;
  void execute() override
  {
    if (get_input("Image").is_single_value()) {
      get_input("Image").pass_through(get_result("Image"));
      return;
    }
    if (node_storage(bnode()).variation == CMP_NODE_KUWAHARA_ANISOTROPIC) {
      execute_anisotropic();
    }
    else {
      execute_classic();
    }
  }
  void execute_classic()
  {
    /* For high radii, we accelerate the filter using a summed area table, making the filter
     * execute in constant time as opposed to the trivial quadratic complexity. */
    if (node_storage(bnode()).size > 5) {
      execute_classic_summed_area_table();
      return;
    }
    GPUShader *shader = shader_manager().get("compositor_kuwahara_classic");
    GPU_shader_bind(shader);
    GPU_shader_uniform_1i(shader, "radius", node_storage(bnode()).size);
    const Result &input_image = get_input("Image");
    input_image.bind_as_texture(shader, "input_tx");
    const Domain domain = compute_domain();
    Result &output_image = get_result("Image");
    output_image.allocate_texture(domain);
    output_image.bind_as_image(shader, "output_img");
    compute_dispatch_threads_at_least(shader, domain.size);
    input_image.unbind_as_texture();
    output_image.unbind_as_image();
    GPU_shader_unbind();
  }
  void execute_classic_summed_area_table()
  {
    Result table = Result::Temporary(ResultType::Color, texture_pool(), ResultPrecision::Full);
    summed_area_table(context(), get_input("Image"), table);
    Result squared_table = Result::Temporary(
        ResultType::Color, texture_pool(), ResultPrecision::Full);
    summed_area_table(
        context(), get_input("Image"), squared_table, SummedAreaTableOperation::Square);
    GPUShader *shader = shader_manager().get("compositor_kuwahara_classic_summed_area_table");
    GPU_shader_bind(shader);
    GPU_shader_uniform_1i(shader, "radius", node_storage(bnode()).size);
    table.bind_as_texture(shader, "table_tx");
    squared_table.bind_as_texture(shader, "squared_table_tx");
    const Domain domain = compute_domain();
    Result &output_image = get_result("Image");
    output_image.allocate_texture(domain);
    output_image.bind_as_image(shader, "output_img");
    compute_dispatch_threads_at_least(shader, domain.size);
    table.unbind_as_texture();
    squared_table.unbind_as_texture();
    output_image.unbind_as_image();
    GPU_shader_unbind();
    table.release();
    squared_table.release();
  }
  void execute_anisotropic()
  {
    get_input("Image").pass_through(get_result("Image"));
    context().set_info_message("Viewport compositor setup not fully supported");
@ -88,8 +169,6 @@ void register_node_type_cmp_kuwahara()
  node_type_storage(
      &ntype, "NodeKuwaharaData", node_free_standard_storage, node_copy_standard_storage);
  ntype.get_compositor_operation = file_ns::get_compositor_operation;
  ntype.realtime_compositor_unsupported_message = N_(
      "Node not supported in the Viewport compositor");
  nodeRegisterType(&ntype);
 }