Compositor: Unify Blur operation across CPU and GPU

This patch unifies the implementation of the separable Blur operation across CPU and GPU. The difference is due to the fact that the CPU code ignores pixels that are across the image boundary and adjusts the weights accordingly, while the GPU code assumes extended boundaries. Like the rest of the Blur nodes the CPU implementation is adjusted to assume extended boundaries until an option to change the boundary condition is added. The alpha version of this operation will be handled in a separate patch. Pull Request: https://projects.blender.org/blender/blender/pulls/118517
2024-02-21 07:31:14 +01:00 · 2024-02-21 07:31:14 +01:00 · c409f2f7c6
commit c409f2f7c6
parent 69920a0680
1 changed files with 17 additions and 47 deletions
--- a/source/blender/compositor/operations/COM_GaussianBlurBaseOperation.cc
+++ b/source/blender/compositor/operations/COM_GaussianBlurBaseOperation.cc
@ -81,59 +81,29 @@ void GaussianBlurBaseOperation::update_memory_buffer_partial(MemoryBuffer *outpu
                                                             const rcti &area,
                                                             Span<MemoryBuffer *> inputs)
 {
+  const int2 unit_offset = dimension_ == eDimension::X ? int2(1, 0) : int2(0, 1);
  MemoryBuffer *input = inputs[IMAGE_INPUT_INDEX];
-  const rcti &input_rect = input->get_rect();
-  BuffersIterator<float> it = output->iterate_with({input}, area);
-
-  int min_input_coord = -1;
-  int max_input_coord = -1;
-  int elem_stride = -1;
-  std::function<int()> get_current_coord;
-  switch (dimension_) {
-    case eDimension::X:
-      min_input_coord = input_rect.xmin;
-      max_input_coord = input_rect.xmax;
-      elem_stride = input->elem_stride;
-      get_current_coord = [&] { return it.x; };
-      break;
-    case eDimension::Y:
-      min_input_coord = input_rect.ymin;
-      max_input_coord = input_rect.ymax;
-      elem_stride = input->row_stride;
-      get_current_coord = [&] { return it.y; };
-      break;
-  }
-
-  for (; !it.is_end(); ++it) {
-    const int coord = get_current_coord();
-    const int coord_min = max_ii(coord - filtersize_, min_input_coord);
-    const int coord_max = min_ii(coord + filtersize_ + 1, max_input_coord);
-
-    float ATTR_ALIGN(16) color_accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-    float multiplier_accum = 0.0f;
-
-    const int step = QualityStepHelper::get_step();
-    const float *in = it.in(0) + (intptr_t(coord_min) - coord) * elem_stride;
-    const int in_stride = elem_stride * step;
-    int gauss_idx = (coord_min - coord) + filtersize_;
-    const int gauss_end = gauss_idx + (coord_max - coord_min);
+  for (BuffersIterator<float> it = output->iterate_with({input}, area); !it.is_end(); ++it) {
+    alignas(16) float4 accumulated_color = float4(0.0f);
 #if BLI_HAVE_SSE2
-    __m128 accum_r = _mm_load_ps(color_accum);
-    for (; gauss_idx < gauss_end; in += in_stride, gauss_idx += step) {
-      __m128 reg_a = _mm_load_ps(in);
-      reg_a = _mm_mul_ps(reg_a, gausstab_sse_[gauss_idx]);
-      accum_r = _mm_add_ps(accum_r, reg_a);
-      multiplier_accum += gausstab_[gauss_idx];
+    __m128 accumulated_color_sse = _mm_setzero_ps();
+    for (int i = -filtersize_; i <= filtersize_; i++) {
+      const int2 offset = unit_offset * i;
+      __m128 weight = gausstab_sse_[i + filtersize_];
+      __m128 color = _mm_load_ps(input->get_elem_clamped(it.x + offset.x, it.y + offset.y));
+      __m128 weighted_color = _mm_mul_ps(color, weight);
+      accumulated_color_sse = _mm_add_ps(accumulated_color_sse, weighted_color);
    }
-    _mm_store_ps(color_accum, accum_r);
+    _mm_store_ps(accumulated_color, accumulated_color_sse);
 #else
-    for (; gauss_idx < gauss_end; in += in_stride, gauss_idx += step) {
-      const float multiplier = gausstab_[gauss_idx];
-      madd_v4_v4fl(color_accum, in, multiplier);
-      multiplier_accum += multiplier;
+    for (int i = -filtersize_; i <= filtersize_; i++) {
+      const int2 offset = unit_offset * i;
+      const float weight = gausstab_[i + filtersize_];
+      const float4 color = input->get_elem_clamped(it.x + offset.x, it.y + offset.y);
+      accumulated_color += color * weight;
    }
 #endif
-    mul_v4_v4fl(it.out, color_accum, 1.0f / multiplier_accum);
+    copy_v4_v4(it.out, accumulated_color);
  }
 }