From d4bf23771d9a33a780d648619d4151cc848eac2a Mon Sep 17 00:00:00 2001
From: Omar Emara <mail@OmarEmara.dev>
Date: Fri, 17 May 2024 12:45:21 +0200
Subject: [PATCH] Compositor: Optimize Fog Glow Glare node

This patches optimizes the Fog Glow Glare node to be about 25x faster
for 4K images. This is mainly achieved by utilizing the FFTW library and
multi-threading support code. Further improvements are still possible by
caching kernels, but the CPU compositor does not support caching yet.

The old Hartley transform was removed, so the node no longer works when
FFTW is disabled as a build time option, much like the OIDN node. A new
BLI library was introduced for FFTW, it includes some helper routines
relevant for FFTW as well as an initialization routine that sets up
multithreading using TBB as well as thread safety.

Build system support for threaded FFTW was also added, which defines the
relevant variables to detect threading support as well as add the
relevant libraries.

We do not currently have the threaded FFTW libs in our precompiled libs,
so the threading code is disabled until the libs lands in the coming
weeks. So currently, the code is only about 9x faster.

The only functional change is that the kernel is now odd sized, which
should produce more accurate results, but the final result is almost
identical and mostly undetectable.

The plan is to port this to the GPU as well similar to how we implement
OIDN until we have a GPU FFT implementation. GPU compositor can also do
caching, so it should be faster, being able to compute a 4K image in
under half a second.

Pull Request: https://projects.blender.org/blender/blender/pulls/121653
---
 CMakeLists.txt                                |   2 +-
 build_files/cmake/Modules/FindFftw3.cmake     |  20 +
 source/blender/blenlib/BLI_fftw.hh            |  22 +
 source/blender/blenlib/CMakeLists.txt         |  15 +
 source/blender/blenlib/intern/fftw.cc         |  93 +++
 source/blender/compositor/CMakeLists.txt      |  10 +
 .../operations/COM_GlareFogGlowOperation.cc   | 598 ++++++------------
 source/blender/nodes/composite/CMakeLists.txt |   5 +
 .../composite/nodes/node_composite_glare.cc   |   8 +-
 9 files changed, 371 insertions(+), 402 deletions(-)
 create mode 100644 source/blender/blenlib/BLI_fftw.hh
 create mode 100644 source/blender/blenlib/intern/fftw.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78918ba085f..22cadad665b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -306,7 +306,7 @@ option(WITH_IK_SOLVER "\
 Enable Legacy IK solver (only disable for development)"
   ON
 )
-option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, and audio effects)" ON)
+option(WITH_FFTW3 "Enable FFTW3 support (Used for smoke, ocean sim, glare, and audio effects)" ON)
 option(WITH_PUGIXML "Enable PugiXML support (Used for OpenImageIO, Grease Pencil SVG export)" ON)
 option(WITH_BULLET "Enable Bullet (Physics Engine)" ON)
 option(WITH_SYSTEM_BULLET "\
diff --git a/build_files/cmake/Modules/FindFftw3.cmake b/build_files/cmake/Modules/FindFftw3.cmake
index 5950c2f61c6..3a13d35f368 100644
--- a/build_files/cmake/Modules/FindFftw3.cmake
+++ b/build_files/cmake/Modules/FindFftw3.cmake
@@ -11,9 +11,11 @@
 #  FFTW3_ROOT_DIR, The base directory to search for Fftw3.
 #                    This can also be an environment variable.
 #  FFTW3_FOUND, If false, do not try to use Fftw3.
+#  WITH_FFTW3_THREADS_F_SUPPORT, if true, single precision Fftw3 supports threads.
 #
 # also defined, but not for general use are
 #  FFTW3_LIBRARY_F, where to find the Fftw3 library (single precision float).
+#  FFTW3_LIBRARY_THREADS_F, where to find the Fftw3 threads library (single precision float).
 #  FFTW3_LIBRARY_D, where to find the Fftw3 library (double precision float).
 
 # If `FFTW3_ROOT_DIR` was defined in the environment, use it.
@@ -49,6 +51,15 @@ find_library(FFTW3_LIBRARY_F
     lib64 lib
   )
 
+find_library(FFTW3_LIBRARY_THREADS_F
+  NAMES
+    fftw3f_threads
+  HINTS
+    ${_fftw3_SEARCH_DIRS}
+  PATH_SUFFIXES
+    lib64 lib
+  )
+
 find_library(FFTW3_LIBRARY_D
   NAMES
     fftw3
@@ -60,6 +71,9 @@ find_library(FFTW3_LIBRARY_D
 
 list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_F}")
 list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_D}")
+if(FFTW3_LIBRARY_THREADS_F)
+  list(APPEND _FFTW3_LIBRARIES "${FFTW3_LIBRARY_THREADS_F}")
+endif()
 
 # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE if
 # all listed variables are TRUE
@@ -70,12 +84,18 @@ find_package_handle_standard_args(Fftw3 DEFAULT_MSG
 if(FFTW3_FOUND)
   set(FFTW3_LIBRARIES ${_FFTW3_LIBRARIES})
   set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
+  if(FFTW3_LIBRARY_THREADS_F)
+    set(WITH_FFTW3_THREADS_F_SUPPORT ON)
+  else()
+    set(WITH_FFTW3_THREADS_F_SUPPORT OFF)
+  endif()
 endif()
 
 
 mark_as_advanced(
   FFTW3_INCLUDE_DIR
   FFTW3_LIBRARY_F
+  FFTW3_LIBRARY_THREADS_F
   FFTW3_LIBRARY_D
 )
 
diff --git a/source/blender/blenlib/BLI_fftw.hh b/source/blender/blenlib/BLI_fftw.hh
new file mode 100644
index 00000000000..f04a9627658
--- /dev/null
+++ b/source/blender/blenlib/BLI_fftw.hh
@@ -0,0 +1,22 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "BLI_math_vector_types.hh"
+
+#pragma once
+
+namespace blender::fftw {
+
+/* FFTW's real to complex and complex to real transforms are more efficient when their input has a
+ * specific size. This function finds the most optimal size that is more than or equal the given
+ * size. The input data can then be zero padded to the optimal size for better performance. See
+ * Section 4.3.3 Real-data DFTs in the FFTW manual for more information. */
+int optimal_size_for_real_transform(int size);
+int2 optimal_size_for_real_transform(int2 size);
+
+/* Initialize the float variant of FFTW. This essentially setup the multithreading hooks to enable
+ * multithreading using TBB's parallel_for and makes the FFTW planner thread safe. */
+void initialize_float();
+
+}  // namespace blender::fftw
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 412044ecabb..59cef66f814 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -65,6 +65,7 @@ set(SRC
   intern/easing.c
   intern/endian_switch.c
   intern/expr_pylike_eval.c
+  intern/fftw.cc
   intern/fileops.cc
   intern/fileops_c.cc
   intern/filereader_file.c
@@ -225,6 +226,7 @@ set(SRC
   BLI_endian_switch_inline.h
   BLI_enumerable_thread_specific.hh
   BLI_expr_pylike_eval.h
+  BLI_fftw.hh
   BLI_fileops.h
   BLI_fileops.hh
   BLI_fileops_types.h
@@ -448,6 +450,19 @@ if(WITH_GMP)
   )
 endif()
 
+if(WITH_FFTW3)
+  list(APPEND INC_SYS
+    ${FFTW3_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${FFTW3_LIBRARIES}
+  )
+  add_definitions(-DWITH_FFTW3)
+  if (WITH_FFTW3_THREADS_F_SUPPORT)
+    add_definitions(-DWITH_FFTW3_THREADS_F_SUPPORT)
+  endif()
+endif()
+
 if(WIN32)
   if(WITH_BLENDER_THUMBNAILER)
     # Needed for querying the `thumbnailer .dll` in `winstuff.c`.
diff --git a/source/blender/blenlib/intern/fftw.cc b/source/blender/blenlib/intern/fftw.cc
new file mode 100644
index 00000000000..cdae65a7f2f
--- /dev/null
+++ b/source/blender/blenlib/intern/fftw.cc
@@ -0,0 +1,93 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#if defined(WITH_FFTW3_THREADS_F_SUPPORT)
+#  include <fftw3.h>
+#endif
+
+#include "BLI_index_range.hh"
+#include "BLI_math_vector_types.hh"
+#include "BLI_task.hh"
+#include "BLI_threads.h"
+
+namespace blender::fftw {
+
+/* Identifies if the given number is a 7-smooth number. */
+static bool is_humble_number(int n)
+{
+  if (n <= 1) {
+    return true;
+  }
+  if (n % 2 == 0) {
+    return is_humble_number(n / 2);
+  }
+  if (n % 3 == 0) {
+    return is_humble_number(n / 3);
+  }
+  if (n % 5 == 0) {
+    return is_humble_number(n / 5);
+  }
+  if (n % 7 == 0) {
+    return is_humble_number(n / 7);
+  }
+  return false;
+}
+
+/* Finds the even humble number larger than or equal the given number. */
+static int find_next_even_humble_number(int n)
+{
+  if (n % 2 == 1) {
+    n++;
+  }
+
+  while (true) {
+    if (is_humble_number(n)) {
+      return n;
+    }
+    n += 2;
+  }
+
+  return n;
+}
+
+/* FFTW is best at handling sizes of the form 2^a * 3^b * 5^c * 7^d * 11^e * 13^f, where e + f is
+ * either 0 or 1, and the other exponents are arbitrary. And it is beneficial for the size to be
+ * evene for real transforms. To simplify computation, we ignore the 11 and 13 factors and find the
+ * even humble number that is more then or equal the given size. See Section 4.3.3 Real-data DFTs
+ * in the FFTW manual for more information. */
+int optimal_size_for_real_transform(int size)
+{
+  return find_next_even_humble_number(size);
+}
+
+int2 optimal_size_for_real_transform(int2 size)
+{
+  return int2(optimal_size_for_real_transform(size.x), optimal_size_for_real_transform(size.y));
+}
+
+/* See Section 5.2 Usage of Multi-threaded FFTW in the FFTW manual for more information. */
+[[maybe_unused]] static void tbb_parallel_loop_for_fftw(void *(*work)(char *),
+                                                        char *job_data,
+                                                        size_t element_size,
+                                                        int number_of_jobs,
+                                                        void * /* data */)
+{
+  threading::parallel_for(IndexRange(number_of_jobs), 1, [&](const IndexRange sub_range) {
+    for (const int64_t i : sub_range) {
+      work(job_data + element_size * i);
+    }
+  });
+}
+
+void initialize_float()
+{
+#if defined(WITH_FFTW3_THREADS_F_SUPPORT)
+  fftwf_init_threads();
+  fftwf_make_planner_thread_safe();
+  fftwf_plan_with_nthreads(BLI_system_thread_count());
+  fftwf_threads_set_callback(tbb_parallel_loop_for_fftw, nullptr);
+#endif
+}
+
+}  // namespace blender::fftw
diff --git a/source/blender/compositor/CMakeLists.txt b/source/blender/compositor/CMakeLists.txt
index d6ab78678b4..c58371cd3ea 100644
--- a/source/blender/compositor/CMakeLists.txt
+++ b/source/blender/compositor/CMakeLists.txt
@@ -588,6 +588,16 @@ if(WITH_COMPOSITOR_CPU)
     )
   endif()
 
+  if(WITH_FFTW3)
+    list(APPEND INC_SYS
+      ${FFTW3_INCLUDE_DIRS}
+    )
+    list(APPEND LIB
+      ${FFTW3_LIBRARIES}
+    )
+    add_definitions(-DWITH_FFTW3)
+  endif()
+
   blender_add_lib(bf_compositor "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
   blender_set_target_unity_build(bf_compositor 10)
diff --git a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
index 283c3e09d5c..bf3684de2b3 100644
--- a/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
+++ b/source/blender/compositor/operations/COM_GlareFogGlowOperation.cc
@@ -2,423 +2,221 @@
  *
  * SPDX-License-Identifier: GPL-2.0-or-later */
 
+#include <complex>
+#include <cstring>
+#include <memory>
+#include <numeric>
+
+#if defined(WITH_FFTW3)
+#  include <fftw3.h>
+#endif
+
+#include "BLI_enumerable_thread_specific.hh"
+#include "BLI_fftw.hh"
+#include "BLI_index_range.hh"
 #include "BLI_math_base.hh"
-#include "BLI_math_vector.h"
+#include "BLI_math_vector_types.hh"
+#include "BLI_task.hh"
 
 #include "COM_GlareFogGlowOperation.h"
 
 namespace blender::compositor {
 
-/*
- *  2D Fast Hartley Transform, used for convolution
- */
-
-using fREAL = float;
-
-/* Returns next highest power of 2 of x, as well its log2 in L2. */
-static uint next_pow2(uint x, uint *L2)
+/* Given the x and y location in the range from 0 to kernel_size - 1, where kernel_size is odd,
+ * compute the fog glow kernel value. The equations are arbitrary and were chosen using visual
+ * judgement. The kernel is not normalized and need normalization. */
+[[maybe_unused]] static float compute_fog_glow_kernel_value(int x, int y, int kernel_size)
 {
-  uint pw, x_notpow2 = x & (x - 1);
-  *L2 = 0;
-  while (x >>= 1) {
-    ++(*L2);
-  }
-  pw = 1 << (*L2);
-  if (x_notpow2) {
-    (*L2)++;
-    pw <<= 1;
-  }
-  return pw;
+  const int half_kernel_size = kernel_size / 2;
+  const float scale = 0.25f * math::sqrt(math::square(kernel_size));
+  const float v = ((y - half_kernel_size) / float(half_kernel_size));
+  const float u = ((x - half_kernel_size) / float(half_kernel_size));
+  const float r = (math::square(u) + math::square(v)) * scale;
+  const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f;
+  const float kernel_value = math::exp(d);
+
+  const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) *
+                       (0.5f + 0.5f * math::cos(v * math::numbers::pi));
+  const float windowed_kernel_value = window * kernel_value;
+
+  return windowed_kernel_value;
 }
 
-//------------------------------------------------------------------------------
-
-/* From FXT library by Joerg Arndt, faster in order bit-reversal
- * use: `r = revbin_upd(r, h)` where `h = N>>1`. */
-static uint revbin_upd(uint r, uint h)
-{
-  while (!((r ^= h) & h)) {
-    h >>= 1;
-  }
-  return r;
-}
-//------------------------------------------------------------------------------
-static void FHT(fREAL *data, uint M, uint inverse)
-{
-  double tt, fc, dc, fs, ds, a = M_PI;
-  fREAL t1, t2;
-  int n2, bd, bl, istep, k, len = 1 << M, n = 1;
-
-  int i, j = 0;
-  uint Nh = len >> 1;
-  for (i = 1; i < (len - 1); i++) {
-    j = revbin_upd(j, Nh);
-    if (j > i) {
-      t1 = data[i];
-      data[i] = data[j];
-      data[j] = t1;
-    }
-  }
-
-  do {
-    fREAL *data_n = &data[n];
-
-    istep = n << 1;
-    for (k = 0; k < len; k += istep) {
-      t1 = data_n[k];
-      data_n[k] = data[k] - t1;
-      data[k] += t1;
-    }
-
-    n2 = n >> 1;
-    if (n > 2) {
-      fc = dc = cos(a);
-      fs = ds = sqrt(1.0 - fc * fc);  // sin(a);
-      bd = n - 2;
-      for (bl = 1; bl < n2; bl++) {
-        fREAL *data_nbd = &data_n[bd];
-        fREAL *data_bd = &data[bd];
-        for (k = bl; k < len; k += istep) {
-          t1 = fc * double(data_n[k]) + fs * double(data_nbd[k]);
-          t2 = fs * double(data_n[k]) - fc * double(data_nbd[k]);
-          data_n[k] = data[k] - t1;
-          data_nbd[k] = data_bd[k] - t2;
-          data[k] += t1;
-          data_bd[k] += t2;
-        }
-        tt = fc * dc - fs * ds;
-        fs = fs * dc + fc * ds;
-        fc = tt;
-        bd -= 2;
-      }
-    }
-
-    if (n > 1) {
-      for (k = n2; k < len; k += istep) {
-        t1 = data_n[k];
-        data_n[k] = data[k] - t1;
-        data[k] += t1;
-      }
-    }
-
-    n = istep;
-    a *= 0.5;
-  } while (n < len);
-
-  if (inverse) {
-    fREAL sc = (fREAL)1 / (fREAL)len;
-    for (k = 0; k < len; k++) {
-      data[k] *= sc;
-    }
-  }
-}
-//------------------------------------------------------------------------------
-/* 2D Fast Hartley Transform, Mx/My -> log2 of width/height,
- * nzp -> the row where zero pad data starts,
- * inverse -> see above. */
-static void FHT2D(fREAL *data, uint Mx, uint My, uint nzp, uint inverse)
-{
-  uint i, j, Nx, Ny, maxy;
-
-  Nx = 1 << Mx;
-  Ny = 1 << My;
-
-  /* Rows (forward transform skips 0 pad data). */
-  maxy = inverse ? Ny : nzp;
-  for (j = 0; j < maxy; j++) {
-    FHT(&data[Nx * j], Mx, inverse);
-  }
-
-  /* Transpose data. */
-  if (Nx == Ny) { /* Square. */
-    for (j = 0; j < Ny; j++) {
-      for (i = j + 1; i < Nx; i++) {
-        uint op = i + (j << Mx), np = j + (i << My);
-        std::swap(data[op], data[np]);
-      }
-    }
-  }
-  else { /* Rectangular. */
-    uint k, Nym = Ny - 1, stm = 1 << (Mx + My);
-    for (i = 0; stm > 0; i++) {
-#define PRED(k) (((k & Nym) << Mx) + (k >> My))
-      for (j = PRED(i); j > i; j = PRED(j)) {
-        /* Pass. */
-      }
-      if (j < i) {
-        continue;
-      }
-      for (k = i, j = PRED(i); j != i; k = j, j = PRED(j), stm--) {
-        std::swap(data[j], data[k]);
-      }
-#undef PRED
-      stm--;
-    }
-  }
-
-  std::swap(Nx, Ny);
-  std::swap(Mx, My);
-
-  /* Now columns == transposed rows. */
-  for (j = 0; j < Ny; j++) {
-    FHT(&data[Nx * j], Mx, inverse);
-  }
-
-  /* Finalize. */
-  for (j = 0; j <= (Ny >> 1); j++) {
-    uint jm = (Ny - j) & (Ny - 1);
-    uint ji = j << Mx;
-    uint jmi = jm << Mx;
-    for (i = 0; i <= (Nx >> 1); i++) {
-      uint im = (Nx - i) & (Nx - 1);
-      fREAL A = data[ji + i];
-      fREAL B = data[jmi + i];
-      fREAL C = data[ji + im];
-      fREAL D = data[jmi + im];
-      fREAL E = (fREAL)0.5 * ((A + D) - (B + C));
-      data[ji + i] = A - E;
-      data[jmi + i] = B + E;
-      data[ji + im] = C + E;
-      data[jmi + im] = D - E;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
-/* 2D convolution calc, d1 *= d2, M/N - > log2 of width/height. */
-static void fht_convolve(fREAL *d1, const fREAL *d2, uint M, uint N)
-{
-  fREAL a, b;
-  uint i, j, k, L, mj, mL;
-  uint m = 1 << M, n = 1 << N;
-  uint m2 = 1 << (M - 1), n2 = 1 << (N - 1);
-  uint mn2 = m << (N - 1);
-
-  d1[0] *= d2[0];
-  d1[mn2] *= d2[mn2];
-  d1[m2] *= d2[m2];
-  d1[m2 + mn2] *= d2[m2 + mn2];
-  for (i = 1; i < m2; i++) {
-    k = m - i;
-    a = d1[i] * d2[i] - d1[k] * d2[k];
-    b = d1[k] * d2[i] + d1[i] * d2[k];
-    d1[i] = (b + a) * (fREAL)0.5;
-    d1[k] = (b - a) * (fREAL)0.5;
-    a = d1[i + mn2] * d2[i + mn2] - d1[k + mn2] * d2[k + mn2];
-    b = d1[k + mn2] * d2[i + mn2] + d1[i + mn2] * d2[k + mn2];
-    d1[i + mn2] = (b + a) * (fREAL)0.5;
-    d1[k + mn2] = (b - a) * (fREAL)0.5;
-  }
-  for (j = 1; j < n2; j++) {
-    L = n - j;
-    mj = j << M;
-    mL = L << M;
-    a = d1[mj] * d2[mj] - d1[mL] * d2[mL];
-    b = d1[mL] * d2[mj] + d1[mj] * d2[mL];
-    d1[mj] = (b + a) * (fREAL)0.5;
-    d1[mL] = (b - a) * (fREAL)0.5;
-    a = d1[m2 + mj] * d2[m2 + mj] - d1[m2 + mL] * d2[m2 + mL];
-    b = d1[m2 + mL] * d2[m2 + mj] + d1[m2 + mj] * d2[m2 + mL];
-    d1[m2 + mj] = (b + a) * (fREAL)0.5;
-    d1[m2 + mL] = (b - a) * (fREAL)0.5;
-  }
-  for (i = 1; i < m2; i++) {
-    k = m - i;
-    for (j = 1; j < n2; j++) {
-      L = n - j;
-      mj = j << M;
-      mL = L << M;
-      a = d1[i + mj] * d2[i + mj] - d1[k + mL] * d2[k + mL];
-      b = d1[k + mL] * d2[i + mj] + d1[i + mj] * d2[k + mL];
-      d1[i + mj] = (b + a) * (fREAL)0.5;
-      d1[k + mL] = (b - a) * (fREAL)0.5;
-      a = d1[i + mL] * d2[i + mL] - d1[k + mj] * d2[k + mj];
-      b = d1[k + mj] * d2[i + mL] + d1[i + mL] * d2[k + mj];
-      d1[i + mL] = (b + a) * (fREAL)0.5;
-      d1[k + mj] = (b - a) * (fREAL)0.5;
-    }
-  }
-}
-//------------------------------------------------------------------------------
-
-static void convolve(float *dst, MemoryBuffer *in1, MemoryBuffer *in2)
-{
-  fREAL *data1, *data2, *fp;
-  uint w2, h2, hw, hh, log2_w, log2_h;
-  fRGB wt, *colp;
-  int x, y, ch;
-  int xbl, ybl, nxb, nyb, xbsz, ybsz;
-  bool in2done = false;
-  const uint kernel_width = in2->get_width();
-  const uint kernel_height = in2->get_height();
-  const uint image_width = in1->get_width();
-  const uint image_height = in1->get_height();
-  float *kernel_buffer = in2->get_buffer();
-  float *image_buffer = in1->get_buffer();
-
-  MemoryBuffer *rdst = new MemoryBuffer(DataType::Color, in1->get_rect());
-  memset(rdst->get_buffer(),
-         0,
-         rdst->get_width() * rdst->get_height() * COM_DATA_TYPE_COLOR_CHANNELS * sizeof(float));
-
-  /* Convolution result width & height. */
-  w2 = 2 * kernel_width - 1;
-  h2 = 2 * kernel_height - 1;
-  /* FFT pow2 required size & log2. */
-  w2 = next_pow2(w2, &log2_w);
-  h2 = next_pow2(h2, &log2_h);
-
-  /* Allocate space. */
-  data1 = (fREAL *)MEM_callocN(3 * w2 * h2 * sizeof(fREAL), "convolve_fast FHT data1");
-  data2 = (fREAL *)MEM_callocN(w2 * h2 * sizeof(fREAL), "convolve_fast FHT data2");
-
-  /* Normalize convolution. */
-  wt[0] = wt[1] = wt[2] = 0.0f;
-  for (y = 0; y < kernel_height; y++) {
-    colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-    for (x = 0; x < kernel_width; x++) {
-      add_v3_v3(wt, colp[x]);
-    }
-  }
-  if (wt[0] != 0.0f) {
-    wt[0] = 1.0f / wt[0];
-  }
-  if (wt[1] != 0.0f) {
-    wt[1] = 1.0f / wt[1];
-  }
-  if (wt[2] != 0.0f) {
-    wt[2] = 1.0f / wt[2];
-  }
-  for (y = 0; y < kernel_height; y++) {
-    colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-    for (x = 0; x < kernel_width; x++) {
-      mul_v3_v3(colp[x], wt);
-    }
-  }
-
-  /* Copy image data, unpacking interleaved RGBA into separate channels
-   * only need to calc data1 once. */
-
-  /* Block add-overlap. */
-  hw = kernel_width >> 1;
-  hh = kernel_height >> 1;
-  xbsz = (w2 + 1) - kernel_width;
-  ybsz = (h2 + 1) - kernel_height;
-  nxb = image_width / xbsz;
-  if (image_width % xbsz) {
-    nxb++;
-  }
-  nyb = image_height / ybsz;
-  if (image_height % ybsz) {
-    nyb++;
-  }
-  for (ybl = 0; ybl < nyb; ybl++) {
-    for (xbl = 0; xbl < nxb; xbl++) {
-
-      /* Each channel one by one. */
-      for (ch = 0; ch < 3; ch++) {
-        fREAL *data1ch = &data1[ch * w2 * h2];
-
-        /* Only need to calc fht data from in2 once, can re-use for every block. */
-        if (!in2done) {
-          /* in2, channel ch -> data1 */
-          for (y = 0; y < kernel_height; y++) {
-            fp = &data1ch[y * w2];
-            colp = (fRGB *)&kernel_buffer[y * kernel_width * COM_DATA_TYPE_COLOR_CHANNELS];
-            for (x = 0; x < kernel_width; x++) {
-              fp[x] = colp[x][ch];
-            }
-          }
-        }
-
-        /* in1, channel ch -> data2 */
-        memset(data2, 0, w2 * h2 * sizeof(fREAL));
-        for (y = 0; y < ybsz; y++) {
-          int yy = ybl * ybsz + y;
-          if (yy >= image_height) {
-            continue;
-          }
-          fp = &data2[y * w2];
-          colp = (fRGB *)&image_buffer[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS];
-          for (x = 0; x < xbsz; x++) {
-            int xx = xbl * xbsz + x;
-            if (xx >= image_width) {
-              continue;
-            }
-            fp[x] = colp[xx][ch];
-          }
-        }
-
-        /* Forward FHT
-         * zero pad data start is different for each == height+1. */
-        if (!in2done) {
-          FHT2D(data1ch, log2_w, log2_h, kernel_height + 1, 0);
-        }
-        FHT2D(data2, log2_w, log2_h, kernel_height + 1, 0);
-
-        /* FHT2D transposed data, row/col now swapped
-         * convolve & inverse FHT. */
-        fht_convolve(data2, data1ch, log2_h, log2_w);
-        FHT2D(data2, log2_h, log2_w, 0, 1);
-        /* Data again transposed, so in order again. */
-
-        /* Overlap-add result. */
-        for (y = 0; y < int(h2); y++) {
-          const int yy = ybl * ybsz + y - hh;
-          if ((yy < 0) || (yy >= image_height)) {
-            continue;
-          }
-          fp = &data2[y * w2];
-          colp = (fRGB *)&rdst->get_buffer()[yy * image_width * COM_DATA_TYPE_COLOR_CHANNELS];
-          for (x = 0; x < int(w2); x++) {
-            const int xx = xbl * xbsz + x - hw;
-            if ((xx < 0) || (xx >= image_width)) {
-              continue;
-            }
-            colp[xx][ch] += fp[x];
-          }
-        }
-      }
-      in2done = true;
-    }
-  }
-
-  MEM_freeN(data2);
-  MEM_freeN(data1);
-  memcpy(dst,
-         rdst->get_buffer(),
-         sizeof(float) * image_width * image_height * COM_DATA_TYPE_COLOR_CHANNELS);
-  delete (rdst);
-}
-
-void GlareFogGlowOperation::generate_glare(float *data,
-                                           MemoryBuffer *input_image,
+void GlareFogGlowOperation::generate_glare(float *output,
+                                           MemoryBuffer *image,
                                            const NodeGlare *settings)
 {
-  const int kernel_size = 1 << settings->size;
-  MemoryBuffer kernel = MemoryBuffer(DataType::Color, kernel_size, kernel_size);
+#if defined(WITH_FFTW3)
+  fftw::initialize_float();
 
-  const float scale = 0.25f * math::sqrt(math::square(kernel_size));
+  /* We use an odd sized kernel since an even one will typically introduce a tiny offset as it has
+   * no exact center value. */
+  const int kernel_size = (1 << settings->size) + 1;
 
-  for (int y = 0; y < kernel_size; y++) {
-    const float v = 2.0f * (y / float(kernel_size)) - 1.0f;
-    for (int x = 0; x < kernel_size; x++) {
-      const float u = 2.0f * (x / float(kernel_size)) - 1.0f;
-      const float r = (math::square(u) + math::square(v)) * scale;
-      const float d = -math::sqrt(math::sqrt(math::sqrt(r))) * 9.0f;
-      const float kernel_value = math::exp(d);
+  /* Since we will be doing a circular convolution, we need to zero pad our input image by half the
+   * kernel size to avoid the kernel affecting the pixels at the other side of image. Therefore,
+   * zero boundary is assumed. */
+  const int needed_padding_amount = kernel_size / 2;
+  const int2 image_size = int2(image->get_width(), image->get_height());
+  const int2 needed_spatial_size = image_size + needed_padding_amount;
+  const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size);
 
-      const float window = (0.5f + 0.5f * math::cos(u * math::numbers::pi)) *
-                           (0.5f + 0.5f * math::cos(v * math::numbers::pi));
-      const float windowed_kernel_value = window * kernel_value;
+  /* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and
+   * stores only half the output since the other half is redundant, so we only allocate half of the
+   * first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for more
+   * information. */
+  const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y);
 
-      copy_v3_fl(kernel.get_elem(x, y), windowed_kernel_value);
-      kernel.get_elem(x, y)[3] = 1.0f;
+  float *kernel_spatial_domain = fftwf_alloc_real(spatial_size.x * spatial_size.y);
+  std::complex<float> *kernel_frequency_domain = reinterpret_cast<std::complex<float> *>(
+      fftwf_alloc_complex(frequency_size.x * frequency_size.y));
+
+  /* Create a real to complex plan to transform the kernel to the frequency domain, but the same
+   * plan will be used for the image since they both have the same dimensions. */
+  fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
+      spatial_size.y,
+      spatial_size.x,
+      kernel_spatial_domain,
+      reinterpret_cast<fftwf_complex *>(kernel_frequency_domain),
+      FFTW_ESTIMATE);
+
+  /* Use a double to sum the kernel since floats are not stable with threaded summation. */
+  threading::EnumerableThreadSpecific<double> sum_by_thread([]() { return 0.0; });
+
+  /* Compute the kernel while zero padding to match the padded image size. */
+  threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(spatial_size.x)) {
+        /* We offset the computed kernel with wrap around such that it is centered at the zero
+         * point, which is the expected format for doing circular convolutions in the frequency
+         * domain. */
+        const int half_kernel_size = kernel_size / 2;
+        int64_t output_x = mod_i(x - half_kernel_size, spatial_size.x);
+        int64_t output_y = mod_i(y - half_kernel_size, spatial_size.y);
+
+        const bool is_inside_kernel = x < kernel_size && y < kernel_size;
+        if (is_inside_kernel) {
+          const float kernel_value = compute_fog_glow_kernel_value(x, y, kernel_size);
+          kernel_spatial_domain[output_x + output_y * spatial_size.x] = kernel_value;
+          sum_by_thread.local() += kernel_value;
+        }
+        else {
+          kernel_spatial_domain[output_x + output_y * spatial_size.x] = 0.0f;
+        }
+      }
     }
-  }
+  });
 
-  convolve(data, input_image, &kernel);
+  /* Instead of normalizing the kernel now, we normalize it in the frequency domain since we will
+   * be doing sample normalization anyways. This is okay since the Fourier transform is linear. */
+  const float kernel_normalization_factor = float(
+      std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), 0.0));
+
+  fftwf_execute_dft_r2c(forward_plan,
+                        kernel_spatial_domain,
+                        reinterpret_cast<fftwf_complex *>(kernel_frequency_domain));
+  fftwf_free(kernel_spatial_domain);
+
+  /* We only process the color channels, the alpha channel is written to the output as is. */
+  const int channels_count = 3;
+  const int64_t spatial_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y;
+  const int64_t frequency_pixels_per_channel = int64_t(spatial_size.x) * spatial_size.y;
+  const int64_t spatial_pixels_count = spatial_pixels_per_channel * channels_count;
+  const int64_t frequency_pixels_count = frequency_pixels_per_channel * channels_count;
+
+  float *image_spatial_domain = fftwf_alloc_real(spatial_pixels_count);
+  std::complex<float> *image_frequency_domain = reinterpret_cast<std::complex<float> *>(
+      fftwf_alloc_complex(frequency_pixels_count));
+
+  /* Zero pad the image to the required spatial domain size, storing each channel in planar
+   * format for better cache locality, that is, RRRR...GGGG...BBBB. */
+  threading::parallel_for(IndexRange(spatial_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(spatial_size.x)) {
+        const bool is_inside_image = x < image_size.x && y < image_size.y;
+        for (const int64_t channel : IndexRange(channels_count)) {
+          const int64_t base_index = x + y * spatial_size.x;
+          const int64_t output_index = base_index + spatial_pixels_per_channel * channel;
+          if (is_inside_image) {
+            image_spatial_domain[output_index] = image->get_elem(x, y)[channel];
+          }
+          else {
+            image_spatial_domain[output_index] = 0.0f;
+          }
+        }
+      }
+    }
+  });
+
+  threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) {
+    for (const int64_t channel : sub_range) {
+      fftwf_execute_dft_r2c(forward_plan,
+                            image_spatial_domain + spatial_pixels_per_channel * channel,
+                            reinterpret_cast<fftwf_complex *>(image_frequency_domain) +
+                                frequency_pixels_per_channel * channel);
+    }
+  });
+
+  /* Multiply the kernel and the image in the frequency domain to perform the convolution. The
+   * FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result
+   * in an image that is scaled by a factor of the product of the width and height, so we take
+   * that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms of
+   * the FFTW manual for more information. */
+  const float normalization_scale = float(spatial_size.x) * spatial_size.y *
+                                    kernel_normalization_factor;
+  threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t channel : IndexRange(channels_count)) {
+      for (const int64_t y : sub_y_range) {
+        for (const int64_t x : IndexRange(frequency_size.x)) {
+          const int64_t base_index = x + y * frequency_size.x;
+          const int64_t output_index = base_index + frequency_pixels_per_channel * channel;
+          const std::complex<float> kernel_value = kernel_frequency_domain[base_index];
+          image_frequency_domain[output_index] *= kernel_value / normalization_scale;
+        }
+      }
+    }
+  });
+
+  /* Create a complex to real plan to transform the image to the real domain. */
+  fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
+      spatial_size.y,
+      spatial_size.x,
+      reinterpret_cast<fftwf_complex *>(image_frequency_domain),
+      image_spatial_domain,
+      FFTW_ESTIMATE);
+
+  threading::parallel_for(IndexRange(channels_count), 1, [&](const IndexRange sub_range) {
+    for (const int64_t channel : sub_range) {
+      fftwf_execute_dft_c2r(backward_plan,
+                            reinterpret_cast<fftwf_complex *>(image_frequency_domain) +
+                                frequency_pixels_per_channel * channel,
+                            image_spatial_domain + spatial_pixels_per_channel * channel);
+    }
+  });
+
+  /* Copy the result to the output. */
+  threading::parallel_for(IndexRange(image_size.y), 1, [&](const IndexRange sub_y_range) {
+    for (const int64_t y : sub_y_range) {
+      for (const int64_t x : IndexRange(image_size.x)) {
+        for (const int64_t channel : IndexRange(channels_count)) {
+          const int64_t output_index = (x + y * image_size.x) * image->get_num_channels();
+          const int64_t base_index = x + y * spatial_size.x;
+          const int64_t input_index = base_index + spatial_pixels_per_channel * channel;
+          output[output_index + channel] = image_spatial_domain[input_index];
+          output[output_index + 3] = image->get_buffer()[output_index + 3];
+        }
+      }
+    }
+  });
+
+  fftwf_destroy_plan(forward_plan);
+  fftwf_destroy_plan(backward_plan);
+  fftwf_free(image_spatial_domain);
+  fftwf_free(image_frequency_domain);
+  fftwf_free(kernel_frequency_domain);
+#else
+  UNUSED_VARS(output, image, settings);
+#endif
 }
 
 }  // namespace blender::compositor
diff --git a/source/blender/nodes/composite/CMakeLists.txt b/source/blender/nodes/composite/CMakeLists.txt
index 90acb4b0d40..9ed62e073a7 100644
--- a/source/blender/nodes/composite/CMakeLists.txt
+++ b/source/blender/nodes/composite/CMakeLists.txt
@@ -157,6 +157,11 @@ if(WITH_OPENIMAGEDENOISE)
   )
 endif()
 
+
+if(WITH_FFTW3)
+  add_definitions(-DWITH_FFTW3)
+endif()
+
 blender_add_lib(bf_nodes_composite "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 blender_set_target_unity_build(bf_nodes_composite 10)
diff --git a/source/blender/nodes/composite/nodes/node_composite_glare.cc b/source/blender/nodes/composite/nodes/node_composite_glare.cc
index 3b74960eb50..2f3a7d34b6f 100644
--- a/source/blender/nodes/composite/nodes/node_composite_glare.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_glare.cc
@@ -68,10 +68,16 @@ static void node_composit_init_glare(bNodeTree * /*ntree*/, bNode *node)
 
 static void node_composit_buts_glare(uiLayout *layout, bContext * /*C*/, PointerRNA *ptr)
 {
+  const int glare_type = RNA_enum_get(ptr, "glare_type");
+#ifndef WITH_FFTW3
+  if (glare_type == CMP_NODE_GLARE_FOG_GLOW) {
+    uiItemL(layout, RPT_("Disabled, built without FFTW"), ICON_ERROR);
+  }
+#endif
+
   uiItemR(layout, ptr, "glare_type", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE);
   uiItemR(layout, ptr, "quality", UI_ITEM_R_SPLIT_EMPTY_NAME, "", ICON_NONE);
 
-  const int glare_type = RNA_enum_get(ptr, "glare_type");
   if (ELEM(glare_type, CMP_NODE_GLARE_SIMPLE_STAR, CMP_NODE_GLARE_GHOST, CMP_NODE_GLARE_STREAKS)) {
     uiItemR(layout, ptr, "iterations", UI_ITEM_R_SPLIT_EMPTY_NAME, nullptr, ICON_NONE);
   }