Cycles: Increase minimum target on x86 to SSE4.2

* Compile regular host code with SSE4.2 * Remove the SSE2 kernel, only the SSE4.2 and AVX2 kernel remain Pull Request: https://projects.blender.org/blender/blender/pulls/118471
2024-02-26 14:49:19 +01:00 · 2024-02-26 14:49:19 +01:00 · 2b095c97fa
commit 2b095c97fa
parent f9e8f2d857
21 changed files with 46 additions and 283 deletions
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -26,10 +26,10 @@ endif()

 # Build Flags
 # todo: this code could be refactored a bit to avoid duplication
-# note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
+# note: CXX_HAS_SSE42 is needed in case passing SSE flags fails altogether (gcc-arm)

 if(WITH_CYCLES_NATIVE_ONLY)
-  set(CXX_HAS_SSE FALSE)
+  set(CXX_HAS_SSE42 FALSE)
  set(CXX_HAS_AVX FALSE)
  set(CXX_HAS_AVX2 FALSE)
  add_definitions(
@ -65,11 +65,11 @@ if(WITH_CYCLES_NATIVE_ONLY)
    set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
  endif()
 elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
-  set(CXX_HAS_SSE FALSE)
+  set(CXX_HAS_SSE42 FALSE)
  set(CXX_HAS_AVX FALSE)
  set(CXX_HAS_AVX2 FALSE)
 elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  set(CXX_HAS_SSE TRUE)
+  set(CXX_HAS_SSE42 TRUE)
  set(CXX_HAS_AVX TRUE)
  set(CXX_HAS_AVX2 TRUE)

@ -95,11 +95,9 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")

  # there is no /arch:SSE3, but intrinsics are available anyway
  if(CMAKE_CL_64)
-    set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
    set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
    set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
  else()
-    set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
    set(CYCLES_SSE42_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
    set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
  endif()
@ -109,7 +107,7 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
  string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox")
  string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox")
 elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
-  check_cxx_compiler_flag(-msse CXX_HAS_SSE)
+  check_cxx_compiler_flag(-msse4.2 CXX_HAS_SSE42)
  check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
  check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)

@ -127,26 +125,27 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
    string(APPEND CYCLES_KERNEL_FLAGS " -fno-rounding-math")
  endif()

-  if(CXX_HAS_SSE)
+  if(CXX_HAS_SSE42)
    if(CMAKE_COMPILER_IS_GNUCC)
      string(APPEND CYCLES_KERNEL_FLAGS " -mfpmath=sse")
    endif()

-    set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
-    set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1 -msse4.2")
+    set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2")
    if(CXX_HAS_AVX2)
      set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
    endif()
+
+    string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_SSE42_KERNEL_FLAGS}")
+  else()
+    string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
  endif()

-  string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
 elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-  check_cxx_compiler_flag(/QxSSE2 CXX_HAS_SSE)
+  check_cxx_compiler_flag(/QxSSE4.2 CXX_HAS_SSE42)
  check_cxx_compiler_flag(/arch:AVX CXX_HAS_AVX)
  check_cxx_compiler_flag(/QxCORE-AVX2 CXX_HAS_AVX2)

-  if(CXX_HAS_SSE)
-    set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
+  if(CXX_HAS_SSE42)
    set(CYCLES_SSE42_KERNEL_FLAGS "/QxSSE4.2")

    if(CXX_HAS_AVX2)
@ -154,24 +153,11 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
    endif()
  endif()
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-  if(APPLE)
-    # ICC does not support SSE2 flag on MacOSX
-    check_cxx_compiler_flag(-xssse3 CXX_HAS_SSE)
-  else()
-    check_cxx_compiler_flag(-xsse2 CXX_HAS_SSE)
-  endif()
-
+  check_cxx_compiler_flag(-xsse4.2 CXX_HAS_SSE42)
  check_cxx_compiler_flag(-xavx CXX_HAS_AVX)
  check_cxx_compiler_flag(-xcore-avx2 CXX_HAS_AVX2)

-  if(CXX_HAS_SSE)
-    if(APPLE)
-      # ICC does not support SSE2 flag on MacOSX
-      set(CYCLES_SSE2_KERNEL_FLAGS "-xssse3")
-    else()
-      set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
-    endif()
-
+  if(CXX_HAS_SSE42)
    set(CYCLES_SSE42_KERNEL_FLAGS "-xsse4.2")

    if(CXX_HAS_AVX2)
@ -180,9 +166,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
  endif()
 endif()

-if(CXX_HAS_SSE)
+if(CXX_HAS_SSE42)
  add_definitions(
-    -DWITH_KERNEL_SSE2
    -DWITH_KERNEL_SSE42
  )
 endif()
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -970,7 +970,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):

    debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True)
    debug_use_cpu_sse42: BoolProperty(name="SSE42", default=True)
-    debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True)
    debug_bvh_layout: EnumProperty(
        name="BVH Layout",
        items=enum_bvh_layouts,
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -2234,7 +2234,6 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
        col = layout.column(heading="CPU")

        row = col.row(align=True)
-        row.prop(cscene, "debug_use_cpu_sse2", toggle=True)
        row.prop(cscene, "debug_use_cpu_sse42", toggle=True)
        row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
        col.prop(cscene, "debug_bvh_layout", text="BVH")
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@ -66,7 +66,6 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene)
  /* Synchronize CPU flags. */
  flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
  flags.cpu.sse42 = get_boolean(cscene, "debug_use_cpu_sse42");
-  flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
  flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
  /* Synchronize CUDA flags. */
  flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@ -46,7 +46,6 @@ void device_cpu_info(vector<DeviceInfo> &devices)
 string device_cpu_capabilities()
 {
  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
  capabilities += system_cpu_support_sse42() ? "SSE42 " : "";
  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
  if (capabilities[capabilities.size() - 1] == ' ') {
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@ -9,8 +9,7 @@
 CCL_NAMESPACE_BEGIN

 #define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)

 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
 #define REGISTER_KERNEL_FILM_CONVERT(name) \
--- a/intern/cycles/device/cpu/kernel_function.h
+++ b/intern/cycles/device/cpu/kernel_function.h
@ -17,11 +17,10 @@ CCL_NAMESPACE_BEGIN
 template<typename FunctionType> class CPUKernelFunction {
 public:
  CPUKernelFunction(FunctionType kernel_default,
-                    FunctionType kernel_sse2,
                    FunctionType kernel_sse42,
                    FunctionType kernel_avx2)
  {
-    kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse42, kernel_avx2);
+    kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse42, kernel_avx2);
  }

  template<typename... Args> inline auto operator()(Args... args) const
@ -55,12 +54,10 @@ template<typename FunctionType> class CPUKernelFunction {
  };

  KernelInfo get_best_kernel_info(FunctionType kernel_default,
-                                  FunctionType kernel_sse2,
                                  FunctionType kernel_sse42,
                                  FunctionType kernel_avx2)
  {
    /* Silence warnings about unused variables when compiling without some architectures. */
-    (void)kernel_sse2;
    (void)kernel_sse42;
    (void)kernel_avx2;

@ -76,12 +73,6 @@ template<typename FunctionType> class CPUKernelFunction {
    }
 #endif

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      return KernelInfo("SSE2", kernel_sse2);
-    }
-#endif
-
    return KernelInfo("default", kernel_default);
  }

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -14,7 +14,6 @@ set(INC_SYS

 set(SRC_KERNEL_DEVICE_CPU
  device/cpu/kernel.cpp
-  device/cpu/kernel_sse2.cpp
  device/cpu/kernel_sse42.cpp
  device/cpu/kernel_avx2.cpp
 )
@ -1163,8 +1162,7 @@ endif()

 set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")

-if(CXX_HAS_SSE)
-  set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+if(CXX_HAS_SSE42)
  set_source_files_properties(device/cpu/kernel_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
 endif()

--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@ -4,31 +4,25 @@

 /* CPU kernel entry points */

-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
+/* On x86-64, our minimum is SSE4.2, so avoid the extra kernel and compile this
+ * one with SSE4.2 intrinsics.
 */
 #if defined(__x86_64__) || defined(_M_X64)
 #  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE42__
 #endif

 /* When building kernel for native machine detect kernel features from the flags
 * set by compiler.
 */
 #ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
 #  ifdef __SSE4_2__
-#    define __KERNEL_SSE42__
+#    ifndef __KERNEL_SSE42__
+#      define __KERNEL_SSE42__
+#    endif
 #  endif
 #  ifdef __AVX__
 #    ifndef __KERNEL_SSE__
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@ -33,9 +33,6 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
 #define KERNEL_ARCH cpu
 #include "kernel/device/cpu/kernel_arch.h"

-#define KERNEL_ARCH cpu_sse2
-#include "kernel/device/cpu/kernel_arch.h"
-
 #define KERNEL_ARCH cpu_sse42
 #include "kernel/device/cpu/kernel_arch.h"

--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@ -1,22 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug #36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/device/cpu/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/device/cpu/kernel_arch_impl.h"
--- a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
@ -2,7 +2,7 @@
 *
 * SPDX-License-Identifier: Apache-2.0 */

-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+/* Optimized CPU kernel entry points. This file is compiled with SSE42
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

@ -13,6 +13,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
--- a/intern/cycles/test/util_float8_test.h
+++ b/intern/cycles/test/util_float8_test.h
@ -16,8 +16,8 @@ static bool validate_cpu_capabilities()
  return system_cpu_support_avx2();
 #elif defined(__KERNEL_AVX__)
  return system_cpu_support_avx();
-#elif defined(__KERNEL_SSE2__)
-  return system_cpu_support_sse2();
+#elif defined(__KERNEL_SSE42__)
+  return system_cpu_support_sse42();
 #else
  return false;
 #endif
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@ -20,7 +20,6 @@ set(SRC
  path.cpp
  profiling.cpp
  string.cpp
-  simd.cpp
  system.cpp
  task.cpp
  thread.cpp
@ -136,7 +135,7 @@ set(SRC_HEADERS
  xml.h
 )

-if(CXX_HAS_SSE)
+if(CXX_HAS_SSE42)
  set_source_files_properties(transform_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
 endif()
 if(CXX_HAS_AVX2)
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@ -31,7 +31,6 @@ void DebugFlags::CPU::reset()

  CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
  CHECK_CPU_FLAGS(sse42, "CYCLES_CPU_NO_SSE42");
-  CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");

 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@ -27,7 +27,6 @@ class DebugFlags {
    /* Flags describing which instructions sets are allowed for use. */
    bool avx2 = true;
    bool sse42 = true;
-    bool sse2 = true;

    /* Check functions to see whether instructions up to the given one
     * are allowed for use.
@ -38,11 +37,7 @@ class DebugFlags {
    }
    bool has_sse42()
    {
-      return has_sse2() && sse42;
-    }
-    bool has_sse2()
-    {
-      return sse2;
+      return sse42;
    }

    /* Requested BVH layout.
--- a/intern/cycles/util/optimization.h
+++ b/intern/cycles/util/optimization.h
@ -9,28 +9,25 @@

 /* x86
 *
- * Compile a regular, SSE2 and SSE3 kernel. */
+ * Compile a regular and SSE42 kernel. */

 #  if defined(i386) || defined(_M_IX86)

-/* We require minimum SSE2 support on x86, so auto enable. */
-#    define __KERNEL_SSE2__
-#    ifdef WITH_KERNEL_SSE2
-#      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+/* We require minimum SSE4.2 support on x86, so auto enable. */
+#    define __KERNEL_SSE42__
+#    ifdef WITH_KERNEL_SSE42
+#      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
 #    endif

 /* x86-64
 *
- * Compile a regular (includes SSE2), SSE 4.2 and AVX2 kernel. */
+ * Compile a regular (includes SSE4.2) and AVX2 kernel. */

 #  elif defined(__x86_64__) || defined(_M_X64)

-/* SSE2 is always available on x86-64 CPUs, so auto enable */
-#    define __KERNEL_SSE2__
-/* no SSE2 kernel on x86-64, part of regular kernel */
-#    ifdef WITH_KERNEL_SSE42
-#      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
-#    endif
+/* SSE4.2 is our minimum requirement for x86-64 CPUs, so auto enable */
+#    define __KERNEL_SSE42__
+/* no SSE4.2 kernel on x86-64, part of regular kernel */
 #    ifdef WITH_KERNEL_AVX2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #    endif
--- a/intern/cycles/util/simd.cpp
+++ b/intern/cycles/util/simd.cpp
@ -1,32 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
- * SPDX-FileCopyrightText: 2014-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#if (defined(WITH_KERNEL_SSE2)) || (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__))
-
-#  define __KERNEL_SSE2__
-#  include "util/simd.h"
-
-CCL_NAMESPACE_BEGIN
-
-const __m128 _mm_lookupmask_ps[16] = {_mm_castsi128_ps(_mm_set_epi32(0, 0, 0, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, 0, 0, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, 0, -1, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, -1)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, 0)),
-                                      _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1))};
-
-CCL_NAMESPACE_END
-
-#endif  // WITH_KERNEL_SSE2
--- a/intern/cycles/util/simd.h
+++ b/intern/cycles/util/simd.h
@ -457,132 +457,12 @@ __forceinline uint64_t bitscan(uint64_t value)

 #endif /* Intrinsics */

-/* SSE compatibility.
- *
- * Various utilities to smooth over differences between SSE versions and
- * implementations. */
-#ifdef __KERNEL_SSE2__
-
-/* Test __KERNEL_SSE42__ for MSVC which does not define __SSE4_2__, and test
- * __SSE4_1__ and __SSE4_2__ to avoid OpenImageIO conflicts with our emulation macros on other
- * platforms when compiling code outside the kernel. */
-#  if !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__))
-
-/* Emulation of SSE4 functions with SSE2 */
-
-#    define _MM_FROUND_TO_NEAREST_INT 0x00
-#    define _MM_FROUND_TO_NEG_INF 0x01
-#    define _MM_FROUND_TO_POS_INF 0x02
-#    define _MM_FROUND_TO_ZERO 0x03
-#    define _MM_FROUND_CUR_DIRECTION 0x04
-
-#    undef _mm_blendv_ps
-#    define _mm_blendv_ps _mm_blendv_ps_emu
-__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
-{
-  __m128i isignmask = _mm_set1_epi32(0x80000000);
-  __m128 signmask = _mm_castsi128_ps(isignmask);
-  __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
-  __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
-  __m128 cmpmask = _mm_castsi128_ps(icmpmask);
-  return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
-}
-
-#    undef _mm_blend_ps
-#    define _mm_blend_ps _mm_blend_ps_emu
-__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
-{
-  assert(mask < 0x10);
-  return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
-}
-
-#    undef _mm_blendv_epi8
-#    define _mm_blendv_epi8 _mm_blendv_epi8_emu
-__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
-{
-  return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
-}
-
-#    undef _mm_min_epi32
-#    define _mm_min_epi32 _mm_min_epi32_emu
-__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
-{
-  return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
-}
-
-#    undef _mm_max_epi32
-#    define _mm_max_epi32 _mm_max_epi32_emu
-__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
-{
-  return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
-}
-
-#    ifndef __KERNEL_NEON__
-#      undef _mm_extract_epi32
-#      define _mm_extract_epi32 _mm_extract_epi32_emu
-__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
-{
-  switch (index) {
-    case 0:
-      return _mm_cvtsi128_si32(input);
-    case 1:
-      return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
-    case 2:
-      return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
-    case 3:
-      return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
-    default:
-      assert(false);
-      return 0;
-  }
-}
-#    endif
-
-#    undef _mm_insert_epi32
-#    define _mm_insert_epi32 _mm_insert_epi32_emu
-__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
-{
-  assert(index >= 0 && index < 4);
-  ((int *)&value)[index] = input;
-  return value;
-}
-
-#    undef _mm_insert_ps
-#    define _mm_insert_ps _mm_insert_ps_emu
-__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
-{
-  assert(index < 0x100);
-  ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
-  return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
-}
-
-#    undef _mm_round_ps
-#    define _mm_round_ps _mm_round_ps_emu
-__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
-{
-  switch (flags) {
-    case _MM_FROUND_TO_NEAREST_INT:
-      return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
-    case _MM_FROUND_TO_NEG_INF:
-      return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
-    case _MM_FROUND_TO_POS_INF:
-      return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
-    case _MM_FROUND_TO_ZERO:
-      return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
-  }
-  return value;
-}
-
-#  endif /* !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
-
 /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
 * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-#    undef _mm256_cvtss_f32
-#    define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
-#  endif
-
-#endif /* __KERNEL_SSE2__ */
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#  undef _mm256_cvtss_f32
+#  define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
+#endif

 /* quiet unused define warnings */
 #if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@ -129,7 +129,6 @@ int system_cpu_bits()
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)

 struct CPUCapabilities {
-  bool sse2;
  bool sse42;
  bool avx2;
 };
@ -160,7 +159,6 @@ static CPUCapabilities &system_cpu_capabilities()
      const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;

      /* Simplify to combined capabilities for which we specialize kernels. */
-      caps.sse2 = sse && sse2;
      caps.sse42 = sse && sse2 && sse3 && ssse3 && sse41 && sse42;

      if (os_uses_xsave_xrestore && cpu_avx_support) {
@ -195,12 +193,6 @@ static CPUCapabilities &system_cpu_capabilities()
  return caps;
 }

-bool system_cpu_support_sse2()
-{
-  CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse2;
-}
-
 bool system_cpu_support_sse42()
 {
  CPUCapabilities &caps = system_cpu_capabilities();
@ -214,11 +206,6 @@ bool system_cpu_support_avx2()
 }
 #else

-bool system_cpu_support_sse2()
-{
-  return false;
-}
-
 bool system_cpu_support_sse42()
 {
  return false;
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@ -17,7 +17,6 @@ int system_console_width();

 std::string system_cpu_brand_string();
 int system_cpu_bits();
-bool system_cpu_support_sse2();
 bool system_cpu_support_sse42();
 bool system_cpu_support_avx2();