From 2b095c97fad129683e60bb1ec0d3a6fdefdd38af Mon Sep 17 00:00:00 2001 From: Thomas Dinges Date: Mon, 26 Feb 2024 14:49:19 +0100 Subject: [PATCH] Cycles: Increase minimum target on x86 to SSE4.2 * Compile regular host code with SSE4.2 * Remove the SSE2 kernel, only the SSE4.2 and AVX2 kernel remain Pull Request: https://projects.blender.org/blender/blender/pulls/118471 --- intern/cycles/CMakeLists.txt | 47 +++---- intern/cycles/blender/addon/properties.py | 1 - intern/cycles/blender/addon/ui.py | 1 - intern/cycles/blender/python.cpp | 1 - intern/cycles/device/cpu/device.cpp | 1 - intern/cycles/device/cpu/kernel.cpp | 3 +- intern/cycles/device/cpu/kernel_function.h | 11 +- intern/cycles/kernel/CMakeLists.txt | 4 +- intern/cycles/kernel/device/cpu/kernel.cpp | 22 ++- intern/cycles/kernel/device/cpu/kernel.h | 3 - .../cycles/kernel/device/cpu/kernel_sse2.cpp | 22 --- .../cycles/kernel/device/cpu/kernel_sse42.cpp | 3 +- intern/cycles/test/util_float8_test.h | 4 +- intern/cycles/util/CMakeLists.txt | 3 +- intern/cycles/util/debug.cpp | 1 - intern/cycles/util/debug.h | 7 +- intern/cycles/util/optimization.h | 21 ++- intern/cycles/util/simd.cpp | 32 ----- intern/cycles/util/simd.h | 128 +----------------- intern/cycles/util/system.cpp | 13 -- intern/cycles/util/system.h | 1 - 21 files changed, 46 insertions(+), 283 deletions(-) delete mode 100644 intern/cycles/kernel/device/cpu/kernel_sse2.cpp delete mode 100644 intern/cycles/util/simd.cpp diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index bee0fe16637..11d45a9f4b6 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -26,10 +26,10 @@ endif() # Build Flags # todo: this code could be refactored a bit to avoid duplication -# note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm) +# note: CXX_HAS_SSE42 is needed in case passing SSE flags fails altogether (gcc-arm) if(WITH_CYCLES_NATIVE_ONLY) - set(CXX_HAS_SSE FALSE) + set(CXX_HAS_SSE42 FALSE) set(CXX_HAS_AVX FALSE) set(CXX_HAS_AVX2 FALSE) add_definitions( @@ -65,11 +65,11 @@ if(WITH_CYCLES_NATIVE_ONLY) set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}") endif() elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND)) - set(CXX_HAS_SSE FALSE) + set(CXX_HAS_SSE42 FALSE) set(CXX_HAS_AVX FALSE) set(CXX_HAS_AVX2 FALSE) elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CXX_HAS_SSE TRUE) + set(CXX_HAS_SSE42 TRUE) set(CXX_HAS_AVX TRUE) set(CXX_HAS_AVX2 TRUE) @@ -95,11 +95,9 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) - set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") else() - set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") set(CYCLES_SSE42_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() @@ -109,7 +107,7 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox") string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox") elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) - check_cxx_compiler_flag(-msse CXX_HAS_SSE) + check_cxx_compiler_flag(-msse4.2 CXX_HAS_SSE42) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) @@ -127,26 +125,27 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) string(APPEND CYCLES_KERNEL_FLAGS " -fno-rounding-math") endif() - if(CXX_HAS_SSE) + if(CXX_HAS_SSE42) if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CYCLES_KERNEL_FLAGS " -mfpmath=sse") endif() - set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2") - set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1 -msse4.2") + set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2") if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") endif() + + string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_SSE42_KERNEL_FLAGS}") + else() + string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}") endif() - string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}") elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel") - check_cxx_compiler_flag(/QxSSE2 CXX_HAS_SSE) + check_cxx_compiler_flag(/QxSSE4.2 CXX_HAS_SSE42) check_cxx_compiler_flag(/arch:AVX CXX_HAS_AVX) check_cxx_compiler_flag(/QxCORE-AVX2 CXX_HAS_AVX2) - if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2") + if(CXX_HAS_SSE42) set(CYCLES_SSE42_KERNEL_FLAGS "/QxSSE4.2") if(CXX_HAS_AVX2) @@ -154,24 +153,11 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel") endif() endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") - if(APPLE) - # ICC does not support SSE2 flag on MacOSX - check_cxx_compiler_flag(-xssse3 CXX_HAS_SSE) - else() - check_cxx_compiler_flag(-xsse2 CXX_HAS_SSE) - endif() - + check_cxx_compiler_flag(-xsse4.2 CXX_HAS_SSE42) check_cxx_compiler_flag(-xavx CXX_HAS_AVX) check_cxx_compiler_flag(-xcore-avx2 CXX_HAS_AVX2) - if(CXX_HAS_SSE) - if(APPLE) - # ICC does not support SSE2 flag on MacOSX - set(CYCLES_SSE2_KERNEL_FLAGS "-xssse3") - else() - set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2") - endif() - + if(CXX_HAS_SSE42) set(CYCLES_SSE42_KERNEL_FLAGS "-xsse4.2") if(CXX_HAS_AVX2) @@ -180,9 +166,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") endif() endif() -if(CXX_HAS_SSE) +if(CXX_HAS_SSE42) add_definitions( - -DWITH_KERNEL_SSE2 -DWITH_KERNEL_SSE42 ) endif() diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 086f30dcf24..78c4a831816 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -970,7 +970,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True) debug_use_cpu_sse42: BoolProperty(name="SSE42", default=True) - debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True) debug_bvh_layout: EnumProperty( name="BVH Layout", items=enum_bvh_layouts, diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 258a463e889..4c80fff37d5 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -2234,7 +2234,6 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel): col = layout.column(heading="CPU") row = col.row(align=True) - row.prop(cscene, "debug_use_cpu_sse2", toggle=True) row.prop(cscene, "debug_use_cpu_sse42", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_bvh_layout", text="BVH") diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index ea41b12bd42..b65a5fe3f83 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -66,7 +66,6 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene) /* Synchronize CPU flags. */ flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2"); flags.cpu.sse42 = get_boolean(cscene, "debug_use_cpu_sse42"); - flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp index 77d7f6cc2a2..02db1cf3601 100644 --- a/intern/cycles/device/cpu/device.cpp +++ b/intern/cycles/device/cpu/device.cpp @@ -46,7 +46,6 @@ void device_cpu_info(vector &devices) string device_cpu_capabilities() { string capabilities = ""; - capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; capabilities += system_cpu_support_sse42() ? "SSE42 " : ""; capabilities += system_cpu_support_avx2() ? "AVX2" : ""; if (capabilities[capabilities.size() - 1] == ' ') { diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp index 1a597ee3cef..af8e27560da 100644 --- a/intern/cycles/device/cpu/kernel.cpp +++ b/intern/cycles/device/cpu/kernel.cpp @@ -9,8 +9,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTIONS(name) \ - KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ - KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name) + KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name) #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) #define REGISTER_KERNEL_FILM_CONVERT(name) \ diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h index 4ba7cfda928..edd920e40b8 100644 --- a/intern/cycles/device/cpu/kernel_function.h +++ b/intern/cycles/device/cpu/kernel_function.h @@ -17,11 +17,10 @@ CCL_NAMESPACE_BEGIN template class CPUKernelFunction { public: CPUKernelFunction(FunctionType kernel_default, - FunctionType kernel_sse2, FunctionType kernel_sse42, FunctionType kernel_avx2) { - kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse42, kernel_avx2); + kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse42, kernel_avx2); } template inline auto operator()(Args... args) const @@ -55,12 +54,10 @@ template class CPUKernelFunction { }; KernelInfo get_best_kernel_info(FunctionType kernel_default, - FunctionType kernel_sse2, FunctionType kernel_sse42, FunctionType kernel_avx2) { /* Silence warnings about unused variables when compiling without some architectures. */ - (void)kernel_sse2; (void)kernel_sse42; (void)kernel_avx2; @@ -76,12 +73,6 @@ template class CPUKernelFunction { } #endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - return KernelInfo("SSE2", kernel_sse2); - } -#endif - return KernelInfo("default", kernel_default); } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 3b53e3b9b0e..0e880f672cd 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -14,7 +14,6 @@ set(INC_SYS set(SRC_KERNEL_DEVICE_CPU device/cpu/kernel.cpp - device/cpu/kernel_sse2.cpp device/cpu/kernel_sse42.cpp device/cpu/kernel_avx2.cpp ) @@ -1163,8 +1162,7 @@ endif() set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") -if(CXX_HAS_SSE) - set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") +if(CXX_HAS_SSE42) set_source_files_properties(device/cpu/kernel_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp index cf42b8d6cda..22d4507327c 100644 --- a/intern/cycles/kernel/device/cpu/kernel.cpp +++ b/intern/cycles/kernel/device/cpu/kernel.cpp @@ -4,31 +4,25 @@ /* CPU kernel entry points */ -/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this - * one with SSE2 intrinsics. +/* On x86-64, our minimum is SSE4.2, so avoid the extra kernel and compile this + * one with SSE4.2 intrinsics. */ #if defined(__x86_64__) || defined(_M_X64) # define __KERNEL_SSE__ # define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE42__ #endif /* When building kernel for native machine detect kernel features from the flags * set by compiler. */ #ifdef WITH_KERNEL_NATIVE -# ifdef __SSE2__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# endif -# ifdef __SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifdef __SSSE3__ -# define __KERNEL_SSSE3__ -# endif # ifdef __SSE4_2__ -# define __KERNEL_SSE42__ +# ifndef __KERNEL_SSE42__ +# define __KERNEL_SSE42__ +# endif # endif # ifdef __AVX__ # ifndef __KERNEL_SSE__ diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h index 902e2d341a9..873a9b48739 100644 --- a/intern/cycles/kernel/device/cpu/kernel.h +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -33,9 +33,6 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem #define KERNEL_ARCH cpu #include "kernel/device/cpu/kernel_arch.h" -#define KERNEL_ARCH cpu_sse2 -#include "kernel/device/cpu/kernel_arch.h" - #define KERNEL_ARCH cpu_sse42 #include "kernel/device/cpu/kernel_arch.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp deleted file mode 100644 index 35c4185177e..00000000000 --- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation - * - * SPDX-License-Identifier: Apache-2.0 */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE2 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug #36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#include "kernel/device/cpu/kernel.h" -#define KERNEL_ARCH cpu_sse2 -#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp index 6215df32f12..d9a12e8a224 100644 --- a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp @@ -2,7 +2,7 @@ * * SPDX-License-Identifier: Apache-2.0 */ -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 +/* Optimized CPU kernel entry points. This file is compiled with SSE42 * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ @@ -13,6 +13,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316. */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/test/util_float8_test.h b/intern/cycles/test/util_float8_test.h index 7f5843b646a..fab2950d429 100644 --- a/intern/cycles/test/util_float8_test.h +++ b/intern/cycles/test/util_float8_test.h @@ -16,8 +16,8 @@ static bool validate_cpu_capabilities() return system_cpu_support_avx2(); #elif defined(__KERNEL_AVX__) return system_cpu_support_avx(); -#elif defined(__KERNEL_SSE2__) - return system_cpu_support_sse2(); +#elif defined(__KERNEL_SSE42__) + return system_cpu_support_sse42(); #else return false; #endif diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index c8c8373436b..2f854e3d69a 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -20,7 +20,6 @@ set(SRC path.cpp profiling.cpp string.cpp - simd.cpp system.cpp task.cpp thread.cpp @@ -136,7 +135,7 @@ set(SRC_HEADERS xml.h ) -if(CXX_HAS_SSE) +if(CXX_HAS_SSE42) set_source_files_properties(transform_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp index 903a1d4cf23..ee6ae3a678d 100644 --- a/intern/cycles/util/debug.cpp +++ b/intern/cycles/util/debug.cpp @@ -31,7 +31,6 @@ void DebugFlags::CPU::reset() CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2"); CHECK_CPU_FLAGS(sse42, "CYCLES_CPU_NO_SSE42"); - CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2"); #undef STRINGIFY #undef CHECK_CPU_FLAGS diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h index 6759256c1a4..beadbe938f9 100644 --- a/intern/cycles/util/debug.h +++ b/intern/cycles/util/debug.h @@ -27,7 +27,6 @@ class DebugFlags { /* Flags describing which instructions sets are allowed for use. */ bool avx2 = true; bool sse42 = true; - bool sse2 = true; /* Check functions to see whether instructions up to the given one * are allowed for use. @@ -38,11 +37,7 @@ class DebugFlags { } bool has_sse42() { - return has_sse2() && sse42; - } - bool has_sse2() - { - return sse2; + return sse42; } /* Requested BVH layout. diff --git a/intern/cycles/util/optimization.h b/intern/cycles/util/optimization.h index dfa19f11352..e9a4ad8e6e6 100644 --- a/intern/cycles/util/optimization.h +++ b/intern/cycles/util/optimization.h @@ -9,28 +9,25 @@ /* x86 * - * Compile a regular, SSE2 and SSE3 kernel. */ + * Compile a regular and SSE42 kernel. */ # if defined(i386) || defined(_M_IX86) -/* We require minimum SSE2 support on x86, so auto enable. */ -# define __KERNEL_SSE2__ -# ifdef WITH_KERNEL_SSE2 -# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +/* We require minimum SSE4.2 support on x86, so auto enable. */ +# define __KERNEL_SSE42__ +# ifdef WITH_KERNEL_SSE42 +# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 # endif /* x86-64 * - * Compile a regular (includes SSE2), SSE 4.2 and AVX2 kernel. */ + * Compile a regular (includes SSE4.2) and AVX2 kernel. */ # elif defined(__x86_64__) || defined(_M_X64) -/* SSE2 is always available on x86-64 CPUs, so auto enable */ -# define __KERNEL_SSE2__ -/* no SSE2 kernel on x86-64, part of regular kernel */ -# ifdef WITH_KERNEL_SSE42 -# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 -# endif +/* SSE4.2 is our minimum requirement for x86-64 CPUs, so auto enable */ +# define __KERNEL_SSE42__ +/* no SSE4.2 kernel on x86-64, part of regular kernel */ # ifdef WITH_KERNEL_AVX2 # define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 # endif diff --git a/intern/cycles/util/simd.cpp b/intern/cycles/util/simd.cpp deleted file mode 100644 index 12c5aa5b4b3..00000000000 --- a/intern/cycles/util/simd.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation - * SPDX-FileCopyrightText: 2014-2022 Blender Foundation - * - * SPDX-License-Identifier: Apache-2.0 */ - -#if (defined(WITH_KERNEL_SSE2)) || (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__)) - -# define __KERNEL_SSE2__ -# include "util/simd.h" - -CCL_NAMESPACE_BEGIN - -const __m128 _mm_lookupmask_ps[16] = {_mm_castsi128_ps(_mm_set_epi32(0, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(0, 0, 0, -1)), - _mm_castsi128_ps(_mm_set_epi32(0, 0, -1, 0)), - _mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)), - _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)), - _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, 0)), - _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, -1)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, -1)), - _mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, -1)), - _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, 0)), - _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1))}; - -CCL_NAMESPACE_END - -#endif // WITH_KERNEL_SSE2 diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h index 8c1c46b89dc..a832811718c 100644 --- a/intern/cycles/util/simd.h +++ b/intern/cycles/util/simd.h @@ -457,132 +457,12 @@ __forceinline uint64_t bitscan(uint64_t value) #endif /* Intrinsics */ -/* SSE compatibility. - * - * Various utilities to smooth over differences between SSE versions and - * implementations. */ -#ifdef __KERNEL_SSE2__ - -/* Test __KERNEL_SSE42__ for MSVC which does not define __SSE4_2__, and test - * __SSE4_1__ and __SSE4_2__ to avoid OpenImageIO conflicts with our emulation macros on other - * platforms when compiling code outside the kernel. */ -# if !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) - -/* Emulation of SSE4 functions with SSE2 */ - -# define _MM_FROUND_TO_NEAREST_INT 0x00 -# define _MM_FROUND_TO_NEG_INF 0x01 -# define _MM_FROUND_TO_POS_INF 0x02 -# define _MM_FROUND_TO_ZERO 0x03 -# define _MM_FROUND_CUR_DIRECTION 0x04 - -# undef _mm_blendv_ps -# define _mm_blendv_ps _mm_blendv_ps_emu -__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask) -{ - __m128i isignmask = _mm_set1_epi32(0x80000000); - __m128 signmask = _mm_castsi128_ps(isignmask); - __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); - __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); - __m128 cmpmask = _mm_castsi128_ps(icmpmask); - return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); -} - -# undef _mm_blend_ps -# define _mm_blend_ps _mm_blend_ps_emu -__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask) -{ - assert(mask < 0x10); - return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); -} - -# undef _mm_blendv_epi8 -# define _mm_blendv_epi8 _mm_blendv_epi8_emu -__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask) -{ - return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); -} - -# undef _mm_min_epi32 -# define _mm_min_epi32 _mm_min_epi32_emu -__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input) -{ - return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); -} - -# undef _mm_max_epi32 -# define _mm_max_epi32 _mm_max_epi32_emu -__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input) -{ - return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); -} - -# ifndef __KERNEL_NEON__ -# undef _mm_extract_epi32 -# define _mm_extract_epi32 _mm_extract_epi32_emu -__forceinline int _mm_extract_epi32_emu(__m128i input, const int index) -{ - switch (index) { - case 0: - return _mm_cvtsi128_si32(input); - case 1: - return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); - case 2: - return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); - case 3: - return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3))); - default: - assert(false); - return 0; - } -} -# endif - -# undef _mm_insert_epi32 -# define _mm_insert_epi32 _mm_insert_epi32_emu -__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index) -{ - assert(index >= 0 && index < 4); - ((int *)&value)[index] = input; - return value; -} - -# undef _mm_insert_ps -# define _mm_insert_ps _mm_insert_ps_emu -__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index) -{ - assert(index < 0x100); - ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6]; - return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value); -} - -# undef _mm_round_ps -# define _mm_round_ps _mm_round_ps_emu -__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags) -{ - switch (flags) { - case _MM_FROUND_TO_NEAREST_INT: - return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); - case _MM_FROUND_TO_NEG_INF: - return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); - case _MM_FROUND_TO_POS_INF: - return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f)))); - case _MM_FROUND_TO_ZERO: - return _mm_cvtepi32_ps(_mm_cvttps_epi32(value)); - } - return value; -} - -# endif /* !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ - /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves. * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */ -# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) -# undef _mm256_cvtss_f32 -# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a))) -# endif - -#endif /* __KERNEL_SSE2__ */ +#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +# undef _mm256_cvtss_f32 +# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a))) +#endif /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \ diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp index afd3bef1961..e6e33ca1645 100644 --- a/intern/cycles/util/system.cpp +++ b/intern/cycles/util/system.cpp @@ -129,7 +129,6 @@ int system_cpu_bits() #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) struct CPUCapabilities { - bool sse2; bool sse42; bool avx2; }; @@ -160,7 +159,6 @@ static CPUCapabilities &system_cpu_capabilities() const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0; /* Simplify to combined capabilities for which we specialize kernels. */ - caps.sse2 = sse && sse2; caps.sse42 = sse && sse2 && sse3 && ssse3 && sse41 && sse42; if (os_uses_xsave_xrestore && cpu_avx_support) { @@ -195,12 +193,6 @@ static CPUCapabilities &system_cpu_capabilities() return caps; } -bool system_cpu_support_sse2() -{ - CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse2; -} - bool system_cpu_support_sse42() { CPUCapabilities &caps = system_cpu_capabilities(); @@ -214,11 +206,6 @@ bool system_cpu_support_avx2() } #else -bool system_cpu_support_sse2() -{ - return false; -} - bool system_cpu_support_sse42() { return false; diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h index f14c0551056..6dbef3c9df8 100644 --- a/intern/cycles/util/system.h +++ b/intern/cycles/util/system.h @@ -17,7 +17,6 @@ int system_console_width(); std::string system_cpu_brand_string(); int system_cpu_bits(); -bool system_cpu_support_sse2(); bool system_cpu_support_sse42(); bool system_cpu_support_avx2();