Cycles: Rename SSE4.1 kernel to SSE4.2

This commit updates all defines, compiler flags and cleans up some code for unused CPU capabilities.

There should be no functional change, unless it's run on a CPU that supports sse41 but not sse42. It will fallback to the SSE2 kernel in this case.

In preparation for the new SSE4.2 minimum in Blender 4.2.

Pull Request: https://projects.blender.org/blender/blender/pulls/118043
This commit is contained in:
Thomas Dinges 2024-02-09 17:25:58 +01:00 committed by Thomas Dinges
parent 6f7ac2be53
commit 30a22b92ca
30 changed files with 75 additions and 79 deletions

@ -96,11 +96,11 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE42_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
endif()
@ -133,9 +133,9 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
endif()
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1")
set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1 -msse4.2")
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
endif()
endif()
@ -147,7 +147,7 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
if(CXX_HAS_SSE)
set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1")
set(CYCLES_SSE42_KERNEL_FLAGS "/QxSSE4.2")
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2")
@ -172,7 +172,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
endif()
set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1")
set(CYCLES_SSE42_KERNEL_FLAGS "-xsse4.2")
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2")
@ -183,7 +183,7 @@ endif()
if(CXX_HAS_SSE)
add_definitions(
-DWITH_KERNEL_SSE2
-DWITH_KERNEL_SSE41
-DWITH_KERNEL_SSE42
)
endif()

@ -969,7 +969,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
return _cycles.debug_flags_update(scene)
debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True)
debug_use_cpu_sse41: BoolProperty(name="SSE41", default=True)
debug_use_cpu_sse42: BoolProperty(name="SSE42", default=True)
debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True)
debug_bvh_layout: EnumProperty(
name="BVH Layout",

@ -2235,7 +2235,7 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
row = col.row(align=True)
row.prop(cscene, "debug_use_cpu_sse2", toggle=True)
row.prop(cscene, "debug_use_cpu_sse41", toggle=True)
row.prop(cscene, "debug_use_cpu_sse42", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_bvh_layout", text="BVH")

@ -65,7 +65,7 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene)
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
/* Synchronize CPU flags. */
flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41");
flags.cpu.sse42 = get_boolean(cscene, "debug_use_cpu_sse42");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
/* Synchronize CUDA flags. */

@ -47,7 +47,7 @@ string device_cpu_capabilities()
{
string capabilities = "";
capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
capabilities += system_cpu_support_sse42() ? "SSE42 " : "";
capabilities += system_cpu_support_avx2() ? "AVX2" : "";
if (capabilities[capabilities.size() - 1] == ' ') {
capabilities.resize(capabilities.size() - 1);

@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
KERNEL_NAME_EVAL(cpu_sse41, name), KERNEL_NAME_EVAL(cpu_avx2, name)
KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
#define REGISTER_KERNEL_FILM_CONVERT(name) \

@ -13,15 +13,15 @@ CCL_NAMESPACE_BEGIN
*
* Provides a function-call-like API which gets routed to the most suitable implementation.
*
* For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
* For example, on a computer which only has SSE4.2 the kernel_sse42 will be used. */
template<typename FunctionType> class CPUKernelFunction {
public:
CPUKernelFunction(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse41,
FunctionType kernel_sse42,
FunctionType kernel_avx2)
{
kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse41, kernel_avx2);
kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse42, kernel_avx2);
}
template<typename... Args> inline auto operator()(Args... args) const
@ -56,12 +56,12 @@ template<typename FunctionType> class CPUKernelFunction {
KernelInfo get_best_kernel_info(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse41,
FunctionType kernel_sse42,
FunctionType kernel_avx2)
{
/* Silence warnings about unused variables when compiling without some architectures. */
(void)kernel_sse2;
(void)kernel_sse41;
(void)kernel_sse42;
(void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@ -70,9 +70,9 @@ template<typename FunctionType> class CPUKernelFunction {
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
return KernelInfo("SSE4.1", kernel_sse41);
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
if (DebugFlags().cpu.has_sse42() && system_cpu_support_sse42()) {
return KernelInfo("SSE4.2", kernel_sse42);
}
#endif

@ -15,7 +15,7 @@ set(INC_SYS
set(SRC_KERNEL_DEVICE_CPU
device/cpu/kernel.cpp
device/cpu/kernel_sse2.cpp
device/cpu/kernel_sse41.cpp
device/cpu/kernel_sse42.cpp
device/cpu/kernel_avx2.cpp
)
@ -1165,7 +1165,7 @@ set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CY
if(CXX_HAS_SSE)
set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(device/cpu/kernel_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)

@ -27,8 +27,8 @@
# ifdef __SSSE3__
# define __KERNEL_SSSE3__
# endif
# ifdef __SSE4_1__
# define __KERNEL_SSE41__
# ifdef __SSE4_2__
# define __KERNEL_SSE42__
# endif
# ifdef __AVX__
# ifndef __KERNEL_SSE__

@ -36,7 +36,7 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
#define KERNEL_ARCH cpu_sse2
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse41
#define KERNEL_ARCH cpu_sse42
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_avx2

@ -17,7 +17,7 @@
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_SSE42__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif

@ -8,7 +8,7 @@
#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
@ -16,10 +16,10 @@
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_SSE42__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 */
#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse41
#define KERNEL_ARCH cpu_sse42
#include "kernel/device/cpu/kernel_arch_impl.h"

@ -27,7 +27,7 @@ set(SRC
time.cpp
transform.cpp
transform_avx2.cpp
transform_sse41.cpp
transform_sse42.cpp
windows.cpp
)
@ -137,7 +137,7 @@ set(SRC_HEADERS
)
if(CXX_HAS_SSE)
set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(transform_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")

@ -30,7 +30,7 @@ void DebugFlags::CPU::reset()
} while (0)
CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
CHECK_CPU_FLAGS(sse41, "CYCLES_CPU_NO_SSE41");
CHECK_CPU_FLAGS(sse42, "CYCLES_CPU_NO_SSE42");
CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");
#undef STRINGIFY

@ -26,7 +26,7 @@ class DebugFlags {
/* Flags describing which instructions sets are allowed for use. */
bool avx2 = true;
bool sse41 = true;
bool sse42 = true;
bool sse2 = true;
/* Check functions to see whether instructions up to the given one
@ -34,11 +34,11 @@ class DebugFlags {
*/
bool has_avx2()
{
return has_sse41() && avx2;
return has_sse42() && avx2;
}
bool has_sse41()
bool has_sse42()
{
return has_sse2() && sse41;
return has_sse2() && sse42;
}
bool has_sse2()
{

@ -22,7 +22,7 @@ static int guiding_device_type()
if (system_cpu_support_avx2()) {
return 8;
}
if (system_cpu_support_sse41()) {
if (system_cpu_support_sse42()) {
return 4;
}
return 0;

@ -62,7 +62,7 @@ ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
ccl_device_inline int fast_rint(float x)
{
/* used by sin/cos/tan range reduction. */
#ifdef __KERNEL_SSE41__
#ifdef __KERNEL_SSE42__
/* Single `roundps` instruction on SSE4.1+ for gcc/clang but not MSVC 19.35:
* float_to_int(rintf(x)); so we use the equivalent intrinsics. */
__m128 vec = _mm_set_ss(x);

@ -200,7 +200,7 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b)
ccl_device_inline float dot(const float3 a, const float3 b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
# else
return a.x * b.x + a.y * b.y + a.z * b.z;
@ -211,7 +211,7 @@ ccl_device_inline float dot(const float3 a, const float3 b)
ccl_device_inline float dot_xy(const float3 a, const float3 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
#else
return a.x * b.x + a.y * b.y;
@ -220,7 +220,7 @@ ccl_device_inline float dot_xy(const float3 a, const float3 b)
ccl_device_inline float len(const float3 a)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
#else
return sqrtf(dot(a, a));
@ -264,7 +264,7 @@ ccl_device_inline float3 cross(const float3 a, const float3 b)
ccl_device_inline float3 normalize(const float3 a)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
return float3(_mm_div_ps(a.m128, norm));
# else

@ -364,7 +364,7 @@ ccl_device_inline float reduce_max(const float4 a)
#if !defined(__KERNEL_METAL__)
ccl_device_inline float dot(const float4 a, const float4 b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
# if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
# if defined(__KERNEL_NEON__)
__m128 t = vmulq_f32(a, b);
return vaddvq_f32(t);
@ -534,7 +534,7 @@ ccl_device_inline bool isequal(const float4 a, const float4 b)
ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
# ifdef __KERNEL_SSE41__
# ifdef __KERNEL_SSE42__
return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
# else
return float4(

@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN
#if !defined(__KERNEL_METAL__)
ccl_device_inline int3 min(int3 a, int3 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
return int3(_mm_min_epi32(a.m128, b.m128));
# else
return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
@ -23,7 +23,7 @@ ccl_device_inline int3 min(int3 a, int3 b)
ccl_device_inline int3 max(int3 a, int3 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
return int3(_mm_max_epi32(a.m128, b.m128));
# else
return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));

@ -203,7 +203,7 @@ ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
ccl_device_inline int4 min(int4 a, int4 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
return int4(_mm_min_epi32(a.m128, b.m128));
# else
return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
@ -212,7 +212,7 @@ ccl_device_inline int4 min(int4 a, int4 b)
ccl_device_inline int4 max(int4 a, int4 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
return int4(_mm_max_epi32(a.m128, b.m128));
# else
return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));

@ -136,7 +136,7 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
return madd(make_float4(a.x),
make_float4(b.x),
madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
@ -147,7 +147,7 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
return make_float3(
msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],

@ -20,7 +20,7 @@ static inline bool openimagedenoise_supported()
/* Always supported through Accelerate framework BNNS. */
return true;
# else
return system_cpu_support_sse41();
return system_cpu_support_sse42();
# endif
#else
return false;

@ -21,15 +21,15 @@
/* x86-64
*
* Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
* Compile a regular (includes SSE2), SSE 4.2 and AVX2 kernel. */
# elif defined(__x86_64__) || defined(_M_X64)
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE41
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# ifdef WITH_KERNEL_SSE42
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
# endif
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@ -47,7 +47,7 @@
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSE41__
# define __KERNEL_SSE42__
# endif

@ -463,10 +463,10 @@ __forceinline uint64_t bitscan(uint64_t value)
* implementations. */
#ifdef __KERNEL_SSE2__
/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
* __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
/* Test __KERNEL_SSE42__ for MSVC which does not define __SSE4_2__, and test
* __SSE4_1__ and __SSE4_2__ to avoid OpenImageIO conflicts with our emulation macros on other
* platforms when compiling code outside the kernel. */
# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
# if !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__))
/* Emulation of SSE4 functions with SSE2 */
@ -573,7 +573,7 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
return value;
}
# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
# endif /* !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
* _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
@ -586,7 +586,7 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
/* do nothing */
#endif

@ -130,9 +130,7 @@ int system_cpu_bits()
struct CPUCapabilities {
bool sse2;
bool sse3;
bool sse41;
bool avx;
bool sse42;
bool avx2;
};
@ -155,7 +153,7 @@ static CPUCapabilities &system_cpu_capabilities()
const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
// const bool sse42 = (result[2] & ((int)1 << 20)) != 0;
const bool sse42 = (result[2] & ((int)1 << 20)) != 0;
const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
@ -163,8 +161,7 @@ static CPUCapabilities &system_cpu_capabilities()
/* Simplify to combined capabilities for which we specialize kernels. */
caps.sse2 = sse && sse2;
caps.sse3 = sse && sse2 && sse3 && ssse3;
caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
caps.sse42 = sse && sse2 && sse3 && ssse3 && sse41 && sse42;
if (os_uses_xsave_xrestore && cpu_avx_support) {
// Check if the OS will save the YMM registers
@ -187,9 +184,8 @@ static CPUCapabilities &system_cpu_capabilities()
bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
bool avx2 = (result[1] & ((int)1 << 5)) != 0;
caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
bmi2;
caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && sse42 && avx && f16c && avx2 &&
fma3 && bmi1 && bmi2;
}
}
@ -205,10 +201,10 @@ bool system_cpu_support_sse2()
return caps.sse2;
}
bool system_cpu_support_sse41()
bool system_cpu_support_sse42()
{
CPUCapabilities &caps = system_cpu_capabilities();
return caps.sse41;
return caps.sse42;
}
bool system_cpu_support_avx2()
@ -223,7 +219,7 @@ bool system_cpu_support_sse2()
return false;
}
bool system_cpu_support_sse41()
bool system_cpu_support_sse42()
{
return false;
}

@ -18,7 +18,7 @@ int system_console_width();
std::string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_sse2();
bool system_cpu_support_sse41();
bool system_cpu_support_sse42();
bool system_cpu_support_avx2();
size_t system_physical_ram();

@ -405,7 +405,7 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
}
#ifndef __KERNEL_GPU__
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
void transform_inverse_cpu_sse42(const Transform &tfm, Transform &itfm);
void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
#endif
@ -418,9 +418,9 @@ ccl_device_inline Transform transform_inverse(const Transform tfm)
transform_inverse_cpu_avx2(tfm, itfm);
return itfm;
}
else if (system_cpu_support_sse41()) {
else if (system_cpu_support_sse42()) {
Transform itfm;
transform_inverse_cpu_sse41(tfm, itfm);
transform_inverse_cpu_sse42(tfm, itfm);
return itfm;
}
#endif

@ -30,7 +30,7 @@ ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const flo
ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
const __m128 a = (const __m128 &)a_;
const __m128 b = (const __m128 &)b_;
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));

@ -6,7 +6,7 @@
CCL_NAMESPACE_BEGIN
void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
void transform_inverse_cpu_sse42(const Transform &tfm, Transform &itfm)
{
itfm = transform_inverse_impl(tfm);
}