Cycles: Increase minimum target on x86 to SSE4.2

* Compile regular host code with SSE4.2
* Remove the SSE2 kernel, only the SSE4.2 and AVX2 kernel remain

Pull Request: https://projects.blender.org/blender/blender/pulls/118471
This commit is contained in:
Thomas Dinges 2024-02-26 14:49:19 +01:00 committed by Thomas Dinges
parent f9e8f2d857
commit 2b095c97fa
21 changed files with 46 additions and 283 deletions

@ -26,10 +26,10 @@ endif()
# Build Flags
# todo: this code could be refactored a bit to avoid duplication
# note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
# note: CXX_HAS_SSE42 is needed in case passing SSE flags fails altogether (gcc-arm)
if(WITH_CYCLES_NATIVE_ONLY)
set(CXX_HAS_SSE FALSE)
set(CXX_HAS_SSE42 FALSE)
set(CXX_HAS_AVX FALSE)
set(CXX_HAS_AVX2 FALSE)
add_definitions(
@ -65,11 +65,11 @@ if(WITH_CYCLES_NATIVE_ONLY)
set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
endif()
elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
set(CXX_HAS_SSE FALSE)
set(CXX_HAS_SSE42 FALSE)
set(CXX_HAS_AVX FALSE)
set(CXX_HAS_AVX2 FALSE)
elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CXX_HAS_SSE TRUE)
set(CXX_HAS_SSE42 TRUE)
set(CXX_HAS_AVX TRUE)
set(CXX_HAS_AVX2 TRUE)
@ -95,11 +95,9 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_SSE42_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
endif()
@ -109,7 +107,7 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox")
string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox")
elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
check_cxx_compiler_flag(-msse CXX_HAS_SSE)
check_cxx_compiler_flag(-msse4.2 CXX_HAS_SSE42)
check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
@ -127,26 +125,27 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
string(APPEND CYCLES_KERNEL_FLAGS " -fno-rounding-math")
endif()
if(CXX_HAS_SSE)
if(CXX_HAS_SSE42)
if(CMAKE_COMPILER_IS_GNUCC)
string(APPEND CYCLES_KERNEL_FLAGS " -mfpmath=sse")
endif()
set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1 -msse4.2")
set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2")
if(CXX_HAS_AVX2)
set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
endif()
string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_SSE42_KERNEL_FLAGS}")
else()
string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
endif()
string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
check_cxx_compiler_flag(/QxSSE2 CXX_HAS_SSE)
check_cxx_compiler_flag(/QxSSE4.2 CXX_HAS_SSE42)
check_cxx_compiler_flag(/arch:AVX CXX_HAS_AVX)
check_cxx_compiler_flag(/QxCORE-AVX2 CXX_HAS_AVX2)
if(CXX_HAS_SSE)
set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
if(CXX_HAS_SSE42)
set(CYCLES_SSE42_KERNEL_FLAGS "/QxSSE4.2")
if(CXX_HAS_AVX2)
@ -154,24 +153,11 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
endif()
endif()
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
if(APPLE)
# ICC does not support SSE2 flag on MacOSX
check_cxx_compiler_flag(-xssse3 CXX_HAS_SSE)
else()
check_cxx_compiler_flag(-xsse2 CXX_HAS_SSE)
endif()
check_cxx_compiler_flag(-xsse4.2 CXX_HAS_SSE42)
check_cxx_compiler_flag(-xavx CXX_HAS_AVX)
check_cxx_compiler_flag(-xcore-avx2 CXX_HAS_AVX2)
if(CXX_HAS_SSE)
if(APPLE)
# ICC does not support SSE2 flag on MacOSX
set(CYCLES_SSE2_KERNEL_FLAGS "-xssse3")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
endif()
if(CXX_HAS_SSE42)
set(CYCLES_SSE42_KERNEL_FLAGS "-xsse4.2")
if(CXX_HAS_AVX2)
@ -180,9 +166,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
endif()
endif()
if(CXX_HAS_SSE)
if(CXX_HAS_SSE42)
add_definitions(
-DWITH_KERNEL_SSE2
-DWITH_KERNEL_SSE42
)
endif()

@ -970,7 +970,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True)
debug_use_cpu_sse42: BoolProperty(name="SSE42", default=True)
debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True)
debug_bvh_layout: EnumProperty(
name="BVH Layout",
items=enum_bvh_layouts,

@ -2234,7 +2234,6 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
col = layout.column(heading="CPU")
row = col.row(align=True)
row.prop(cscene, "debug_use_cpu_sse2", toggle=True)
row.prop(cscene, "debug_use_cpu_sse42", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_bvh_layout", text="BVH")

@ -66,7 +66,6 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene)
/* Synchronize CPU flags. */
flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
flags.cpu.sse42 = get_boolean(cscene, "debug_use_cpu_sse42");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");

@ -46,7 +46,6 @@ void device_cpu_info(vector<DeviceInfo> &devices)
string device_cpu_capabilities()
{
string capabilities = "";
capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
capabilities += system_cpu_support_sse42() ? "SSE42 " : "";
capabilities += system_cpu_support_avx2() ? "AVX2" : "";
if (capabilities[capabilities.size() - 1] == ' ') {

@ -9,8 +9,7 @@
CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)
KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
#define REGISTER_KERNEL_FILM_CONVERT(name) \

@ -17,11 +17,10 @@ CCL_NAMESPACE_BEGIN
template<typename FunctionType> class CPUKernelFunction {
public:
CPUKernelFunction(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse42,
FunctionType kernel_avx2)
{
kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse42, kernel_avx2);
kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse42, kernel_avx2);
}
template<typename... Args> inline auto operator()(Args... args) const
@ -55,12 +54,10 @@ template<typename FunctionType> class CPUKernelFunction {
};
KernelInfo get_best_kernel_info(FunctionType kernel_default,
FunctionType kernel_sse2,
FunctionType kernel_sse42,
FunctionType kernel_avx2)
{
/* Silence warnings about unused variables when compiling without some architectures. */
(void)kernel_sse2;
(void)kernel_sse42;
(void)kernel_avx2;
@ -76,12 +73,6 @@ template<typename FunctionType> class CPUKernelFunction {
}
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
return KernelInfo("SSE2", kernel_sse2);
}
#endif
return KernelInfo("default", kernel_default);
}

@ -14,7 +14,6 @@ set(INC_SYS
set(SRC_KERNEL_DEVICE_CPU
device/cpu/kernel.cpp
device/cpu/kernel_sse2.cpp
device/cpu/kernel_sse42.cpp
device/cpu/kernel_avx2.cpp
)
@ -1163,8 +1162,7 @@ endif()
set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
if(CXX_HAS_SSE42)
set_source_files_properties(device/cpu/kernel_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
endif()

@ -4,31 +4,25 @@
/* CPU kernel entry points */
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
* one with SSE2 intrinsics.
/* On x86-64, our minimum is SSE4.2, so avoid the extra kernel and compile this
* one with SSE4.2 intrinsics.
*/
#if defined(__x86_64__) || defined(_M_X64)
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE42__
#endif
/* When building kernel for native machine detect kernel features from the flags
* set by compiler.
*/
#ifdef WITH_KERNEL_NATIVE
# ifdef __SSE2__
# ifndef __KERNEL_SSE2__
# define __KERNEL_SSE2__
# endif
# endif
# ifdef __SSE3__
# define __KERNEL_SSE3__
# endif
# ifdef __SSSE3__
# define __KERNEL_SSSE3__
# endif
# ifdef __SSE4_2__
# define __KERNEL_SSE42__
# ifndef __KERNEL_SSE42__
# define __KERNEL_SSE42__
# endif
# endif
# ifdef __AVX__
# ifndef __KERNEL_SSE__

@ -33,9 +33,6 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
#define KERNEL_ARCH cpu
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse42
#include "kernel/device/cpu/kernel_arch.h"

@ -1,22 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/device/cpu/kernel_arch_impl.h"

@ -2,7 +2,7 @@
*
* SPDX-License-Identifier: Apache-2.0 */
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
/* Optimized CPU kernel entry points. This file is compiled with SSE42
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
@ -13,6 +13,7 @@
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__

@ -16,8 +16,8 @@ static bool validate_cpu_capabilities()
return system_cpu_support_avx2();
#elif defined(__KERNEL_AVX__)
return system_cpu_support_avx();
#elif defined(__KERNEL_SSE2__)
return system_cpu_support_sse2();
#elif defined(__KERNEL_SSE42__)
return system_cpu_support_sse42();
#else
return false;
#endif

@ -20,7 +20,6 @@ set(SRC
path.cpp
profiling.cpp
string.cpp
simd.cpp
system.cpp
task.cpp
thread.cpp
@ -136,7 +135,7 @@ set(SRC_HEADERS
xml.h
)
if(CXX_HAS_SSE)
if(CXX_HAS_SSE42)
set_source_files_properties(transform_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)

@ -31,7 +31,6 @@ void DebugFlags::CPU::reset()
CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
CHECK_CPU_FLAGS(sse42, "CYCLES_CPU_NO_SSE42");
CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");
#undef STRINGIFY
#undef CHECK_CPU_FLAGS

@ -27,7 +27,6 @@ class DebugFlags {
/* Flags describing which instructions sets are allowed for use. */
bool avx2 = true;
bool sse42 = true;
bool sse2 = true;
/* Check functions to see whether instructions up to the given one
* are allowed for use.
@ -38,11 +37,7 @@ class DebugFlags {
}
bool has_sse42()
{
return has_sse2() && sse42;
}
bool has_sse2()
{
return sse2;
return sse42;
}
/* Requested BVH layout.

@ -9,28 +9,25 @@
/* x86
*
* Compile a regular, SSE2 and SSE3 kernel. */
* Compile a regular and SSE42 kernel. */
# if defined(i386) || defined(_M_IX86)
/* We require minimum SSE2 support on x86, so auto enable. */
# define __KERNEL_SSE2__
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
/* We require minimum SSE4.2 support on x86, so auto enable. */
# define __KERNEL_SSE42__
# ifdef WITH_KERNEL_SSE42
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
# endif
/* x86-64
*
* Compile a regular (includes SSE2), SSE 4.2 and AVX2 kernel. */
* Compile a regular (includes SSE4.2) and AVX2 kernel. */
# elif defined(__x86_64__) || defined(_M_X64)
/* SSE2 is always available on x86-64 CPUs, so auto enable */
# define __KERNEL_SSE2__
/* no SSE2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_SSE42
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
# endif
/* SSE4.2 is our minimum requirement for x86-64 CPUs, so auto enable */
# define __KERNEL_SSE42__
/* no SSE4.2 kernel on x86-64, part of regular kernel */
# ifdef WITH_KERNEL_AVX2
# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# endif

@ -1,32 +0,0 @@
/* SPDX-FileCopyrightText: 2011-2013 Intel Corporation
* SPDX-FileCopyrightText: 2014-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#if (defined(WITH_KERNEL_SSE2)) || (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__))
# define __KERNEL_SSE2__
# include "util/simd.h"
CCL_NAMESPACE_BEGIN
const __m128 _mm_lookupmask_ps[16] = {_mm_castsi128_ps(_mm_set_epi32(0, 0, 0, 0)),
_mm_castsi128_ps(_mm_set_epi32(0, 0, 0, -1)),
_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, 0)),
_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)),
_mm_castsi128_ps(_mm_set_epi32(0, -1, 0, 0)),
_mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)),
_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, 0)),
_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)),
_mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
_mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, -1)),
_mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, 0)),
_mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, -1)),
_mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, 0)),
_mm_castsi128_ps(_mm_set_epi32(-1, -1, 0, -1)),
_mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, 0)),
_mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1))};
CCL_NAMESPACE_END
#endif // WITH_KERNEL_SSE2

@ -457,132 +457,12 @@ __forceinline uint64_t bitscan(uint64_t value)
#endif /* Intrinsics */
/* SSE compatibility.
*
* Various utilities to smooth over differences between SSE versions and
* implementations. */
#ifdef __KERNEL_SSE2__
/* Test __KERNEL_SSE42__ for MSVC which does not define __SSE4_2__, and test
* __SSE4_1__ and __SSE4_2__ to avoid OpenImageIO conflicts with our emulation macros on other
* platforms when compiling code outside the kernel. */
# if !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__))
/* Emulation of SSE4 functions with SSE2 */
# define _MM_FROUND_TO_NEAREST_INT 0x00
# define _MM_FROUND_TO_NEG_INF 0x01
# define _MM_FROUND_TO_POS_INF 0x02
# define _MM_FROUND_TO_ZERO 0x03
# define _MM_FROUND_CUR_DIRECTION 0x04
# undef _mm_blendv_ps
# define _mm_blendv_ps _mm_blendv_ps_emu
__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
{
__m128i isignmask = _mm_set1_epi32(0x80000000);
__m128 signmask = _mm_castsi128_ps(isignmask);
__m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
__m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
__m128 cmpmask = _mm_castsi128_ps(icmpmask);
return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
}
# undef _mm_blend_ps
# define _mm_blend_ps _mm_blend_ps_emu
__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
{
assert(mask < 0x10);
return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
}
# undef _mm_blendv_epi8
# define _mm_blendv_epi8 _mm_blendv_epi8_emu
__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
{
return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
}
# undef _mm_min_epi32
# define _mm_min_epi32 _mm_min_epi32_emu
__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
{
return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
}
# undef _mm_max_epi32
# define _mm_max_epi32 _mm_max_epi32_emu
__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
{
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
# ifndef __KERNEL_NEON__
# undef _mm_extract_epi32
# define _mm_extract_epi32 _mm_extract_epi32_emu
__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
{
switch (index) {
case 0:
return _mm_cvtsi128_si32(input);
case 1:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
case 2:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
case 3:
return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
default:
assert(false);
return 0;
}
}
# endif
# undef _mm_insert_epi32
# define _mm_insert_epi32 _mm_insert_epi32_emu
__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
{
assert(index >= 0 && index < 4);
((int *)&value)[index] = input;
return value;
}
# undef _mm_insert_ps
# define _mm_insert_ps _mm_insert_ps_emu
__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
{
assert(index < 0x100);
((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
}
# undef _mm_round_ps
# define _mm_round_ps _mm_round_ps_emu
__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
{
switch (flags) {
case _MM_FROUND_TO_NEAREST_INT:
return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
case _MM_FROUND_TO_NEG_INF:
return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
case _MM_FROUND_TO_POS_INF:
return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
case _MM_FROUND_TO_ZERO:
return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
}
return value;
}
# endif /* !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
* _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
# undef _mm256_cvtss_f32
# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
# endif
#endif /* __KERNEL_SSE2__ */
#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
# undef _mm256_cvtss_f32
# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
#endif
/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \

@ -129,7 +129,6 @@ int system_cpu_bits()
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
struct CPUCapabilities {
bool sse2;
bool sse42;
bool avx2;
};
@ -160,7 +159,6 @@ static CPUCapabilities &system_cpu_capabilities()
const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
/* Simplify to combined capabilities for which we specialize kernels. */
caps.sse2 = sse && sse2;
caps.sse42 = sse && sse2 && sse3 && ssse3 && sse41 && sse42;
if (os_uses_xsave_xrestore && cpu_avx_support) {
@ -195,12 +193,6 @@ static CPUCapabilities &system_cpu_capabilities()
return caps;
}
bool system_cpu_support_sse2()
{
CPUCapabilities &caps = system_cpu_capabilities();
return caps.sse2;
}
bool system_cpu_support_sse42()
{
CPUCapabilities &caps = system_cpu_capabilities();
@ -214,11 +206,6 @@ bool system_cpu_support_avx2()
}
#else
bool system_cpu_support_sse2()
{
return false;
}
bool system_cpu_support_sse42()
{
return false;

@ -17,7 +17,6 @@ int system_console_width();
std::string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_sse2();
bool system_cpu_support_sse42();
bool system_cpu_support_avx2();