BLI: use sse2neon to emulate SSE instructions with Arm Neon

* WITH_CPU_SSE was renamed to WITH_CPU_SIMD, and now covers both SSE and Neon.
* For macOS sse2neon.h is included as part of the precompiled libraries.
* For Linux it is enabled if the sse2neon.h header file is detected. However
  this library does not have official releases and is not shipped with any Linux
  distribution, so manual installation and configuration is required to get this
  working.

Ref D8237, T78710
This commit is contained in:
Brecht Van Lommel 2021-02-14 04:16:39 +01:00
parent 859118d8f6
commit db28411fd9
6 changed files with 75 additions and 39 deletions

@ -370,8 +370,8 @@ if(WITH_PYTHON_INSTALL)
endif()
endif()
option(WITH_CPU_SSE "Enable SIMD instruction if they're detected on the host machine" ON)
mark_as_advanced(WITH_CPU_SSE)
option(WITH_CPU_SIMD "Enable SIMD instruction if they're detected on the host machine" ON)
mark_as_advanced(WITH_CPU_SIMD)
# Cycles
option(WITH_CYCLES "Enable Cycles Render Engine" ON)
@ -775,14 +775,6 @@ if(WITH_GHOST_SDL OR WITH_HEADLESS)
set(WITH_XR_OPENXR OFF)
endif()
if(WITH_CPU_SSE)
TEST_SSE_SUPPORT(COMPILER_SSE_FLAG COMPILER_SSE2_FLAG)
else()
message(STATUS "SSE and SSE2 optimizations are DISABLED!")
set(COMPILER_SSE_FLAG)
set(COMPILER_SSE2_FLAG)
endif()
if(WITH_BUILDINFO)
find_package(Git)
if(NOT GIT_FOUND)
@ -962,22 +954,55 @@ if(WITH_INTERNATIONAL)
endif()
endif()
# See TEST_SSE_SUPPORT() for how this is defined.
# See TEST_SSE_SUPPORT() and TEST_NEON_SUPPORT() for how these are defined.
#
# This is done globally, so that all modules can use it if available, and
# because these are used in headers used by many modules.
if(WITH_CPU_SIMD)
set(COMPILER_SSE_FLAG)
set(COMPILER_SSE2_FLAG)
# Do it globally, SSE2 is required for quite some time now.
# Doing it now allows to use SSE/SSE2 in inline headers.
if(SUPPORT_SSE_BUILD)
string(PREPEND PLATFORM_CFLAGS "${COMPILER_SSE_FLAG} ")
add_definitions(-D__SSE__ -D__MMX__)
endif()
if(SUPPORT_SSE2_BUILD)
string(APPEND PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG}")
add_definitions(-D__SSE2__)
if(NOT SUPPORT_SSE_BUILD) # don't double up
add_definitions(-D__MMX__)
# Test Neon first since macOS Arm can compile and run x86-64 SSE binaries.
TEST_NEON_SUPPORT()
if(SUPPORT_NEON_BUILD)
# Neon
if(SSE2NEON_FOUND)
blender_include_dirs_sys("${SSE2NEON_INCLUDE_DIRS}")
add_definitions(-DWITH_SSE2NEON)
endif()
else()
# SSE
TEST_SSE_SUPPORT(COMPILER_SSE_FLAG COMPILER_SSE2_FLAG)
if(SUPPORT_SSE_BUILD)
string(PREPEND PLATFORM_CFLAGS "${COMPILER_SSE_FLAG} ")
add_definitions(-D__SSE__ -D__MMX__)
endif()
if(SUPPORT_SSE2_BUILD)
string(APPEND PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG}")
add_definitions(-D__SSE2__)
if(NOT SUPPORT_SSE_BUILD) # don't double up
add_definitions(-D__MMX__)
endif()
endif()
endif()
endif()
# Print instructions used
if(SUPPORT_NEON_BUILD)
if(SSE2NEON_FOUND)
message(STATUS "Neon SIMD instructions enabled")
else()
message(STATUS "Neon SIMD instructions detected but unused, requires sse2neon")
endif()
elseif(SUPPORT_SSE2_BUILD)
message(STATUS "SSE2 SIMD instructions enabled")
elseif(SUPPORT_SSE_BUILD)
message(STATUS "SSE SIMD instructions enabled")
else()
message(STATUS "No SIMD instructions detected")
endif()
else()
message(STATUS "SIMD instructions disabled")
endif()
# set the endian define
if(MSVC)

@ -668,12 +668,6 @@ macro(TEST_SSE_SUPPORT
#include <xmmintrin.h>
int main(void) { __m128 v = _mm_setzero_ps(); return 0; }"
SUPPORT_SSE_BUILD)
if(SUPPORT_SSE_BUILD)
message(STATUS "SSE Support: detected.")
else()
message(STATUS "SSE Support: missing.")
endif()
endif()
if(NOT DEFINED SUPPORT_SSE2_BUILD)
@ -682,17 +676,19 @@ macro(TEST_SSE_SUPPORT
#include <emmintrin.h>
int main(void) { __m128d v = _mm_setzero_pd(); return 0; }"
SUPPORT_SSE2_BUILD)
if(SUPPORT_SSE2_BUILD)
message(STATUS "SSE2 Support: detected.")
else()
message(STATUS "SSE2 Support: missing.")
endif()
endif()
unset(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(TEST_NEON_SUPPORT)
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
"#include <arm_neon.h>
int main() {return vaddvq_s32(vdupq_n_s32(1));}"
SUPPORT_NEON_BUILD)
endmacro()
# Only print message if running CMake first time
macro(message_first_run)
if(FIRST_RUN)

@ -321,8 +321,11 @@ if(WITH_OPENVDB)
endif()
if(WITH_NANOVDB)
set(NANOVDB ${LIBDIR}/nanovdb)
set(NANOVDB_INCLUDE_DIR ${NANOVDB}/include)
find_package(NanoVDB)
endif()
if(WITH_CPU_SIMD)
find_package(sse2neon)
endif()
if(WITH_LLVM)

@ -284,6 +284,10 @@ if(WITH_NANOVDB)
endif()
endif()
if(WITH_CPU_SIMD)
find_package_wrapper(sse2neon)
endif()
if(WITH_ALEMBIC)
find_package_wrapper(Alembic)

@ -64,7 +64,7 @@ if(WITH_CYCLES_NATIVE_ONLY)
endif()
set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
endif()
elseif(NOT WITH_CPU_SSE)
elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
set(CXX_HAS_SSE FALSE)
set(CXX_HAS_AVX FALSE)
set(CXX_HAS_AVX2 FALSE)

@ -22,7 +22,15 @@
* SIMD instruction support.
*/
#if defined(__SSE2__)
#if defined(__ARM_NEON) && defined(WITH_SSE2NEON)
/* SSE/SSE2 emulation on ARM Neon. Match SSE precision. */
# define SSE2NEON_PRECISE_MINMAX 1
# define SSE2NEON_PRECISE_DIV 1
# define SSE2NEON_PRECISE_SQRT 1
# include <sse2neon.h>
# define BLI_HAVE_SSE2
#elif defined(__SSE2__)
/* Native SSE2 on Intel/AMD. */
# include <emmintrin.h>
# define BLI_HAVE_SSE2
#endif