diff --git a/CMake/VTKmCompilerOptimizations.cmake b/CMake/VTKmCompilerOptimizations.cmake index c6d4879d0..fb3e44375 100644 --- a/CMake/VTKmCompilerOptimizations.cmake +++ b/CMake/VTKmCompilerOptimizations.cmake @@ -18,9 +18,6 @@ ## this software. ##============================================================================ -set(VTKm_Vectorization "none" CACHE STRING "Level of compiler vectorization support") -set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) - #Currently all we are going to build is a set of options that are possible #based on the compiler. For now we are going on the presumption #that x86 architecture is the only target for vectorization and therefore @@ -32,16 +29,21 @@ set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) # Do not explicitly enable vectorization, but at the same don't explicitly disable # vectorization. # -# 2. avx +# 2. native: +# Allow the compiler to use auto-detection based on the systems CPU to determine +# the highest level of vectorization support that is allowed. This means that +# libraries and executables built with this setting are non-portable. +# +# 3. avx # Compile with just AVX enabled, no AVX2 or AVX512 vectorization will be used. # This means that Sandy Bridge, Ivy Bridge, Haswell, and Skylake are supported, # but Haswell and newer will not use any AVX2 instructions # -# 3. avx2 +# 4. avx2 # Compile with  AVX2/AVX enabled, no AVX512 vectorization will be used. # This means that Sandy Bridge, and Ivy Bridge can not run the code. # -# 4. avx512 +# 5. avx512 # Compile with AVX512/AVX2/AVX options enabled. # This means that Sandy Bridge, Ivy Bridge, Haswell and can not run the code. # Only XeonPhi Knights Landing and Skylake processors can run the code. @@ -58,66 +60,83 @@ set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) # include(clang.cmake) # # This way we could also do compile warning flag detection at the same time -# We need to enable -Wno-pass-failed when using clang atleast to kill the -# amount of warnings we get +# +# +# Note: By default we use 'native' as the default option +# +# +set(vec_levels none native) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") #for now we presume gcc > 4.6 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx) + list(APPEND vec_levels avx) #common flags for the avx instructions for the gcc compiler + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.7) #if GNU is less than 4.9 you get avx, avx2 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2) + list(APPEND vec_levels avx2) elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) #if GNU is less than 5.1 you get avx, avx2, and some avx512 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) + list(APPEND vec_levels avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd) else() #if GNU is 5.1+ you get avx, avx2, and more avx512 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) + list(APPEND vec_levels avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) + list(APPEND vec_levels avx avx2 avx512) + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") #While Clang support AVX512, no version of AppleClang has that support yet - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) + list(APPEND vec_levels avx avx2) + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") #I can't find documentation to explicitly state the level of vectorization #support I want from the PGI compiler + #so for now we are going to do nothing elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #Intel 15.X is the first version with avx512 #Intel 16.X has way better vector generation compared to 15.X though + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -xHost) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -xAVX) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX2) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0) - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) + list(APPEND vec_levels avx avx2) else() - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) + list(APPEND vec_levels avx avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX512) endif() endif() +# +# Now that we have set up what levels the compiler lets setup the CMake option +# We use a combo box style property, so that ccmake and cmake-gui have a +# nice interface +# +set(VTKm_Vectorization "native" CACHE STRING "Level of compiler vectorization support") +set_property(CACHE VTKm_Vectorization PROPERTY STRINGS ${vec_levels}) # # Now that we have set up the options, lets setup the compile flags that # we are going to require. # -# set(flags) -if(VTKm_Vectorization STREQUAL "avx") +if(VTKm_Vectorization STREQUAL "native") + get_property(flags GLOBAL PROPERTY VTKm_NATIVE_FLAGS) +elseif(VTKm_Vectorization STREQUAL "avx") get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS) elseif(VTKm_Vectorization STREQUAL "avx2") get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) @@ -134,14 +153,3 @@ endif() foreach(flag ${flags}) add_compile_options( ${flag} ) endforeach() - -# -# Lastly we need to setup flags that can be configured into a vtk-m header -# file. so that the code understands that we have enabled vectorization -# -# - - - - - diff --git a/CMakeLists.txt b/CMakeLists.txt index b9d7ae605..a8b9b84ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,11 +232,6 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS}) set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA}) set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB}) -set(VTKM_ENABLE_VECTORIZATION ON) -if(VTKm_Vectorization STREQUAL "none") - set(VTKM_ENABLE_VECTORIZATION OFF) -endif() - set(VTKM_ENABLE_OPENGL_INTEROP ${VTKm_ENABLE_OPENGL_INTEROP}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/vtkm/internal/Configure.h.in @@ -247,8 +242,6 @@ vtkm_install_headers( unset(VTKM_ENABLE_OPENGL_INTEROP) -unset(VTKM_ENABLE_VECTORIZATION) - unset(VTKM_ENABLE_TBB) unset(VTKM_ENABLE_CUDA) diff --git a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h index 8ca8f8d95..1b7a99fe4 100644 --- a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h +++ b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h @@ -226,16 +226,10 @@ public: //into a standard copy, causing the above issue. T lastValue = inputPortal.Get(numberOfValues - 1); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=(numberOfValues-1); i >= 1; --i) { +VTKM_VECTORIZATION_IN_LOOP //nothing for gcc as input & output could be the same outputPortal.Set(i, inputPortal.Get(i-1)); } @@ -295,23 +289,10 @@ public: const vtkm::Id size = numInstances; -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=0; i < size; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP kernel(i); } @@ -343,23 +324,10 @@ public: for(vtkm::Id j=0; j < rangeMax[1]; ++j) { index[1] = j; -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=0; i < rangeMax[0]; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP index[0] = i; kernel( index ); } @@ -394,16 +362,10 @@ private: PortalI indexPortal = index.PrepareForInput(Device()); PortalVout valuesOutPortal = values_out.PrepareForOutput(n, Device()); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for (vtkm::Id i=0; iFunctor(index); } } @@ -406,23 +392,10 @@ public: index[1] = j; const vtkm::Id start =range.cols().begin(); const vtkm::Id end = range.cols().end(); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for( vtkm::Id i=start; i != end; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP index[0] = i; this->Functor( index ); } diff --git a/vtkm/internal/Configure.h.in b/vtkm/internal/Configure.h.in index 5375da4a9..c93f18b42 100644 --- a/vtkm/internal/Configure.h.in +++ b/vtkm/internal/Configure.h.in @@ -148,9 +148,30 @@ #define VTKM_THIRDPARTY_POST_INCLUDE #endif -//Mark if we are building with vectorization enabled -#ifndef VTKM_ENABLE_VECTORIZATION -#cmakedefine VTKM_ENABLE_VECTORIZATION +// Define a pair of macros, VTKM_VECTORIZATION_PRE_LOOP and VTKM_VECTORIZATION_IN_LOOP, +// that should be wrapped around any "for"/"while" that you want vectorized. +// This is used to set per compiler pragmas for vectorization, and to disable +// any warnings that about vectorization failures. +#if defined(VTKM_CLANG) +//clang only needs pre loop +#define VTKM_VECTORIZATION_PRE_LOOP \ + _Pragma("clang loop vectorize(enable) interleave(enable)") +#define VTKM_VECTORIZATION_IN_LOOP + +#elif defined(VTKM_ICC) +//icc needs pre and in loop +#define VTKM_VECTORIZATION_PRE_LOOP \ + _Pragma("simd") +#define VTKM_VECTORIZATION_IN_LOOP \ + _Pragma("forceinline recursive") +#elif defined(VTKM_GCC) +//gcc only needs in loop +#define VTKM_VECTORIZATION_PRE_LOOP + _Pragma("ivdep") +#define VTKM_VECTORIZATION_IN_LOOP +#else +#define VTKM_VECTORIZATION_PRE_LOOP +#define VTKM_VECTORIZATION_IN_LOOP #endif //Mark if we are building with CUDA enabled