From 4ea567aee9a0c7b3c97cc306679e8f4a9808da32 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 30 Nov 2015 11:08:21 -0500 Subject: [PATCH 1/2] Remove VTKm_ENABLE_VECTORIZATION, as VTKm_Vectorization handles all use cases. --- CMake/UseVTKmSerial.cmake | 4 +-- CMake/UseVTKmTBB.cmake | 4 +-- CMake/VTKmCompilerOptimizations.cmake | 37 ++++++++++++--------------- CMake/VTKmConfig.cmake.in | 1 - CMakeLists.txt | 3 +-- 5 files changed, 20 insertions(+), 29 deletions(-) diff --git a/CMake/UseVTKmSerial.cmake b/CMake/UseVTKmSerial.cmake index d92a902e1..da48849f1 100644 --- a/CMake/UseVTKmSerial.cmake +++ b/CMake/UseVTKmSerial.cmake @@ -34,9 +34,7 @@ endif () #----------------------------------------------------------------------------- # Set up the compiler flag optimizations #----------------------------------------------------------------------------- -if(VTKm_ENABLE_VECTORIZATION) - include(VTKmCompilerOptimizations) -endif() +include(VTKmCompilerOptimizations) if (VTKm_Serial_FOUND) set(VTKm_Serial_initialize_complete TRUE) diff --git a/CMake/UseVTKmTBB.cmake b/CMake/UseVTKmTBB.cmake index b85371f31..70adb0672 100644 --- a/CMake/UseVTKmTBB.cmake +++ b/CMake/UseVTKmTBB.cmake @@ -47,9 +47,7 @@ endif () #----------------------------------------------------------------------------- # Set up the compiler flag optimizations #----------------------------------------------------------------------------- -if(VTKm_ENABLE_VECTORIZATION) - include(VTKmCompilerOptimizations) -endif() +include(VTKmCompilerOptimizations) #----------------------------------------------------------------------------- # Set up all these dependent packages (if they were all found). diff --git a/CMake/VTKmCompilerOptimizations.cmake b/CMake/VTKmCompilerOptimizations.cmake index cbb955743..c6d4879d0 100644 --- a/CMake/VTKmCompilerOptimizations.cmake +++ b/CMake/VTKmCompilerOptimizations.cmake @@ -116,28 +116,25 @@ endif() # we are going to require. # # -if(VTKm_ENABLE_VECTORIZATION) - set(flags) - if(VTKm_Vectorization STREQUAL "avx") - get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS) - elseif(VTKm_Vectorization STREQUAL "avx2") - get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) - get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) - set(flags ${avx} ${avx2}) - elseif(VTKm_Vectorization STREQUAL "avx512") - get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) - get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) - get_property(avx512 GLOBAL PROPERTY VTKm_AVX512_FLAGS) - set(flags ${avx} ${avx2} ${avx512}) - endif() - - #have to specify each compile option separately, can't do them in bulk - foreach(flag ${flags}) - - add_compile_options( ${flag} ) - endforeach() +set(flags) +if(VTKm_Vectorization STREQUAL "avx") + get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS) +elseif(VTKm_Vectorization STREQUAL "avx2") + get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) + get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) + set(flags ${avx} ${avx2}) +elseif(VTKm_Vectorization STREQUAL "avx512") + get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) + get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) + get_property(avx512 GLOBAL PROPERTY VTKm_AVX512_FLAGS) + set(flags ${avx} ${avx2} ${avx512}) endif() +#have to specify each compile option separately, can't do them in bulk +foreach(flag ${flags}) + add_compile_options( ${flag} ) +endforeach() + # # Lastly we need to setup flags that can be configured into a vtk-m header # file. so that the code understands that we have enabled vectorization diff --git a/CMake/VTKmConfig.cmake.in b/CMake/VTKmConfig.cmake.in index fcec91c38..1e871e5ea 100644 --- a/CMake/VTKmConfig.cmake.in +++ b/CMake/VTKmConfig.cmake.in @@ -43,7 +43,6 @@ set(VTKm_CMAKE_MODULE_PATH "@VTKm_CMAKE_MODULE_PATH_CONFIG@") set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@") set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@") -set(VTKm_ENABLE_VECTORIZATION "@VTKm_ENABLE_VECTORIZATION@") # VTKm requires some CMake Find modules not included with CMake, so # include the CMake modules distributed with VTKm. diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a3f89064..b9d7ae605 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,7 +92,6 @@ include(CMake/VTKmCompilerExtras.cmake) # Configurable Options option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF) option(VTKm_ENABLE_TBB "Enable TBB support" OFF) -option(VTKm_ENABLE_VECTORIZATION "Enable compiler vectorization support" ON) option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON) option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF) @@ -233,7 +232,7 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS}) set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA}) set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB}) -set(VTKM_ENABLE_VECTORIZATION ${VTKm_ENABLE_VECTORIZATION}) +set(VTKM_ENABLE_VECTORIZATION ON) if(VTKm_Vectorization STREQUAL "none") set(VTKM_ENABLE_VECTORIZATION OFF) endif() From bfb6c26a985b94fb35a06c809f91ebb7d649a014 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 1 Dec 2015 10:09:46 -0500 Subject: [PATCH 2/2] Simplify the design of vectorization support. Remove the configured file variables, as that causes problems when using an installed version of VTK-m. --- CMake/VTKmCompilerOptimizations.cmake | 66 +++++++++++-------- CMakeLists.txt | 7 -- .../internal/DeviceAdapterAlgorithmSerial.h | 54 +++------------ vtkm/cont/tbb/internal/FunctorsTBB.h | 35 ++-------- vtkm/internal/Configure.h.in | 27 +++++++- 5 files changed, 73 insertions(+), 116 deletions(-) diff --git a/CMake/VTKmCompilerOptimizations.cmake b/CMake/VTKmCompilerOptimizations.cmake index c6d4879d0..fb3e44375 100644 --- a/CMake/VTKmCompilerOptimizations.cmake +++ b/CMake/VTKmCompilerOptimizations.cmake @@ -18,9 +18,6 @@ ## this software. ##============================================================================ -set(VTKm_Vectorization "none" CACHE STRING "Level of compiler vectorization support") -set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) - #Currently all we are going to build is a set of options that are possible #based on the compiler. For now we are going on the presumption #that x86 architecture is the only target for vectorization and therefore @@ -32,16 +29,21 @@ set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) # Do not explicitly enable vectorization, but at the same don't explicitly disable # vectorization. # -# 2. avx +# 2. native: +# Allow the compiler to use auto-detection based on the systems CPU to determine +# the highest level of vectorization support that is allowed. This means that +# libraries and executables built with this setting are non-portable. +# +# 3. avx # Compile with just AVX enabled, no AVX2 or AVX512 vectorization will be used. # This means that Sandy Bridge, Ivy Bridge, Haswell, and Skylake are supported, # but Haswell and newer will not use any AVX2 instructions # -# 3. avx2 +# 4. avx2 # Compile with  AVX2/AVX enabled, no AVX512 vectorization will be used. # This means that Sandy Bridge, and Ivy Bridge can not run the code. # -# 4. avx512 +# 5. avx512 # Compile with AVX512/AVX2/AVX options enabled. # This means that Sandy Bridge, Ivy Bridge, Haswell and can not run the code. # Only XeonPhi Knights Landing and Skylake processors can run the code. @@ -58,66 +60,83 @@ set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none) # include(clang.cmake) # # This way we could also do compile warning flag detection at the same time -# We need to enable -Wno-pass-failed when using clang atleast to kill the -# amount of warnings we get +# +# +# Note: By default we use 'native' as the default option +# +# +set(vec_levels none native) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") #for now we presume gcc > 4.6 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx) + list(APPEND vec_levels avx) #common flags for the avx instructions for the gcc compiler + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.7) #if GNU is less than 4.9 you get avx, avx2 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2) + list(APPEND vec_levels avx2) elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) #if GNU is less than 5.1 you get avx, avx2, and some avx512 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) + list(APPEND vec_levels avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd) else() #if GNU is 5.1+ you get avx, avx2, and more avx512 - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) + list(APPEND vec_levels avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) + list(APPEND vec_levels avx avx2 avx512) + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") #While Clang support AVX512, no version of AppleClang has that support yet - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) + list(APPEND vec_levels avx avx2) + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -march=native) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") #I can't find documentation to explicitly state the level of vectorization #support I want from the PGI compiler + #so for now we are going to do nothing elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #Intel 15.X is the first version with avx512 #Intel 16.X has way better vector generation compared to 15.X though + set_property(GLOBAL PROPERTY VTKm_NATIVE_FLAGS -xHost) set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -xAVX) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX2) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0) - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) + list(APPEND vec_levels avx avx2) else() - set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) + list(APPEND vec_levels avx avx2 avx512) set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX512) endif() endif() +# +# Now that we have set up what levels the compiler lets setup the CMake option +# We use a combo box style property, so that ccmake and cmake-gui have a +# nice interface +# +set(VTKm_Vectorization "native" CACHE STRING "Level of compiler vectorization support") +set_property(CACHE VTKm_Vectorization PROPERTY STRINGS ${vec_levels}) # # Now that we have set up the options, lets setup the compile flags that # we are going to require. # -# set(flags) -if(VTKm_Vectorization STREQUAL "avx") +if(VTKm_Vectorization STREQUAL "native") + get_property(flags GLOBAL PROPERTY VTKm_NATIVE_FLAGS) +elseif(VTKm_Vectorization STREQUAL "avx") get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS) elseif(VTKm_Vectorization STREQUAL "avx2") get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) @@ -134,14 +153,3 @@ endif() foreach(flag ${flags}) add_compile_options( ${flag} ) endforeach() - -# -# Lastly we need to setup flags that can be configured into a vtk-m header -# file. so that the code understands that we have enabled vectorization -# -# - - - - - diff --git a/CMakeLists.txt b/CMakeLists.txt index b9d7ae605..a8b9b84ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,11 +232,6 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS}) set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA}) set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB}) -set(VTKM_ENABLE_VECTORIZATION ON) -if(VTKm_Vectorization STREQUAL "none") - set(VTKM_ENABLE_VECTORIZATION OFF) -endif() - set(VTKM_ENABLE_OPENGL_INTEROP ${VTKm_ENABLE_OPENGL_INTEROP}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/vtkm/internal/Configure.h.in @@ -247,8 +242,6 @@ vtkm_install_headers( unset(VTKM_ENABLE_OPENGL_INTEROP) -unset(VTKM_ENABLE_VECTORIZATION) - unset(VTKM_ENABLE_TBB) unset(VTKM_ENABLE_CUDA) diff --git a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h index 8ca8f8d95..1b7a99fe4 100644 --- a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h +++ b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h @@ -226,16 +226,10 @@ public: //into a standard copy, causing the above issue. T lastValue = inputPortal.Get(numberOfValues - 1); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=(numberOfValues-1); i >= 1; --i) { +VTKM_VECTORIZATION_IN_LOOP //nothing for gcc as input & output could be the same outputPortal.Set(i, inputPortal.Get(i-1)); } @@ -295,23 +289,10 @@ public: const vtkm::Id size = numInstances; -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=0; i < size; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP kernel(i); } @@ -343,23 +324,10 @@ public: for(vtkm::Id j=0; j < rangeMax[1]; ++j) { index[1] = j; -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for(vtkm::Id i=0; i < rangeMax[0]; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP index[0] = i; kernel( index ); } @@ -394,16 +362,10 @@ private: PortalI indexPortal = index.PrepareForInput(Device()); PortalVout valuesOutPortal = values_out.PrepareForOutput(n, Device()); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for (vtkm::Id i=0; iFunctor(index); } } @@ -406,23 +392,10 @@ public: index[1] = j; const vtkm::Id start =range.cols().begin(); const vtkm::Id end = range.cols().end(); -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_CLANG) - #pragma ivdep - #pragma clang loop vectorize(enable) interleave(enable) -#elif defined(VTKM_ICC) - #pragma simd -#endif -#endif +VTKM_VECTORIZATION_PRE_LOOP for( vtkm::Id i=start; i != end; ++i) { -#ifdef VTKM_ENABLE_VECTORIZATION -#if defined(VTKM_GCC) - #pragma Loop_Optimize (Ivdep, Vector) -#elif defined(VTKM_ICC) - #pragma forceinline recursive -#endif -#endif +VTKM_VECTORIZATION_IN_LOOP index[0] = i; this->Functor( index ); } diff --git a/vtkm/internal/Configure.h.in b/vtkm/internal/Configure.h.in index 5375da4a9..c93f18b42 100644 --- a/vtkm/internal/Configure.h.in +++ b/vtkm/internal/Configure.h.in @@ -148,9 +148,30 @@ #define VTKM_THIRDPARTY_POST_INCLUDE #endif -//Mark if we are building with vectorization enabled -#ifndef VTKM_ENABLE_VECTORIZATION -#cmakedefine VTKM_ENABLE_VECTORIZATION +// Define a pair of macros, VTKM_VECTORIZATION_PRE_LOOP and VTKM_VECTORIZATION_IN_LOOP, +// that should be wrapped around any "for"/"while" that you want vectorized. +// This is used to set per compiler pragmas for vectorization, and to disable +// any warnings that about vectorization failures. +#if defined(VTKM_CLANG) +//clang only needs pre loop +#define VTKM_VECTORIZATION_PRE_LOOP \ + _Pragma("clang loop vectorize(enable) interleave(enable)") +#define VTKM_VECTORIZATION_IN_LOOP + +#elif defined(VTKM_ICC) +//icc needs pre and in loop +#define VTKM_VECTORIZATION_PRE_LOOP \ + _Pragma("simd") +#define VTKM_VECTORIZATION_IN_LOOP \ + _Pragma("forceinline recursive") +#elif defined(VTKM_GCC) +//gcc only needs in loop +#define VTKM_VECTORIZATION_PRE_LOOP + _Pragma("ivdep") +#define VTKM_VECTORIZATION_IN_LOOP +#else +#define VTKM_VECTORIZATION_PRE_LOOP +#define VTKM_VECTORIZATION_IN_LOOP #endif //Mark if we are building with CUDA enabled