From 4ceb111a68a38a4dbef0a2d2e54630f9680030c2 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 23 Nov 2015 12:44:26 -0500 Subject: [PATCH] Enable vectorization inside the Serial and TBB backends. --- CMake/VTKmCompilerExtras.cmake | 8 ++- CMake/VTKmCompilerOptimizations.cmake | 40 ++++++++------- .../internal/DeviceAdapterAlgorithmSerial.h | 49 +++++++++++++++++++ vtkm/cont/tbb/internal/FunctorsTBB.h | 47 ++++++++++++++++-- 4 files changed, 122 insertions(+), 22 deletions(-) diff --git a/CMake/VTKmCompilerExtras.cmake b/CMake/VTKmCompilerExtras.cmake index 86b3ce065..37ca83d2e 100644 --- a/CMake/VTKmCompilerExtras.cmake +++ b/CMake/VTKmCompilerExtras.cmake @@ -51,8 +51,12 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) set(CMAKE_CXX_FLAGS_WARN_EXTRA "-ansi ${CMAKE_CXX_FLAGS_WARN_EXTRA}") endif() - # Additional warnings just for Clang - if(CMAKE_COMPILER_IS_CLANGXX) + # Additional warnings just for Clang 3.5+, and AppleClang 7+ + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.4) + set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.99) set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}") endif() diff --git a/CMake/VTKmCompilerOptimizations.cmake b/CMake/VTKmCompilerOptimizations.cmake index 85dbd4991..cbb955743 100644 --- a/CMake/VTKmCompilerOptimizations.cmake +++ b/CMake/VTKmCompilerOptimizations.cmake @@ -66,32 +66,32 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx) #common flags for the avx instructions for the gcc compiler - set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx") - set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2") + set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) + set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) - if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 + if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.7) #if GNU is less than 4.9 you get avx, avx2 set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2) elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) #if GNU is less than 5.1 you get avx, avx2, and some avx512 set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) - set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd") + set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd) else() #if GNU is 5.1+ you get avx, avx2, and more avx512 set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512) - set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi") + set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi) endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) - set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx") - set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2") - set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512") + set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) + set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) + set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") #While Clang support AVX512, no version of AppleClang has that support yet set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) - set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx") - set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2") + set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx) + set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2) elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") #I can't find documentation to explicitly state the level of vectorization #support I want from the PGI compiler @@ -99,14 +99,14 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #Intel 15.X is the first version with avx512 #Intel 16.X has way better vector generation compared to 15.X though - set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-xAVX") - set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-xCORE-AVX2") + set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -xAVX) + set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX2) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0) set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2) else() set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512) - set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-xCORE-AVX512") + set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX512) endif() endif() @@ -117,19 +117,25 @@ endif() # # if(VTKm_ENABLE_VECTORIZATION) + set(flags) if(VTKm_Vectorization STREQUAL "avx") - get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) - add_compile_options( "${avx}" ) + get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS) elseif(VTKm_Vectorization STREQUAL "avx2") get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) - add_compile_options( "${avx}" "${avx2}" ) + set(flags ${avx} ${avx2}) elseif(VTKm_Vectorization STREQUAL "avx512") get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS) get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS) get_property(avx512 GLOBAL PROPERTY VTKm_AVX512_FLAGS) - add_compile_options( "${avx}" "${avx2}" "${avx512}" ) + set(flags ${avx} ${avx2} ${avx512}) endif() + + #have to specify each compile option separately, can't do them in bulk + foreach(flag ${flags}) + + add_compile_options( ${flag} ) + endforeach() endif() # diff --git a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h index 1fac8fc7e..8ca8f8d95 100644 --- a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h +++ b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h @@ -225,8 +225,18 @@ public: //The ICC compiler has been found to improperly optimize the copy_backwards //into a standard copy, causing the above issue. T lastValue = inputPortal.Get(numberOfValues - 1); + +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma ivdep + #pragma clang loop vectorize(enable) interleave(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif for(vtkm::Id i=(numberOfValues-1); i >= 1; --i) { + //nothing for gcc as input & output could be the same outputPortal.Set(i, inputPortal.Get(i-1)); } outputPortal.Set(0, initialValue); @@ -284,8 +294,24 @@ public: DeviceAdapterAlgorithm::ScheduleKernel kernel(functor); const vtkm::Id size = numInstances; + +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma ivdep + #pragma clang loop vectorize(enable) interleave(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif for(vtkm::Id i=0; i < size; ++i) { +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_GCC) + #pragma Loop_Optimize (Ivdep, Vector) +#elif defined(VTKM_ICC) + #pragma forceinline recursive +#endif +#endif kernel(i); } @@ -317,8 +343,23 @@ public: for(vtkm::Id j=0; j < rangeMax[1]; ++j) { index[1] = j; +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma ivdep + #pragma clang loop vectorize(enable) interleave(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif for(vtkm::Id i=0; i < rangeMax[0]; ++i) { +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_GCC) + #pragma Loop_Optimize (Ivdep, Vector) +#elif defined(VTKM_ICC) + #pragma forceinline recursive +#endif +#endif index[0] = i; kernel( index ); } @@ -353,6 +394,14 @@ private: PortalI indexPortal = index.PrepareForInput(Device()); PortalVout valuesOutPortal = values_out.PrepareForOutput(n, Device()); +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma ivdep + #pragma clang loop vectorize(enable) interleave(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif for (vtkm::Id i=0; iFunctor(index); } } @@ -386,8 +404,25 @@ public: for( vtkm::Id j=range.rows().begin(); j!=range.rows().end(); ++j) { index[1] = j; - for( vtkm::Id i=range.cols().begin(); i!=range.cols().end(); ++i) + const vtkm::Id start =range.cols().begin(); + const vtkm::Id end = range.cols().end(); +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma ivdep + #pragma clang loop vectorize(enable) interleave(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif + for( vtkm::Id i=start; i != end; ++i) { +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_GCC) + #pragma Loop_Optimize (Ivdep, Vector) +#elif defined(VTKM_ICC) + #pragma forceinline recursive +#endif +#endif index[0] = i; this->Functor( index ); } @@ -435,6 +470,13 @@ public: // error and setting the message buffer as expected. try { +#ifdef VTKM_ENABLE_VECTORIZATION +#if defined(VTKM_CLANG) + #pragma clang loop vectorize(enable) +#elif defined(VTKM_ICC) + #pragma simd +#endif +#endif for (vtkm::Id i = range.begin(); i < range.end(); i++) { OutputPortal.Set( i, ValuesPortal.Get(IndexPortal.Get(i)) ); @@ -481,4 +523,3 @@ VTKM_CONT_EXPORT static void ScatterPortal(InputPortalType inputPortal, } } #endif //vtk_m_cont_tbb_internal_FunctorsTBB_h -