Enable vectorization inside the Serial and TBB backends.

This commit is contained in:
Robert Maynard 2015-11-23 12:44:26 -05:00
parent 514ea09afc
commit 4ceb111a68
4 changed files with 122 additions and 22 deletions

@ -51,8 +51,12 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
set(CMAKE_CXX_FLAGS_WARN_EXTRA "-ansi ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
endif()
# Additional warnings just for Clang
if(CMAKE_COMPILER_IS_CLANGXX)
# Additional warnings just for Clang 3.5+, and AppleClang 7+
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.4)
set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.99)
set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
endif()

@ -66,32 +66,32 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx)
#common flags for the avx instructions for the gcc compiler
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx")
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7
if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 OR
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.7)
#if GNU is less than 4.9 you get avx, avx2
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2)
elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
#if GNU is less than 5.1 you get avx, avx2, and some avx512
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512)
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd")
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd)
else()
#if GNU is 5.1+ you get avx, avx2, and more avx512
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512)
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi")
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi)
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512)
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx")
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS "-mavx512")
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
#While Clang support AVX512, no version of AppleClang has that support yet
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2)
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-mavx")
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
#I can't find documentation to explicitly state the level of vectorization
#support I want from the PGI compiler
@ -99,14 +99,14 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
#Intel 15.X is the first version with avx512
#Intel 16.X has way better vector generation compared to 15.X though
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS "-xAVX")
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-xCORE-AVX2")
set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -xAVX)
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX2)
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0)
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2)
else()
set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512)
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS "-xCORE-AVX512")
set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX512)
endif()
endif()
@ -117,19 +117,25 @@ endif()
#
#
if(VTKm_ENABLE_VECTORIZATION)
set(flags)
if(VTKm_Vectorization STREQUAL "avx")
get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS)
add_compile_options( "${avx}" )
get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS)
elseif(VTKm_Vectorization STREQUAL "avx2")
get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS)
get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS)
add_compile_options( "${avx}" "${avx2}" )
set(flags ${avx} ${avx2})
elseif(VTKm_Vectorization STREQUAL "avx512")
get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS)
get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS)
get_property(avx512 GLOBAL PROPERTY VTKm_AVX512_FLAGS)
add_compile_options( "${avx}" "${avx2}" "${avx512}" )
set(flags ${avx} ${avx2} ${avx512})
endif()
#have to specify each compile option separately, can't do them in bulk
foreach(flag ${flags})
add_compile_options( ${flag} )
endforeach()
endif()
#

@ -225,8 +225,18 @@ public:
//The ICC compiler has been found to improperly optimize the copy_backwards
//into a standard copy, causing the above issue.
T lastValue = inputPortal.Get(numberOfValues - 1);
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for(vtkm::Id i=(numberOfValues-1); i >= 1; --i)
{
//nothing for gcc as input & output could be the same
outputPortal.Set(i, inputPortal.Get(i-1));
}
outputPortal.Set(0, initialValue);
@ -284,8 +294,24 @@ public:
DeviceAdapterAlgorithm<Device>::ScheduleKernel<Functor> kernel(functor);
const vtkm::Id size = numInstances;
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for(vtkm::Id i=0; i < size; ++i)
{
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_GCC)
#pragma Loop_Optimize (Ivdep, Vector)
#elif defined(VTKM_ICC)
#pragma forceinline recursive
#endif
#endif
kernel(i);
}
@ -317,8 +343,23 @@ public:
for(vtkm::Id j=0; j < rangeMax[1]; ++j)
{
index[1] = j;
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for(vtkm::Id i=0; i < rangeMax[0]; ++i)
{
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_GCC)
#pragma Loop_Optimize (Ivdep, Vector)
#elif defined(VTKM_ICC)
#pragma forceinline recursive
#endif
#endif
index[0] = i;
kernel( index );
}
@ -353,6 +394,14 @@ private:
PortalI indexPortal = index.PrepareForInput(Device());
PortalVout valuesOutPortal = values_out.PrepareForOutput(n, Device());
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for (vtkm::Id i=0; i<n; i++)
{
valuesOutPortal.Set( i, valuesPortal.Get(indexPortal.Get(i)) );

@ -337,8 +337,26 @@ public:
// error and setting the message buffer as expected.
try
{
for (vtkm::Id index = range.begin(); index < range.end(); index++)
const vtkm::Id start = range.begin();
const vtkm::Id end = range.end();
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for (vtkm::Id index = start; index != end; index++)
{
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_GCC)
#pragma Loop_Optimize (Ivdep, Vector)
#elif defined(VTKM_ICC)
#pragma forceinline recursive
#endif
#endif
this->Functor(index);
}
}
@ -386,8 +404,25 @@ public:
for( vtkm::Id j=range.rows().begin(); j!=range.rows().end(); ++j)
{
index[1] = j;
for( vtkm::Id i=range.cols().begin(); i!=range.cols().end(); ++i)
const vtkm::Id start =range.cols().begin();
const vtkm::Id end = range.cols().end();
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma ivdep
#pragma clang loop vectorize(enable) interleave(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for( vtkm::Id i=start; i != end; ++i)
{
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_GCC)
#pragma Loop_Optimize (Ivdep, Vector)
#elif defined(VTKM_ICC)
#pragma forceinline recursive
#endif
#endif
index[0] = i;
this->Functor( index );
}
@ -435,6 +470,13 @@ public:
// error and setting the message buffer as expected.
try
{
#ifdef VTKM_ENABLE_VECTORIZATION
#if defined(VTKM_CLANG)
#pragma clang loop vectorize(enable)
#elif defined(VTKM_ICC)
#pragma simd
#endif
#endif
for (vtkm::Id i = range.begin(); i < range.end(); i++)
{
OutputPortal.Set( i, ValuesPortal.Get(IndexPortal.Get(i)) );
@ -481,4 +523,3 @@ VTKM_CONT_EXPORT static void ScatterPortal(InputPortalType inputPortal,
}
}
#endif //vtk_m_cont_tbb_internal_FunctorsTBB_h