Merge topic 'enable_vectorization'

4ceb111a Enable vectorization inside the Serial and TBB backends. 514ea09a Teach VTK-m how to enable vectorization for gcc, clang, and icc. Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Kenneth Moreland <kmorel@sandia.gov> Merge-request: !275
2024-09-08 21:33:55 +00:00 · 2015-11-27 09:36:15 -05:00 · 2015-11-27 09:36:15 -05:00 · c06c54b1fb
commit c06c54b1fb
parent 9c31290619 4ceb111a68
9 changed files with 289 additions and 4 deletions
--- a/CMake/UseVTKmSerial.cmake
+++ b/CMake/UseVTKmSerial.cmake
@ -31,6 +31,13 @@ else () # !VTKm_Base_FOUND
  set(VTKm_Serial_FOUND)
 endif ()

+#-----------------------------------------------------------------------------
+# Set up the compiler flag optimizations
+#-----------------------------------------------------------------------------
+if(VTKm_ENABLE_VECTORIZATION)
+  include(VTKmCompilerOptimizations)
+endif()
+
 if (VTKm_Serial_FOUND)
  set(VTKm_Serial_initialize_complete TRUE)
 endif ()
--- a/CMake/UseVTKmTBB.cmake
+++ b/CMake/UseVTKmTBB.cmake
@ -44,6 +44,13 @@ if (VTKm_Base_FOUND)

 endif ()

+#-----------------------------------------------------------------------------
+# Set up the compiler flag optimizations
+#-----------------------------------------------------------------------------
+if(VTKm_ENABLE_VECTORIZATION)
+  include(VTKmCompilerOptimizations)
+endif()
+
 #-----------------------------------------------------------------------------
 # Set up all these dependent packages (if they were all found).
 #-----------------------------------------------------------------------------
--- a/CMake/VTKmCompilerExtras.cmake
+++ b/CMake/VTKmCompilerExtras.cmake
@ -45,12 +45,21 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
  set(CMAKE_CXX_FLAGS_DEBUG
    "${CMAKE_CXX_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_WARN}")

-  # Addtional warnings for GCC
+  # Additional warnings for GCC/Clang
  set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-long-long -Wcast-align -Wconversion -Wchar-subscripts -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused-parameter -fno-common")
  if (VTKm_FORCE_ANSI)
    set(CMAKE_CXX_FLAGS_WARN_EXTRA "-ansi ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
  endif()

+  # Additional warnings just for Clang 3.5+, and AppleClang 7+
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+      CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.4)
+    set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND
+         CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.99)
+    set(CMAKE_CXX_FLAGS_WARN_EXTRA "-Wno-pass-failed ${CMAKE_CXX_FLAGS_WARN_EXTRA}")
+  endif()
+
  # Set up the debug CXX_FLAGS for extra warnings
  option(VTKm_EXTRA_COMPILER_WARNINGS "Add compiler flags to do stricter checking when building debug." ON)
  # We used to add the compiler flags globally, but this caused problems with
--- a/CMake/VTKmCompilerOptimizations.cmake
+++ b/CMake/VTKmCompilerOptimizations.cmake
@ -0,0 +1,150 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2014 Sandia Corporation.
+##  Copyright 2014 UT-Battelle, LLC.
+##  Copyright 2014 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(VTKm_Vectorization "none" CACHE STRING "Level of compiler vectorization support")
+set_property(CACHE VTKm_Vectorization PROPERTY STRINGS none)
+
+#Currently all we are going to build is a set of options that are possible
+#based on the compiler. For now we are going on the presumption
+#that x86 architecture is the only target for vectorization and therefore
+#we don't need any system detect.
+#
+#Here is the breakdown of what each flag type means:
+#
+#  1. none:
+#  Do not explicitly enable vectorization, but at the same don't explicitly disable
+#  vectorization.
+#
+#  2. avx
+#  Compile with just AVX enabled, no AVX2 or AVX512 vectorization will be used.
+#  This means that Sandy Bridge, Ivy Bridge, Haswell, and Skylake are supported,
+#  but Haswell and newer will not use any AVX2 instructions
+#
+#  3. avx2
+#  Compile with  AVX2/AVX enabled, no AVX512 vectorization will be used.
+#  This means that Sandy Bridge, and Ivy Bridge can not run the code.
+#
+#  4. avx512
+#  Compile with AVX512/AVX2/AVX options enabled.
+#  This means that Sandy Bridge, Ivy Bridge, Haswell and can not run the code.
+#  Only XeonPhi Knights Landing and Skylake processors can run the code.
+#
+#  AVX512 is designed to mix with avx/avx2 without any performance penalties,
+#  so we enable AVX2 so that we get AVX2 support for < 32bit value types which
+#  AVX512 has less support for
+#
+#
+# I wonder if we should go towards a per platform cmake include that stores
+# all this knowledge
+#   include(gcc.cmake)
+#   include(icc.cmake)
+#   include(clang.cmake)
+#
+# This way we could also do compile warning flag detection at the same time
+# We need to enable -Wno-pass-failed when using clang atleast to kill the
+# amount of warnings we get
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  #for now we presume gcc > 4.6
+  set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx)
+
+  #common flags for the avx instructions for the gcc compiler
+  set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
+  set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
+
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.7 OR
+      CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.7)
+    #if GNU is less than 4.9 you get avx, avx2
+    set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2)
+  elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
+    #if GNU is less than 5.1 you get avx, avx2, and some avx512
+    set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512)
+    set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd)
+  else()
+    #if GNU is 5.1+ you get avx, avx2, and more avx512
+    set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx2 avx512)
+    set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi)
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512)
+  set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
+  set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
+  set_property(GLOBAL PROPERTY VTKm_AVX512_FLAGS -mavx512)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+  #While Clang support AVX512, no version of AppleClang has that support yet
+  set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2)
+  set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS -mavx)
+  set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -mf16c -mavx2 -mfma -mlzcnt -mbmi -mbmi2)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+  #I can't find documentation to explicitly state the level of vectorization
+  #support I want from the PGI compiler
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  #Intel 15.X is the first version with avx512
+  #Intel 16.X has way better vector generation compared to 15.X though
+
+  set_property(GLOBAL PROPERTY VTKm_AVX_FLAGS  -xAVX)
+  set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX2)
+
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0)
+    set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2)
+  else()
+    set_property(CACHE VTKm_Vectorization APPEND PROPERTY STRINGS avx avx2 avx512)
+    set_property(GLOBAL PROPERTY VTKm_AVX2_FLAGS -xCORE-AVX512)
+  endif()
+endif()
+
+
+#
+# Now that we have set up the options, lets setup the compile flags that
+# we are going to require.
+#
+#
+if(VTKm_ENABLE_VECTORIZATION)
+  set(flags)
+  if(VTKm_Vectorization STREQUAL "avx")
+    get_property(flags GLOBAL PROPERTY VTKm_AVX_FLAGS)
+  elseif(VTKm_Vectorization STREQUAL "avx2")
+    get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS)
+    get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS)
+    set(flags ${avx} ${avx2})
+  elseif(VTKm_Vectorization STREQUAL "avx512")
+    get_property(avx GLOBAL PROPERTY VTKm_AVX_FLAGS)
+    get_property(avx2 GLOBAL PROPERTY VTKm_AVX2_FLAGS)
+    get_property(avx512 GLOBAL PROPERTY VTKm_AVX512_FLAGS)
+    set(flags ${avx} ${avx2} ${avx512})
+  endif()
+
+  #have to specify each compile option separately, can't do them in bulk
+  foreach(flag ${flags})
+
+    add_compile_options( ${flag} )
+  endforeach()
+endif()
+
+#
+# Lastly we need to setup flags that can be configured into a vtk-m header
+# file. so that the code understands that we have enabled vectorization
+#
+#
+
+
+
+
+
--- a/CMake/VTKmConfig.cmake.in
+++ b/CMake/VTKmConfig.cmake.in
@ -43,6 +43,7 @@ set(VTKm_CMAKE_MODULE_PATH "@VTKm_CMAKE_MODULE_PATH_CONFIG@")

 set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@")
 set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
+set(VTKm_ENABLE_VECTORIZATION "@VTKm_ENABLE_VECTORIZATION@")

 # VTKm requires some CMake Find modules not included with CMake, so
 # include the CMake modules distributed with VTKm.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -92,6 +92,7 @@ include(CMake/VTKmCompilerExtras.cmake)
 # Configurable Options
 option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF)
 option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
+option(VTKm_ENABLE_VECTORIZATION "Enable compiler vectorization support" ON)
 option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON)
 option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)

@ -232,6 +233,11 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS})
 set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA})
 set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB})

+set(VTKM_ENABLE_VECTORIZATION ${VTKm_ENABLE_VECTORIZATION})
+if(VTKm_Vectorization STREQUAL "none")
+  set(VTKM_ENABLE_VECTORIZATION OFF)
+endif()
+
 set(VTKM_ENABLE_OPENGL_INTEROP ${VTKm_ENABLE_OPENGL_INTEROP})

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/vtkm/internal/Configure.h.in
@ -242,6 +248,8 @@ vtkm_install_headers(

 unset(VTKM_ENABLE_OPENGL_INTEROP)

+unset(VTKM_ENABLE_VECTORIZATION)
+
 unset(VTKM_ENABLE_TBB)
 unset(VTKM_ENABLE_CUDA)

@ -310,6 +318,14 @@ install(
  DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
  )

+# Install support files.
+install(
+  FILES
+    ${VTKm_SOURCE_DIR}/CMake/VTKmCompilerOptimizations.cmake
+  DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
+  )
+
+

 # Enable CPack packaging
 set(CPACK_PACKAGE_DESCRIPTION_FILE ${VTKm_SOURCE_DIR}/README.md)
--- a/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h
+++ b/vtkm/cont/internal/DeviceAdapterAlgorithmSerial.h
@ -225,8 +225,18 @@ public:
    //The ICC compiler has been found to improperly optimize the copy_backwards
    //into a standard copy, causing the above issue.
    T lastValue = inputPortal.Get(numberOfValues - 1);
+
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
    for(vtkm::Id i=(numberOfValues-1); i >= 1; --i)
      {
+      //nothing for gcc as input & output could be the same
      outputPortal.Set(i, inputPortal.Get(i-1));
      }
    outputPortal.Set(0, initialValue);
@ -284,8 +294,24 @@ public:
    DeviceAdapterAlgorithm<Device>::ScheduleKernel<Functor> kernel(functor);

    const vtkm::Id size = numInstances;
+
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
    for(vtkm::Id i=0; i < size; ++i)
      {
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_GCC)
+    #pragma Loop_Optimize (Ivdep, Vector)
+#elif defined(VTKM_ICC)
+    #pragma forceinline recursive
+#endif
+#endif
      kernel(i);
      }

@ -317,8 +343,23 @@ public:
      for(vtkm::Id j=0; j < rangeMax[1]; ++j)
        {
        index[1] = j;
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
        for(vtkm::Id i=0; i < rangeMax[0]; ++i)
          {
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_GCC)
+    #pragma Loop_Optimize (Ivdep, Vector)
+#elif defined(VTKM_ICC)
+    #pragma forceinline recursive
+#endif
+#endif
          index[0] = i;
          kernel( index );
          }
@ -353,6 +394,14 @@ private:
    PortalI indexPortal = index.PrepareForInput(Device());
    PortalVout valuesOutPortal = values_out.PrepareForOutput(n, Device());

+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
    for (vtkm::Id i=0; i<n; i++)
    {
       valuesOutPortal.Set( i, valuesPortal.Get(indexPortal.Get(i)) );
--- a/vtkm/cont/tbb/internal/FunctorsTBB.h
+++ b/vtkm/cont/tbb/internal/FunctorsTBB.h
@ -337,8 +337,26 @@ public:
    // error and setting the message buffer as expected.
    try
      {
-      for (vtkm::Id index = range.begin(); index < range.end(); index++)
+      const vtkm::Id start = range.begin();
+      const vtkm::Id end = range.end();
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
+      for (vtkm::Id index = start; index != end; index++)
        {
+
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_GCC)
+    #pragma Loop_Optimize (Ivdep, Vector)
+#elif defined(VTKM_ICC)
+    #pragma forceinline recursive
+#endif
+#endif
        this->Functor(index);
        }
      }
@ -386,8 +404,25 @@ public:
        for( vtkm::Id j=range.rows().begin(); j!=range.rows().end(); ++j)
          {
          index[1] = j;
-          for( vtkm::Id i=range.cols().begin(); i!=range.cols().end(); ++i)
+          const vtkm::Id start =range.cols().begin();
+          const vtkm::Id end = range.cols().end();
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma ivdep
+    #pragma clang loop vectorize(enable) interleave(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
+          for( vtkm::Id i=start; i != end; ++i)
            {
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_GCC)
+    #pragma Loop_Optimize (Ivdep, Vector)
+#elif defined(VTKM_ICC)
+    #pragma forceinline recursive
+#endif
+#endif
            index[0] = i;
            this->Functor( index );
            }
@ -435,6 +470,13 @@ public:
    // error and setting the message buffer as expected.
    try
      {
+#ifdef VTKM_ENABLE_VECTORIZATION
+#if defined(VTKM_CLANG)
+    #pragma clang loop vectorize(enable)
+#elif defined(VTKM_ICC)
+    #pragma simd
+#endif
+#endif
      for (vtkm::Id i = range.begin(); i < range.end(); i++)
        {
        OutputPortal.Set( i, ValuesPortal.Get(IndexPortal.Get(i)) );
@ -481,4 +523,3 @@ VTKM_CONT_EXPORT static void ScatterPortal(InputPortalType  inputPortal,
 }
 }
 #endif //vtk_m_cont_tbb_internal_FunctorsTBB_h
-
--- a/vtkm/internal/Configure.h.in
+++ b/vtkm/internal/Configure.h.in
@ -148,6 +148,11 @@
 #define VTKM_THIRDPARTY_POST_INCLUDE
 #endif

+//Mark if we are building with vectorization enabled
+#ifndef VTKM_ENABLE_VECTORIZATION
+#cmakedefine VTKM_ENABLE_VECTORIZATION
+#endif
+
 //Mark if we are building with CUDA enabled
 #ifndef VTKM_ENABLE_CUDA
 #cmakedefine VTKM_ENABLE_CUDA