From 204804af50b2899c80e72eb4a9ecafb7497be2fa Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Thu, 3 Dec 2015 10:16:19 -0500
Subject: [PATCH] Teach VTK-m how to specify the CUDA GPU architecture to build
 for.

Like the ability to specify the vectorization level, users of CMake can
now specify what GPU architectures they want to build for. Most users
should just use the default 'native'.
---
 CMake/UseVTKmCUDA.cmake         | 74 +++++++++++++++++++++++++++
 CMake/VTKmDetectCUDAVersion.cxx | 88 +++++++++++++++++++++++++++++++++
 CMakeLists.txt                  |  1 +
 3 files changed, 163 insertions(+)
 create mode 100644 CMake/VTKmDetectCUDAVersion.cxx

diff --git a/CMake/UseVTKmCUDA.cmake b/CMake/UseVTKmCUDA.cmake
index 219886150..715df3128 100644
--- a/CMake/UseVTKmCUDA.cmake
+++ b/CMake/UseVTKmCUDA.cmake
@@ -51,6 +51,80 @@ if (VTKm_Base_FOUND)
     endif ()
   endif ()
 
+  if(VTKm_CUDA_FOUND)
+  #---------------------------------------------------------------------------
+  # Setup build flags for CUDA
+  #---------------------------------------------------------------------------
+  # Populates CUDA_NVCC_FLAGS with the best set of flags to compile for a
+  # given GPU architecture. The majority of developers should leave the
+  # option at the default of 'native' which uses system introspection to
+  # determine the smallest numerous of virtual and real architectures it
+  # should target.
+  #
+  # The option of 'all' is provided for people generating libraries that
+  # will deployed to any number of machines, it will compile all CUDA code
+  # for all major virtual architectures, guaranteeing that the code will run
+  # anywhere.
+  #
+  #
+  # 1 - native
+  #   - Uses system introspection to determine compile flags
+  # 2 - fermi
+  #   - Uses: --generate-code arch=compute_20,code=compute_20
+  # 3 - kepler
+  #   - Uses: --generate-code arch=compute_30,code=compute_30
+  #   - Uses: --generate-code arch=compute_35,code=compute_35
+  # 4 - maxwell
+  #   - Uses: --generate-code arch=compute_50,code=compute_50
+  #   - Uses: --generate-code arch=compute_52,code=compute_52
+  # 5 - all
+  #   - Uses: --generate-code arch=compute_20,code=compute_20
+  #   - Uses: --generate-code arch=compute_30,code=compute_30
+  #   - Uses: --generate-code arch=compute_35,code=compute_35
+  #   - Uses: --generate-code arch=compute_50,code=compute_50
+  #
+
+    #specify the property
+    set(VTKm_CUDA_Architecture "native" CACHE STRING "Which GPU Architecture(s) to compile for")
+    set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell all)
+
+    #detect what the propery is set too
+    if(VTKm_CUDA_Architecture STREQUAL "native")
+      #run execute_process to do auto_detection
+      execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${CMAKE_CURRENT_LIST_DIR}/VTKmDetectCUDAVersion.cxx"
+                      RESULT_VARIABLE ran_properly
+                      OUTPUT_VARIABLE run_output)
+
+      if(ran_properly EQUAL 0)
+        #find the position of the "--generate-code" output. With some compilers such as
+        #msvc we get compile output plus run output. So we need to strip out just the
+        #run output
+        string(FIND "${run_output}" "--generate-code" position)
+        string(SUBSTRING "${run_output}" ${position} -1 run_output)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${run_output}")
+      else()
+        message(STATUS "Unable to run \"${CUDA_NVCC_EXECUTABLE}\" to autodetect GPU architecture."
+                       "Falling back to fermi, please manually specify if you want something else.")
+        set(VTKm_CUDA_Architecture "fermi")
+      endif()
+
+    elseif(VTKm_CUDA_Architecture STREQUAL "fermi")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
+    elseif(VTKm_CUDA_Architecture STREQUAL "kepler")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
+    elseif(VTKm_CUDA_Architecture STREQUAL "maxwell")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
+    elseif(VTKm_CUDA_Architecture STREQUAL "all")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
+      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
+    endif()
+  endif()
+
   #---------------------------------------------------------------------------
   # Find Thrust library.
   #---------------------------------------------------------------------------
diff --git a/CMake/VTKmDetectCUDAVersion.cxx b/CMake/VTKmDetectCUDAVersion.cxx
new file mode 100644
index 000000000..0c95a5d69
--- /dev/null
+++ b/CMake/VTKmDetectCUDAVersion.cxx
@@ -0,0 +1,88 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 Sandia Corporation.
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <string>
+#include <map>
+
+int main(int argc, char **argv)
+{
+  std::map< int, std::string > arch_to_compute;
+  arch_to_compute[11] = "compute_11";
+  arch_to_compute[12] = "compute_12";
+  arch_to_compute[13] = "compute_13";
+  arch_to_compute[20] = "compute_20";
+  arch_to_compute[21] = "compute_20";
+  arch_to_compute[30] = "compute_30";
+  arch_to_compute[32] = "compute_32";
+  arch_to_compute[35] = "compute_35";
+  arch_to_compute[37] = "compute_37";
+  arch_to_compute[50] = "compute_50";
+  arch_to_compute[52] = "compute_52";
+  arch_to_compute[53] = "compute_53";
+
+  int nDevices;
+  cudaGetDeviceCount(&nDevices);
+  if(nDevices == 0)
+  { //return failure if no cuda devices found
+    return 1;
+  }
+
+  //iterate over the devices outputting a string that would be the compile
+  //flags needed to target all gpu's on this machine.
+  int prev_arch = 0;
+  for (int i = 0; i < nDevices; i++)
+  {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+
+    //convert 2.1 to 21, 3.5 to 35, etc
+    int arch = (prop.major * 10) + prop.minor;
+
+    //if we have multiple gpu's make sure they have different arch's
+    //instead of adding the same compile options multiple times
+    if(prev_arch == arch)
+    {
+      continue;
+    }
+    prev_arch = arch;
+
+    //look up the closest virtual architecture, if the arch we are building
+    //for is not found
+    if(arch_to_compute.find(arch) != arch_to_compute.end() )
+    {
+    std::string compute_level = arch_to_compute[arch];
+    std::cout << "--generate-code arch=" << compute_level << ",code=sm_"<< arch << " ";
+    }
+    else
+    {
+    //if not found default to known highest arch, and compile to a virtual arch
+    //instead of a known sm.
+    std::map< int, std::string >::const_iterator i = arch_to_compute.end();
+    --i;
+    std::string compute_level = i->second;
+    std::cout << "--generate-code arch=" << compute_level << ",code=" << compute_level << " ";
+    }
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8b9b84ae..ab88f453e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ install(
 install(
   FILES
     ${VTKm_SOURCE_DIR}/CMake/VTKmCompilerOptimizations.cmake
+    ${VTKm_SOURCE_DIR}/CMake/VTKmDetectCUDAVersion.cxx
   DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
   )