Teach VTK-m how to specify the CUDA GPU architecture to build for.

Like the ability to specify the vectorization level, users of CMake can
now specify what GPU architectures they want to build for. Most users
should just use the default 'native'.
This commit is contained in:
Robert Maynard 2015-12-03 10:16:19 -05:00
parent 03661259b8
commit 204804af50
3 changed files with 163 additions and 0 deletions

@ -51,6 +51,80 @@ if (VTKm_Base_FOUND)
endif ()
endif ()
if(VTKm_CUDA_FOUND)
#---------------------------------------------------------------------------
# Setup build flags for CUDA
#---------------------------------------------------------------------------
# Populates CUDA_NVCC_FLAGS with the best set of flags to compile for a
# given GPU architecture. The majority of developers should leave the
# option at the default of 'native' which uses system introspection to
# determine the smallest numerous of virtual and real architectures it
# should target.
#
# The option of 'all' is provided for people generating libraries that
# will deployed to any number of machines, it will compile all CUDA code
# for all major virtual architectures, guaranteeing that the code will run
# anywhere.
#
#
# 1 - native
# - Uses system introspection to determine compile flags
# 2 - fermi
# - Uses: --generate-code arch=compute_20,code=compute_20
# 3 - kepler
# - Uses: --generate-code arch=compute_30,code=compute_30
# - Uses: --generate-code arch=compute_35,code=compute_35
# 4 - maxwell
# - Uses: --generate-code arch=compute_50,code=compute_50
# - Uses: --generate-code arch=compute_52,code=compute_52
# 5 - all
# - Uses: --generate-code arch=compute_20,code=compute_20
# - Uses: --generate-code arch=compute_30,code=compute_30
# - Uses: --generate-code arch=compute_35,code=compute_35
# - Uses: --generate-code arch=compute_50,code=compute_50
#
#specify the property
set(VTKm_CUDA_Architecture "native" CACHE STRING "Which GPU Architecture(s) to compile for")
set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell all)
#detect what the propery is set too
if(VTKm_CUDA_Architecture STREQUAL "native")
#run execute_process to do auto_detection
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${CMAKE_CURRENT_LIST_DIR}/VTKmDetectCUDAVersion.cxx"
RESULT_VARIABLE ran_properly
OUTPUT_VARIABLE run_output)
if(ran_properly EQUAL 0)
#find the position of the "--generate-code" output. With some compilers such as
#msvc we get compile output plus run output. So we need to strip out just the
#run output
string(FIND "${run_output}" "--generate-code" position)
string(SUBSTRING "${run_output}" ${position} -1 run_output)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${run_output}")
else()
message(STATUS "Unable to run \"${CUDA_NVCC_EXECUTABLE}\" to autodetect GPU architecture."
"Falling back to fermi, please manually specify if you want something else.")
set(VTKm_CUDA_Architecture "fermi")
endif()
elseif(VTKm_CUDA_Architecture STREQUAL "fermi")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
elseif(VTKm_CUDA_Architecture STREQUAL "kepler")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
elseif(VTKm_CUDA_Architecture STREQUAL "maxwell")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
elseif(VTKm_CUDA_Architecture STREQUAL "all")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_20,code=compute_20")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_30,code=compute_30")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_35,code=compute_35")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_50,code=compute_50")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-code arch=compute_52,code=compute_52")
endif()
endif()
#---------------------------------------------------------------------------
# Find Thrust library.
#---------------------------------------------------------------------------

@ -0,0 +1,88 @@
//=============================================================================
//
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2015 Sandia Corporation.
// Copyright 2015 UT-Battelle, LLC.
// Copyright 2015 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//
//=============================================================================
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <string>
#include <map>
int main(int argc, char **argv)
{
std::map< int, std::string > arch_to_compute;
arch_to_compute[11] = "compute_11";
arch_to_compute[12] = "compute_12";
arch_to_compute[13] = "compute_13";
arch_to_compute[20] = "compute_20";
arch_to_compute[21] = "compute_20";
arch_to_compute[30] = "compute_30";
arch_to_compute[32] = "compute_32";
arch_to_compute[35] = "compute_35";
arch_to_compute[37] = "compute_37";
arch_to_compute[50] = "compute_50";
arch_to_compute[52] = "compute_52";
arch_to_compute[53] = "compute_53";
int nDevices;
cudaGetDeviceCount(&nDevices);
if(nDevices == 0)
{ //return failure if no cuda devices found
return 1;
}
//iterate over the devices outputting a string that would be the compile
//flags needed to target all gpu's on this machine.
int prev_arch = 0;
for (int i = 0; i < nDevices; i++)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
//convert 2.1 to 21, 3.5 to 35, etc
int arch = (prop.major * 10) + prop.minor;
//if we have multiple gpu's make sure they have different arch's
//instead of adding the same compile options multiple times
if(prev_arch == arch)
{
continue;
}
prev_arch = arch;
//look up the closest virtual architecture, if the arch we are building
//for is not found
if(arch_to_compute.find(arch) != arch_to_compute.end() )
{
std::string compute_level = arch_to_compute[arch];
std::cout << "--generate-code arch=" << compute_level << ",code=sm_"<< arch << " ";
}
else
{
//if not found default to known highest arch, and compile to a virtual arch
//instead of a known sm.
std::map< int, std::string >::const_iterator i = arch_to_compute.end();
--i;
std::string compute_level = i->second;
std::cout << "--generate-code arch=" << compute_level << ",code=" << compute_level << " ";
}
}
return 0;
}

@ -314,6 +314,7 @@ install(
install(
FILES
${VTKm_SOURCE_DIR}/CMake/VTKmCompilerOptimizations.cmake
${VTKm_SOURCE_DIR}/CMake/VTKmDetectCUDAVersion.cxx
DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
)