Cycles: Use pre-compiled PTX kernel for older generation when no matching one is found

This patch changes the discovery of pre-compiled kernels, to look for any PTX, even if
it does not match the current architecture version exactly. It works because the driver can
JIT-compile PTX generated for architectures less than or equal to the current one.
This e.g. makes it possible to render on a new GPU architecture even if no pre-compiled
binary kernel was distributed for it as part of the Blender installation.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D8332
This commit is contained in:
Patrick Mours 2020-07-17 15:06:55 +02:00
parent a5ded0720c
commit a9644c812f
4 changed files with 22 additions and 9 deletions

@ -367,7 +367,7 @@ option(WITH_CYCLES_CUDA_BINARIES "Build Cycles CUDA binaries" OFF)
option(WITH_CYCLES_CUBIN_COMPILER "Build cubins with nvrtc based compiler instead of nvcc" OFF) option(WITH_CYCLES_CUBIN_COMPILER "Build cubins with nvrtc based compiler instead of nvcc" OFF)
option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF) option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF)
mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL) mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 CACHE STRING "CUDA architectures to build binaries for") set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 compute_75 CACHE STRING "CUDA architectures to build binaries for")
mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH) mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
unset(PLATFORM_DEFAULT) unset(PLATFORM_DEFAULT)
option(WITH_CYCLES_LOGGING "Build Cycles with logging support" ON) option(WITH_CYCLES_LOGGING "Build Cycles with logging support" ON)

@ -53,7 +53,7 @@ set(WITH_USD ON CACHE BOOL "" FORCE)
set(WITH_MEM_JEMALLOC ON CACHE BOOL "" FORCE) set(WITH_MEM_JEMALLOC ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUDA_BINARIES ON CACHE BOOL "" FORCE) set(WITH_CYCLES_CUDA_BINARIES ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE) set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE)
set(CYCLES_CUDA_BINARIES_ARCH sm_30;sm_35;sm_37;sm_50;sm_52;sm_60;sm_61;sm_70;sm_75 CACHE STRING "" FORCE) set(CYCLES_CUDA_BINARIES_ARCH sm_30;sm_35;sm_37;sm_50;sm_52;sm_60;sm_61;sm_70;sm_75;compute_75 CACHE STRING "" FORCE)
set(WITH_CYCLES_DEVICE_OPTIX ON CACHE BOOL "" FORCE) set(WITH_CYCLES_DEVICE_OPTIX ON CACHE BOOL "" FORCE)
# platform dependent options # platform dependent options

@ -383,12 +383,25 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
} }
} }
const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
int ptx_major = major, ptx_minor = minor;
while (ptx_major >= 3) {
const string ptx = path_get(
string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
if (path_exists(ptx)) { if (path_exists(ptx)) {
VLOG(1) << "Using precompiled kernel."; VLOG(1) << "Using precompiled kernel.";
return ptx; return ptx;
} }
if (ptx_minor > 0) {
ptx_minor--;
}
else {
ptx_major--;
ptx_minor = 9;
}
}
} }
/* Try to use locally compiled kernel. */ /* Try to use locally compiled kernel. */

@ -539,7 +539,7 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
${SRC_UTIL_HEADERS} ${SRC_UTIL_HEADERS}
COMMAND ${CUBIN_CC_ENV} COMMAND ${CUBIN_CC_ENV}
"$<TARGET_FILE:cycles_cubin_cc>" "$<TARGET_FILE:cycles_cubin_cc>"
-target 30 -target 52
-ptx -ptx
-i ${CMAKE_CURRENT_SOURCE_DIR}/${input} -i ${CMAKE_CURRENT_SOURCE_DIR}/${input}
${cuda_flags} ${cuda_flags}
@ -563,7 +563,7 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
COMMAND COMMAND
${CUDA_NVCC_EXECUTABLE} ${CUDA_NVCC_EXECUTABLE}
--ptx --ptx
-arch=sm_30 -arch=sm_52
${cuda_flags} ${cuda_flags}
${input} ${input}
WORKING_DIRECTORY WORKING_DIRECTORY