Cycles: prepare to make CUDA 5.0 the official version we use

* Add CUDA compiler version detection to cmake/scons/runtime
* Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA 5.x
  is used, these were workarounds for CUDA 4.2 bugs
* Change max number of registers to 32 for sm 2.x (based on performance tests
  from Martijn Berger and confirmed here), and also for NVidia OpenCL.

Overall it seems that with these changes and the latest CUDA 5.0 download, that
performance is as good as or better than the 2.67b release with the scenes and
graphics cards I tested.
This commit is contained in:
Brecht Van Lommel 2013-06-19 17:54:23 +00:00
parent a7416641e6
commit 16204bd647
8 changed files with 209 additions and 22 deletions

@ -271,11 +271,53 @@ public:
return "";
}
int cuda_version = cuCompilerVersion();
if(cuda_version == 0) {
cuda_error_message("CUDA nvcc compiler version could not be parsed.");
return "";
}
if(cuda_version != 50)
printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
/* compile */
string kernel = path_join(kernel_path, "kernel.cu");
string include = kernel_path;
const int machine = system_cpu_bits();
const int maxreg = 24;
string arch_flags;
/* build flags depending on CUDA version and arch */
if(cuda_version < 50) {
/* CUDA 4.x */
if(major == 1) {
/* sm_1x */
arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
}
else if(major == 2) {
/* sm_2x */
arch_flags = "--maxrregcount=24";
}
else {
/* sm_3x */
arch_flags = "--maxrregcount=32";
}
}
else {
/* CUDA 4.x */
if(major == 1) {
/* sm_1x */
arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
}
else if(major == 2) {
/* sm_2x */
arch_flags = "--maxrregcount=32 --use_fast_math";
}
else {
/* sm_3x */
arch_flags = "--maxrregcount=32 --use_fast_math";
}
}
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@ -283,8 +325,10 @@ public:
path_create_directories(cubin);
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
printf("%s\n", command.c_str());
if(system(command.c_str()) == -1) {
cuda_error_message("Failed to execute compilation command, see console for details.");

@ -85,7 +85,7 @@ static string opencl_kernel_build_options(const string& platform, const string *
string build_options = " -cl-fast-relaxed-math ";
if(platform == "NVIDIA CUDA")
build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
else if(platform == "Apple")
build_options += "-D__KERNEL_OPENCL_APPLE__ -Wno-missing-prototypes ";

@ -117,32 +117,68 @@ set(SRC_UTIL_HEADERS
# CUDA module
if(WITH_CYCLES_CUDA_BINARIES)
# 32 bit or 64 bit
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(CUDA_BITS 64)
else()
set(CUDA_BITS 32)
endif()
# CUDA version
execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
# build for each arch
set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
set(cuda_cubins)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
set(cuda_cubin kernel_${arch}.cubin)
if(${arch} MATCHES "sm_1[0-9]")
# sm_1x
set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
elseif(${arch} MATCHES "sm_2[0-9]")
# sm_2x
set(cuda_arch_flags "--maxrregcount=24")
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
# warn for other versions
if(CUDA_VERSION MATCHES "50")
else()
# sm_3x
set(cuda_arch_flags "--maxrregcount=32")
message(STATUS "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed but only CUDA 5.0 is officially supported")
endif()
# build flags depending on CUDA version and arch
if(CUDA_VERSION LESS 50)
# CUDA 4.x
if(${arch} MATCHES "sm_1[0-9]")
# sm_1x
set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
elseif(${arch} MATCHES "sm_2[0-9]")
# sm_2x
set(cuda_arch_flags "--maxrregcount=24")
else()
# sm_3x
set(cuda_arch_flags "--maxrregcount=32")
endif()
set(cuda_math_flags "")
else()
# CUDA 5.x
if(${arch} MATCHES "sm_1[0-9]")
# sm_1x
set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
elseif(${arch} MATCHES "sm_2[0-9]")
# sm_2x
set(cuda_arch_flags "--maxrregcount=32")
else()
# sm_3x
set(cuda_arch_flags "--maxrregcount=32")
endif()
set(cuda_math_flags "--use_fast_math")
endif()
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
DEPENDS ${cuda_sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)

@ -25,6 +25,8 @@
#
# ***** END GPL LICENSE BLOCK *****
import re
import subprocess
import sys
import os
import Blender as B
@ -60,10 +62,19 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
svm_dir = os.path.join(source_dir, "../svm")
closure_dir = os.path.join(source_dir, "../closure")
# get CUDA version
nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
output, erroroutput = nvcc_pipe.communicate()
cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
if cuda_version != 50:
print("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported." % (cuda_version/10, cuda_version%10))
# nvcc flags
nvcc_flags = "-m%s" % (bits)
nvcc_flags += " --cubin --ptxas-options=\"-v\" --maxrregcount=24"
nvcc_flags += " --opencc-options -OPT:Olimit=0"
nvcc_flags += " --cubin --ptxas-options=\"-v\""
nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir)
@ -75,7 +86,31 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
for arch in cuda_archs:
cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
# build flags depending on CUDA version and arch
if cuda_version < 50:
# CUDA 4.x
if arch.startswith("sm_1"):
# sm_1x
cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0"
elif arch.startswith("sm_2"):
# sm_2x
cuda_arch_flags = "--maxrregcount=24"
else:
# sm_3x
cuda_arch_flags = "--maxrregcount=32"
else:
# CUDA 5.x
if arch.startswith("sm_1"):
# sm_1x
cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math"
elif arch.startswith("sm_2"):
# sm_2x
cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
else:
# sm_3x
cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
kernel.Command(cubin_file, 'kernel.cu', command)
kernel.Depends(cubin_file, dependencies)

@ -137,7 +137,7 @@ __device_inline float cmj_randfloat(uint i, uint p)
}
#ifdef __CMJ__
__device_noinline float cmj_sample_1D(int s, int N, int p)
__device float cmj_sample_1D(int s, int N, int p)
{
uint x = cmj_permute(s, N, p * 0x68bc21eb);
float jx = cmj_randfloat(s, p * 0x967a889b);
@ -146,7 +146,7 @@ __device_noinline float cmj_sample_1D(int s, int N, int p)
return (x + jx)*invN;
}
__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
{
int m = float_to_int(sqrtf(N));
int n = (N + m - 1)/m;

@ -38,7 +38,12 @@ CCL_NAMESPACE_BEGIN
/* ShaderData setup from incoming ray */
#ifdef __OBJECT_MOTION__
__device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
__device_noinline
#else
__device
#endif
void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
{
/* note that this is a separate non-inlined function to work around crash
* on CUDA sm 2.0, otherwise kernel execution crashes (compiler bug?) */
@ -53,7 +58,12 @@ __device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderD
}
#endif
__device_noinline void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
__device_noinline
#else
__device
#endif
void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
const Intersection *isect, const Ray *ray)
{
#ifdef __INSTANCING__
@ -260,7 +270,12 @@ __device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData
/* ShaderData setup from position sampled on mesh */
__device_noinline void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
__device_noinline
#else
__device
#endif
void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
const float3 P, const float3 Ng, const float3 I,
int shader, int object, int prim, float u, float v, float t, float time, int segment)
{

@ -16,6 +16,8 @@
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
@ -25,6 +27,11 @@
#include "util_path.h"
#include "util_string.h"
#ifdef _WIN32
#define popen _popen
#define pclose _pclose
#endif
/* function defininitions */
tcuInit *cuInit;
@ -399,7 +406,15 @@ string cuCompilerPath()
const char *defaultpaths[] = {"C:/CUDA/bin", NULL};
const char *executable = "nvcc.exe";
#else
const char *defaultpaths[] = {"/Developer/NVIDIA/CUDA-4.2/bin", "/usr/local/cuda-4.2/bin", "/usr/local/cuda/bin", NULL};
const char *defaultpaths[] = {
"/Developer/NVIDIA/CUDA-5.0/bin",
"/usr/local/cuda-5.0/bin",
"/usr/local/cuda/bin",
"/Developer/NVIDIA/CUDA-4.2/bin",
"/usr/local/cuda-4.2/bin",
"/Developer/NVIDIA/CUDA-5.5/bin",
"/usr/local/cuda-5.5/bin",
NULL};
const char *executable = "nvcc";
#endif
@ -437,5 +452,46 @@ string cuCompilerPath()
return "";
}
int cuCompilerVersion()
{
string path = cuCompilerPath();
if(path == "")
return 0;
/* get --version output */
FILE *pipe = popen((path + " --version").c_str(), "r");
if(!pipe) {
fprintf(stderr, "CUDA: failed to run compiler to retrieve version");
return 0;
}
char buf[128];
string output = "";
while(!feof(pipe))
if(fgets(buf, 128, pipe) != NULL)
output += buf;
pclose(pipe);
/* parse version number */
string marker = "Cuda compilation tools, release ";
size_t offset = output.find(marker);
if(offset == string::npos) {
fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output.c_str());
return 0;
}
string versionstr = output.substr(offset + marker.size(), string::npos);
int major, minor;
if(sscanf(versionstr.c_str(), "%d.%d", &major, &minor) < 2) {
fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output.c_str());
return 0;
}
return 10*major + minor;
}
CCL_NAMESPACE_END

@ -32,6 +32,7 @@ CCL_NAMESPACE_BEGIN
bool cuLibraryInit();
bool cuHavePrecompiledKernels();
string cuCompilerPath();
int cuCompilerVersion();
CCL_NAMESPACE_END