Cycles: Add support for shader raytracing in OptiX

Support for the AO and bevel shader nodes requires calling "optixTrace" from within the shading
VM, which is only allowed from inlined functions to the raygen program or callables. This patch
therefore converts the shading VM to use direct callables to make it work. To prevent performance
regressions a separate kernel module is compiled and used for this purpose.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D9733
This commit is contained in:
Patrick Mours 2020-12-03 12:19:36 +01:00
parent 7f2d356a67
commit c10546f5e9
6 changed files with 168 additions and 74 deletions

@ -141,7 +141,8 @@ class OptiXDevice : public CUDADevice {
PG_BAKE, // kernel_bake_evaluate PG_BAKE, // kernel_bake_evaluate
PG_DISP, // kernel_displace_evaluate PG_DISP, // kernel_displace_evaluate
PG_BACK, // kernel_background_evaluate PG_BACK, // kernel_background_evaluate
NUM_PROGRAM_GROUPS PG_CALL,
NUM_PROGRAM_GROUPS = PG_CALL + 3
}; };
// List of OptiX pipelines // List of OptiX pipelines
@ -334,11 +335,6 @@ class OptiXDevice : public CUDADevice {
set_error("OptiX backend does not support baking yet"); set_error("OptiX backend does not support baking yet");
return false; return false;
} }
// Disable shader raytracing support for now, since continuation callables are slow
if (requested_features.use_shader_raytrace) {
set_error("OptiX backend does not support 'Ambient Occlusion' and 'Bevel' shader nodes yet");
return false;
}
const CUDAContextScope scope(cuContext); const CUDAContextScope scope(cuContext);
@ -410,7 +406,9 @@ class OptiXDevice : public CUDADevice {
} }
{ // Load and compile PTX module with OptiX kernels { // Load and compile PTX module with OptiX kernels
string ptx_data, ptx_filename = path_get("lib/kernel_optix.ptx"); string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
"lib/kernel_optix_shader_raytrace.ptx" :
"lib/kernel_optix.ptx");
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
if (!getenv("OPTIX_ROOT_DIR")) { if (!getenv("OPTIX_ROOT_DIR")) {
set_error( set_error(
@ -525,6 +523,21 @@ class OptiXDevice : public CUDADevice {
group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
} }
// Shader raytracing replaces some functions with direct callables
if (requested_features.use_shader_raytrace) {
group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
"__direct_callable__kernel_volume_shadow";
group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
"__direct_callable__subsurface_scatter_multi_setup";
}
check_result_optix_ret(optixProgramGroupCreate( check_result_optix_ret(optixProgramGroupCreate(
context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
@ -564,33 +577,51 @@ class OptiXDevice : public CUDADevice {
# endif # endif
{ // Create path tracing pipeline { // Create path tracing pipeline
OptixProgramGroup pipeline_groups[] = { vector<OptixProgramGroup> pipeline_groups;
groups[PG_RGEN], pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
groups[PG_MISS], pipeline_groups.push_back(groups[PG_RGEN]);
groups[PG_HITD], pipeline_groups.push_back(groups[PG_MISS]);
groups[PG_HITS], pipeline_groups.push_back(groups[PG_HITD]);
groups[PG_HITL], pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
# if OPTIX_ABI_VERSION >= 36 # if OPTIX_ABI_VERSION >= 36
groups[PG_HITD_MOTION], if (motion_blur) {
groups[PG_HITS_MOTION], pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
# endif # endif
}; if (requested_features.use_shader_raytrace) {
check_result_optix_ret( pipeline_groups.push_back(groups[PG_CALL + 0]);
optixPipelineCreate(context, pipeline_groups.push_back(groups[PG_CALL + 1]);
&pipeline_options, pipeline_groups.push_back(groups[PG_CALL + 2]);
&link_options, }
pipeline_groups,
(sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), check_result_optix_ret(optixPipelineCreate(context,
nullptr, &pipeline_options,
0, &link_options,
&pipelines[PIP_PATH_TRACE])); pipeline_groups.data(),
pipeline_groups.size(),
nullptr,
0,
&pipelines[PIP_PATH_TRACE]));
// Combine ray generation and trace continuation stack size // Combine ray generation and trace continuation stack size
const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
// Max direct callable depth is one of the following, so combine accordingly
// - __raygen__ -> svm_eval_nodes
// - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
// - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
std::max(stack_size[PG_CALL + 1].dssDC,
stack_size[PG_CALL + 2].dssDC);
// Set stack size depending on pipeline options // Set stack size depending on pipeline options
check_result_optix_ret( check_result_optix_ret(
optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2))); optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
0,
requested_features.use_shader_raytrace ? dss : 0,
css,
motion_blur ? 3 : 2));
} }
// Only need to create shader evaluation pipeline if one of these features is used: // Only need to create shader evaluation pipeline if one of these features is used:
@ -599,37 +630,51 @@ class OptiXDevice : public CUDADevice {
requested_features.use_true_displacement; requested_features.use_true_displacement;
if (use_shader_eval_pipeline) { // Create shader evaluation pipeline if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
OptixProgramGroup pipeline_groups[] = { vector<OptixProgramGroup> pipeline_groups;
groups[PG_BAKE], pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
groups[PG_DISP], pipeline_groups.push_back(groups[PG_BAKE]);
groups[PG_BACK], pipeline_groups.push_back(groups[PG_DISP]);
groups[PG_MISS], pipeline_groups.push_back(groups[PG_BACK]);
groups[PG_HITD], pipeline_groups.push_back(groups[PG_MISS]);
groups[PG_HITS], pipeline_groups.push_back(groups[PG_HITD]);
groups[PG_HITL], pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
# if OPTIX_ABI_VERSION >= 36 # if OPTIX_ABI_VERSION >= 36
groups[PG_HITD_MOTION], if (motion_blur) {
groups[PG_HITS_MOTION], pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
# endif # endif
}; if (requested_features.use_shader_raytrace) {
check_result_optix_ret( pipeline_groups.push_back(groups[PG_CALL + 0]);
optixPipelineCreate(context, pipeline_groups.push_back(groups[PG_CALL + 1]);
&pipeline_options, pipeline_groups.push_back(groups[PG_CALL + 2]);
&link_options, }
pipeline_groups,
(sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), check_result_optix_ret(optixPipelineCreate(context,
nullptr, &pipeline_options,
0, &link_options,
&pipelines[PIP_SHADER_EVAL])); pipeline_groups.data(),
pipeline_groups.size(),
nullptr,
0,
&pipelines[PIP_SHADER_EVAL]));
// Calculate continuation stack size based on the maximum of all ray generation stack sizes // Calculate continuation stack size based on the maximum of all ray generation stack sizes
const unsigned int css = std::max(stack_size[PG_BAKE].cssRG, const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
std::max(stack_size[PG_DISP].cssRG, std::max(stack_size[PG_DISP].cssRG,
stack_size[PG_BACK].cssRG)) + stack_size[PG_BACK].cssRG)) +
link_options.maxTraceDepth * trace_css; link_options.maxTraceDepth * trace_css;
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
std::max(stack_size[PG_CALL + 1].dssDC,
stack_size[PG_CALL + 2].dssDC);
check_result_optix_ret(optixPipelineSetStackSize( check_result_optix_ret(
pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2))); optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
0,
requested_features.use_shader_raytrace ? dss : 0,
css,
motion_blur ? 3 : 2));
} }
// Clean up program group objects // Clean up program group objects
@ -734,6 +779,9 @@ class OptiXDevice : public CUDADevice {
# else # else
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
# endif # endif
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
sbt_params.callablesRecordCount = 3;
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
// Launch the ray generation program // Launch the ray generation program
check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@ -1061,6 +1109,9 @@ class OptiXDevice : public CUDADevice {
# else # else
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
# endif # endif
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
sbt_params.callablesRecordCount = 3;
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
cuda_stream[thread_index], cuda_stream[thread_index],

@ -423,7 +423,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_kernel_src "/kernels/cuda/${name}.cu") set(cuda_kernel_src "/kernels/cuda/${name}.cu")
set(cuda_flags set(cuda_flags ${flags}
-D CCL_NAMESPACE_BEGIN= -D CCL_NAMESPACE_BEGIN=
-D CCL_NAMESPACE_END= -D CCL_NAMESPACE_END=
-D NVCC -D NVCC
@ -545,11 +545,11 @@ endif()
# OptiX PTX modules # OptiX PTX modules
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
foreach(input ${SRC_OPTIX_KERNELS}) macro(CYCLES_OPTIX_KERNEL_ADD name flags)
get_filename_component(input_we ${input} NAME_WE) set(input "kernels/optix/kernel_optix.cu")
set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx") set(cuda_flags ${flags}
set(cuda_flags
-I "${OPTIX_INCLUDE_DIR}" -I "${OPTIX_INCLUDE_DIR}"
-I "${CMAKE_CURRENT_SOURCE_DIR}/.." -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda" -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
@ -625,7 +625,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
list(APPEND optix_ptx ${output}) list(APPEND optix_ptx ${output})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
endforeach() endmacro()
CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx}) add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
cycles_set_solution_folder(cycles_kernel_optix) cycles_set_solution_folder(cycles_kernel_optix)

@ -281,13 +281,28 @@ ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
return num_eval_hits; return num_eval_hits;
} }
ccl_device_noinline void subsurface_scatter_multi_setup(KernelGlobals *kg, #if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
LocalIntersection *ss_isect, ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
int hit, LocalIntersection *ss_isect,
ShaderData *sd, int hit,
ccl_addr_space PathState *state, ShaderData *sd,
ClosureType type, ccl_addr_space PathState *state,
float roughness) ClosureType type,
float roughness)
{
optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
}
extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
#else
ccl_device_noinline void subsurface_scatter_multi_setup(
#endif
KernelGlobals *kg,
LocalIntersection *ss_isect,
int hit,
ShaderData *sd,
ccl_addr_space PathState *state,
ClosureType type,
float roughness)
{ {
#ifdef __SPLIT_KERNEL__ #ifdef __SPLIT_KERNEL__
Ray ray_object = ss_isect->ray; Ray ray_object = ss_isect->ray;

@ -139,8 +139,6 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_OPTIX__ #ifdef __KERNEL_OPTIX__
# undef __BAKING__ # undef __BAKING__
# undef __BRANCHED_PATH__ # undef __BRANCHED_PATH__
/* TODO(pmours): Cannot use optixTrace in non-inlined functions */
# undef __SHADER_RAYTRACE__
#endif /* __KERNEL_OPTIX__ */ #endif /* __KERNEL_OPTIX__ */
#ifdef __KERNEL_OPENCL__ #ifdef __KERNEL_OPENCL__

@ -274,11 +274,24 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
/* get the volume attenuation over line segment defined by ray, with the /* get the volume attenuation over line segment defined by ray, with the
* assumption that there are no surfaces blocking light between the endpoints */ * assumption that there are no surfaces blocking light between the endpoints */
ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, # if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
ShaderData *shadow_sd, ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
ccl_addr_space PathState *state, ShaderData *shadow_sd,
Ray *ray, ccl_addr_space PathState *state,
float3 *throughput) Ray *ray,
float3 *throughput)
{
optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
}
extern "C" __device__ void __direct_callable__kernel_volume_shadow(
# else
ccl_device_noinline void kernel_volume_shadow(
# endif
KernelGlobals *kg,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
Ray *ray,
float3 *throughput)
{ {
shader_setup_from_volume(kg, shadow_sd, ray); shader_setup_from_volume(kg, shadow_sd, ray);

@ -217,12 +217,26 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
/* Main Interpreter Loop */ /* Main Interpreter Loop */
ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, #if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
ShaderData *sd, ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
ccl_addr_space PathState *state, ShaderData *sd,
ccl_global float *buffer, ccl_addr_space PathState *state,
ShaderType type, ccl_global float *buffer,
int path_flag) ShaderType type,
int path_flag)
{
optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
}
extern "C" __device__ void __direct_callable__svm_eval_nodes(
#else
ccl_device_noinline void svm_eval_nodes(
#endif
KernelGlobals *kg,
ShaderData *sd,
ccl_addr_space PathState *state,
ccl_global float *buffer,
ShaderType type,
int path_flag)
{ {
float stack[SVM_STACK_SIZE]; float stack[SVM_STACK_SIZE];
int offset = sd->shader & SHADER_MASK; int offset = sd->shader & SHADER_MASK;