diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 95234845f98..682540a51fd 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -141,7 +141,8 @@ class OptiXDevice : public CUDADevice { PG_BAKE, // kernel_bake_evaluate PG_DISP, // kernel_displace_evaluate PG_BACK, // kernel_background_evaluate - NUM_PROGRAM_GROUPS + PG_CALL, + NUM_PROGRAM_GROUPS = PG_CALL + 3 }; // List of OptiX pipelines @@ -334,11 +335,6 @@ class OptiXDevice : public CUDADevice { set_error("OptiX backend does not support baking yet"); return false; } - // Disable shader raytracing support for now, since continuation callables are slow - if (requested_features.use_shader_raytrace) { - set_error("OptiX backend does not support 'Ambient Occlusion' and 'Bevel' shader nodes yet"); - return false; - } const CUDAContextScope scope(cuContext); @@ -410,7 +406,9 @@ class OptiXDevice : public CUDADevice { } { // Load and compile PTX module with OptiX kernels - string ptx_data, ptx_filename = path_get("lib/kernel_optix.ptx"); + string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ? + "lib/kernel_optix_shader_raytrace.ptx" : + "lib/kernel_optix.ptx"); if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { if (!getenv("OPTIX_ROOT_DIR")) { set_error( @@ -525,6 +523,21 @@ class OptiXDevice : public CUDADevice { group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; } + // Shader raytracing replaces some functions with direct callables + if (requested_features.use_shader_raytrace) { + group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL + 0].callables.moduleDC = optix_module; + group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes"; + group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL + 1].callables.moduleDC = optix_module; + group_descs[PG_CALL + 1].callables.entryFunctionNameDC = + "__direct_callable__kernel_volume_shadow"; + group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL + 2].callables.moduleDC = optix_module; + group_descs[PG_CALL + 2].callables.entryFunctionNameDC = + "__direct_callable__subsurface_scatter_multi_setup"; + } + check_result_optix_ret(optixProgramGroupCreate( context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); @@ -564,33 +577,51 @@ class OptiXDevice : public CUDADevice { # endif { // Create path tracing pipeline - OptixProgramGroup pipeline_groups[] = { - groups[PG_RGEN], - groups[PG_MISS], - groups[PG_HITD], - groups[PG_HITS], - groups[PG_HITL], + vector pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); # if OPTIX_ABI_VERSION >= 36 - groups[PG_HITD_MOTION], - groups[PG_HITS_MOTION], + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } # endif - }; - check_result_optix_ret( - optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups, - (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), - nullptr, - 0, - &pipelines[PIP_PATH_TRACE])); + if (requested_features.use_shader_raytrace) { + pipeline_groups.push_back(groups[PG_CALL + 0]); + pipeline_groups.push_back(groups[PG_CALL + 1]); + pipeline_groups.push_back(groups[PG_CALL + 2]); + } + + check_result_optix_ret(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_PATH_TRACE])); // Combine ray generation and trace continuation stack size const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; + // Max direct callable depth is one of the following, so combine accordingly + // - __raygen__ -> svm_eval_nodes + // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes + // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes + const unsigned int dss = stack_size[PG_CALL + 0].dssDC + + std::max(stack_size[PG_CALL + 1].dssDC, + stack_size[PG_CALL + 2].dssDC); // Set stack size depending on pipeline options check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2))); + optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], + 0, + requested_features.use_shader_raytrace ? dss : 0, + css, + motion_blur ? 3 : 2)); } // Only need to create shader evaluation pipeline if one of these features is used: @@ -599,37 +630,51 @@ class OptiXDevice : public CUDADevice { requested_features.use_true_displacement; if (use_shader_eval_pipeline) { // Create shader evaluation pipeline - OptixProgramGroup pipeline_groups[] = { - groups[PG_BAKE], - groups[PG_DISP], - groups[PG_BACK], - groups[PG_MISS], - groups[PG_HITD], - groups[PG_HITS], - groups[PG_HITL], + vector pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_BAKE]); + pipeline_groups.push_back(groups[PG_DISP]); + pipeline_groups.push_back(groups[PG_BACK]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); # if OPTIX_ABI_VERSION >= 36 - groups[PG_HITD_MOTION], - groups[PG_HITS_MOTION], + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } # endif - }; - check_result_optix_ret( - optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups, - (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])), - nullptr, - 0, - &pipelines[PIP_SHADER_EVAL])); + if (requested_features.use_shader_raytrace) { + pipeline_groups.push_back(groups[PG_CALL + 0]); + pipeline_groups.push_back(groups[PG_CALL + 1]); + pipeline_groups.push_back(groups[PG_CALL + 2]); + } + + check_result_optix_ret(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_SHADER_EVAL])); // Calculate continuation stack size based on the maximum of all ray generation stack sizes const unsigned int css = std::max(stack_size[PG_BAKE].cssRG, std::max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) + link_options.maxTraceDepth * trace_css; + const unsigned int dss = stack_size[PG_CALL + 0].dssDC + + std::max(stack_size[PG_CALL + 1].dssDC, + stack_size[PG_CALL + 2].dssDC); - check_result_optix_ret(optixPipelineSetStackSize( - pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2))); + check_result_optix_ret( + optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL], + 0, + requested_features.use_shader_raytrace ? dss : 0, + css, + motion_blur ? 3 : 2)); } // Clean up program group objects @@ -734,6 +779,9 @@ class OptiXDevice : public CUDADevice { # else sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL # endif + sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); + sbt_params.callablesRecordCount = 3; + sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); // Launch the ray generation program check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], @@ -1061,6 +1109,9 @@ class OptiXDevice : public CUDADevice { # else sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL # endif + sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); + sbt_params.callablesRecordCount = 3; + sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], cuda_stream[thread_index], diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index c39c67afb5a..f6b4b963a7a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -423,7 +423,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_kernel_src "/kernels/cuda/${name}.cu") - set(cuda_flags + set(cuda_flags ${flags} -D CCL_NAMESPACE_BEGIN= -D CCL_NAMESPACE_END= -D NVCC @@ -545,11 +545,11 @@ endif() # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) - foreach(input ${SRC_OPTIX_KERNELS}) - get_filename_component(input_we ${input} NAME_WE) + macro(CYCLES_OPTIX_KERNEL_ADD name flags) + set(input "kernels/optix/kernel_optix.cu") + set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx") - set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx") - set(cuda_flags + set(cuda_flags ${flags} -I "${OPTIX_INCLUDE_DIR}" -I "${CMAKE_CURRENT_SOURCE_DIR}/.." -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda" @@ -625,7 +625,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) list(APPEND optix_ptx ${output}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib) - endforeach() + endmacro() + + CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__") + CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions") add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx}) cycles_set_solution_folder(cycles_kernel_optix) diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index ed8572467ea..917f35d37dc 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -281,13 +281,28 @@ ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg, return num_eval_hits; } -ccl_device_noinline void subsurface_scatter_multi_setup(KernelGlobals *kg, - LocalIntersection *ss_isect, - int hit, - ShaderData *sd, - ccl_addr_space PathState *state, - ClosureType type, - float roughness) +#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) +ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg, + LocalIntersection *ss_isect, + int hit, + ShaderData *sd, + ccl_addr_space PathState *state, + ClosureType type, + float roughness) +{ + optixDirectCall(2, kg, ss_isect, hit, sd, state, type, roughness); +} +extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup( +#else +ccl_device_noinline void subsurface_scatter_multi_setup( +#endif + KernelGlobals *kg, + LocalIntersection *ss_isect, + int hit, + ShaderData *sd, + ccl_addr_space PathState *state, + ClosureType type, + float roughness) { #ifdef __SPLIT_KERNEL__ Ray ray_object = ss_isect->ray; diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 8e2b0e46a66..6beabebb92f 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -139,8 +139,6 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_OPTIX__ # undef __BAKING__ # undef __BRANCHED_PATH__ -/* TODO(pmours): Cannot use optixTrace in non-inlined functions */ -# undef __SHADER_RAYTRACE__ #endif /* __KERNEL_OPTIX__ */ #ifdef __KERNEL_OPENCL__ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index f5d10c0ca8a..fdf712293e7 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -274,11 +274,24 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, /* get the volume attenuation over line segment defined by ray, with the * assumption that there are no surfaces blocking light between the endpoints */ -ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - Ray *ray, - float3 *throughput) +# if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) +ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *throughput) +{ + optixDirectCall(1, kg, shadow_sd, state, ray, throughput); +} +extern "C" __device__ void __direct_callable__kernel_volume_shadow( +# else +ccl_device_noinline void kernel_volume_shadow( +# endif + KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *throughput) { shader_setup_from_volume(kg, shadow_sd, ray); diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 6c849f5b2fc..000da1fa615 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -217,12 +217,26 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN /* Main Interpreter Loop */ -ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_global float *buffer, - ShaderType type, - int path_flag) +#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) +ccl_device_inline void svm_eval_nodes(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + ccl_global float *buffer, + ShaderType type, + int path_flag) +{ + optixDirectCall(0, kg, sd, state, buffer, type, path_flag); +} +extern "C" __device__ void __direct_callable__svm_eval_nodes( +#else +ccl_device_noinline void svm_eval_nodes( +#endif + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + ccl_global float *buffer, + ShaderType type, + int path_flag) { float stack[SVM_STACK_SIZE]; int offset = sd->shader & SHADER_MASK;