blender/intern/cycles/device/device_optix.cpp
Brecht Van Lommel 073bf8bf52 Cycles: remove WITH_CYCLES_DEBUG, add WITH_CYCLES_DEBUG_NAN
WITH_CYCLES_DEBUG was used for rendering BVH debugging passes. But since we
mainly use Embree an OptiX now, this information is no longer important.

WITH_CYCLES_DEBUG_NAN will enable additional checks for NaNs and invalid values
in the kernel, for Cycles developers. Previously these asserts where enabled in
all debug builds, but this is too likely to crash Blender in scenes that render
fine regardless of the NaNs. So this is behind a CMake option now.

Fixes T90240
2021-07-28 19:27:57 +02:00

1937 lines
79 KiB
C++

/*
* Copyright 2019, NVIDIA Corporation.
* Copyright 2019, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_OPTIX
# include "bvh/bvh.h"
# include "bvh/bvh_optix.h"
# include "device/cuda/device_cuda.h"
# include "device/device_denoising.h"
# include "device/device_intern.h"
# include "render/buffers.h"
# include "render/hair.h"
# include "render/mesh.h"
# include "render/object.h"
# include "render/scene.h"
# include "util/util_debug.h"
# include "util/util_logging.h"
# include "util/util_md5.h"
# include "util/util_path.h"
# include "util/util_progress.h"
# include "util/util_time.h"
# ifdef WITH_CUDA_DYNLOAD
# include <cuew.h>
// Do not use CUDA SDK headers when using CUEW
# define OPTIX_DONT_INCLUDE_CUDA
# endif
# include <optix_function_table_definition.h>
# include <optix_stubs.h>
// TODO(pmours): Disable this once drivers have native support
# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
CCL_NAMESPACE_BEGIN
/* Make sure this stays in sync with kernel_globals.h */
struct ShaderParams {
uint4 *input;
float4 *output;
int type;
int filter;
int sx;
int offset;
int sample;
};
struct KernelParams {
WorkTile tile;
KernelData data;
ShaderParams shader;
# define KERNEL_TEX(type, name) const type *name;
# include "kernel/kernel_textures.h"
# undef KERNEL_TEX
};
# define check_result_cuda(stmt) \
{ \
CUresult res = stmt; \
if (res != CUDA_SUCCESS) { \
const char *name; \
cuGetErrorName(res, &name); \
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
return; \
} \
} \
(void)0
# define check_result_cuda_ret(stmt) \
{ \
CUresult res = stmt; \
if (res != CUDA_SUCCESS) { \
const char *name; \
cuGetErrorName(res, &name); \
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
return false; \
} \
} \
(void)0
# define check_result_optix(stmt) \
{ \
enum OptixResult res = stmt; \
if (res != OPTIX_SUCCESS) { \
const char *name = optixGetErrorName(res); \
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
return; \
} \
} \
(void)0
# define check_result_optix_ret(stmt) \
{ \
enum OptixResult res = stmt; \
if (res != OPTIX_SUCCESS) { \
const char *name = optixGetErrorName(res); \
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
return false; \
} \
} \
(void)0
# define launch_filter_kernel(func_name, w, h, args) \
{ \
CUfunction func; \
check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
int threads; \
check_result_cuda_ret( \
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
threads = (int)sqrt((float)threads); \
int xblocks = ((w) + threads - 1) / threads; \
int yblocks = ((h) + threads - 1) / threads; \
check_result_cuda_ret( \
cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
} \
(void)0
class OptiXDevice : public CUDADevice {
// List of OptiX program groups
enum {
PG_RGEN,
PG_MISS,
PG_HITD, // Default hit group
PG_HITS, // __SHADOW_RECORD_ALL__ hit group
PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles)
# if OPTIX_ABI_VERSION >= 36
PG_HITD_MOTION,
PG_HITS_MOTION,
# endif
PG_BAKE, // kernel_bake_evaluate
PG_DISP, // kernel_displace_evaluate
PG_BACK, // kernel_background_evaluate
PG_CALL,
NUM_PROGRAM_GROUPS = PG_CALL + 3
};
// List of OptiX pipelines
enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
// A single shader binding table entry
struct SbtRecord {
char header[OPTIX_SBT_RECORD_HEADER_SIZE];
};
// Information stored about CUDA memory allocations
struct CUDAMem {
bool free_map_host = false;
CUarray array = NULL;
CUtexObject texobject = 0;
bool use_mapped_host = false;
};
// Helper class to manage current CUDA context
struct CUDAContextScope {
CUDAContextScope(CUcontext ctx)
{
cuCtxPushCurrent(ctx);
}
~CUDAContextScope()
{
cuCtxPopCurrent(NULL);
}
};
// Use a pool with multiple threads to support launches with multiple CUDA streams
TaskPool task_pool;
vector<CUstream> cuda_stream;
OptixDeviceContext context = NULL;
OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module
OptixModule builtin_modules[2] = {};
OptixPipeline pipelines[NUM_PIPELINES] = {};
bool motion_blur = false;
device_vector<SbtRecord> sbt_data;
device_only_memory<KernelParams> launch_params;
OptixTraversableHandle tlas_handle = 0;
OptixDenoiser denoiser = NULL;
device_only_memory<unsigned char> denoiser_state;
int denoiser_input_passes = 0;
vector<device_only_memory<char>> delayed_free_bvh_memory;
thread_mutex delayed_free_bvh_mutex;
public:
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
: CUDADevice(info_, stats_, profiler_, background_),
sbt_data(this, "__sbt", MEM_READ_ONLY),
launch_params(this, "__params", false),
denoiser_state(this, "__denoiser_state", true)
{
// Store number of CUDA streams in device info
info.cpu_threads = DebugFlags().optix.cuda_streams;
// Make the CUDA context current
if (!cuContext) {
return; // Do not initialize if CUDA context creation failed already
}
const CUDAContextScope scope(cuContext);
// Create OptiX context for this device
OptixDeviceContextOptions options = {};
# ifdef WITH_CYCLES_LOGGING
options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
options.logCallbackFunction =
[](unsigned int level, const char *, const char *message, void *) {
switch (level) {
case 1:
LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
break;
case 2:
LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
break;
case 3:
LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
break;
case 4:
LOG_IF(INFO, VLOG_IS_ON(1)) << message;
break;
}
};
# endif
check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
# ifdef WITH_CYCLES_LOGGING
check_result_optix(optixDeviceContextSetLogCallback(
context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
# endif
// Create launch streams
cuda_stream.resize(info.cpu_threads);
for (int i = 0; i < info.cpu_threads; ++i)
check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
// Fix weird compiler bug that assigns wrong size
launch_params.data_elements = sizeof(KernelParams);
// Allocate launch parameter buffer memory on device
launch_params.alloc_to_device(info.cpu_threads);
}
~OptiXDevice()
{
// Stop processing any more tasks
task_pool.cancel();
// Make CUDA context current
const CUDAContextScope scope(cuContext);
free_bvh_memory_delayed();
sbt_data.free();
texture_info.free();
launch_params.free();
denoiser_state.free();
// Unload modules
if (optix_module != NULL)
optixModuleDestroy(optix_module);
for (unsigned int i = 0; i < 2; ++i)
if (builtin_modules[i] != NULL)
optixModuleDestroy(builtin_modules[i]);
for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
if (pipelines[i] != NULL)
optixPipelineDestroy(pipelines[i]);
// Destroy launch streams
for (CUstream stream : cuda_stream)
cuStreamDestroy(stream);
if (denoiser != NULL)
optixDenoiserDestroy(denoiser);
optixDeviceContextDestroy(context);
}
private:
bool show_samples() const override
{
// Only show samples if not rendering multiple tiles in parallel
return info.cpu_threads == 1;
}
BVHLayoutMask get_bvh_layout_mask() const override
{
// CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
if (optix_module == NULL)
return CUDADevice::get_bvh_layout_mask();
// OptiX has its own internal acceleration structure format
return BVH_LAYOUT_OPTIX;
}
string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
bool filter,
bool /*split*/) override
{
// Split kernel is not supported in OptiX
string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
requested_features, filter, false);
// Add OptiX SDK include directory to include paths
const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
if (optix_sdk_path) {
common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
}
// Specialization for shader raytracing
if (requested_features.use_shader_raytrace) {
common_cflags += " --keep-device-functions";
}
else {
common_cflags += " -D __NO_SHADER_RAYTRACE__";
}
return common_cflags;
}
bool load_kernels(const DeviceRequestedFeatures &requested_features) override
{
if (have_error()) {
// Abort early if context creation failed already
return false;
}
// Load CUDA modules because we need some of the utility kernels
if (!CUDADevice::load_kernels(requested_features)) {
return false;
}
// Baking is currently performed using CUDA, so no need to load OptiX kernels
if (requested_features.use_baking) {
return true;
}
const CUDAContextScope scope(cuContext);
// Unload existing OptiX module and pipelines first
if (optix_module != NULL) {
optixModuleDestroy(optix_module);
optix_module = NULL;
}
for (unsigned int i = 0; i < 2; ++i) {
if (builtin_modules[i] != NULL) {
optixModuleDestroy(builtin_modules[i]);
builtin_modules[i] = NULL;
}
}
for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
if (pipelines[i] != NULL) {
optixPipelineDestroy(pipelines[i]);
pipelines[i] = NULL;
}
}
OptixModuleCompileOptions module_options = {};
module_options.maxRegisterCount = 0; // Do not set an explicit register limit
module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
# if OPTIX_ABI_VERSION >= 41
module_options.boundValues = nullptr;
module_options.numBoundValues = 0;
# endif
OptixPipelineCompileOptions pipeline_options = {};
// Default to no motion blur and two-level graph, since it is the fastest option
pipeline_options.usesMotionBlur = false;
pipeline_options.traversableGraphFlags =
OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
pipeline_options.numPayloadValues = 6;
pipeline_options.numAttributeValues = 2; // u, v
pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
# if OPTIX_ABI_VERSION >= 36
pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
if (requested_features.use_hair) {
if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
}
else {
pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
}
}
# endif
// Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
// This is necessary since objects may be reported to have motion if the Vector pass is
// active, but may still need to be rendered without motion blur if that isn't active as well
motion_blur = requested_features.use_object_motion;
if (motion_blur) {
pipeline_options.usesMotionBlur = true;
// Motion blur can insert motion transforms into the traversal graph
// It is no longer a two-level graph then, so need to set flags to allow any configuration
pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
}
{ // Load and compile PTX module with OptiX kernels
string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
"lib/kernel_optix_shader_raytrace.ptx" :
"lib/kernel_optix.ptx");
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
if (!getenv("OPTIX_ROOT_DIR")) {
set_error(
"Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
"the Optix SDK to be able to compile Optix kernels on demand).");
return false;
}
ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
}
if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
return false;
}
check_result_optix_ret(optixModuleCreateFromPTX(context,
&module_options,
&pipeline_options,
ptx_data.data(),
ptx_data.size(),
nullptr,
0,
&optix_module));
}
// Create program groups
OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupOptions group_options = {}; // There are no options currently
group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_RGEN].raygen.module = optix_module;
// Ignore branched integrator for now (see "requested_features.use_integrator_branched")
group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
group_descs[PG_MISS].miss.module = optix_module;
group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
if (requested_features.use_hair) {
group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
// Add curve intersection programs
if (requested_features.use_hair_thick) {
// Slower programs for thick hair since that also slows down ribbons.
// Ideally this should not be needed.
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
}
else {
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
}
# if OPTIX_ABI_VERSION >= 36
if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
OptixBuiltinISOptions builtin_options = {};
builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
builtin_options.usesMotionBlur = false;
check_result_optix_ret(optixBuiltinISModuleGet(
context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
if (motion_blur) {
builtin_options.usesMotionBlur = true;
check_result_optix_ret(optixBuiltinISModuleGet(
context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
}
}
# endif
}
if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
// Add hit group for local intersections
group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
}
if (requested_features.use_baking) {
group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_BAKE].raygen.module = optix_module;
group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
}
if (requested_features.use_true_displacement) {
group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_DISP].raygen.module = optix_module;
group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
}
if (requested_features.use_background_light) {
group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_BACK].raygen.module = optix_module;
group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
}
// Shader raytracing replaces some functions with direct callables
if (requested_features.use_shader_raytrace) {
group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
"__direct_callable__kernel_volume_shadow";
group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
"__direct_callable__subsurface_scatter_multi_setup";
}
check_result_optix_ret(optixProgramGroupCreate(
context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
// Get program stack sizes
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
// Set up SBT, which in this case is used only to select between different programs
sbt_data.alloc(NUM_PROGRAM_GROUPS);
memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
}
sbt_data.copy_to_device(); // Upload SBT to device
// Calculate maximum trace continuation stack size
unsigned int trace_css = stack_size[PG_HITD].cssCH;
// This is based on the maximum of closest-hit and any-hit/intersection programs
trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
# if OPTIX_ABI_VERSION >= 36
trace_css = std::max(trace_css,
stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
trace_css = std::max(trace_css,
stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
# endif
OptixPipelineLinkOptions link_options = {};
link_options.maxTraceDepth = 1;
link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
# if OPTIX_ABI_VERSION < 24
link_options.overrideUsesMotionBlur = motion_blur;
# endif
{ // Create path tracing pipeline
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
pipeline_groups.push_back(groups[PG_RGEN]);
pipeline_groups.push_back(groups[PG_MISS]);
pipeline_groups.push_back(groups[PG_HITD]);
pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
# if OPTIX_ABI_VERSION >= 36
if (motion_blur) {
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
# endif
if (requested_features.use_shader_raytrace) {
pipeline_groups.push_back(groups[PG_CALL + 0]);
pipeline_groups.push_back(groups[PG_CALL + 1]);
pipeline_groups.push_back(groups[PG_CALL + 2]);
}
check_result_optix_ret(optixPipelineCreate(context,
&pipeline_options,
&link_options,
pipeline_groups.data(),
pipeline_groups.size(),
nullptr,
0,
&pipelines[PIP_PATH_TRACE]));
// Combine ray generation and trace continuation stack size
const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
// Max direct callable depth is one of the following, so combine accordingly
// - __raygen__ -> svm_eval_nodes
// - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
// - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
std::max(stack_size[PG_CALL + 1].dssDC,
stack_size[PG_CALL + 2].dssDC);
// Set stack size depending on pipeline options
check_result_optix_ret(
optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
0,
requested_features.use_shader_raytrace ? dss : 0,
css,
motion_blur ? 3 : 2));
}
// Only need to create shader evaluation pipeline if one of these features is used:
const bool use_shader_eval_pipeline = requested_features.use_baking ||
requested_features.use_background_light ||
requested_features.use_true_displacement;
if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
pipeline_groups.push_back(groups[PG_BAKE]);
pipeline_groups.push_back(groups[PG_DISP]);
pipeline_groups.push_back(groups[PG_BACK]);
pipeline_groups.push_back(groups[PG_MISS]);
pipeline_groups.push_back(groups[PG_HITD]);
pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
# if OPTIX_ABI_VERSION >= 36
if (motion_blur) {
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
# endif
if (requested_features.use_shader_raytrace) {
pipeline_groups.push_back(groups[PG_CALL + 0]);
pipeline_groups.push_back(groups[PG_CALL + 1]);
pipeline_groups.push_back(groups[PG_CALL + 2]);
}
check_result_optix_ret(optixPipelineCreate(context,
&pipeline_options,
&link_options,
pipeline_groups.data(),
pipeline_groups.size(),
nullptr,
0,
&pipelines[PIP_SHADER_EVAL]));
// Calculate continuation stack size based on the maximum of all ray generation stack sizes
const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
std::max(stack_size[PG_DISP].cssRG,
stack_size[PG_BACK].cssRG)) +
link_options.maxTraceDepth * trace_css;
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
std::max(stack_size[PG_CALL + 1].dssDC,
stack_size[PG_CALL + 2].dssDC);
check_result_optix_ret(
optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
0,
requested_features.use_shader_raytrace ? dss : 0,
css,
motion_blur ? 3 : 2));
}
// Clean up program group objects
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
optixProgramGroupDestroy(groups[i]);
}
return true;
}
void thread_run(DeviceTask &task, int thread_index) // Main task entry point
{
if (have_error())
return; // Abort early if there was an error previously
if (task.type == DeviceTask::RENDER) {
if (thread_index != 0) {
// Only execute denoising in a single thread (see also 'task_add')
task.tile_types &= ~RenderTile::DENOISE;
}
RenderTile tile;
while (task.acquire_tile(this, tile, task.tile_types)) {
if (tile.task == RenderTile::PATH_TRACE)
launch_render(task, tile, thread_index);
else if (tile.task == RenderTile::BAKE) {
// Perform baking using CUDA, since it is not currently implemented in OptiX
device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
CUDADevice::render(task, tile, work_tiles);
}
else if (tile.task == RenderTile::DENOISE)
launch_denoise(task, tile);
task.release_tile(tile);
if (task.get_cancel() && !task.need_finish_queue)
break; // User requested cancellation
else if (have_error())
break; // Abort rendering when encountering an error
}
}
else if (task.type == DeviceTask::SHADER) {
// CUDA kernels are used when doing baking
if (optix_module == NULL)
CUDADevice::shader(task);
else
launch_shader_eval(task, thread_index);
}
else if (task.type == DeviceTask::DENOISE_BUFFER) {
// Set up a single tile that covers the whole task and denoise it
RenderTile tile;
tile.x = task.x;
tile.y = task.y;
tile.w = task.w;
tile.h = task.h;
tile.buffer = task.buffer;
tile.num_samples = task.num_samples;
tile.start_sample = task.sample;
tile.offset = task.offset;
tile.stride = task.stride;
tile.buffers = task.buffers;
launch_denoise(task, tile);
}
}
void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
{
assert(thread_index < launch_params.data_size);
// Keep track of total render time of this tile
const scoped_timer timer(&rtile.buffers->render_time);
WorkTile wtile;
wtile.x = rtile.x;
wtile.y = rtile.y;
wtile.w = rtile.w;
wtile.h = rtile.h;
wtile.offset = rtile.offset;
wtile.stride = rtile.stride;
wtile.buffer = (float *)rtile.buffer;
const int end_sample = rtile.start_sample + rtile.num_samples;
// Keep this number reasonable to avoid running into TDRs
int step_samples = (info.display_device ? 8 : 32);
// Offset into launch params buffer so that streams use separate data
device_ptr launch_params_ptr = launch_params.device_pointer +
thread_index * launch_params.data_elements;
const CUDAContextScope scope(cuContext);
for (int sample = rtile.start_sample; sample < end_sample;) {
// Copy work tile information to device
wtile.start_sample = sample;
wtile.num_samples = step_samples;
if (task.adaptive_sampling.use) {
wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
}
wtile.num_samples = min(wtile.num_samples, end_sample - sample);
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
check_result_cuda(
cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
OptixShaderBindingTable sbt_params = {};
sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
sbt_params.missRecordCount = 1;
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
# if OPTIX_ABI_VERSION >= 36
sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
# else
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
# endif
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
sbt_params.callablesRecordCount = 3;
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
// Launch the ray generation program
check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
cuda_stream[thread_index],
launch_params_ptr,
launch_params.data_elements,
&sbt_params,
// Launch with samples close to each other for better locality
wtile.w * wtile.num_samples,
wtile.h,
1));
// Run the adaptive sampling kernels at selected samples aligned to step samples.
uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
}
// Wait for launch to finish
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
// Update current sample, so it is displayed correctly
sample += wtile.num_samples;
rtile.sample = sample;
// Update task progress after the kernel completed rendering
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
if (task.get_cancel() && !task.need_finish_queue)
return; // Cancel rendering
}
// Finalize adaptive sampling
if (task.adaptive_sampling.use) {
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
}
}
bool launch_denoise(DeviceTask &task, RenderTile &rtile)
{
// Update current sample (for display and NLM denoising task)
rtile.sample = rtile.start_sample + rtile.num_samples;
// Make CUDA context current now, since it is used for both denoising tasks
const CUDAContextScope scope(cuContext);
// Choose between OptiX and NLM denoising
if (task.denoising.type == DENOISER_OPTIX) {
// Map neighboring tiles onto this device, indices are as following:
// Where index 4 is the center tile and index 9 is the target for the result.
// 0 1 2
// 3 4 5
// 6 7 8 9
RenderTileNeighbors neighbors(rtile);
task.map_neighbor_tiles(neighbors, this);
RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
RenderTile &target_tile = neighbors.target;
rtile = center_tile; // Tile may have been modified by mapping code
// Calculate size of the tile to denoise (including overlap)
int4 rect = center_tile.bounds();
// Overlap between tiles has to be at least 64 pixels
// TODO(pmours): Query this value from OptiX
rect = rect_expand(rect, 64);
int4 clip_rect = neighbors.bounds();
rect = rect_clip(rect, clip_rect);
int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
// Calculate byte offsets and strides
int pixel_stride = task.pass_stride * (int)sizeof(float);
int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
const int pass_offset[3] = {
(task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
(task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
(task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
// Start with the current tile pointer offset
int input_stride = pixel_stride;
device_ptr input_ptr = rtile.buffer + pixel_offset;
// Copy tile data into a common buffer if necessary
device_only_memory<float> input(this, "denoiser input", true);
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
bool contiguous_memory = true;
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
contiguous_memory = false;
}
}
if (contiguous_memory) {
// Tiles are in continous memory, so can just subtract overlap offset
input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
// Stride covers the whole width of the image and not just a single tile
input_stride *= rtile.stride;
}
else {
// Adjacent tiles are in separate memory regions, so need to copy them into a single one
input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
// Start with the new input buffer
input_ptr = input.device_pointer;
// Stride covers the width of the new input buffer, which includes tile width and overlap
input_stride *= rect_size.x;
TileInfo *tile_info = tile_info_mem.alloc(1);
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
tile_info->offsets[i] = neighbors.tiles[i].offset;
tile_info->strides[i] = neighbors.tiles[i].stride;
tile_info->buffers[i] = neighbors.tiles[i].buffer;
}
tile_info->x[0] = neighbors.tiles[3].x;
tile_info->x[1] = neighbors.tiles[4].x;
tile_info->x[2] = neighbors.tiles[5].x;
tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
tile_info->y[0] = neighbors.tiles[1].y;
tile_info->y[1] = neighbors.tiles[4].y;
tile_info->y[2] = neighbors.tiles[7].y;
tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
tile_info_mem.copy_to_device();
void *args[] = {
&input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
}
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
void *input_args[] = {&input_rgb.device_pointer,
&input_ptr,
&rect_size.x,
&rect_size.y,
&input_stride,
&task.pass_stride,
const_cast<int *>(pass_offset),
&task.denoising.input_passes,
&rtile.sample};
launch_filter_kernel(
"kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
input_ptr = input_rgb.device_pointer;
pixel_stride = 3 * sizeof(float);
input_stride = rect_size.x * pixel_stride;
# endif
const bool recreate_denoiser = (denoiser == NULL) ||
(task.denoising.input_passes != denoiser_input_passes);
if (recreate_denoiser) {
// Destroy existing handle before creating new one
if (denoiser != NULL) {
optixDenoiserDestroy(denoiser);
}
// Create OptiX denoiser handle on demand when it is first used
OptixDenoiserOptions denoiser_options = {};
assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
# if OPTIX_ABI_VERSION >= 47
denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
denoiser_options.guideNormal = task.denoising.input_passes >= 3;
check_result_optix_ret(optixDenoiserCreate(
context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
# else
denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
# if OPTIX_ABI_VERSION < 28
denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
# endif
check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
check_result_optix_ret(
optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
# endif
// OptiX denoiser handle was created with the requested number of input passes
denoiser_input_passes = task.denoising.input_passes;
}
OptixDenoiserSizes sizes = {};
check_result_optix_ret(
optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
# if OPTIX_ABI_VERSION < 28
const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
# else
const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
# endif
const size_t scratch_offset = sizes.stateSizeInBytes;
// Allocate denoiser state if tile size has changed since last setup
if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
denoiser_state.data_height != rect_size.y)) {
denoiser_state.alloc_to_device(scratch_offset + scratch_size);
// Initialize denoiser state for the current tile size
check_result_optix_ret(optixDenoiserSetup(denoiser,
0,
rect_size.x,
rect_size.y,
denoiser_state.device_pointer,
scratch_offset,
denoiser_state.device_pointer + scratch_offset,
scratch_size));
denoiser_state.data_width = rect_size.x;
denoiser_state.data_height = rect_size.y;
}
// Set up input and output layer information
OptixImage2D input_layers[3] = {};
OptixImage2D output_layers[1] = {};
for (int i = 0; i < 3; ++i) {
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
# else
input_layers[i].data = input_ptr + pass_offset[i];
# endif
input_layers[i].width = rect_size.x;
input_layers[i].height = rect_size.y;
input_layers[i].rowStrideInBytes = input_stride;
input_layers[i].pixelStrideInBytes = pixel_stride;
input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
}
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
output_layers[0].data = input_ptr;
output_layers[0].width = rect_size.x;
output_layers[0].height = rect_size.y;
output_layers[0].rowStrideInBytes = input_stride;
output_layers[0].pixelStrideInBytes = pixel_stride;
int2 output_offset = overlap_offset;
overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
# else
output_layers[0].data = target_tile.buffer + pixel_offset;
output_layers[0].width = target_tile.w;
output_layers[0].height = target_tile.h;
output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
output_layers[0].pixelStrideInBytes = pixel_stride;
# endif
output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
# if OPTIX_ABI_VERSION >= 47
OptixDenoiserLayer image_layers = {};
image_layers.input = input_layers[0];
image_layers.output = output_layers[0];
OptixDenoiserGuideLayer guide_layers = {};
guide_layers.albedo = input_layers[1];
guide_layers.normal = input_layers[2];
# endif
// Finally run denonising
OptixDenoiserParams params = {}; // All parameters are disabled/zero
# if OPTIX_ABI_VERSION >= 47
check_result_optix_ret(optixDenoiserInvoke(denoiser,
NULL,
&params,
denoiser_state.device_pointer,
scratch_offset,
&guide_layers,
&image_layers,
1,
overlap_offset.x,
overlap_offset.y,
denoiser_state.device_pointer + scratch_offset,
scratch_size));
# else
check_result_optix_ret(optixDenoiserInvoke(denoiser,
NULL,
&params,
denoiser_state.device_pointer,
scratch_offset,
input_layers,
task.denoising.input_passes,
overlap_offset.x,
overlap_offset.y,
output_layers,
denoiser_state.device_pointer + scratch_offset,
scratch_size));
# endif
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
void *output_args[] = {&input_ptr,
&target_tile.buffer,
&output_offset.x,
&output_offset.y,
&rect_size.x,
&rect_size.y,
&target_tile.x,
&target_tile.y,
&target_tile.w,
&target_tile.h,
&target_tile.offset,
&target_tile.stride,
&task.pass_stride,
&rtile.sample};
launch_filter_kernel(
"kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
# endif
check_result_cuda_ret(cuStreamSynchronize(0));
task.unmap_neighbor_tiles(neighbors, this);
}
else {
// Run CUDA denoising kernels
DenoisingTask denoising(this, task);
CUDADevice::denoise(rtile, denoising);
}
// Update task progress after the denoiser completed processing
task.update_progress(&rtile, rtile.w * rtile.h);
return true;
}
void launch_shader_eval(DeviceTask &task, int thread_index)
{
unsigned int rgen_index = PG_BACK;
if (task.shader_eval_type >= SHADER_EVAL_BAKE)
rgen_index = PG_BAKE;
if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
rgen_index = PG_DISP;
const CUDAContextScope scope(cuContext);
device_ptr launch_params_ptr = launch_params.device_pointer +
thread_index * launch_params.data_elements;
for (int sample = 0; sample < task.num_samples; ++sample) {
ShaderParams params;
params.input = (uint4 *)task.shader_input;
params.output = (float4 *)task.shader_output;
params.type = task.shader_eval_type;
params.filter = task.shader_filter;
params.sx = task.shader_x;
params.offset = task.offset;
params.sample = sample;
check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
&params,
sizeof(params),
cuda_stream[thread_index]));
OptixShaderBindingTable sbt_params = {};
sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
sbt_params.missRecordCount = 1;
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
# if OPTIX_ABI_VERSION >= 36
sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
# else
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
# endif
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
sbt_params.callablesRecordCount = 3;
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
cuda_stream[thread_index],
launch_params_ptr,
launch_params.data_elements,
&sbt_params,
task.shader_w,
1,
1));
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
task.update_progress(NULL);
}
}
bool build_optix_bvh(BVHOptiX *bvh,
OptixBuildOperation operation,
const OptixBuildInput &build_input,
uint16_t num_motion_steps)
{
/* Allocate and build acceleration structures only one at a time, to prevent parallel builds
* from running out of memory (since both original and compacted acceleration structure memory
* may be allocated at the same time for the duration of this function). The builds would
* otherwise happen on the same CUDA stream anyway. */
static thread_mutex mutex;
thread_scoped_lock lock(mutex);
const CUDAContextScope scope(cuContext);
const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
// Compute memory usage
OptixAccelBufferSizes sizes = {};
OptixAccelBuildOptions options = {};
options.operation = operation;
if (use_fast_trace_bvh) {
VLOG(2) << "Using fast to trace OptiX BVH";
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
}
else {
VLOG(2) << "Using fast to update OptiX BVH";
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
}
options.motionOptions.numKeys = num_motion_steps;
options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
options.motionOptions.timeBegin = 0.0f;
options.motionOptions.timeEnd = 1.0f;
check_result_optix_ret(
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
// Allocate required output buffers
device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
if (!temp_mem.device_pointer)
return false; // Make sure temporary memory allocation succeeded
// Acceleration structure memory has to be allocated on the device (not allowed to be on host)
device_only_memory<char> &out_data = bvh->as_data;
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
assert(out_data.device == this);
out_data.alloc_to_device(sizes.outputSizeInBytes);
if (!out_data.device_pointer)
return false;
}
else {
assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
}
// Finally build the acceleration structure
OptixAccelEmitDesc compacted_size_prop = {};
compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
// A tiny space was allocated for this property at the end of the temporary buffer above
// Make sure this pointer is 8-byte aligned
compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
OptixTraversableHandle out_handle = 0;
check_result_optix_ret(optixAccelBuild(context,
NULL,
&options,
&build_input,
1,
temp_mem.device_pointer,
sizes.tempSizeInBytes,
out_data.device_pointer,
sizes.outputSizeInBytes,
&out_handle,
use_fast_trace_bvh ? &compacted_size_prop : NULL,
use_fast_trace_bvh ? 1 : 0));
bvh->traversable_handle = static_cast<uint64_t>(out_handle);
// Wait for all operations to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
// Compact acceleration structure to save memory (only if using fast trace as the
// OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
if (use_fast_trace_bvh) {
uint64_t compacted_size = sizes.outputSizeInBytes;
check_result_cuda_ret(
cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
// Temporary memory is no longer needed, so free it now to make space
temp_mem.free();
// There is no point compacting if the size does not change
if (compacted_size < sizes.outputSizeInBytes) {
device_only_memory<char> compacted_data(this, "optix compacted as", false);
compacted_data.alloc_to_device(compacted_size);
if (!compacted_data.device_pointer)
// Do not compact if memory allocation for compacted acceleration structure fails
// Can just use the uncompacted one then, so succeed here regardless
return true;
check_result_optix_ret(optixAccelCompact(context,
NULL,
out_handle,
compacted_data.device_pointer,
compacted_size,
&out_handle));
bvh->traversable_handle = static_cast<uint64_t>(out_handle);
// Wait for compaction to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
std::swap(out_data.device_size, compacted_data.device_size);
std::swap(out_data.device_pointer, compacted_data.device_pointer);
// Original acceleration structure memory is freed when 'compacted_data' goes out of scope
}
}
return true;
}
void build_bvh(BVH *bvh, Progress &progress, bool refit) override
{
if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
/* For baking CUDA is used, build appropriate BVH for that. */
Device::build_bvh(bvh, progress, refit);
return;
}
const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
free_bvh_memory_delayed();
BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
progress.set_substatus("Building OptiX acceleration structure");
if (!bvh->params.top_level) {
assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
/* Refit is only possible when using fast to trace BVH (because AS is built with
* OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
if (refit && !use_fast_trace_bvh) {
assert(bvh_optix->traversable_handle != 0);
operation = OPTIX_BUILD_OPERATION_UPDATE;
}
else {
bvh_optix->as_data.free();
bvh_optix->traversable_handle = 0;
}
// Build bottom level acceleration structures (BLAS)
Geometry *const geom = bvh->geometry[0];
if (geom->geometry_type == Geometry::HAIR) {
// Build BLAS for curve primitives
Hair *const hair = static_cast<Hair *const>(geom);
if (hair->num_curves() == 0) {
return;
}
const size_t num_segments = hair->num_segments();
size_t num_motion_steps = 1;
Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
num_motion_steps = hair->get_motion_steps();
}
device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
# if OPTIX_ABI_VERSION >= 36
device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
// Four control points for each curve segment
const size_t num_vertices = num_segments * 4;
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
index_data.alloc(num_segments);
vertex_data.alloc(num_vertices * num_motion_steps);
}
else
# endif
aabb_data.alloc(num_segments * num_motion_steps);
// Get AABBs for each motion step
for (size_t step = 0; step < num_motion_steps; ++step) {
// The center step for motion vertices is not stored in the attribute
const float3 *keys = hair->get_curve_keys().data();
size_t center_step = (num_motion_steps - 1) / 2;
if (step != center_step) {
size_t attr_offset = (step > center_step) ? step - 1 : step;
// Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
}
for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
const Hair::Curve curve = hair->get_curve(j);
# if OPTIX_ABI_VERSION >= 36
const array<float> &curve_radius = hair->get_curve_radius();
# endif
for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
# if OPTIX_ABI_VERSION >= 36
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
int k0 = curve.first_key + segment;
int k1 = k0 + 1;
int ka = max(k0 - 1, curve.first_key);
int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
const float4 pw = make_float4(
curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
// Convert Catmull-Rom data to Bezier spline
static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
index_data[i] = i * 4;
float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
v[0] = make_float4(
dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
v[1] = make_float4(
dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
v[2] = make_float4(
dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
v[3] = make_float4(
dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
}
else
# endif
{
BoundBox bounds = BoundBox::empty;
curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
const size_t index = step * num_segments + i;
aabb_data[index].minX = bounds.min.x;
aabb_data[index].minY = bounds.min.y;
aabb_data[index].minZ = bounds.min.z;
aabb_data[index].maxX = bounds.max.x;
aabb_data[index].maxY = bounds.max.y;
aabb_data[index].maxZ = bounds.max.z;
}
}
}
}
// Upload AABB data to GPU
aabb_data.copy_to_device();
# if OPTIX_ABI_VERSION >= 36
index_data.copy_to_device();
vertex_data.copy_to_device();
# endif
vector<device_ptr> aabb_ptrs;
aabb_ptrs.reserve(num_motion_steps);
# if OPTIX_ABI_VERSION >= 36
vector<device_ptr> width_ptrs;
vector<device_ptr> vertex_ptrs;
width_ptrs.reserve(num_motion_steps);
vertex_ptrs.reserve(num_motion_steps);
# endif
for (size_t step = 0; step < num_motion_steps; ++step) {
aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
# if OPTIX_ABI_VERSION >= 36
const device_ptr base_ptr = vertex_data.device_pointer +
step * num_vertices * sizeof(float4);
width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size
vertex_ptrs.push_back(base_ptr);
# endif
}
// Force a single any-hit call, so shadow record-all behavior works correctly
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
OptixBuildInput build_input = {};
# if OPTIX_ABI_VERSION >= 36
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
build_input.curveArray.numPrimitives = num_segments;
build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
build_input.curveArray.numVertices = num_vertices;
build_input.curveArray.vertexStrideInBytes = sizeof(float4);
build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
build_input.curveArray.widthStrideInBytes = sizeof(float4);
build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
build_input.curveArray.indexStrideInBytes = sizeof(int);
build_input.curveArray.flag = build_flags;
build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
}
else
# endif
{
// Disable visibility test any-hit program, since it is already checked during
// intersection. Those trace calls that require anyhit can force it with a ray flag.
build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
# if OPTIX_ABI_VERSION < 23
build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
build_input.aabbArray.numPrimitives = num_segments;
build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
build_input.aabbArray.flags = &build_flags;
build_input.aabbArray.numSbtRecords = 1;
build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
# else
build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
build_input.customPrimitiveArray.numPrimitives = num_segments;
build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
build_input.customPrimitiveArray.flags = &build_flags;
build_input.customPrimitiveArray.numSbtRecords = 1;
build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
# endif
}
if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
progress.set_error("Failed to build OptiX acceleration structure");
}
}
else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
// Build BLAS for triangle primitives
Mesh *const mesh = static_cast<Mesh *const>(geom);
if (mesh->num_triangles() == 0) {
return;
}
const size_t num_verts = mesh->get_verts().size();
size_t num_motion_steps = 1;
Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
num_motion_steps = mesh->get_motion_steps();
}
device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
index_data.alloc(mesh->get_triangles().size());
memcpy(index_data.data(),
mesh->get_triangles().data(),
mesh->get_triangles().size() * sizeof(int));
device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
vertex_data.alloc(num_verts * num_motion_steps);
for (size_t step = 0; step < num_motion_steps; ++step) {
const float3 *verts = mesh->get_verts().data();
size_t center_step = (num_motion_steps - 1) / 2;
// The center step for motion vertices is not stored in the attribute
if (step != center_step) {
verts = motion_keys->data_float3() +
(step > center_step ? step - 1 : step) * num_verts;
}
memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
}
// Upload triangle data to GPU
index_data.copy_to_device();
vertex_data.copy_to_device();
vector<device_ptr> vertex_ptrs;
vertex_ptrs.reserve(num_motion_steps);
for (size_t step = 0; step < num_motion_steps; ++step) {
vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
}
// Force a single any-hit call, so shadow record-all behavior works correctly
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
OptixBuildInput build_input = {};
build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
build_input.triangleArray.numVertices = num_verts;
build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
build_input.triangleArray.indexBuffer = index_data.device_pointer;
build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
build_input.triangleArray.flags = &build_flags;
// The SBT does not store per primitive data since Cycles already allocates separate
// buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
// one and rely on that having the same meaning in this case.
build_input.triangleArray.numSbtRecords = 1;
build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
progress.set_error("Failed to build OptiX acceleration structure");
}
}
}
else {
unsigned int num_instances = 0;
unsigned int max_num_instances = 0xFFFFFFFF;
bvh_optix->as_data.free();
bvh_optix->traversable_handle = 0;
bvh_optix->motion_transform_data.free();
optixDeviceContextGetProperty(context,
OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
&max_num_instances,
sizeof(max_num_instances));
// Do not count first bit, which is used to distinguish instanced and non-instanced objects
max_num_instances >>= 1;
if (bvh->objects.size() > max_num_instances) {
progress.set_error(
"Failed to build OptiX acceleration structure because there are too many instances");
return;
}
// Fill instance descriptions
# if OPTIX_ABI_VERSION < 41
device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
aabbs.alloc(bvh->objects.size());
# endif
device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
instances.alloc(bvh->objects.size());
// Calculate total motion transform size and allocate memory for them
size_t motion_transform_offset = 0;
if (motion_blur) {
size_t total_motion_transform_size = 0;
for (Object *const ob : bvh->objects) {
if (ob->is_traceable() && ob->use_motion()) {
total_motion_transform_size = align_up(total_motion_transform_size,
OPTIX_TRANSFORM_BYTE_ALIGNMENT);
const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
total_motion_transform_size = total_motion_transform_size +
sizeof(OptixSRTMotionTransform) +
motion_keys * sizeof(OptixSRTData);
}
}
assert(bvh_optix->motion_transform_data.device == this);
bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
}
for (Object *ob : bvh->objects) {
// Skip non-traceable objects
if (!ob->is_traceable())
continue;
BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
OptixTraversableHandle handle = blas->traversable_handle;
# if OPTIX_ABI_VERSION < 41
OptixAabb &aabb = aabbs[num_instances];
aabb.minX = ob->bounds.min.x;
aabb.minY = ob->bounds.min.y;
aabb.minZ = ob->bounds.min.z;
aabb.maxX = ob->bounds.max.x;
aabb.maxY = ob->bounds.max.y;
aabb.maxZ = ob->bounds.max.z;
# endif
OptixInstance &instance = instances[num_instances++];
memset(&instance, 0, sizeof(instance));
// Clear transform to identity matrix
instance.transform[0] = 1.0f;
instance.transform[5] = 1.0f;
instance.transform[10] = 1.0f;
// Set user instance ID to object index (but leave low bit blank)
instance.instanceId = ob->get_device_index() << 1;
// Have to have at least one bit in the mask, or else instance would always be culled
instance.visibilityMask = 1;
if (ob->get_geometry()->has_volume) {
// Volumes have a special bit set in the visibility mask so a trace can mask only volumes
instance.visibilityMask |= 2;
}
if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
// Same applies to curves (so they can be skipped in local trace calls)
instance.visibilityMask |= 4;
# if OPTIX_ABI_VERSION >= 36
if (motion_blur && ob->get_geometry()->has_motion_blur() &&
DebugFlags().optix.curves_api &&
static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
// Select between motion blur and non-motion blur built-in intersection module
instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
}
# endif
}
// Insert motion traversable if object has motion
if (motion_blur && ob->use_motion()) {
size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
motion_keys * sizeof(OptixSRTData);
const CUDAContextScope scope(cuContext);
motion_transform_offset = align_up(motion_transform_offset,
OPTIX_TRANSFORM_BYTE_ALIGNMENT);
CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
motion_transform_offset;
motion_transform_offset += motion_transform_size;
// Allocate host side memory for motion transform and fill it with transform data
OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
new uint8_t[motion_transform_size]);
motion_transform.child = handle;
motion_transform.motionOptions.numKeys = ob->get_motion().size();
motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
motion_transform.motionOptions.timeBegin = 0.0f;
motion_transform.motionOptions.timeEnd = 1.0f;
OptixSRTData *const srt_data = motion_transform.srtData;
array<DecomposedTransform> decomp(ob->get_motion().size());
transform_motion_decompose(
decomp.data(), ob->get_motion().data(), ob->get_motion().size());
for (size_t i = 0; i < ob->get_motion().size(); ++i) {
// Scale
srt_data[i].sx = decomp[i].y.w; // scale.x.x
srt_data[i].sy = decomp[i].z.w; // scale.y.y
srt_data[i].sz = decomp[i].w.w; // scale.z.z
// Shear
srt_data[i].a = decomp[i].z.x; // scale.x.y
srt_data[i].b = decomp[i].z.y; // scale.x.z
srt_data[i].c = decomp[i].w.x; // scale.y.z
assert(decomp[i].z.z == 0.0f); // scale.y.x
assert(decomp[i].w.y == 0.0f); // scale.z.x
assert(decomp[i].w.z == 0.0f); // scale.z.y
// Pivot point
srt_data[i].pvx = 0.0f;
srt_data[i].pvy = 0.0f;
srt_data[i].pvz = 0.0f;
// Rotation
srt_data[i].qx = decomp[i].x.x;
srt_data[i].qy = decomp[i].x.y;
srt_data[i].qz = decomp[i].x.z;
srt_data[i].qw = decomp[i].x.w;
// Translation
srt_data[i].tx = decomp[i].y.x;
srt_data[i].ty = decomp[i].y.y;
srt_data[i].tz = decomp[i].y.z;
}
// Upload motion transform to GPU
cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
delete[] reinterpret_cast<uint8_t *>(&motion_transform);
// Disable instance transform if object uses motion transform already
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
// Get traversable handle to motion transform
optixConvertPointerToTraversableHandle(context,
motion_transform_gpu,
OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
&instance.traversableHandle);
}
else {
instance.traversableHandle = handle;
if (ob->get_geometry()->is_instanced()) {
// Set transform matrix
memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
}
else {
// Disable instance transform if geometry already has it applied to vertex data
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
// Non-instanced objects read ID from 'prim_object', so distinguish
// them from instanced objects with the low bit set
instance.instanceId |= 1;
}
}
}
// Upload instance descriptions
# if OPTIX_ABI_VERSION < 41
aabbs.resize(num_instances);
aabbs.copy_to_device();
# endif
instances.resize(num_instances);
instances.copy_to_device();
// Build top-level acceleration structure (TLAS)
OptixBuildInput build_input = {};
build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
build_input.instanceArray.aabbs = aabbs.device_pointer;
build_input.instanceArray.numAabbs = num_instances;
# endif
build_input.instanceArray.instances = instances.device_pointer;
build_input.instanceArray.numInstances = num_instances;
if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
progress.set_error("Failed to build OptiX acceleration structure");
}
tlas_handle = bvh_optix->traversable_handle;
}
}
void release_optix_bvh(BVH *bvh) override
{
thread_scoped_lock lock(delayed_free_bvh_mutex);
/* Do delayed free of BVH memory, since geometry holding BVH might be deleted
* while GPU is still rendering. */
BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
bvh_optix->traversable_handle = 0;
}
void free_bvh_memory_delayed()
{
thread_scoped_lock lock(delayed_free_bvh_mutex);
delayed_free_bvh_memory.free_memory();
}
void const_copy_to(const char *name, void *host, size_t size) override
{
// Set constant memory for CUDA module
// TODO(pmours): This is only used for tonemapping (see 'film_convert').
// Could be removed by moving those functions to filter CUDA module.
CUDADevice::const_copy_to(name, host, size);
if (strcmp(name, "__data") == 0) {
assert(size <= sizeof(KernelData));
// Update traversable handle (since it is different for each device on multi devices)
KernelData *const data = (KernelData *)host;
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
update_launch_params(offsetof(KernelParams, data), host, size);
return;
}
// Update data storage pointers in launch parameters
# define KERNEL_TEX(data_type, tex_name) \
if (strcmp(name, #tex_name) == 0) { \
update_launch_params(offsetof(KernelParams, tex_name), host, size); \
return; \
}
# include "kernel/kernel_textures.h"
# undef KERNEL_TEX
}
void update_launch_params(size_t offset, void *data, size_t data_size)
{
const CUDAContextScope scope(cuContext);
for (int i = 0; i < info.cpu_threads; ++i)
check_result_cuda(
cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
data,
data_size));
}
void task_add(DeviceTask &task) override
{
// Upload texture information to device if it has changed since last launch
load_texture_info();
if (task.type == DeviceTask::FILM_CONVERT) {
// Execute in main thread because of OpenGL access
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
return;
}
if (task.type == DeviceTask::DENOISE_BUFFER) {
// Execute denoising in a single thread (e.g. to avoid race conditions during creation)
task_pool.push([=] {
DeviceTask task_copy = task;
thread_run(task_copy, 0);
});
return;
}
// Split task into smaller ones
list<DeviceTask> tasks;
task.split(tasks, info.cpu_threads);
// Queue tasks in internal task pool
int task_index = 0;
for (DeviceTask &task : tasks) {
task_pool.push([=] {
// Using task index parameter instead of thread index, since number of CUDA streams may
// differ from number of threads
DeviceTask task_copy = task;
thread_run(task_copy, task_index);
});
task_index++;
}
}
void task_wait() override
{
// Wait for all queued tasks to finish
task_pool.wait_work();
}
void task_cancel() override
{
// Cancel any remaining tasks in the internal pool
task_pool.cancel();
}
};
bool device_optix_init()
{
if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
return true; // Already initialized function table
// Need to initialize CUDA as well
if (!device_cuda_init())
return false;
const OptixResult result = optixInit();
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
"Please update to the latest driver first!";
return false;
}
else if (result != OPTIX_SUCCESS) {
VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
return false;
}
// Loaded OptiX successfully!
return true;
}
void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
{
devices.reserve(cuda_devices.size());
// Simply add all supported CUDA devices as OptiX devices again
for (DeviceInfo info : cuda_devices) {
assert(info.type == DEVICE_CUDA);
int major;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
if (major < 5) {
continue; // Only Maxwell and up are supported by OptiX
}
info.type = DEVICE_OPTIX;
info.id += "_OptiX";
info.denoisers |= DENOISER_OPTIX;
info.has_branched_path = false;
devices.push_back(info);
}
}
Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
return new OptiXDevice(info, stats, profiler, background);
}
CCL_NAMESPACE_END
#endif