073bf8bf52
WITH_CYCLES_DEBUG was used for rendering BVH debugging passes. But since we mainly use Embree an OptiX now, this information is no longer important. WITH_CYCLES_DEBUG_NAN will enable additional checks for NaNs and invalid values in the kernel, for Cycles developers. Previously these asserts where enabled in all debug builds, but this is too likely to crash Blender in scenes that render fine regardless of the NaNs. So this is behind a CMake option now. Fixes T90240
1937 lines
79 KiB
C++
1937 lines
79 KiB
C++
/*
|
|
* Copyright 2019, NVIDIA Corporation.
|
|
* Copyright 2019, Blender Foundation.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifdef WITH_OPTIX
|
|
|
|
# include "bvh/bvh.h"
|
|
# include "bvh/bvh_optix.h"
|
|
# include "device/cuda/device_cuda.h"
|
|
# include "device/device_denoising.h"
|
|
# include "device/device_intern.h"
|
|
# include "render/buffers.h"
|
|
# include "render/hair.h"
|
|
# include "render/mesh.h"
|
|
# include "render/object.h"
|
|
# include "render/scene.h"
|
|
# include "util/util_debug.h"
|
|
# include "util/util_logging.h"
|
|
# include "util/util_md5.h"
|
|
# include "util/util_path.h"
|
|
# include "util/util_progress.h"
|
|
# include "util/util_time.h"
|
|
|
|
# ifdef WITH_CUDA_DYNLOAD
|
|
# include <cuew.h>
|
|
// Do not use CUDA SDK headers when using CUEW
|
|
# define OPTIX_DONT_INCLUDE_CUDA
|
|
# endif
|
|
# include <optix_function_table_definition.h>
|
|
# include <optix_stubs.h>
|
|
|
|
// TODO(pmours): Disable this once drivers have native support
|
|
# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Make sure this stays in sync with kernel_globals.h */
|
|
struct ShaderParams {
|
|
uint4 *input;
|
|
float4 *output;
|
|
int type;
|
|
int filter;
|
|
int sx;
|
|
int offset;
|
|
int sample;
|
|
};
|
|
struct KernelParams {
|
|
WorkTile tile;
|
|
KernelData data;
|
|
ShaderParams shader;
|
|
# define KERNEL_TEX(type, name) const type *name;
|
|
# include "kernel/kernel_textures.h"
|
|
# undef KERNEL_TEX
|
|
};
|
|
|
|
# define check_result_cuda(stmt) \
|
|
{ \
|
|
CUresult res = stmt; \
|
|
if (res != CUDA_SUCCESS) { \
|
|
const char *name; \
|
|
cuGetErrorName(res, &name); \
|
|
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
|
|
return; \
|
|
} \
|
|
} \
|
|
(void)0
|
|
# define check_result_cuda_ret(stmt) \
|
|
{ \
|
|
CUresult res = stmt; \
|
|
if (res != CUDA_SUCCESS) { \
|
|
const char *name; \
|
|
cuGetErrorName(res, &name); \
|
|
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
|
|
return false; \
|
|
} \
|
|
} \
|
|
(void)0
|
|
|
|
# define check_result_optix(stmt) \
|
|
{ \
|
|
enum OptixResult res = stmt; \
|
|
if (res != OPTIX_SUCCESS) { \
|
|
const char *name = optixGetErrorName(res); \
|
|
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
|
|
return; \
|
|
} \
|
|
} \
|
|
(void)0
|
|
# define check_result_optix_ret(stmt) \
|
|
{ \
|
|
enum OptixResult res = stmt; \
|
|
if (res != OPTIX_SUCCESS) { \
|
|
const char *name = optixGetErrorName(res); \
|
|
set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
|
|
return false; \
|
|
} \
|
|
} \
|
|
(void)0
|
|
|
|
# define launch_filter_kernel(func_name, w, h, args) \
|
|
{ \
|
|
CUfunction func; \
|
|
check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
|
|
int threads; \
|
|
check_result_cuda_ret( \
|
|
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
|
threads = (int)sqrt((float)threads); \
|
|
int xblocks = ((w) + threads - 1) / threads; \
|
|
int yblocks = ((h) + threads - 1) / threads; \
|
|
check_result_cuda_ret( \
|
|
cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
|
|
} \
|
|
(void)0
|
|
|
|
class OptiXDevice : public CUDADevice {
|
|
|
|
// List of OptiX program groups
|
|
enum {
|
|
PG_RGEN,
|
|
PG_MISS,
|
|
PG_HITD, // Default hit group
|
|
PG_HITS, // __SHADOW_RECORD_ALL__ hit group
|
|
PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles)
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
PG_HITD_MOTION,
|
|
PG_HITS_MOTION,
|
|
# endif
|
|
PG_BAKE, // kernel_bake_evaluate
|
|
PG_DISP, // kernel_displace_evaluate
|
|
PG_BACK, // kernel_background_evaluate
|
|
PG_CALL,
|
|
NUM_PROGRAM_GROUPS = PG_CALL + 3
|
|
};
|
|
|
|
// List of OptiX pipelines
|
|
enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
|
|
|
|
// A single shader binding table entry
|
|
struct SbtRecord {
|
|
char header[OPTIX_SBT_RECORD_HEADER_SIZE];
|
|
};
|
|
|
|
// Information stored about CUDA memory allocations
|
|
struct CUDAMem {
|
|
bool free_map_host = false;
|
|
CUarray array = NULL;
|
|
CUtexObject texobject = 0;
|
|
bool use_mapped_host = false;
|
|
};
|
|
|
|
// Helper class to manage current CUDA context
|
|
struct CUDAContextScope {
|
|
CUDAContextScope(CUcontext ctx)
|
|
{
|
|
cuCtxPushCurrent(ctx);
|
|
}
|
|
~CUDAContextScope()
|
|
{
|
|
cuCtxPopCurrent(NULL);
|
|
}
|
|
};
|
|
|
|
// Use a pool with multiple threads to support launches with multiple CUDA streams
|
|
TaskPool task_pool;
|
|
|
|
vector<CUstream> cuda_stream;
|
|
OptixDeviceContext context = NULL;
|
|
|
|
OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module
|
|
OptixModule builtin_modules[2] = {};
|
|
OptixPipeline pipelines[NUM_PIPELINES] = {};
|
|
|
|
bool motion_blur = false;
|
|
device_vector<SbtRecord> sbt_data;
|
|
device_only_memory<KernelParams> launch_params;
|
|
OptixTraversableHandle tlas_handle = 0;
|
|
|
|
OptixDenoiser denoiser = NULL;
|
|
device_only_memory<unsigned char> denoiser_state;
|
|
int denoiser_input_passes = 0;
|
|
|
|
vector<device_only_memory<char>> delayed_free_bvh_memory;
|
|
thread_mutex delayed_free_bvh_mutex;
|
|
|
|
public:
|
|
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
|
: CUDADevice(info_, stats_, profiler_, background_),
|
|
sbt_data(this, "__sbt", MEM_READ_ONLY),
|
|
launch_params(this, "__params", false),
|
|
denoiser_state(this, "__denoiser_state", true)
|
|
{
|
|
// Store number of CUDA streams in device info
|
|
info.cpu_threads = DebugFlags().optix.cuda_streams;
|
|
|
|
// Make the CUDA context current
|
|
if (!cuContext) {
|
|
return; // Do not initialize if CUDA context creation failed already
|
|
}
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
// Create OptiX context for this device
|
|
OptixDeviceContextOptions options = {};
|
|
# ifdef WITH_CYCLES_LOGGING
|
|
options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
|
|
options.logCallbackFunction =
|
|
[](unsigned int level, const char *, const char *message, void *) {
|
|
switch (level) {
|
|
case 1:
|
|
LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
|
|
break;
|
|
case 2:
|
|
LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
|
|
break;
|
|
case 3:
|
|
LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
|
|
break;
|
|
case 4:
|
|
LOG_IF(INFO, VLOG_IS_ON(1)) << message;
|
|
break;
|
|
}
|
|
};
|
|
# endif
|
|
check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
|
|
# ifdef WITH_CYCLES_LOGGING
|
|
check_result_optix(optixDeviceContextSetLogCallback(
|
|
context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
|
|
# endif
|
|
|
|
// Create launch streams
|
|
cuda_stream.resize(info.cpu_threads);
|
|
for (int i = 0; i < info.cpu_threads; ++i)
|
|
check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
|
|
|
|
// Fix weird compiler bug that assigns wrong size
|
|
launch_params.data_elements = sizeof(KernelParams);
|
|
// Allocate launch parameter buffer memory on device
|
|
launch_params.alloc_to_device(info.cpu_threads);
|
|
}
|
|
~OptiXDevice()
|
|
{
|
|
// Stop processing any more tasks
|
|
task_pool.cancel();
|
|
|
|
// Make CUDA context current
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
free_bvh_memory_delayed();
|
|
|
|
sbt_data.free();
|
|
texture_info.free();
|
|
launch_params.free();
|
|
denoiser_state.free();
|
|
|
|
// Unload modules
|
|
if (optix_module != NULL)
|
|
optixModuleDestroy(optix_module);
|
|
for (unsigned int i = 0; i < 2; ++i)
|
|
if (builtin_modules[i] != NULL)
|
|
optixModuleDestroy(builtin_modules[i]);
|
|
for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
|
|
if (pipelines[i] != NULL)
|
|
optixPipelineDestroy(pipelines[i]);
|
|
|
|
// Destroy launch streams
|
|
for (CUstream stream : cuda_stream)
|
|
cuStreamDestroy(stream);
|
|
|
|
if (denoiser != NULL)
|
|
optixDenoiserDestroy(denoiser);
|
|
|
|
optixDeviceContextDestroy(context);
|
|
}
|
|
|
|
private:
|
|
bool show_samples() const override
|
|
{
|
|
// Only show samples if not rendering multiple tiles in parallel
|
|
return info.cpu_threads == 1;
|
|
}
|
|
|
|
BVHLayoutMask get_bvh_layout_mask() const override
|
|
{
|
|
// CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
|
|
if (optix_module == NULL)
|
|
return CUDADevice::get_bvh_layout_mask();
|
|
|
|
// OptiX has its own internal acceleration structure format
|
|
return BVH_LAYOUT_OPTIX;
|
|
}
|
|
|
|
string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
|
|
bool filter,
|
|
bool /*split*/) override
|
|
{
|
|
// Split kernel is not supported in OptiX
|
|
string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
|
|
requested_features, filter, false);
|
|
|
|
// Add OptiX SDK include directory to include paths
|
|
const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
|
|
if (optix_sdk_path) {
|
|
common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
|
|
}
|
|
|
|
// Specialization for shader raytracing
|
|
if (requested_features.use_shader_raytrace) {
|
|
common_cflags += " --keep-device-functions";
|
|
}
|
|
else {
|
|
common_cflags += " -D __NO_SHADER_RAYTRACE__";
|
|
}
|
|
|
|
return common_cflags;
|
|
}
|
|
|
|
bool load_kernels(const DeviceRequestedFeatures &requested_features) override
|
|
{
|
|
if (have_error()) {
|
|
// Abort early if context creation failed already
|
|
return false;
|
|
}
|
|
|
|
// Load CUDA modules because we need some of the utility kernels
|
|
if (!CUDADevice::load_kernels(requested_features)) {
|
|
return false;
|
|
}
|
|
|
|
// Baking is currently performed using CUDA, so no need to load OptiX kernels
|
|
if (requested_features.use_baking) {
|
|
return true;
|
|
}
|
|
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
// Unload existing OptiX module and pipelines first
|
|
if (optix_module != NULL) {
|
|
optixModuleDestroy(optix_module);
|
|
optix_module = NULL;
|
|
}
|
|
for (unsigned int i = 0; i < 2; ++i) {
|
|
if (builtin_modules[i] != NULL) {
|
|
optixModuleDestroy(builtin_modules[i]);
|
|
builtin_modules[i] = NULL;
|
|
}
|
|
}
|
|
for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
|
|
if (pipelines[i] != NULL) {
|
|
optixPipelineDestroy(pipelines[i]);
|
|
pipelines[i] = NULL;
|
|
}
|
|
}
|
|
|
|
OptixModuleCompileOptions module_options = {};
|
|
module_options.maxRegisterCount = 0; // Do not set an explicit register limit
|
|
module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
|
|
module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
|
|
|
|
# if OPTIX_ABI_VERSION >= 41
|
|
module_options.boundValues = nullptr;
|
|
module_options.numBoundValues = 0;
|
|
# endif
|
|
|
|
OptixPipelineCompileOptions pipeline_options = {};
|
|
// Default to no motion blur and two-level graph, since it is the fastest option
|
|
pipeline_options.usesMotionBlur = false;
|
|
pipeline_options.traversableGraphFlags =
|
|
OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
|
|
pipeline_options.numPayloadValues = 6;
|
|
pipeline_options.numAttributeValues = 2; // u, v
|
|
pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
|
|
pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
|
|
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
|
|
if (requested_features.use_hair) {
|
|
if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
|
|
pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
|
|
}
|
|
else {
|
|
pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
|
|
}
|
|
}
|
|
# endif
|
|
|
|
// Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
|
|
// This is necessary since objects may be reported to have motion if the Vector pass is
|
|
// active, but may still need to be rendered without motion blur if that isn't active as well
|
|
motion_blur = requested_features.use_object_motion;
|
|
|
|
if (motion_blur) {
|
|
pipeline_options.usesMotionBlur = true;
|
|
// Motion blur can insert motion transforms into the traversal graph
|
|
// It is no longer a two-level graph then, so need to set flags to allow any configuration
|
|
pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
|
|
}
|
|
|
|
{ // Load and compile PTX module with OptiX kernels
|
|
string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
|
|
"lib/kernel_optix_shader_raytrace.ptx" :
|
|
"lib/kernel_optix.ptx");
|
|
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
|
|
if (!getenv("OPTIX_ROOT_DIR")) {
|
|
set_error(
|
|
"Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
|
|
"the Optix SDK to be able to compile Optix kernels on demand).");
|
|
return false;
|
|
}
|
|
ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
|
|
}
|
|
if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
|
|
set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
|
|
return false;
|
|
}
|
|
|
|
check_result_optix_ret(optixModuleCreateFromPTX(context,
|
|
&module_options,
|
|
&pipeline_options,
|
|
ptx_data.data(),
|
|
ptx_data.size(),
|
|
nullptr,
|
|
0,
|
|
&optix_module));
|
|
}
|
|
|
|
// Create program groups
|
|
OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
|
|
OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
|
|
OptixProgramGroupOptions group_options = {}; // There are no options currently
|
|
group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
group_descs[PG_RGEN].raygen.module = optix_module;
|
|
// Ignore branched integrator for now (see "requested_features.use_integrator_branched")
|
|
group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
|
|
group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
|
|
group_descs[PG_MISS].miss.module = optix_module;
|
|
group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
|
|
group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
|
|
group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
|
|
group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
|
|
|
|
if (requested_features.use_hair) {
|
|
group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
|
|
group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
|
|
|
|
// Add curve intersection programs
|
|
if (requested_features.use_hair_thick) {
|
|
// Slower programs for thick hair since that also slows down ribbons.
|
|
// Ideally this should not be needed.
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
|
|
}
|
|
else {
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
|
|
}
|
|
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
|
|
OptixBuiltinISOptions builtin_options = {};
|
|
builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
|
|
builtin_options.usesMotionBlur = false;
|
|
|
|
check_result_optix_ret(optixBuiltinISModuleGet(
|
|
context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
|
|
|
|
group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
|
|
group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
|
|
|
|
if (motion_blur) {
|
|
builtin_options.usesMotionBlur = true;
|
|
|
|
check_result_optix_ret(optixBuiltinISModuleGet(
|
|
context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
|
|
|
|
group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
|
|
group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
|
|
group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
|
|
group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
|
|
}
|
|
}
|
|
# endif
|
|
}
|
|
|
|
if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
|
|
// Add hit group for local intersections
|
|
group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
|
|
group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
|
|
}
|
|
|
|
if (requested_features.use_baking) {
|
|
group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
group_descs[PG_BAKE].raygen.module = optix_module;
|
|
group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
|
|
}
|
|
|
|
if (requested_features.use_true_displacement) {
|
|
group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
group_descs[PG_DISP].raygen.module = optix_module;
|
|
group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
|
|
}
|
|
|
|
if (requested_features.use_background_light) {
|
|
group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
group_descs[PG_BACK].raygen.module = optix_module;
|
|
group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
|
|
}
|
|
|
|
// Shader raytracing replaces some functions with direct callables
|
|
if (requested_features.use_shader_raytrace) {
|
|
group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
|
|
group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
|
|
group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
|
|
group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
|
|
group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
|
|
group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
|
|
"__direct_callable__kernel_volume_shadow";
|
|
group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
|
|
group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
|
|
group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
|
|
"__direct_callable__subsurface_scatter_multi_setup";
|
|
}
|
|
|
|
check_result_optix_ret(optixProgramGroupCreate(
|
|
context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
|
|
|
|
// Get program stack sizes
|
|
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
|
|
// Set up SBT, which in this case is used only to select between different programs
|
|
sbt_data.alloc(NUM_PROGRAM_GROUPS);
|
|
memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
|
|
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
|
|
check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
|
|
check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
|
|
}
|
|
sbt_data.copy_to_device(); // Upload SBT to device
|
|
|
|
// Calculate maximum trace continuation stack size
|
|
unsigned int trace_css = stack_size[PG_HITD].cssCH;
|
|
// This is based on the maximum of closest-hit and any-hit/intersection programs
|
|
trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
|
|
trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
|
|
trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
trace_css = std::max(trace_css,
|
|
stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
|
|
trace_css = std::max(trace_css,
|
|
stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
|
|
# endif
|
|
|
|
OptixPipelineLinkOptions link_options = {};
|
|
link_options.maxTraceDepth = 1;
|
|
link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
|
|
# if OPTIX_ABI_VERSION < 24
|
|
link_options.overrideUsesMotionBlur = motion_blur;
|
|
# endif
|
|
|
|
{ // Create path tracing pipeline
|
|
vector<OptixProgramGroup> pipeline_groups;
|
|
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
|
|
pipeline_groups.push_back(groups[PG_RGEN]);
|
|
pipeline_groups.push_back(groups[PG_MISS]);
|
|
pipeline_groups.push_back(groups[PG_HITD]);
|
|
pipeline_groups.push_back(groups[PG_HITS]);
|
|
pipeline_groups.push_back(groups[PG_HITL]);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (motion_blur) {
|
|
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
|
|
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
|
|
}
|
|
# endif
|
|
if (requested_features.use_shader_raytrace) {
|
|
pipeline_groups.push_back(groups[PG_CALL + 0]);
|
|
pipeline_groups.push_back(groups[PG_CALL + 1]);
|
|
pipeline_groups.push_back(groups[PG_CALL + 2]);
|
|
}
|
|
|
|
check_result_optix_ret(optixPipelineCreate(context,
|
|
&pipeline_options,
|
|
&link_options,
|
|
pipeline_groups.data(),
|
|
pipeline_groups.size(),
|
|
nullptr,
|
|
0,
|
|
&pipelines[PIP_PATH_TRACE]));
|
|
|
|
// Combine ray generation and trace continuation stack size
|
|
const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
|
|
// Max direct callable depth is one of the following, so combine accordingly
|
|
// - __raygen__ -> svm_eval_nodes
|
|
// - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
|
|
// - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
|
|
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
|
|
std::max(stack_size[PG_CALL + 1].dssDC,
|
|
stack_size[PG_CALL + 2].dssDC);
|
|
|
|
// Set stack size depending on pipeline options
|
|
check_result_optix_ret(
|
|
optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
|
|
0,
|
|
requested_features.use_shader_raytrace ? dss : 0,
|
|
css,
|
|
motion_blur ? 3 : 2));
|
|
}
|
|
|
|
// Only need to create shader evaluation pipeline if one of these features is used:
|
|
const bool use_shader_eval_pipeline = requested_features.use_baking ||
|
|
requested_features.use_background_light ||
|
|
requested_features.use_true_displacement;
|
|
|
|
if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
|
|
vector<OptixProgramGroup> pipeline_groups;
|
|
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
|
|
pipeline_groups.push_back(groups[PG_BAKE]);
|
|
pipeline_groups.push_back(groups[PG_DISP]);
|
|
pipeline_groups.push_back(groups[PG_BACK]);
|
|
pipeline_groups.push_back(groups[PG_MISS]);
|
|
pipeline_groups.push_back(groups[PG_HITD]);
|
|
pipeline_groups.push_back(groups[PG_HITS]);
|
|
pipeline_groups.push_back(groups[PG_HITL]);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (motion_blur) {
|
|
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
|
|
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
|
|
}
|
|
# endif
|
|
if (requested_features.use_shader_raytrace) {
|
|
pipeline_groups.push_back(groups[PG_CALL + 0]);
|
|
pipeline_groups.push_back(groups[PG_CALL + 1]);
|
|
pipeline_groups.push_back(groups[PG_CALL + 2]);
|
|
}
|
|
|
|
check_result_optix_ret(optixPipelineCreate(context,
|
|
&pipeline_options,
|
|
&link_options,
|
|
pipeline_groups.data(),
|
|
pipeline_groups.size(),
|
|
nullptr,
|
|
0,
|
|
&pipelines[PIP_SHADER_EVAL]));
|
|
|
|
// Calculate continuation stack size based on the maximum of all ray generation stack sizes
|
|
const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
|
|
std::max(stack_size[PG_DISP].cssRG,
|
|
stack_size[PG_BACK].cssRG)) +
|
|
link_options.maxTraceDepth * trace_css;
|
|
const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
|
|
std::max(stack_size[PG_CALL + 1].dssDC,
|
|
stack_size[PG_CALL + 2].dssDC);
|
|
|
|
check_result_optix_ret(
|
|
optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
|
|
0,
|
|
requested_features.use_shader_raytrace ? dss : 0,
|
|
css,
|
|
motion_blur ? 3 : 2));
|
|
}
|
|
|
|
// Clean up program group objects
|
|
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
|
|
optixProgramGroupDestroy(groups[i]);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void thread_run(DeviceTask &task, int thread_index) // Main task entry point
|
|
{
|
|
if (have_error())
|
|
return; // Abort early if there was an error previously
|
|
|
|
if (task.type == DeviceTask::RENDER) {
|
|
if (thread_index != 0) {
|
|
// Only execute denoising in a single thread (see also 'task_add')
|
|
task.tile_types &= ~RenderTile::DENOISE;
|
|
}
|
|
|
|
RenderTile tile;
|
|
while (task.acquire_tile(this, tile, task.tile_types)) {
|
|
if (tile.task == RenderTile::PATH_TRACE)
|
|
launch_render(task, tile, thread_index);
|
|
else if (tile.task == RenderTile::BAKE) {
|
|
// Perform baking using CUDA, since it is not currently implemented in OptiX
|
|
device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
|
|
CUDADevice::render(task, tile, work_tiles);
|
|
}
|
|
else if (tile.task == RenderTile::DENOISE)
|
|
launch_denoise(task, tile);
|
|
task.release_tile(tile);
|
|
if (task.get_cancel() && !task.need_finish_queue)
|
|
break; // User requested cancellation
|
|
else if (have_error())
|
|
break; // Abort rendering when encountering an error
|
|
}
|
|
}
|
|
else if (task.type == DeviceTask::SHADER) {
|
|
// CUDA kernels are used when doing baking
|
|
if (optix_module == NULL)
|
|
CUDADevice::shader(task);
|
|
else
|
|
launch_shader_eval(task, thread_index);
|
|
}
|
|
else if (task.type == DeviceTask::DENOISE_BUFFER) {
|
|
// Set up a single tile that covers the whole task and denoise it
|
|
RenderTile tile;
|
|
tile.x = task.x;
|
|
tile.y = task.y;
|
|
tile.w = task.w;
|
|
tile.h = task.h;
|
|
tile.buffer = task.buffer;
|
|
tile.num_samples = task.num_samples;
|
|
tile.start_sample = task.sample;
|
|
tile.offset = task.offset;
|
|
tile.stride = task.stride;
|
|
tile.buffers = task.buffers;
|
|
|
|
launch_denoise(task, tile);
|
|
}
|
|
}
|
|
|
|
void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
|
|
{
|
|
assert(thread_index < launch_params.data_size);
|
|
|
|
// Keep track of total render time of this tile
|
|
const scoped_timer timer(&rtile.buffers->render_time);
|
|
|
|
WorkTile wtile;
|
|
wtile.x = rtile.x;
|
|
wtile.y = rtile.y;
|
|
wtile.w = rtile.w;
|
|
wtile.h = rtile.h;
|
|
wtile.offset = rtile.offset;
|
|
wtile.stride = rtile.stride;
|
|
wtile.buffer = (float *)rtile.buffer;
|
|
|
|
const int end_sample = rtile.start_sample + rtile.num_samples;
|
|
// Keep this number reasonable to avoid running into TDRs
|
|
int step_samples = (info.display_device ? 8 : 32);
|
|
|
|
// Offset into launch params buffer so that streams use separate data
|
|
device_ptr launch_params_ptr = launch_params.device_pointer +
|
|
thread_index * launch_params.data_elements;
|
|
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
for (int sample = rtile.start_sample; sample < end_sample;) {
|
|
// Copy work tile information to device
|
|
wtile.start_sample = sample;
|
|
wtile.num_samples = step_samples;
|
|
if (task.adaptive_sampling.use) {
|
|
wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
|
|
}
|
|
wtile.num_samples = min(wtile.num_samples, end_sample - sample);
|
|
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
|
|
check_result_cuda(
|
|
cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
|
|
|
|
OptixShaderBindingTable sbt_params = {};
|
|
sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
|
|
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
|
|
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
|
|
sbt_params.missRecordCount = 1;
|
|
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
|
|
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
|
|
# else
|
|
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
|
|
# endif
|
|
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
|
|
sbt_params.callablesRecordCount = 3;
|
|
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
// Launch the ray generation program
|
|
check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
|
|
cuda_stream[thread_index],
|
|
launch_params_ptr,
|
|
launch_params.data_elements,
|
|
&sbt_params,
|
|
// Launch with samples close to each other for better locality
|
|
wtile.w * wtile.num_samples,
|
|
wtile.h,
|
|
1));
|
|
|
|
// Run the adaptive sampling kernels at selected samples aligned to step samples.
|
|
uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
|
|
if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
|
|
adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
|
|
}
|
|
|
|
// Wait for launch to finish
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
// Update current sample, so it is displayed correctly
|
|
sample += wtile.num_samples;
|
|
rtile.sample = sample;
|
|
// Update task progress after the kernel completed rendering
|
|
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
|
|
|
|
if (task.get_cancel() && !task.need_finish_queue)
|
|
return; // Cancel rendering
|
|
}
|
|
|
|
// Finalize adaptive sampling
|
|
if (task.adaptive_sampling.use) {
|
|
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
|
|
adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
|
|
}
|
|
}
|
|
|
|
bool launch_denoise(DeviceTask &task, RenderTile &rtile)
|
|
{
|
|
// Update current sample (for display and NLM denoising task)
|
|
rtile.sample = rtile.start_sample + rtile.num_samples;
|
|
|
|
// Make CUDA context current now, since it is used for both denoising tasks
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
// Choose between OptiX and NLM denoising
|
|
if (task.denoising.type == DENOISER_OPTIX) {
|
|
// Map neighboring tiles onto this device, indices are as following:
|
|
// Where index 4 is the center tile and index 9 is the target for the result.
|
|
// 0 1 2
|
|
// 3 4 5
|
|
// 6 7 8 9
|
|
RenderTileNeighbors neighbors(rtile);
|
|
task.map_neighbor_tiles(neighbors, this);
|
|
RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
|
|
RenderTile &target_tile = neighbors.target;
|
|
rtile = center_tile; // Tile may have been modified by mapping code
|
|
|
|
// Calculate size of the tile to denoise (including overlap)
|
|
int4 rect = center_tile.bounds();
|
|
// Overlap between tiles has to be at least 64 pixels
|
|
// TODO(pmours): Query this value from OptiX
|
|
rect = rect_expand(rect, 64);
|
|
int4 clip_rect = neighbors.bounds();
|
|
rect = rect_clip(rect, clip_rect);
|
|
int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
|
|
int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
|
|
|
|
// Calculate byte offsets and strides
|
|
int pixel_stride = task.pass_stride * (int)sizeof(float);
|
|
int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
|
|
const int pass_offset[3] = {
|
|
(task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
|
|
(task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
|
|
(task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
|
|
|
|
// Start with the current tile pointer offset
|
|
int input_stride = pixel_stride;
|
|
device_ptr input_ptr = rtile.buffer + pixel_offset;
|
|
|
|
// Copy tile data into a common buffer if necessary
|
|
device_only_memory<float> input(this, "denoiser input", true);
|
|
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
|
|
|
|
bool contiguous_memory = true;
|
|
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
|
|
if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
|
|
contiguous_memory = false;
|
|
}
|
|
}
|
|
|
|
if (contiguous_memory) {
|
|
// Tiles are in continous memory, so can just subtract overlap offset
|
|
input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
|
|
// Stride covers the whole width of the image and not just a single tile
|
|
input_stride *= rtile.stride;
|
|
}
|
|
else {
|
|
// Adjacent tiles are in separate memory regions, so need to copy them into a single one
|
|
input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
|
|
// Start with the new input buffer
|
|
input_ptr = input.device_pointer;
|
|
// Stride covers the width of the new input buffer, which includes tile width and overlap
|
|
input_stride *= rect_size.x;
|
|
|
|
TileInfo *tile_info = tile_info_mem.alloc(1);
|
|
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
|
|
tile_info->offsets[i] = neighbors.tiles[i].offset;
|
|
tile_info->strides[i] = neighbors.tiles[i].stride;
|
|
tile_info->buffers[i] = neighbors.tiles[i].buffer;
|
|
}
|
|
tile_info->x[0] = neighbors.tiles[3].x;
|
|
tile_info->x[1] = neighbors.tiles[4].x;
|
|
tile_info->x[2] = neighbors.tiles[5].x;
|
|
tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
|
|
tile_info->y[0] = neighbors.tiles[1].y;
|
|
tile_info->y[1] = neighbors.tiles[4].y;
|
|
tile_info->y[2] = neighbors.tiles[7].y;
|
|
tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
|
|
tile_info_mem.copy_to_device();
|
|
|
|
void *args[] = {
|
|
&input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
|
|
launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
|
|
}
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
|
|
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
|
|
|
|
void *input_args[] = {&input_rgb.device_pointer,
|
|
&input_ptr,
|
|
&rect_size.x,
|
|
&rect_size.y,
|
|
&input_stride,
|
|
&task.pass_stride,
|
|
const_cast<int *>(pass_offset),
|
|
&task.denoising.input_passes,
|
|
&rtile.sample};
|
|
launch_filter_kernel(
|
|
"kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
|
|
|
|
input_ptr = input_rgb.device_pointer;
|
|
pixel_stride = 3 * sizeof(float);
|
|
input_stride = rect_size.x * pixel_stride;
|
|
# endif
|
|
|
|
const bool recreate_denoiser = (denoiser == NULL) ||
|
|
(task.denoising.input_passes != denoiser_input_passes);
|
|
if (recreate_denoiser) {
|
|
// Destroy existing handle before creating new one
|
|
if (denoiser != NULL) {
|
|
optixDenoiserDestroy(denoiser);
|
|
}
|
|
|
|
// Create OptiX denoiser handle on demand when it is first used
|
|
OptixDenoiserOptions denoiser_options = {};
|
|
assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
|
|
# if OPTIX_ABI_VERSION >= 47
|
|
denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
|
|
denoiser_options.guideNormal = task.denoising.input_passes >= 3;
|
|
check_result_optix_ret(optixDenoiserCreate(
|
|
context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
|
|
# else
|
|
denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
|
|
OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
|
|
# if OPTIX_ABI_VERSION < 28
|
|
denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
# endif
|
|
check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
|
|
check_result_optix_ret(
|
|
optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
|
|
# endif
|
|
|
|
// OptiX denoiser handle was created with the requested number of input passes
|
|
denoiser_input_passes = task.denoising.input_passes;
|
|
}
|
|
|
|
OptixDenoiserSizes sizes = {};
|
|
check_result_optix_ret(
|
|
optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
|
|
|
|
# if OPTIX_ABI_VERSION < 28
|
|
const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
|
|
# else
|
|
const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
|
|
# endif
|
|
const size_t scratch_offset = sizes.stateSizeInBytes;
|
|
|
|
// Allocate denoiser state if tile size has changed since last setup
|
|
if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
|
|
denoiser_state.data_height != rect_size.y)) {
|
|
denoiser_state.alloc_to_device(scratch_offset + scratch_size);
|
|
|
|
// Initialize denoiser state for the current tile size
|
|
check_result_optix_ret(optixDenoiserSetup(denoiser,
|
|
0,
|
|
rect_size.x,
|
|
rect_size.y,
|
|
denoiser_state.device_pointer,
|
|
scratch_offset,
|
|
denoiser_state.device_pointer + scratch_offset,
|
|
scratch_size));
|
|
|
|
denoiser_state.data_width = rect_size.x;
|
|
denoiser_state.data_height = rect_size.y;
|
|
}
|
|
|
|
// Set up input and output layer information
|
|
OptixImage2D input_layers[3] = {};
|
|
OptixImage2D output_layers[1] = {};
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
|
|
# else
|
|
input_layers[i].data = input_ptr + pass_offset[i];
|
|
# endif
|
|
input_layers[i].width = rect_size.x;
|
|
input_layers[i].height = rect_size.y;
|
|
input_layers[i].rowStrideInBytes = input_stride;
|
|
input_layers[i].pixelStrideInBytes = pixel_stride;
|
|
input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
}
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
output_layers[0].data = input_ptr;
|
|
output_layers[0].width = rect_size.x;
|
|
output_layers[0].height = rect_size.y;
|
|
output_layers[0].rowStrideInBytes = input_stride;
|
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
|
int2 output_offset = overlap_offset;
|
|
overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
|
|
# else
|
|
output_layers[0].data = target_tile.buffer + pixel_offset;
|
|
output_layers[0].width = target_tile.w;
|
|
output_layers[0].height = target_tile.h;
|
|
output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
|
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
|
# endif
|
|
output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
|
|
# if OPTIX_ABI_VERSION >= 47
|
|
OptixDenoiserLayer image_layers = {};
|
|
image_layers.input = input_layers[0];
|
|
image_layers.output = output_layers[0];
|
|
|
|
OptixDenoiserGuideLayer guide_layers = {};
|
|
guide_layers.albedo = input_layers[1];
|
|
guide_layers.normal = input_layers[2];
|
|
# endif
|
|
|
|
// Finally run denonising
|
|
OptixDenoiserParams params = {}; // All parameters are disabled/zero
|
|
# if OPTIX_ABI_VERSION >= 47
|
|
check_result_optix_ret(optixDenoiserInvoke(denoiser,
|
|
NULL,
|
|
¶ms,
|
|
denoiser_state.device_pointer,
|
|
scratch_offset,
|
|
&guide_layers,
|
|
&image_layers,
|
|
1,
|
|
overlap_offset.x,
|
|
overlap_offset.y,
|
|
denoiser_state.device_pointer + scratch_offset,
|
|
scratch_size));
|
|
# else
|
|
check_result_optix_ret(optixDenoiserInvoke(denoiser,
|
|
NULL,
|
|
¶ms,
|
|
denoiser_state.device_pointer,
|
|
scratch_offset,
|
|
input_layers,
|
|
task.denoising.input_passes,
|
|
overlap_offset.x,
|
|
overlap_offset.y,
|
|
output_layers,
|
|
denoiser_state.device_pointer + scratch_offset,
|
|
scratch_size));
|
|
# endif
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
void *output_args[] = {&input_ptr,
|
|
&target_tile.buffer,
|
|
&output_offset.x,
|
|
&output_offset.y,
|
|
&rect_size.x,
|
|
&rect_size.y,
|
|
&target_tile.x,
|
|
&target_tile.y,
|
|
&target_tile.w,
|
|
&target_tile.h,
|
|
&target_tile.offset,
|
|
&target_tile.stride,
|
|
&task.pass_stride,
|
|
&rtile.sample};
|
|
launch_filter_kernel(
|
|
"kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
|
|
# endif
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(0));
|
|
|
|
task.unmap_neighbor_tiles(neighbors, this);
|
|
}
|
|
else {
|
|
// Run CUDA denoising kernels
|
|
DenoisingTask denoising(this, task);
|
|
CUDADevice::denoise(rtile, denoising);
|
|
}
|
|
|
|
// Update task progress after the denoiser completed processing
|
|
task.update_progress(&rtile, rtile.w * rtile.h);
|
|
|
|
return true;
|
|
}
|
|
|
|
void launch_shader_eval(DeviceTask &task, int thread_index)
|
|
{
|
|
unsigned int rgen_index = PG_BACK;
|
|
if (task.shader_eval_type >= SHADER_EVAL_BAKE)
|
|
rgen_index = PG_BAKE;
|
|
if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
|
|
rgen_index = PG_DISP;
|
|
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
device_ptr launch_params_ptr = launch_params.device_pointer +
|
|
thread_index * launch_params.data_elements;
|
|
|
|
for (int sample = 0; sample < task.num_samples; ++sample) {
|
|
ShaderParams params;
|
|
params.input = (uint4 *)task.shader_input;
|
|
params.output = (float4 *)task.shader_output;
|
|
params.type = task.shader_eval_type;
|
|
params.filter = task.shader_filter;
|
|
params.sx = task.shader_x;
|
|
params.offset = task.offset;
|
|
params.sample = sample;
|
|
|
|
check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
|
|
¶ms,
|
|
sizeof(params),
|
|
cuda_stream[thread_index]));
|
|
|
|
OptixShaderBindingTable sbt_params = {};
|
|
sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
|
|
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
|
|
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
|
|
sbt_params.missRecordCount = 1;
|
|
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
|
|
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
|
|
# else
|
|
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
|
|
# endif
|
|
sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
|
|
sbt_params.callablesRecordCount = 3;
|
|
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
|
|
cuda_stream[thread_index],
|
|
launch_params_ptr,
|
|
launch_params.data_elements,
|
|
&sbt_params,
|
|
task.shader_w,
|
|
1,
|
|
1));
|
|
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
task.update_progress(NULL);
|
|
}
|
|
}
|
|
|
|
bool build_optix_bvh(BVHOptiX *bvh,
|
|
OptixBuildOperation operation,
|
|
const OptixBuildInput &build_input,
|
|
uint16_t num_motion_steps)
|
|
{
|
|
/* Allocate and build acceleration structures only one at a time, to prevent parallel builds
|
|
* from running out of memory (since both original and compacted acceleration structure memory
|
|
* may be allocated at the same time for the duration of this function). The builds would
|
|
* otherwise happen on the same CUDA stream anyway. */
|
|
static thread_mutex mutex;
|
|
thread_scoped_lock lock(mutex);
|
|
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
|
|
|
|
// Compute memory usage
|
|
OptixAccelBufferSizes sizes = {};
|
|
OptixAccelBuildOptions options = {};
|
|
options.operation = operation;
|
|
if (use_fast_trace_bvh) {
|
|
VLOG(2) << "Using fast to trace OptiX BVH";
|
|
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
|
|
}
|
|
else {
|
|
VLOG(2) << "Using fast to update OptiX BVH";
|
|
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
|
|
}
|
|
|
|
options.motionOptions.numKeys = num_motion_steps;
|
|
options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
|
|
options.motionOptions.timeBegin = 0.0f;
|
|
options.motionOptions.timeEnd = 1.0f;
|
|
|
|
check_result_optix_ret(
|
|
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
|
|
|
|
// Allocate required output buffers
|
|
device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
|
|
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
|
|
if (!temp_mem.device_pointer)
|
|
return false; // Make sure temporary memory allocation succeeded
|
|
|
|
// Acceleration structure memory has to be allocated on the device (not allowed to be on host)
|
|
device_only_memory<char> &out_data = bvh->as_data;
|
|
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
|
|
assert(out_data.device == this);
|
|
out_data.alloc_to_device(sizes.outputSizeInBytes);
|
|
if (!out_data.device_pointer)
|
|
return false;
|
|
}
|
|
else {
|
|
assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
|
|
}
|
|
|
|
// Finally build the acceleration structure
|
|
OptixAccelEmitDesc compacted_size_prop = {};
|
|
compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
|
|
// A tiny space was allocated for this property at the end of the temporary buffer above
|
|
// Make sure this pointer is 8-byte aligned
|
|
compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
|
|
|
|
OptixTraversableHandle out_handle = 0;
|
|
check_result_optix_ret(optixAccelBuild(context,
|
|
NULL,
|
|
&options,
|
|
&build_input,
|
|
1,
|
|
temp_mem.device_pointer,
|
|
sizes.tempSizeInBytes,
|
|
out_data.device_pointer,
|
|
sizes.outputSizeInBytes,
|
|
&out_handle,
|
|
use_fast_trace_bvh ? &compacted_size_prop : NULL,
|
|
use_fast_trace_bvh ? 1 : 0));
|
|
bvh->traversable_handle = static_cast<uint64_t>(out_handle);
|
|
|
|
// Wait for all operations to finish
|
|
check_result_cuda_ret(cuStreamSynchronize(NULL));
|
|
|
|
// Compact acceleration structure to save memory (only if using fast trace as the
|
|
// OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
|
|
if (use_fast_trace_bvh) {
|
|
uint64_t compacted_size = sizes.outputSizeInBytes;
|
|
check_result_cuda_ret(
|
|
cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
|
|
|
|
// Temporary memory is no longer needed, so free it now to make space
|
|
temp_mem.free();
|
|
|
|
// There is no point compacting if the size does not change
|
|
if (compacted_size < sizes.outputSizeInBytes) {
|
|
device_only_memory<char> compacted_data(this, "optix compacted as", false);
|
|
compacted_data.alloc_to_device(compacted_size);
|
|
if (!compacted_data.device_pointer)
|
|
// Do not compact if memory allocation for compacted acceleration structure fails
|
|
// Can just use the uncompacted one then, so succeed here regardless
|
|
return true;
|
|
|
|
check_result_optix_ret(optixAccelCompact(context,
|
|
NULL,
|
|
out_handle,
|
|
compacted_data.device_pointer,
|
|
compacted_size,
|
|
&out_handle));
|
|
bvh->traversable_handle = static_cast<uint64_t>(out_handle);
|
|
|
|
// Wait for compaction to finish
|
|
check_result_cuda_ret(cuStreamSynchronize(NULL));
|
|
|
|
std::swap(out_data.device_size, compacted_data.device_size);
|
|
std::swap(out_data.device_pointer, compacted_data.device_pointer);
|
|
// Original acceleration structure memory is freed when 'compacted_data' goes out of scope
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void build_bvh(BVH *bvh, Progress &progress, bool refit) override
|
|
{
|
|
if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
|
|
/* For baking CUDA is used, build appropriate BVH for that. */
|
|
Device::build_bvh(bvh, progress, refit);
|
|
return;
|
|
}
|
|
|
|
const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
|
|
|
|
free_bvh_memory_delayed();
|
|
|
|
BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
|
|
|
|
progress.set_substatus("Building OptiX acceleration structure");
|
|
|
|
if (!bvh->params.top_level) {
|
|
assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
|
|
|
|
OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
|
|
/* Refit is only possible when using fast to trace BVH (because AS is built with
|
|
* OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
|
|
if (refit && !use_fast_trace_bvh) {
|
|
assert(bvh_optix->traversable_handle != 0);
|
|
operation = OPTIX_BUILD_OPERATION_UPDATE;
|
|
}
|
|
else {
|
|
bvh_optix->as_data.free();
|
|
bvh_optix->traversable_handle = 0;
|
|
}
|
|
|
|
// Build bottom level acceleration structures (BLAS)
|
|
Geometry *const geom = bvh->geometry[0];
|
|
if (geom->geometry_type == Geometry::HAIR) {
|
|
// Build BLAS for curve primitives
|
|
Hair *const hair = static_cast<Hair *const>(geom);
|
|
if (hair->num_curves() == 0) {
|
|
return;
|
|
}
|
|
|
|
const size_t num_segments = hair->num_segments();
|
|
|
|
size_t num_motion_steps = 1;
|
|
Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
|
if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
|
|
num_motion_steps = hair->get_motion_steps();
|
|
}
|
|
|
|
device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
|
|
device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
|
|
// Four control points for each curve segment
|
|
const size_t num_vertices = num_segments * 4;
|
|
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
|
|
index_data.alloc(num_segments);
|
|
vertex_data.alloc(num_vertices * num_motion_steps);
|
|
}
|
|
else
|
|
# endif
|
|
aabb_data.alloc(num_segments * num_motion_steps);
|
|
|
|
// Get AABBs for each motion step
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
// The center step for motion vertices is not stored in the attribute
|
|
const float3 *keys = hair->get_curve_keys().data();
|
|
size_t center_step = (num_motion_steps - 1) / 2;
|
|
if (step != center_step) {
|
|
size_t attr_offset = (step > center_step) ? step - 1 : step;
|
|
// Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
|
|
keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
|
|
}
|
|
|
|
for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
|
|
const Hair::Curve curve = hair->get_curve(j);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
const array<float> &curve_radius = hair->get_curve_radius();
|
|
# endif
|
|
|
|
for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
|
|
int k0 = curve.first_key + segment;
|
|
int k1 = k0 + 1;
|
|
int ka = max(k0 - 1, curve.first_key);
|
|
int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
|
|
|
|
const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
|
|
const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
|
|
const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
|
|
const float4 pw = make_float4(
|
|
curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
|
|
|
|
// Convert Catmull-Rom data to Bezier spline
|
|
static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
|
|
static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
|
|
static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
|
|
static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
|
|
|
|
index_data[i] = i * 4;
|
|
float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
|
|
v[0] = make_float4(
|
|
dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
|
|
v[1] = make_float4(
|
|
dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
|
|
v[2] = make_float4(
|
|
dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
|
|
v[3] = make_float4(
|
|
dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
|
|
}
|
|
else
|
|
# endif
|
|
{
|
|
BoundBox bounds = BoundBox::empty;
|
|
curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
|
|
|
|
const size_t index = step * num_segments + i;
|
|
aabb_data[index].minX = bounds.min.x;
|
|
aabb_data[index].minY = bounds.min.y;
|
|
aabb_data[index].minZ = bounds.min.z;
|
|
aabb_data[index].maxX = bounds.max.x;
|
|
aabb_data[index].maxY = bounds.max.y;
|
|
aabb_data[index].maxZ = bounds.max.z;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Upload AABB data to GPU
|
|
aabb_data.copy_to_device();
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
index_data.copy_to_device();
|
|
vertex_data.copy_to_device();
|
|
# endif
|
|
|
|
vector<device_ptr> aabb_ptrs;
|
|
aabb_ptrs.reserve(num_motion_steps);
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
vector<device_ptr> width_ptrs;
|
|
vector<device_ptr> vertex_ptrs;
|
|
width_ptrs.reserve(num_motion_steps);
|
|
vertex_ptrs.reserve(num_motion_steps);
|
|
# endif
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
const device_ptr base_ptr = vertex_data.device_pointer +
|
|
step * num_vertices * sizeof(float4);
|
|
width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size
|
|
vertex_ptrs.push_back(base_ptr);
|
|
# endif
|
|
}
|
|
|
|
// Force a single any-hit call, so shadow record-all behavior works correctly
|
|
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
|
|
OptixBuildInput build_input = {};
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
|
|
build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
|
|
build_input.curveArray.numPrimitives = num_segments;
|
|
build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
|
|
build_input.curveArray.numVertices = num_vertices;
|
|
build_input.curveArray.vertexStrideInBytes = sizeof(float4);
|
|
build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
|
|
build_input.curveArray.widthStrideInBytes = sizeof(float4);
|
|
build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
|
|
build_input.curveArray.indexStrideInBytes = sizeof(int);
|
|
build_input.curveArray.flag = build_flags;
|
|
build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
|
|
}
|
|
else
|
|
# endif
|
|
{
|
|
// Disable visibility test any-hit program, since it is already checked during
|
|
// intersection. Those trace calls that require anyhit can force it with a ray flag.
|
|
build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
|
|
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
|
|
# if OPTIX_ABI_VERSION < 23
|
|
build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
|
|
build_input.aabbArray.numPrimitives = num_segments;
|
|
build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
|
|
build_input.aabbArray.flags = &build_flags;
|
|
build_input.aabbArray.numSbtRecords = 1;
|
|
build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
|
|
# else
|
|
build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
|
|
build_input.customPrimitiveArray.numPrimitives = num_segments;
|
|
build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
|
|
build_input.customPrimitiveArray.flags = &build_flags;
|
|
build_input.customPrimitiveArray.numSbtRecords = 1;
|
|
build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
|
|
# endif
|
|
}
|
|
|
|
if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
|
|
progress.set_error("Failed to build OptiX acceleration structure");
|
|
}
|
|
}
|
|
else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
|
|
// Build BLAS for triangle primitives
|
|
Mesh *const mesh = static_cast<Mesh *const>(geom);
|
|
if (mesh->num_triangles() == 0) {
|
|
return;
|
|
}
|
|
|
|
const size_t num_verts = mesh->get_verts().size();
|
|
|
|
size_t num_motion_steps = 1;
|
|
Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
|
if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
|
|
num_motion_steps = mesh->get_motion_steps();
|
|
}
|
|
|
|
device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
|
|
index_data.alloc(mesh->get_triangles().size());
|
|
memcpy(index_data.data(),
|
|
mesh->get_triangles().data(),
|
|
mesh->get_triangles().size() * sizeof(int));
|
|
device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
|
|
vertex_data.alloc(num_verts * num_motion_steps);
|
|
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
const float3 *verts = mesh->get_verts().data();
|
|
|
|
size_t center_step = (num_motion_steps - 1) / 2;
|
|
// The center step for motion vertices is not stored in the attribute
|
|
if (step != center_step) {
|
|
verts = motion_keys->data_float3() +
|
|
(step > center_step ? step - 1 : step) * num_verts;
|
|
}
|
|
|
|
memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
|
|
}
|
|
|
|
// Upload triangle data to GPU
|
|
index_data.copy_to_device();
|
|
vertex_data.copy_to_device();
|
|
|
|
vector<device_ptr> vertex_ptrs;
|
|
vertex_ptrs.reserve(num_motion_steps);
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
|
|
}
|
|
|
|
// Force a single any-hit call, so shadow record-all behavior works correctly
|
|
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
|
|
OptixBuildInput build_input = {};
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
|
|
build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
|
|
build_input.triangleArray.numVertices = num_verts;
|
|
build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
|
|
build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
|
|
build_input.triangleArray.indexBuffer = index_data.device_pointer;
|
|
build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
|
|
build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
|
|
build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
|
|
build_input.triangleArray.flags = &build_flags;
|
|
// The SBT does not store per primitive data since Cycles already allocates separate
|
|
// buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
|
|
// one and rely on that having the same meaning in this case.
|
|
build_input.triangleArray.numSbtRecords = 1;
|
|
build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
|
|
|
|
if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
|
|
progress.set_error("Failed to build OptiX acceleration structure");
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
unsigned int num_instances = 0;
|
|
unsigned int max_num_instances = 0xFFFFFFFF;
|
|
|
|
bvh_optix->as_data.free();
|
|
bvh_optix->traversable_handle = 0;
|
|
bvh_optix->motion_transform_data.free();
|
|
|
|
optixDeviceContextGetProperty(context,
|
|
OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
|
|
&max_num_instances,
|
|
sizeof(max_num_instances));
|
|
// Do not count first bit, which is used to distinguish instanced and non-instanced objects
|
|
max_num_instances >>= 1;
|
|
if (bvh->objects.size() > max_num_instances) {
|
|
progress.set_error(
|
|
"Failed to build OptiX acceleration structure because there are too many instances");
|
|
return;
|
|
}
|
|
|
|
// Fill instance descriptions
|
|
# if OPTIX_ABI_VERSION < 41
|
|
device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
|
|
aabbs.alloc(bvh->objects.size());
|
|
# endif
|
|
device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
|
|
instances.alloc(bvh->objects.size());
|
|
|
|
// Calculate total motion transform size and allocate memory for them
|
|
size_t motion_transform_offset = 0;
|
|
if (motion_blur) {
|
|
size_t total_motion_transform_size = 0;
|
|
for (Object *const ob : bvh->objects) {
|
|
if (ob->is_traceable() && ob->use_motion()) {
|
|
total_motion_transform_size = align_up(total_motion_transform_size,
|
|
OPTIX_TRANSFORM_BYTE_ALIGNMENT);
|
|
const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
|
|
total_motion_transform_size = total_motion_transform_size +
|
|
sizeof(OptixSRTMotionTransform) +
|
|
motion_keys * sizeof(OptixSRTData);
|
|
}
|
|
}
|
|
|
|
assert(bvh_optix->motion_transform_data.device == this);
|
|
bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
|
|
}
|
|
|
|
for (Object *ob : bvh->objects) {
|
|
// Skip non-traceable objects
|
|
if (!ob->is_traceable())
|
|
continue;
|
|
|
|
BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
|
|
OptixTraversableHandle handle = blas->traversable_handle;
|
|
|
|
# if OPTIX_ABI_VERSION < 41
|
|
OptixAabb &aabb = aabbs[num_instances];
|
|
aabb.minX = ob->bounds.min.x;
|
|
aabb.minY = ob->bounds.min.y;
|
|
aabb.minZ = ob->bounds.min.z;
|
|
aabb.maxX = ob->bounds.max.x;
|
|
aabb.maxY = ob->bounds.max.y;
|
|
aabb.maxZ = ob->bounds.max.z;
|
|
# endif
|
|
|
|
OptixInstance &instance = instances[num_instances++];
|
|
memset(&instance, 0, sizeof(instance));
|
|
|
|
// Clear transform to identity matrix
|
|
instance.transform[0] = 1.0f;
|
|
instance.transform[5] = 1.0f;
|
|
instance.transform[10] = 1.0f;
|
|
|
|
// Set user instance ID to object index (but leave low bit blank)
|
|
instance.instanceId = ob->get_device_index() << 1;
|
|
|
|
// Have to have at least one bit in the mask, or else instance would always be culled
|
|
instance.visibilityMask = 1;
|
|
|
|
if (ob->get_geometry()->has_volume) {
|
|
// Volumes have a special bit set in the visibility mask so a trace can mask only volumes
|
|
instance.visibilityMask |= 2;
|
|
}
|
|
|
|
if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
|
|
// Same applies to curves (so they can be skipped in local trace calls)
|
|
instance.visibilityMask |= 4;
|
|
|
|
# if OPTIX_ABI_VERSION >= 36
|
|
if (motion_blur && ob->get_geometry()->has_motion_blur() &&
|
|
DebugFlags().optix.curves_api &&
|
|
static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
|
|
// Select between motion blur and non-motion blur built-in intersection module
|
|
instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
|
|
}
|
|
# endif
|
|
}
|
|
|
|
// Insert motion traversable if object has motion
|
|
if (motion_blur && ob->use_motion()) {
|
|
size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
|
|
size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
|
|
motion_keys * sizeof(OptixSRTData);
|
|
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
motion_transform_offset = align_up(motion_transform_offset,
|
|
OPTIX_TRANSFORM_BYTE_ALIGNMENT);
|
|
CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
|
|
motion_transform_offset;
|
|
motion_transform_offset += motion_transform_size;
|
|
|
|
// Allocate host side memory for motion transform and fill it with transform data
|
|
OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
|
|
new uint8_t[motion_transform_size]);
|
|
motion_transform.child = handle;
|
|
motion_transform.motionOptions.numKeys = ob->get_motion().size();
|
|
motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
|
|
motion_transform.motionOptions.timeBegin = 0.0f;
|
|
motion_transform.motionOptions.timeEnd = 1.0f;
|
|
|
|
OptixSRTData *const srt_data = motion_transform.srtData;
|
|
array<DecomposedTransform> decomp(ob->get_motion().size());
|
|
transform_motion_decompose(
|
|
decomp.data(), ob->get_motion().data(), ob->get_motion().size());
|
|
|
|
for (size_t i = 0; i < ob->get_motion().size(); ++i) {
|
|
// Scale
|
|
srt_data[i].sx = decomp[i].y.w; // scale.x.x
|
|
srt_data[i].sy = decomp[i].z.w; // scale.y.y
|
|
srt_data[i].sz = decomp[i].w.w; // scale.z.z
|
|
|
|
// Shear
|
|
srt_data[i].a = decomp[i].z.x; // scale.x.y
|
|
srt_data[i].b = decomp[i].z.y; // scale.x.z
|
|
srt_data[i].c = decomp[i].w.x; // scale.y.z
|
|
assert(decomp[i].z.z == 0.0f); // scale.y.x
|
|
assert(decomp[i].w.y == 0.0f); // scale.z.x
|
|
assert(decomp[i].w.z == 0.0f); // scale.z.y
|
|
|
|
// Pivot point
|
|
srt_data[i].pvx = 0.0f;
|
|
srt_data[i].pvy = 0.0f;
|
|
srt_data[i].pvz = 0.0f;
|
|
|
|
// Rotation
|
|
srt_data[i].qx = decomp[i].x.x;
|
|
srt_data[i].qy = decomp[i].x.y;
|
|
srt_data[i].qz = decomp[i].x.z;
|
|
srt_data[i].qw = decomp[i].x.w;
|
|
|
|
// Translation
|
|
srt_data[i].tx = decomp[i].y.x;
|
|
srt_data[i].ty = decomp[i].y.y;
|
|
srt_data[i].tz = decomp[i].y.z;
|
|
}
|
|
|
|
// Upload motion transform to GPU
|
|
cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
|
|
delete[] reinterpret_cast<uint8_t *>(&motion_transform);
|
|
|
|
// Disable instance transform if object uses motion transform already
|
|
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
|
|
|
// Get traversable handle to motion transform
|
|
optixConvertPointerToTraversableHandle(context,
|
|
motion_transform_gpu,
|
|
OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
|
|
&instance.traversableHandle);
|
|
}
|
|
else {
|
|
instance.traversableHandle = handle;
|
|
|
|
if (ob->get_geometry()->is_instanced()) {
|
|
// Set transform matrix
|
|
memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
|
|
}
|
|
else {
|
|
// Disable instance transform if geometry already has it applied to vertex data
|
|
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
|
// Non-instanced objects read ID from 'prim_object', so distinguish
|
|
// them from instanced objects with the low bit set
|
|
instance.instanceId |= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Upload instance descriptions
|
|
# if OPTIX_ABI_VERSION < 41
|
|
aabbs.resize(num_instances);
|
|
aabbs.copy_to_device();
|
|
# endif
|
|
instances.resize(num_instances);
|
|
instances.copy_to_device();
|
|
|
|
// Build top-level acceleration structure (TLAS)
|
|
OptixBuildInput build_input = {};
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
|
|
# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
|
|
build_input.instanceArray.aabbs = aabbs.device_pointer;
|
|
build_input.instanceArray.numAabbs = num_instances;
|
|
# endif
|
|
build_input.instanceArray.instances = instances.device_pointer;
|
|
build_input.instanceArray.numInstances = num_instances;
|
|
|
|
if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
|
|
progress.set_error("Failed to build OptiX acceleration structure");
|
|
}
|
|
tlas_handle = bvh_optix->traversable_handle;
|
|
}
|
|
}
|
|
|
|
void release_optix_bvh(BVH *bvh) override
|
|
{
|
|
thread_scoped_lock lock(delayed_free_bvh_mutex);
|
|
/* Do delayed free of BVH memory, since geometry holding BVH might be deleted
|
|
* while GPU is still rendering. */
|
|
BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
|
|
|
|
delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
|
|
delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
|
|
bvh_optix->traversable_handle = 0;
|
|
}
|
|
|
|
void free_bvh_memory_delayed()
|
|
{
|
|
thread_scoped_lock lock(delayed_free_bvh_mutex);
|
|
delayed_free_bvh_memory.free_memory();
|
|
}
|
|
|
|
void const_copy_to(const char *name, void *host, size_t size) override
|
|
{
|
|
// Set constant memory for CUDA module
|
|
// TODO(pmours): This is only used for tonemapping (see 'film_convert').
|
|
// Could be removed by moving those functions to filter CUDA module.
|
|
CUDADevice::const_copy_to(name, host, size);
|
|
|
|
if (strcmp(name, "__data") == 0) {
|
|
assert(size <= sizeof(KernelData));
|
|
|
|
// Update traversable handle (since it is different for each device on multi devices)
|
|
KernelData *const data = (KernelData *)host;
|
|
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
|
|
|
|
update_launch_params(offsetof(KernelParams, data), host, size);
|
|
return;
|
|
}
|
|
|
|
// Update data storage pointers in launch parameters
|
|
# define KERNEL_TEX(data_type, tex_name) \
|
|
if (strcmp(name, #tex_name) == 0) { \
|
|
update_launch_params(offsetof(KernelParams, tex_name), host, size); \
|
|
return; \
|
|
}
|
|
# include "kernel/kernel_textures.h"
|
|
# undef KERNEL_TEX
|
|
}
|
|
|
|
void update_launch_params(size_t offset, void *data, size_t data_size)
|
|
{
|
|
const CUDAContextScope scope(cuContext);
|
|
|
|
for (int i = 0; i < info.cpu_threads; ++i)
|
|
check_result_cuda(
|
|
cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
|
|
data,
|
|
data_size));
|
|
}
|
|
|
|
void task_add(DeviceTask &task) override
|
|
{
|
|
// Upload texture information to device if it has changed since last launch
|
|
load_texture_info();
|
|
|
|
if (task.type == DeviceTask::FILM_CONVERT) {
|
|
// Execute in main thread because of OpenGL access
|
|
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
|
|
return;
|
|
}
|
|
|
|
if (task.type == DeviceTask::DENOISE_BUFFER) {
|
|
// Execute denoising in a single thread (e.g. to avoid race conditions during creation)
|
|
task_pool.push([=] {
|
|
DeviceTask task_copy = task;
|
|
thread_run(task_copy, 0);
|
|
});
|
|
return;
|
|
}
|
|
|
|
// Split task into smaller ones
|
|
list<DeviceTask> tasks;
|
|
task.split(tasks, info.cpu_threads);
|
|
|
|
// Queue tasks in internal task pool
|
|
int task_index = 0;
|
|
for (DeviceTask &task : tasks) {
|
|
task_pool.push([=] {
|
|
// Using task index parameter instead of thread index, since number of CUDA streams may
|
|
// differ from number of threads
|
|
DeviceTask task_copy = task;
|
|
thread_run(task_copy, task_index);
|
|
});
|
|
task_index++;
|
|
}
|
|
}
|
|
|
|
void task_wait() override
|
|
{
|
|
// Wait for all queued tasks to finish
|
|
task_pool.wait_work();
|
|
}
|
|
|
|
void task_cancel() override
|
|
{
|
|
// Cancel any remaining tasks in the internal pool
|
|
task_pool.cancel();
|
|
}
|
|
};
|
|
|
|
bool device_optix_init()
|
|
{
|
|
if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
|
|
return true; // Already initialized function table
|
|
|
|
// Need to initialize CUDA as well
|
|
if (!device_cuda_init())
|
|
return false;
|
|
|
|
const OptixResult result = optixInit();
|
|
|
|
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
|
|
VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
|
|
"Please update to the latest driver first!";
|
|
return false;
|
|
}
|
|
else if (result != OPTIX_SUCCESS) {
|
|
VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
|
|
return false;
|
|
}
|
|
|
|
// Loaded OptiX successfully!
|
|
return true;
|
|
}
|
|
|
|
void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
|
|
{
|
|
devices.reserve(cuda_devices.size());
|
|
|
|
// Simply add all supported CUDA devices as OptiX devices again
|
|
for (DeviceInfo info : cuda_devices) {
|
|
assert(info.type == DEVICE_CUDA);
|
|
|
|
int major;
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
|
|
if (major < 5) {
|
|
continue; // Only Maxwell and up are supported by OptiX
|
|
}
|
|
|
|
info.type = DEVICE_OPTIX;
|
|
info.id += "_OptiX";
|
|
info.denoisers |= DENOISER_OPTIX;
|
|
info.has_branched_path = false;
|
|
|
|
devices.push_back(info);
|
|
}
|
|
}
|
|
|
|
Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
|
|
{
|
|
return new OptiXDevice(info, stats, profiler, background);
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
#endif
|