2019-09-12 12:50:06 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2019, NVIDIA Corporation.
|
|
|
|
* Copyright 2019, Blender Foundation.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef WITH_OPTIX
|
|
|
|
|
|
|
|
# include "device/device.h"
|
|
|
|
# include "device/device_intern.h"
|
|
|
|
# include "device/device_denoising.h"
|
|
|
|
# include "bvh/bvh.h"
|
|
|
|
# include "render/scene.h"
|
|
|
|
# include "render/mesh.h"
|
|
|
|
# include "render/object.h"
|
|
|
|
# include "render/buffers.h"
|
|
|
|
# include "util/util_md5.h"
|
|
|
|
# include "util/util_path.h"
|
|
|
|
# include "util/util_time.h"
|
|
|
|
# include "util/util_debug.h"
|
|
|
|
# include "util/util_logging.h"
|
|
|
|
|
|
|
|
# ifdef WITH_CUDA_DYNLOAD
|
|
|
|
# include <cuew.h>
|
|
|
|
// Do not use CUDA SDK headers when using CUEW
|
|
|
|
# define OPTIX_DONT_INCLUDE_CUDA
|
|
|
|
# endif
|
|
|
|
# include <optix_stubs.h>
|
|
|
|
# include <optix_function_table_definition.h>
|
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
// TODO(pmours): Disable this once drivers have native support
|
|
|
|
# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
/* Make sure this stays in sync with kernel_globals.h */
|
|
|
|
struct ShaderParams {
|
|
|
|
uint4 *input;
|
|
|
|
float4 *output;
|
|
|
|
int type;
|
|
|
|
int filter;
|
|
|
|
int sx;
|
|
|
|
int offset;
|
|
|
|
int sample;
|
|
|
|
};
|
|
|
|
struct KernelParams {
|
|
|
|
WorkTile tile;
|
|
|
|
KernelData data;
|
|
|
|
ShaderParams shader;
|
|
|
|
# define KERNEL_TEX(type, name) const type *name;
|
|
|
|
# include "kernel/kernel_textures.h"
|
|
|
|
# undef KERNEL_TEX
|
|
|
|
};
|
|
|
|
|
|
|
|
# define check_result_cuda(stmt) \
|
|
|
|
{ \
|
|
|
|
CUresult res = stmt; \
|
|
|
|
if (res != CUDA_SUCCESS) { \
|
|
|
|
const char *name; \
|
|
|
|
cuGetErrorName(res, &name); \
|
|
|
|
set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
|
|
|
|
return; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
(void)0
|
|
|
|
# define check_result_cuda_ret(stmt) \
|
|
|
|
{ \
|
|
|
|
CUresult res = stmt; \
|
|
|
|
if (res != CUDA_SUCCESS) { \
|
|
|
|
const char *name; \
|
|
|
|
cuGetErrorName(res, &name); \
|
|
|
|
set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
|
|
|
|
return false; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
(void)0
|
|
|
|
|
|
|
|
# define check_result_optix(stmt) \
|
|
|
|
{ \
|
|
|
|
enum OptixResult res = stmt; \
|
|
|
|
if (res != OPTIX_SUCCESS) { \
|
|
|
|
const char *name = optixGetErrorName(res); \
|
|
|
|
set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
|
|
|
|
return; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
(void)0
|
|
|
|
# define check_result_optix_ret(stmt) \
|
|
|
|
{ \
|
|
|
|
enum OptixResult res = stmt; \
|
|
|
|
if (res != OPTIX_SUCCESS) { \
|
|
|
|
const char *name = optixGetErrorName(res); \
|
|
|
|
set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
|
|
|
|
return false; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
(void)0
|
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
# define CUDA_GET_BLOCKSIZE(func, w, h) \
|
|
|
|
int threads; \
|
|
|
|
check_result_cuda_ret( \
|
|
|
|
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
|
|
|
threads = (int)sqrt((float)threads); \
|
|
|
|
int xblocks = ((w) + threads - 1) / threads; \
|
|
|
|
int yblocks = ((h) + threads - 1) / threads;
|
|
|
|
|
|
|
|
# define CUDA_LAUNCH_KERNEL(func, args) \
|
|
|
|
check_result_cuda_ret(cuLaunchKernel( \
|
|
|
|
func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
|
|
|
|
|
|
|
|
/* Similar as above, but for 1-dimensional blocks. */
|
|
|
|
# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
|
|
|
|
int threads; \
|
|
|
|
check_result_cuda_ret( \
|
|
|
|
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
|
|
|
int xblocks = ((w) + threads - 1) / threads; \
|
|
|
|
int yblocks = h;
|
|
|
|
|
|
|
|
# define CUDA_LAUNCH_KERNEL_1D(func, args) \
|
|
|
|
check_result_cuda_ret(cuLaunchKernel( \
|
|
|
|
func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
class OptiXDevice : public Device {
|
|
|
|
|
|
|
|
// List of OptiX program groups
|
|
|
|
enum {
|
|
|
|
PG_RGEN,
|
|
|
|
PG_MISS,
|
|
|
|
PG_HITD, // Default hit group
|
|
|
|
PG_HITL, // __BVH_LOCAL__ hit group
|
|
|
|
PG_HITS, // __SHADOW_RECORD_ALL__ hit group
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
PG_EXCP,
|
|
|
|
# endif
|
|
|
|
PG_BAKE, // kernel_bake_evaluate
|
|
|
|
PG_DISP, // kernel_displace_evaluate
|
|
|
|
PG_BACK, // kernel_background_evaluate
|
|
|
|
NUM_PROGRAM_GROUPS
|
|
|
|
};
|
|
|
|
|
|
|
|
// List of OptiX pipelines
|
|
|
|
enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
|
|
|
|
|
|
|
|
// A single shader binding table entry
|
|
|
|
struct SbtRecord {
|
|
|
|
char header[OPTIX_SBT_RECORD_HEADER_SIZE];
|
|
|
|
};
|
|
|
|
|
|
|
|
// Information stored about CUDA memory allocations
|
|
|
|
struct CUDAMem {
|
|
|
|
bool free_map_host = false;
|
|
|
|
CUarray array = NULL;
|
|
|
|
CUtexObject texobject = 0;
|
2019-11-05 15:27:52 +00:00
|
|
|
bool use_mapped_host = false;
|
2019-09-12 12:50:06 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Helper class to manage current CUDA context
|
|
|
|
struct CUDAContextScope {
|
|
|
|
CUDAContextScope(CUcontext ctx)
|
|
|
|
{
|
|
|
|
cuCtxPushCurrent(ctx);
|
|
|
|
}
|
|
|
|
~CUDAContextScope()
|
|
|
|
{
|
|
|
|
cuCtxPopCurrent(NULL);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Use a pool with multiple threads to support launches with multiple CUDA streams
|
|
|
|
TaskPool task_pool;
|
|
|
|
|
|
|
|
// CUDA/OptiX context handles
|
|
|
|
CUdevice cuda_device = 0;
|
|
|
|
CUcontext cuda_context = NULL;
|
|
|
|
vector<CUstream> cuda_stream;
|
|
|
|
OptixDeviceContext context = NULL;
|
|
|
|
|
|
|
|
// Need CUDA kernel module for some utility functions
|
|
|
|
CUmodule cuda_module = NULL;
|
|
|
|
CUmodule cuda_filter_module = NULL;
|
|
|
|
// All necessary OptiX kernels are in one module
|
|
|
|
OptixModule optix_module = NULL;
|
|
|
|
OptixPipeline pipelines[NUM_PIPELINES] = {};
|
|
|
|
|
2019-10-02 10:06:30 +00:00
|
|
|
bool motion_blur = false;
|
2019-09-12 12:50:06 +00:00
|
|
|
bool need_texture_info = false;
|
|
|
|
device_vector<SbtRecord> sbt_data;
|
|
|
|
device_vector<TextureInfo> texture_info;
|
|
|
|
device_only_memory<KernelParams> launch_params;
|
2019-12-05 18:17:01 +00:00
|
|
|
vector<CUdeviceptr> as_mem;
|
2019-09-12 12:50:06 +00:00
|
|
|
OptixTraversableHandle tlas_handle = 0;
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
// TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
|
|
|
|
int can_map_host = 0;
|
|
|
|
size_t map_host_used = 0;
|
|
|
|
size_t map_host_limit = 0;
|
|
|
|
size_t device_working_headroom = 32 * 1024 * 1024LL; // 32MB
|
|
|
|
size_t device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
|
2019-09-12 12:50:06 +00:00
|
|
|
map<device_memory *, CUDAMem> cuda_mem_map;
|
2019-10-18 10:06:28 +00:00
|
|
|
bool move_texture_to_host = false;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
OptixDenoiser denoiser = NULL;
|
|
|
|
vector<pair<int2, CUdeviceptr>> denoiser_state;
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
public:
|
|
|
|
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
|
|
|
: Device(info_, stats_, profiler_, background_),
|
|
|
|
sbt_data(this, "__sbt", MEM_READ_ONLY),
|
|
|
|
texture_info(this, "__texture_info", MEM_TEXTURE),
|
|
|
|
launch_params(this, "__params")
|
|
|
|
{
|
|
|
|
// Store number of CUDA streams in device info
|
|
|
|
info.cpu_threads = DebugFlags().optix.cuda_streams;
|
|
|
|
|
|
|
|
// Initialize CUDA driver API
|
|
|
|
check_result_cuda(cuInit(0));
|
|
|
|
|
|
|
|
// Retrieve the primary CUDA context for this device
|
|
|
|
check_result_cuda(cuDeviceGet(&cuda_device, info.num));
|
|
|
|
check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));
|
|
|
|
|
|
|
|
// Make that CUDA context current
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
// Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
|
|
|
|
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
|
|
|
|
size_t system_ram = system_physical_ram();
|
|
|
|
if (system_ram > 0) {
|
|
|
|
if (system_ram / 2 > default_limit) {
|
|
|
|
map_host_limit = system_ram - default_limit;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
map_host_limit = system_ram / 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check device support for pinned host memory
|
|
|
|
check_result_cuda(
|
|
|
|
cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
// Create OptiX context for this device
|
|
|
|
OptixDeviceContextOptions options = {};
|
|
|
|
# ifdef WITH_CYCLES_LOGGING
|
|
|
|
options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
|
|
|
|
options.logCallbackFunction =
|
|
|
|
[](unsigned int level, const char *, const char *message, void *) {
|
|
|
|
switch (level) {
|
|
|
|
case 1:
|
|
|
|
LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
LOG_IF(INFO, VLOG_IS_ON(1)) << message;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
# endif
|
|
|
|
check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
|
|
|
|
# ifdef WITH_CYCLES_LOGGING
|
|
|
|
check_result_optix(optixDeviceContextSetLogCallback(
|
|
|
|
context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
|
|
|
|
# endif
|
|
|
|
|
|
|
|
// Create launch streams
|
|
|
|
cuda_stream.resize(info.cpu_threads);
|
|
|
|
for (int i = 0; i < info.cpu_threads; ++i)
|
|
|
|
check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
|
|
|
|
|
|
|
|
// Fix weird compiler bug that assigns wrong size
|
|
|
|
launch_params.data_elements = sizeof(KernelParams);
|
|
|
|
// Allocate launch parameter buffer memory on device
|
|
|
|
launch_params.alloc_to_device(info.cpu_threads);
|
2019-12-11 17:11:46 +00:00
|
|
|
|
|
|
|
// Create denoiser state entries for all threads (but do not allocate yet)
|
|
|
|
denoiser_state.resize(info.cpu_threads);
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
~OptiXDevice()
|
|
|
|
{
|
|
|
|
// Stop processing any more tasks
|
|
|
|
task_pool.stop();
|
|
|
|
|
2019-11-28 12:57:02 +00:00
|
|
|
// Free all acceleration structures
|
2019-12-05 18:17:01 +00:00
|
|
|
for (CUdeviceptr mem : as_mem) {
|
|
|
|
cuMemFree(mem);
|
|
|
|
}
|
2019-12-11 17:11:46 +00:00
|
|
|
|
|
|
|
// Free denoiser state for all threads
|
|
|
|
for (const pair<int2, CUdeviceptr> &state : denoiser_state) {
|
|
|
|
cuMemFree(state.second);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
sbt_data.free();
|
|
|
|
texture_info.free();
|
|
|
|
launch_params.free();
|
|
|
|
|
|
|
|
// Make CUDA context current
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
// Unload modules
|
|
|
|
if (cuda_module != NULL)
|
|
|
|
cuModuleUnload(cuda_module);
|
|
|
|
if (cuda_filter_module != NULL)
|
|
|
|
cuModuleUnload(cuda_filter_module);
|
|
|
|
if (optix_module != NULL)
|
|
|
|
optixModuleDestroy(optix_module);
|
|
|
|
for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
|
|
|
|
if (pipelines[i] != NULL)
|
|
|
|
optixPipelineDestroy(pipelines[i]);
|
|
|
|
|
|
|
|
// Destroy launch streams
|
2019-11-28 16:06:56 +00:00
|
|
|
for (CUstream stream : cuda_stream)
|
|
|
|
cuStreamDestroy(stream);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
if (denoiser != NULL)
|
|
|
|
optixDenoiserDestroy(denoiser);
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
// Destroy OptiX and CUDA context
|
|
|
|
optixDeviceContextDestroy(context);
|
|
|
|
cuDevicePrimaryCtxRelease(cuda_device);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool show_samples() const override
|
|
|
|
{
|
|
|
|
// Only show samples if not rendering multiple tiles in parallel
|
|
|
|
return info.cpu_threads == 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
BVHLayoutMask get_bvh_layout_mask() const override
|
|
|
|
{
|
|
|
|
// OptiX has its own internal acceleration structure format
|
|
|
|
return BVH_LAYOUT_OPTIX;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool load_kernels(const DeviceRequestedFeatures &requested_features) override
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false; // Abort early if context creation failed already
|
|
|
|
|
|
|
|
// Disable baking for now, since its kernel is not well-suited for inlining and is very slow
|
|
|
|
if (requested_features.use_baking) {
|
|
|
|
set_error("OptiX implementation does not support baking yet");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Disable shader raytracing support for now, since continuation callables are slow
|
|
|
|
if (requested_features.use_shader_raytrace) {
|
|
|
|
set_error("OptiX implementation does not support shader raytracing yet");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
2019-11-25 17:36:55 +00:00
|
|
|
// Unload existing OptiX module and pipelines first
|
|
|
|
if (optix_module != NULL) {
|
2019-09-12 12:50:06 +00:00
|
|
|
optixModuleDestroy(optix_module);
|
2019-11-25 17:36:55 +00:00
|
|
|
optix_module = NULL;
|
|
|
|
}
|
|
|
|
for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
|
|
|
|
if (pipelines[i] != NULL) {
|
2019-09-12 12:50:06 +00:00
|
|
|
optixPipelineDestroy(pipelines[i]);
|
2019-11-25 17:36:55 +00:00
|
|
|
pipelines[i] = NULL;
|
|
|
|
}
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
OptixModuleCompileOptions module_options;
|
|
|
|
module_options.maxRegisterCount = 0; // Do not set an explicit register limit
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
|
|
|
|
module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
|
|
|
|
# else
|
|
|
|
module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
|
|
|
|
module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
|
|
|
|
# endif
|
|
|
|
OptixPipelineCompileOptions pipeline_options;
|
|
|
|
// Default to no motion blur and two-level graph, since it is the fastest option
|
|
|
|
pipeline_options.usesMotionBlur = false;
|
|
|
|
pipeline_options.traversableGraphFlags =
|
|
|
|
OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
|
|
|
|
pipeline_options.numPayloadValues = 6;
|
|
|
|
pipeline_options.numAttributeValues = 2; // u, v
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW |
|
|
|
|
OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
|
|
|
|
# else
|
|
|
|
pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
|
|
|
|
# endif
|
|
|
|
pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
|
|
|
|
|
2019-10-02 10:06:30 +00:00
|
|
|
// Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
|
|
|
|
// This is necessary since objects may be reported to have motion if the Vector pass is
|
|
|
|
// active, but may still need to be rendered without motion blur if that isn't active as well
|
|
|
|
motion_blur = requested_features.use_object_motion;
|
|
|
|
|
|
|
|
if (motion_blur) {
|
2019-09-12 12:50:06 +00:00
|
|
|
pipeline_options.usesMotionBlur = true;
|
|
|
|
// Motion blur can insert motion transforms into the traversal graph
|
|
|
|
// It is no longer a two-level graph then, so need to set flags to allow any configuration
|
|
|
|
pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
|
|
|
|
}
|
|
|
|
|
|
|
|
{ // Load and compile PTX module with OptiX kernels
|
|
|
|
string ptx_data;
|
|
|
|
const string ptx_filename = "lib/kernel_optix.ptx";
|
|
|
|
if (!path_read_text(path_get(ptx_filename), ptx_data)) {
|
|
|
|
set_error("Failed loading OptiX kernel " + ptx_filename + ".");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
check_result_optix_ret(optixModuleCreateFromPTX(context,
|
|
|
|
&module_options,
|
|
|
|
&pipeline_options,
|
|
|
|
ptx_data.data(),
|
|
|
|
ptx_data.size(),
|
|
|
|
nullptr,
|
|
|
|
0,
|
|
|
|
&optix_module));
|
|
|
|
}
|
|
|
|
|
|
|
|
{ // Load CUDA modules because we need some of the utility kernels
|
|
|
|
int major, minor;
|
|
|
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
|
|
|
|
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);
|
|
|
|
|
2019-11-25 17:36:55 +00:00
|
|
|
if (cuda_module == NULL) { // Avoid reloading module if it was already loaded
|
|
|
|
string cubin_data;
|
|
|
|
const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
|
|
|
|
if (!path_read_text(path_get(cubin_filename), cubin_data)) {
|
|
|
|
set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
|
|
|
|
return false;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-11-25 17:36:55 +00:00
|
|
|
check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-11-25 17:36:55 +00:00
|
|
|
if (requested_features.use_denoising && cuda_filter_module == NULL) {
|
2019-09-12 12:50:06 +00:00
|
|
|
string filter_data;
|
|
|
|
const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
|
|
|
|
if (!path_read_text(path_get(filter_filename), filter_data)) {
|
|
|
|
set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create program groups
|
|
|
|
OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
|
|
|
|
OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
|
|
|
|
OptixProgramGroupOptions group_options = {}; // There are no options currently
|
|
|
|
group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
|
|
group_descs[PG_RGEN].raygen.module = optix_module;
|
|
|
|
// Ignore branched integrator for now (see "requested_features.use_integrator_branched")
|
|
|
|
group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
|
|
|
|
group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
|
|
|
|
group_descs[PG_MISS].miss.module = optix_module;
|
|
|
|
group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
|
|
|
|
group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
|
|
group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
|
|
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
|
|
|
|
group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
|
|
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
|
|
|
|
group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
|
|
group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
|
|
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
|
|
|
|
|
|
|
|
if (requested_features.use_hair) {
|
|
|
|
// Add curve intersection programs
|
|
|
|
group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
|
|
|
|
group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
|
|
|
|
group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
|
|
|
|
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
|
|
|
|
// Add hit group for local intersections
|
|
|
|
group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
|
|
|
group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
|
|
|
|
group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
|
|
|
|
}
|
|
|
|
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION;
|
|
|
|
group_descs[PG_EXCP].exception.module = optix_module;
|
|
|
|
group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception";
|
|
|
|
# endif
|
|
|
|
|
|
|
|
if (requested_features.use_baking) {
|
|
|
|
group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
|
|
group_descs[PG_BAKE].raygen.module = optix_module;
|
|
|
|
group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (requested_features.use_true_displacement) {
|
|
|
|
group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
|
|
group_descs[PG_DISP].raygen.module = optix_module;
|
|
|
|
group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (requested_features.use_background_light) {
|
|
|
|
group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
|
|
|
|
group_descs[PG_BACK].raygen.module = optix_module;
|
|
|
|
group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
|
|
|
|
}
|
|
|
|
|
|
|
|
check_result_optix_ret(optixProgramGroupCreate(
|
|
|
|
context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
|
|
|
|
|
|
|
|
// Get program stack sizes
|
|
|
|
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
|
|
|
|
// Set up SBT, which in this case is used only to select between different programs
|
|
|
|
sbt_data.alloc(NUM_PROGRAM_GROUPS);
|
|
|
|
memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
|
|
|
|
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
|
|
|
|
check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
|
|
|
|
check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
|
|
|
|
}
|
|
|
|
sbt_data.copy_to_device(); // Upload SBT to device
|
|
|
|
|
|
|
|
// Calculate maximum trace continuation stack size
|
|
|
|
unsigned int trace_css = stack_size[PG_HITD].cssCH;
|
|
|
|
// This is based on the maximum of closest-hit and any-hit/intersection programs
|
|
|
|
trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
|
|
|
|
trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
|
|
|
|
trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
|
|
|
|
|
|
|
|
OptixPipelineLinkOptions link_options;
|
|
|
|
link_options.maxTraceDepth = 1;
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
|
|
|
|
# else
|
|
|
|
link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
|
|
|
|
# endif
|
|
|
|
link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
|
|
|
|
|
|
|
|
{ // Create path tracing pipeline
|
|
|
|
OptixProgramGroup pipeline_groups[] = {
|
|
|
|
groups[PG_RGEN],
|
|
|
|
groups[PG_MISS],
|
|
|
|
groups[PG_HITD],
|
|
|
|
groups[PG_HITS],
|
|
|
|
groups[PG_HITL],
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
groups[PG_EXCP],
|
|
|
|
# endif
|
|
|
|
};
|
|
|
|
check_result_optix_ret(
|
|
|
|
optixPipelineCreate(context,
|
|
|
|
&pipeline_options,
|
|
|
|
&link_options,
|
|
|
|
pipeline_groups,
|
|
|
|
(sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
|
|
|
|
nullptr,
|
|
|
|
0,
|
|
|
|
&pipelines[PIP_PATH_TRACE]));
|
|
|
|
|
|
|
|
// Combine ray generation and trace continuation stack size
|
|
|
|
const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
|
|
|
|
|
|
|
|
// Set stack size depending on pipeline options
|
|
|
|
check_result_optix_ret(optixPipelineSetStackSize(
|
|
|
|
pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only need to create shader evaluation pipeline if one of these features is used:
|
|
|
|
const bool use_shader_eval_pipeline = requested_features.use_baking ||
|
|
|
|
requested_features.use_background_light ||
|
|
|
|
requested_features.use_true_displacement;
|
|
|
|
|
|
|
|
if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
|
|
|
|
OptixProgramGroup pipeline_groups[] = {
|
|
|
|
groups[PG_BAKE],
|
|
|
|
groups[PG_DISP],
|
|
|
|
groups[PG_BACK],
|
|
|
|
groups[PG_MISS],
|
|
|
|
groups[PG_HITD],
|
|
|
|
groups[PG_HITS],
|
|
|
|
groups[PG_HITL],
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
groups[PG_EXCP],
|
|
|
|
# endif
|
|
|
|
};
|
|
|
|
check_result_optix_ret(
|
|
|
|
optixPipelineCreate(context,
|
|
|
|
&pipeline_options,
|
|
|
|
&link_options,
|
|
|
|
pipeline_groups,
|
|
|
|
(sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
|
|
|
|
nullptr,
|
|
|
|
0,
|
|
|
|
&pipelines[PIP_SHADER_EVAL]));
|
|
|
|
|
|
|
|
// Calculate continuation stack size based on the maximum of all ray generation stack sizes
|
|
|
|
const unsigned int css = max(stack_size[PG_BAKE].cssRG,
|
|
|
|
max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) +
|
|
|
|
link_options.maxTraceDepth * trace_css;
|
|
|
|
|
|
|
|
check_result_optix_ret(optixPipelineSetStackSize(
|
|
|
|
pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clean up program group objects
|
|
|
|
for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
|
|
|
|
optixProgramGroupDestroy(groups[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void thread_run(DeviceTask &task, int thread_index) // Main task entry point
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return; // Abort early if there was an error previously
|
|
|
|
|
|
|
|
if (task.type == DeviceTask::RENDER) {
|
|
|
|
RenderTile tile;
|
|
|
|
while (task.acquire_tile(this, tile)) {
|
|
|
|
if (tile.task == RenderTile::PATH_TRACE)
|
|
|
|
launch_render(task, tile, thread_index);
|
|
|
|
else if (tile.task == RenderTile::DENOISE)
|
|
|
|
launch_denoise(task, tile, thread_index);
|
|
|
|
task.release_tile(tile);
|
|
|
|
if (task.get_cancel() && !task.need_finish_queue)
|
|
|
|
break; // User requested cancellation
|
|
|
|
else if (have_error())
|
|
|
|
break; // Abort rendering when encountering an error
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (task.type == DeviceTask::SHADER) {
|
|
|
|
launch_shader_eval(task, thread_index);
|
|
|
|
}
|
|
|
|
else if (task.type == DeviceTask::FILM_CONVERT) {
|
|
|
|
launch_film_convert(task, thread_index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
|
|
|
|
{
|
|
|
|
assert(thread_index < launch_params.data_size);
|
|
|
|
|
|
|
|
// Keep track of total render time of this tile
|
|
|
|
const scoped_timer timer(&rtile.buffers->render_time);
|
|
|
|
|
|
|
|
WorkTile wtile;
|
|
|
|
wtile.x = rtile.x;
|
|
|
|
wtile.y = rtile.y;
|
|
|
|
wtile.w = rtile.w;
|
|
|
|
wtile.h = rtile.h;
|
|
|
|
wtile.offset = rtile.offset;
|
|
|
|
wtile.stride = rtile.stride;
|
|
|
|
wtile.buffer = (float *)rtile.buffer;
|
|
|
|
|
|
|
|
const int end_sample = rtile.start_sample + rtile.num_samples;
|
|
|
|
// Keep this number reasonable to avoid running into TDRs
|
|
|
|
const int step_samples = (info.display_device ? 8 : 32);
|
|
|
|
// Offset into launch params buffer so that streams use separate data
|
|
|
|
device_ptr launch_params_ptr = launch_params.device_pointer +
|
|
|
|
thread_index * launch_params.data_elements;
|
|
|
|
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
|
|
|
|
// Copy work tile information to device
|
|
|
|
wtile.num_samples = min(step_samples, end_sample - sample);
|
|
|
|
wtile.start_sample = sample;
|
|
|
|
check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
|
|
|
|
&wtile,
|
|
|
|
sizeof(wtile),
|
|
|
|
cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
OptixShaderBindingTable sbt_params = {};
|
|
|
|
sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
|
|
|
|
# endif
|
|
|
|
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
|
|
|
|
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
sbt_params.missRecordCount = 1;
|
|
|
|
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
|
|
|
|
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS
|
|
|
|
|
|
|
|
// Launch the ray generation program
|
|
|
|
check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
|
|
|
|
cuda_stream[thread_index],
|
|
|
|
launch_params_ptr,
|
|
|
|
launch_params.data_elements,
|
|
|
|
&sbt_params,
|
|
|
|
// Launch with samples close to each other for better locality
|
|
|
|
wtile.w * wtile.num_samples,
|
|
|
|
wtile.h,
|
|
|
|
1));
|
|
|
|
|
|
|
|
// Wait for launch to finish
|
|
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
// Update current sample, so it is displayed correctly
|
|
|
|
rtile.sample = wtile.start_sample + wtile.num_samples;
|
|
|
|
// Update task progress after the kernel completed rendering
|
|
|
|
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
|
|
|
|
|
|
|
|
if (task.get_cancel() && !task.need_finish_queue)
|
|
|
|
return; // Cancel rendering
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
bool launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
|
2019-09-12 12:50:06 +00:00
|
|
|
{
|
2019-12-11 17:11:46 +00:00
|
|
|
int total_samples = rtile.start_sample + rtile.num_samples;
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
// Choose between OptiX and NLM denoising
|
|
|
|
if (task.denoising_use_optix) {
|
|
|
|
// Map neighboring tiles onto this device, indices are as following:
|
|
|
|
// Where index 4 is the center tile and index 9 is the target for the result.
|
|
|
|
// 0 1 2
|
|
|
|
// 3 4 5
|
|
|
|
// 6 7 8 9
|
|
|
|
RenderTile rtiles[10];
|
|
|
|
rtiles[4] = rtile;
|
|
|
|
task.map_neighbor_tiles(rtiles, this);
|
|
|
|
|
|
|
|
// Calculate size of the tile to denoise (including overlap)
|
|
|
|
int4 rect = make_int4(
|
|
|
|
rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h);
|
|
|
|
// Overlap between tiles has to be at least 64 pixels
|
|
|
|
// TODO(pmours): Query this value from OptiX
|
|
|
|
rect = rect_expand(rect, 64);
|
|
|
|
int4 clip_rect = make_int4(
|
|
|
|
rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h);
|
|
|
|
rect = rect_clip(rect, clip_rect);
|
|
|
|
int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
|
|
|
|
int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
|
|
|
|
|
|
|
|
// Calculate byte offsets and strides
|
|
|
|
int pixel_stride = task.pass_stride * (int)sizeof(float);
|
|
|
|
int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
|
|
|
|
const int pass_offset[3] = {
|
|
|
|
(task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
|
|
|
|
(task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
|
|
|
|
(task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
|
|
|
|
|
|
|
|
// Start with the current tile pointer offset
|
|
|
|
int input_stride = pixel_stride;
|
|
|
|
device_ptr input_ptr = rtile.buffer + pixel_offset;
|
|
|
|
|
|
|
|
// Copy tile data into a common buffer if necessary
|
|
|
|
device_only_memory<float> input(this, "denoiser input");
|
|
|
|
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
|
|
|
|
|
|
|
|
if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) &&
|
|
|
|
(!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) {
|
|
|
|
// Tiles are in continous memory, so can just subtract overlap offset
|
|
|
|
input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
|
|
|
|
// Stride covers the whole width of the image and not just a single tile
|
|
|
|
input_stride *= rtile.stride;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// Adjacent tiles are in separate memory regions, so need to copy them into a single one
|
|
|
|
input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
|
|
|
|
// Start with the new input buffer
|
|
|
|
input_ptr = input.device_pointer;
|
|
|
|
// Stride covers the width of the new input buffer, which includes tile width and overlap
|
|
|
|
input_stride *= rect_size.x;
|
|
|
|
|
|
|
|
TileInfo *tile_info = tile_info_mem.alloc(1);
|
|
|
|
for (int i = 0; i < 9; i++) {
|
|
|
|
tile_info->offsets[i] = rtiles[i].offset;
|
|
|
|
tile_info->strides[i] = rtiles[i].stride;
|
|
|
|
tile_info->buffers[i] = rtiles[i].buffer;
|
|
|
|
}
|
|
|
|
tile_info->x[0] = rtiles[3].x;
|
|
|
|
tile_info->x[1] = rtiles[4].x;
|
|
|
|
tile_info->x[2] = rtiles[5].x;
|
|
|
|
tile_info->x[3] = rtiles[5].x + rtiles[5].w;
|
|
|
|
tile_info->y[0] = rtiles[1].y;
|
|
|
|
tile_info->y[1] = rtiles[4].y;
|
|
|
|
tile_info->y[2] = rtiles[7].y;
|
|
|
|
tile_info->y[3] = rtiles[7].y + rtiles[7].h;
|
|
|
|
tile_info_mem.copy_to_device();
|
|
|
|
|
|
|
|
CUfunction filter_copy_func;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
|
|
|
|
void *args[] = {
|
|
|
|
&input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
|
|
|
|
CUDA_GET_BLOCKSIZE(filter_copy_func, rect_size.x, rect_size.y);
|
|
|
|
CUDA_LAUNCH_KERNEL(filter_copy_func, args);
|
|
|
|
}
|
|
|
|
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
|
|
device_only_memory<float> input_rgb(this, "denoiser input rgb");
|
|
|
|
{
|
|
|
|
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 *
|
|
|
|
task.denoising.optix_input_passes);
|
|
|
|
|
|
|
|
CUfunction convert_to_rgb_func;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
|
|
|
|
void *args[] = {&input_rgb.device_pointer,
|
|
|
|
&input_ptr,
|
|
|
|
&rect_size.x,
|
|
|
|
&rect_size.y,
|
|
|
|
&input_stride,
|
|
|
|
&task.pass_stride,
|
|
|
|
const_cast<int *>(pass_offset),
|
|
|
|
&task.denoising.optix_input_passes,
|
|
|
|
&total_samples};
|
|
|
|
CUDA_GET_BLOCKSIZE(convert_to_rgb_func, rect_size.x, rect_size.y);
|
|
|
|
CUDA_LAUNCH_KERNEL(convert_to_rgb_func, args);
|
|
|
|
|
|
|
|
input_ptr = input_rgb.device_pointer;
|
|
|
|
pixel_stride = 3 * sizeof(float);
|
|
|
|
input_stride = rect_size.x * pixel_stride;
|
|
|
|
}
|
|
|
|
# endif
|
|
|
|
|
|
|
|
if (denoiser == NULL) {
|
|
|
|
// Create OptiX denoiser handle on demand when it is first used
|
|
|
|
OptixDenoiserOptions denoiser_options;
|
|
|
|
assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
|
|
|
|
denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
|
|
|
|
OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
|
|
|
|
denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
|
|
check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
|
|
|
|
check_result_optix_ret(
|
|
|
|
optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
OptixDenoiserSizes sizes = {};
|
|
|
|
check_result_optix_ret(
|
|
|
|
optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
|
|
|
|
|
|
|
|
auto &state = denoiser_state[thread_index].second;
|
|
|
|
auto &state_size = denoiser_state[thread_index].first;
|
|
|
|
const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
|
|
|
|
const size_t scratch_offset = sizes.stateSizeInBytes;
|
|
|
|
|
|
|
|
// Allocate denoiser state if tile size has changed since last setup
|
|
|
|
if (state_size.x != rect_size.x || state_size.y != rect_size.y) {
|
|
|
|
if (state) {
|
|
|
|
cuMemFree(state);
|
|
|
|
state = 0;
|
|
|
|
}
|
|
|
|
check_result_cuda_ret(cuMemAlloc(&state, scratch_offset + scratch_size));
|
|
|
|
|
|
|
|
check_result_optix_ret(optixDenoiserSetup(denoiser,
|
|
|
|
cuda_stream[thread_index],
|
|
|
|
rect_size.x,
|
|
|
|
rect_size.y,
|
|
|
|
state,
|
|
|
|
scratch_offset,
|
|
|
|
state + scratch_offset,
|
|
|
|
scratch_size));
|
|
|
|
|
|
|
|
state_size = rect_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set up input and output layer information
|
|
|
|
OptixImage2D input_layers[3] = {};
|
|
|
|
OptixImage2D output_layers[1] = {};
|
|
|
|
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
|
|
input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
|
|
|
|
# else
|
|
|
|
input_layers[i].data = input_ptr + pass_offset[i];
|
|
|
|
# endif
|
|
|
|
input_layers[i].width = rect_size.x;
|
|
|
|
input_layers[i].height = rect_size.y;
|
|
|
|
input_layers[i].rowStrideInBytes = input_stride;
|
|
|
|
input_layers[i].pixelStrideInBytes = pixel_stride;
|
|
|
|
input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
|
|
}
|
|
|
|
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
|
|
output_layers[0].data = input_ptr;
|
|
|
|
output_layers[0].width = rect_size.x;
|
|
|
|
output_layers[0].height = rect_size.y;
|
|
|
|
output_layers[0].rowStrideInBytes = input_stride;
|
|
|
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
|
|
|
int2 output_offset = overlap_offset;
|
|
|
|
overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
|
|
|
|
# else
|
|
|
|
output_layers[0].data = rtiles[9].buffer + pixel_offset;
|
|
|
|
output_layers[0].width = rtiles[9].w;
|
|
|
|
output_layers[0].height = rtiles[9].h;
|
|
|
|
output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride;
|
|
|
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
|
|
|
# endif
|
|
|
|
output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
|
|
|
|
|
|
|
// Finally run denonising
|
|
|
|
OptixDenoiserParams params = {}; // All parameters are disabled/zero
|
|
|
|
check_result_optix_ret(optixDenoiserInvoke(denoiser,
|
|
|
|
cuda_stream[thread_index],
|
|
|
|
¶ms,
|
|
|
|
state,
|
|
|
|
scratch_offset,
|
|
|
|
input_layers,
|
|
|
|
task.denoising.optix_input_passes,
|
|
|
|
overlap_offset.x,
|
|
|
|
overlap_offset.y,
|
|
|
|
output_layers,
|
|
|
|
state + scratch_offset,
|
|
|
|
scratch_size));
|
|
|
|
|
|
|
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
|
|
|
{
|
|
|
|
CUfunction convert_from_rgb_func;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb"));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
|
|
|
|
void *args[] = {&input_ptr,
|
|
|
|
&rtiles[9].buffer,
|
|
|
|
&output_offset.x,
|
|
|
|
&output_offset.y,
|
|
|
|
&rect_size.x,
|
|
|
|
&rect_size.y,
|
|
|
|
&rtiles[9].x,
|
|
|
|
&rtiles[9].y,
|
|
|
|
&rtiles[9].w,
|
|
|
|
&rtiles[9].h,
|
|
|
|
&rtiles[9].offset,
|
|
|
|
&rtiles[9].stride,
|
|
|
|
&task.pass_stride};
|
|
|
|
CUDA_GET_BLOCKSIZE(convert_from_rgb_func, rtiles[9].w, rtiles[9].h);
|
|
|
|
CUDA_LAUNCH_KERNEL(convert_from_rgb_func, args);
|
|
|
|
}
|
|
|
|
# endif
|
|
|
|
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-12-11 17:11:46 +00:00
|
|
|
task.unmap_neighbor_tiles(rtiles, this);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// Run CUDA denoising kernels
|
|
|
|
DenoisingTask denoising(this, task);
|
|
|
|
denoising.functions.construct_transform = function_bind(
|
|
|
|
&OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
|
|
|
|
denoising.functions.accumulate = function_bind(
|
|
|
|
&OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
|
|
|
|
denoising.functions.solve = function_bind(
|
|
|
|
&OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
|
|
|
|
denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow,
|
|
|
|
this,
|
|
|
|
_1,
|
|
|
|
_2,
|
|
|
|
_3,
|
|
|
|
_4,
|
|
|
|
_5,
|
|
|
|
&denoising,
|
|
|
|
thread_index);
|
|
|
|
denoising.functions.non_local_means = function_bind(
|
|
|
|
&OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
|
|
|
|
denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
|
|
|
|
this,
|
|
|
|
_1,
|
|
|
|
_2,
|
|
|
|
_3,
|
|
|
|
_4,
|
|
|
|
_5,
|
|
|
|
_6,
|
|
|
|
&denoising,
|
|
|
|
thread_index);
|
|
|
|
denoising.functions.get_feature = function_bind(
|
|
|
|
&OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
|
|
|
|
denoising.functions.write_feature = function_bind(
|
|
|
|
&OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
|
|
|
|
denoising.functions.detect_outliers = function_bind(
|
|
|
|
&OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
|
|
|
|
|
|
|
|
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
|
|
|
denoising.render_buffer.samples = total_samples;
|
|
|
|
denoising.buffer.gpu_temporary_mem = true;
|
|
|
|
|
|
|
|
denoising.run_denoising(&rtile);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update current sample, so it is displayed correctly
|
|
|
|
rtile.sample = total_samples;
|
|
|
|
// Update task progress after the denoiser completed processing
|
2019-09-12 12:50:06 +00:00
|
|
|
task.update_progress(&rtile, rtile.w * rtile.h);
|
2019-12-11 17:11:46 +00:00
|
|
|
|
|
|
|
return true;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void launch_shader_eval(DeviceTask &task, int thread_index)
|
|
|
|
{
|
|
|
|
unsigned int rgen_index = PG_BACK;
|
|
|
|
if (task.shader_eval_type >= SHADER_EVAL_BAKE)
|
|
|
|
rgen_index = PG_BAKE;
|
|
|
|
if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
|
|
|
|
rgen_index = PG_DISP;
|
|
|
|
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
device_ptr launch_params_ptr = launch_params.device_pointer +
|
|
|
|
thread_index * launch_params.data_elements;
|
|
|
|
|
|
|
|
for (int sample = 0; sample < task.num_samples; ++sample) {
|
|
|
|
ShaderParams params;
|
|
|
|
params.input = (uint4 *)task.shader_input;
|
|
|
|
params.output = (float4 *)task.shader_output;
|
|
|
|
params.type = task.shader_eval_type;
|
|
|
|
params.filter = task.shader_filter;
|
|
|
|
params.sx = task.shader_x;
|
|
|
|
params.offset = task.offset;
|
|
|
|
params.sample = sample;
|
|
|
|
|
|
|
|
check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
|
|
|
|
¶ms,
|
|
|
|
sizeof(params),
|
|
|
|
cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
OptixShaderBindingTable sbt_params = {};
|
|
|
|
sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
|
|
|
|
# endif
|
|
|
|
sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
|
|
|
|
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
sbt_params.missRecordCount = 1;
|
|
|
|
sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
|
|
|
|
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
|
|
|
|
sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS
|
|
|
|
|
|
|
|
check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
|
|
|
|
cuda_stream[thread_index],
|
|
|
|
launch_params_ptr,
|
|
|
|
launch_params.data_elements,
|
|
|
|
&sbt_params,
|
|
|
|
task.shader_w,
|
|
|
|
1,
|
|
|
|
1));
|
|
|
|
|
|
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
task.update_progress(NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void launch_film_convert(DeviceTask &task, int thread_index)
|
|
|
|
{
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
CUfunction film_convert_func;
|
|
|
|
check_result_cuda(cuModuleGetFunction(&film_convert_func,
|
|
|
|
cuda_module,
|
|
|
|
task.rgba_byte ? "kernel_cuda_convert_to_byte" :
|
|
|
|
"kernel_cuda_convert_to_half_float"));
|
|
|
|
|
|
|
|
float sample_scale = 1.0f / (task.sample + 1);
|
|
|
|
CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);
|
|
|
|
|
|
|
|
void *args[] = {&rgba,
|
|
|
|
&task.buffer,
|
|
|
|
&sample_scale,
|
|
|
|
&task.x,
|
|
|
|
&task.y,
|
|
|
|
&task.w,
|
|
|
|
&task.h,
|
|
|
|
&task.offset,
|
|
|
|
&task.stride};
|
|
|
|
|
|
|
|
int threads_per_block;
|
|
|
|
check_result_cuda(cuFuncGetAttribute(
|
|
|
|
&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));
|
|
|
|
|
|
|
|
const int num_threads_x = (int)sqrt(threads_per_block);
|
|
|
|
const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
|
|
|
|
const int num_threads_y = (int)sqrt(threads_per_block);
|
|
|
|
const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;
|
|
|
|
|
|
|
|
check_result_cuda(cuLaunchKernel(film_convert_func,
|
|
|
|
num_blocks_x,
|
|
|
|
num_blocks_y,
|
|
|
|
1, /* blocks */
|
|
|
|
num_threads_x,
|
|
|
|
num_threads_y,
|
|
|
|
1, /* threads */
|
|
|
|
0,
|
|
|
|
cuda_stream[thread_index],
|
|
|
|
args,
|
|
|
|
0));
|
|
|
|
|
|
|
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
task.update_progress(NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool build_optix_bvh(const OptixBuildInput &build_input,
|
|
|
|
uint16_t num_motion_steps,
|
|
|
|
OptixTraversableHandle &out_handle)
|
|
|
|
{
|
|
|
|
out_handle = 0;
|
|
|
|
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
// Compute memory usage
|
|
|
|
OptixAccelBufferSizes sizes = {};
|
|
|
|
OptixAccelBuildOptions options;
|
|
|
|
options.operation = OPTIX_BUILD_OPERATION_BUILD;
|
2019-12-05 18:17:01 +00:00
|
|
|
if (background) {
|
|
|
|
// Prefer best performance and lowest memory consumption in background
|
|
|
|
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// Prefer fast updates in viewport
|
|
|
|
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
|
|
|
|
}
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
options.motionOptions.numKeys = num_motion_steps;
|
|
|
|
options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
|
|
|
|
options.motionOptions.timeBegin = 0.0f;
|
|
|
|
options.motionOptions.timeEnd = 1.0f;
|
|
|
|
|
|
|
|
check_result_optix_ret(
|
|
|
|
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
|
|
|
|
|
|
|
|
// Allocate required output buffers
|
|
|
|
device_only_memory<char> temp_mem(this, "temp_build_mem");
|
2019-12-05 18:17:01 +00:00
|
|
|
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
|
|
|
|
if (!temp_mem.device_pointer)
|
|
|
|
return false; // Make sure temporary memory allocation succeeded
|
|
|
|
|
|
|
|
// Move textures to host memory if there is not enough room
|
|
|
|
size_t size = 0, free = 0;
|
|
|
|
cuMemGetInfo(&free, &size);
|
|
|
|
size = sizes.outputSizeInBytes + device_working_headroom;
|
|
|
|
if (size >= free && can_map_host) {
|
|
|
|
move_textures_to_host(size - free, false);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-12-05 18:17:01 +00:00
|
|
|
CUdeviceptr out_data = 0;
|
|
|
|
check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
|
|
|
|
as_mem.push_back(out_data);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Finally build the acceleration structure
|
2019-12-05 18:17:01 +00:00
|
|
|
OptixAccelEmitDesc compacted_size_prop;
|
|
|
|
compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
|
|
|
|
// A tiny space was allocated for this property at the end of the temporary buffer above
|
|
|
|
// Make sure this pointer is 8-byte aligned
|
|
|
|
compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
check_result_optix_ret(optixAccelBuild(context,
|
|
|
|
NULL,
|
|
|
|
&options,
|
|
|
|
&build_input,
|
|
|
|
1,
|
|
|
|
temp_mem.device_pointer,
|
2019-12-05 18:17:01 +00:00
|
|
|
temp_mem.device_size,
|
|
|
|
out_data,
|
2019-09-12 12:50:06 +00:00
|
|
|
sizes.outputSizeInBytes,
|
|
|
|
&out_handle,
|
2020-01-10 15:03:11 +00:00
|
|
|
background ? &compacted_size_prop : NULL,
|
|
|
|
background ? 1 : 0));
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Wait for all operations to finish
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(NULL));
|
|
|
|
|
2019-12-05 18:17:01 +00:00
|
|
|
// Compact acceleration structure to save memory (do not do this in viewport for faster builds)
|
|
|
|
if (background) {
|
|
|
|
uint64_t compacted_size = sizes.outputSizeInBytes;
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
|
|
|
|
|
|
|
|
// Temporary memory is no longer needed, so free it now to make space
|
|
|
|
temp_mem.free();
|
|
|
|
|
|
|
|
// There is no point compacting if the size does not change
|
|
|
|
if (compacted_size < sizes.outputSizeInBytes) {
|
|
|
|
CUdeviceptr compacted_data = 0;
|
|
|
|
if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
|
|
|
|
// Do not compact if memory allocation for compacted acceleration structure fails
|
|
|
|
// Can just use the uncompacted one then, so succeed here regardless
|
|
|
|
return true;
|
|
|
|
as_mem.push_back(compacted_data);
|
|
|
|
|
|
|
|
check_result_optix_ret(optixAccelCompact(
|
|
|
|
context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
|
|
|
|
|
|
|
|
// Wait for compaction to finish
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(NULL));
|
|
|
|
|
|
|
|
// Free uncompacted acceleration structure
|
|
|
|
cuMemFree(out_data);
|
|
|
|
as_mem.erase(as_mem.end() - 2); // Remove 'out_data' from 'as_mem' array
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-11-28 12:57:02 +00:00
|
|
|
bool build_optix_bvh(BVH *bvh) override
|
2019-09-12 12:50:06 +00:00
|
|
|
{
|
|
|
|
assert(bvh->params.top_level);
|
|
|
|
|
|
|
|
unsigned int num_instances = 0;
|
|
|
|
unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
|
2019-11-28 12:57:02 +00:00
|
|
|
meshes.reserve(bvh->meshes.size());
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-12-05 18:17:01 +00:00
|
|
|
// Free all previous acceleration structures
|
|
|
|
for (CUdeviceptr mem : as_mem) {
|
|
|
|
cuMemFree(mem);
|
|
|
|
}
|
2019-11-28 12:57:02 +00:00
|
|
|
as_mem.clear();
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Build bottom level acceleration structures (BLAS)
|
|
|
|
// Note: Always keep this logic in sync with bvh_optix.cpp!
|
|
|
|
for (Object *ob : bvh->objects) {
|
|
|
|
// Skip meshes for which acceleration structure already exists
|
|
|
|
if (meshes.find(ob->mesh) != meshes.end())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Mesh *const mesh = ob->mesh;
|
|
|
|
vector<OptixTraversableHandle> handles;
|
2019-11-28 12:57:02 +00:00
|
|
|
handles.reserve(2);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Build BLAS for curve primitives
|
|
|
|
if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
|
|
|
|
const size_t num_curves = mesh->num_curves();
|
|
|
|
const size_t num_segments = mesh->num_segments();
|
|
|
|
|
|
|
|
size_t num_motion_steps = 1;
|
|
|
|
Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
2019-10-02 10:06:30 +00:00
|
|
|
if (motion_blur && mesh->use_motion_blur && motion_keys) {
|
2019-09-12 12:50:06 +00:00
|
|
|
num_motion_steps = mesh->motion_steps;
|
|
|
|
}
|
|
|
|
|
|
|
|
device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
|
|
|
|
aabb_data.alloc(num_segments * num_motion_steps);
|
|
|
|
|
|
|
|
// Get AABBs for each motion step
|
|
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
2019-10-01 17:39:11 +00:00
|
|
|
// The center step for motion vertices is not stored in the attribute
|
2019-09-12 12:50:06 +00:00
|
|
|
const float3 *keys = mesh->curve_keys.data();
|
|
|
|
size_t center_step = (num_motion_steps - 1) / 2;
|
|
|
|
if (step != center_step) {
|
2019-10-01 17:39:11 +00:00
|
|
|
size_t attr_offset = (step > center_step) ? step - 1 : step;
|
|
|
|
// Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
|
|
|
|
keys = motion_keys->data_float3() + attr_offset * mesh->curve_keys.size();
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-01 17:39:11 +00:00
|
|
|
size_t i = step * num_segments;
|
|
|
|
for (size_t j = 0; j < num_curves; ++j) {
|
2019-09-12 12:50:06 +00:00
|
|
|
const Mesh::Curve c = mesh->get_curve(j);
|
2019-10-01 17:39:11 +00:00
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
|
|
|
|
BoundBox bounds = BoundBox::empty;
|
|
|
|
c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds);
|
2019-10-01 17:39:11 +00:00
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
aabb_data[i].minX = bounds.min.x;
|
|
|
|
aabb_data[i].minY = bounds.min.y;
|
|
|
|
aabb_data[i].minZ = bounds.min.z;
|
|
|
|
aabb_data[i].maxX = bounds.max.x;
|
|
|
|
aabb_data[i].maxY = bounds.max.y;
|
|
|
|
aabb_data[i].maxZ = bounds.max.z;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upload AABB data to GPU
|
|
|
|
aabb_data.copy_to_device();
|
|
|
|
|
|
|
|
vector<device_ptr> aabb_ptrs;
|
|
|
|
aabb_ptrs.reserve(num_motion_steps);
|
|
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
|
|
aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Disable visibility test anyhit program, since it is already checked during intersection
|
|
|
|
// Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
|
|
|
|
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
|
|
|
|
OptixBuildInput build_input = {};
|
|
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
|
|
|
|
build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
|
|
|
|
build_input.aabbArray.numPrimitives = num_segments;
|
|
|
|
build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
|
|
|
|
build_input.aabbArray.flags = &build_flags;
|
|
|
|
build_input.aabbArray.numSbtRecords = 1;
|
|
|
|
build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
|
|
|
|
|
|
|
|
// Allocate memory for new BLAS and build it
|
|
|
|
handles.emplace_back();
|
2019-12-05 18:17:01 +00:00
|
|
|
if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
|
2019-09-12 12:50:06 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build BLAS for triangle primitives
|
|
|
|
if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
|
|
|
|
const size_t num_verts = mesh->verts.size();
|
|
|
|
|
|
|
|
size_t num_motion_steps = 1;
|
|
|
|
Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
|
2019-10-02 10:06:30 +00:00
|
|
|
if (motion_blur && mesh->use_motion_blur && motion_keys) {
|
2019-09-12 12:50:06 +00:00
|
|
|
num_motion_steps = mesh->motion_steps;
|
|
|
|
}
|
|
|
|
|
|
|
|
device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
|
|
|
|
index_data.alloc(mesh->triangles.size());
|
|
|
|
memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int));
|
|
|
|
device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
|
|
|
|
vertex_data.alloc(num_verts * num_motion_steps);
|
|
|
|
|
|
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
|
|
const float3 *verts = mesh->verts.data();
|
|
|
|
|
|
|
|
size_t center_step = (num_motion_steps - 1) / 2;
|
|
|
|
// The center step for motion vertices is not stored in the attribute
|
|
|
|
if (step != center_step) {
|
|
|
|
verts = motion_keys->data_float3() +
|
|
|
|
(step > center_step ? step - 1 : step) * num_verts;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upload triangle data to GPU
|
|
|
|
index_data.copy_to_device();
|
|
|
|
vertex_data.copy_to_device();
|
|
|
|
|
|
|
|
vector<device_ptr> vertex_ptrs;
|
|
|
|
vertex_ptrs.reserve(num_motion_steps);
|
|
|
|
for (size_t step = 0; step < num_motion_steps; ++step) {
|
|
|
|
vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
|
|
|
|
}
|
|
|
|
|
|
|
|
// No special build flags for triangle primitives
|
|
|
|
unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
|
|
|
|
OptixBuildInput build_input = {};
|
|
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
|
|
|
|
build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
|
|
|
|
build_input.triangleArray.numVertices = num_verts;
|
|
|
|
build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
|
|
|
|
build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
|
|
|
|
build_input.triangleArray.indexBuffer = index_data.device_pointer;
|
|
|
|
build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
|
|
|
|
build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
|
|
|
|
build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
|
|
|
|
build_input.triangleArray.flags = &build_flags;
|
|
|
|
// The SBT does not store per primitive data since Cycles already allocates separate
|
|
|
|
// buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
|
|
|
|
// one and rely on that having the same meaning in this case.
|
|
|
|
build_input.triangleArray.numSbtRecords = 1;
|
|
|
|
// Triangle primitives are packed right after the curve primitives of this mesh
|
|
|
|
build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
|
|
|
|
|
|
|
|
// Allocate memory for new BLAS and build it
|
|
|
|
handles.emplace_back();
|
2019-12-05 18:17:01 +00:00
|
|
|
if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
|
2019-09-12 12:50:06 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
meshes.insert({mesh, handles});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fill instance descriptions
|
|
|
|
device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
|
|
|
|
aabbs.alloc(bvh->objects.size() * 2);
|
|
|
|
device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
|
|
|
|
instances.alloc(bvh->objects.size() * 2);
|
|
|
|
|
|
|
|
for (Object *ob : bvh->objects) {
|
|
|
|
// Skip non-traceable objects
|
|
|
|
if (!ob->is_traceable())
|
|
|
|
continue;
|
2019-11-28 12:57:02 +00:00
|
|
|
|
2019-09-12 12:50:06 +00:00
|
|
|
// Create separate instance for triangle/curve meshes of an object
|
|
|
|
for (OptixTraversableHandle handle : meshes[ob->mesh]) {
|
|
|
|
OptixAabb &aabb = aabbs[num_instances];
|
|
|
|
aabb.minX = ob->bounds.min.x;
|
|
|
|
aabb.minY = ob->bounds.min.y;
|
|
|
|
aabb.minZ = ob->bounds.min.z;
|
|
|
|
aabb.maxX = ob->bounds.max.x;
|
|
|
|
aabb.maxY = ob->bounds.max.y;
|
|
|
|
aabb.maxZ = ob->bounds.max.z;
|
|
|
|
|
|
|
|
OptixInstance &instance = instances[num_instances++];
|
|
|
|
memset(&instance, 0, sizeof(instance));
|
|
|
|
|
|
|
|
// Clear transform to identity matrix
|
|
|
|
instance.transform[0] = 1.0f;
|
|
|
|
instance.transform[5] = 1.0f;
|
|
|
|
instance.transform[10] = 1.0f;
|
|
|
|
|
|
|
|
// Set user instance ID to object index
|
|
|
|
instance.instanceId = ob->get_device_index();
|
|
|
|
|
|
|
|
// Volumes have a special bit set in the visibility mask so a trace can mask only volumes
|
|
|
|
// See 'scene_intersect_volume' in bvh.h
|
|
|
|
instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1);
|
|
|
|
|
|
|
|
// Insert motion traversable if object has motion
|
2019-10-02 10:06:30 +00:00
|
|
|
if (motion_blur && ob->use_motion()) {
|
2019-12-05 18:17:01 +00:00
|
|
|
size_t motion_keys = max(ob->motion.size(), 2) - 2;
|
|
|
|
size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
|
|
|
|
motion_keys * sizeof(OptixSRTData);
|
|
|
|
|
2020-01-14 16:47:41 +00:00
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
2019-12-05 18:17:01 +00:00
|
|
|
CUdeviceptr motion_transform_gpu = 0;
|
|
|
|
check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
|
|
|
|
as_mem.push_back(motion_transform_gpu);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Allocate host side memory for motion transform and fill it with transform data
|
|
|
|
OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
|
2019-12-05 18:17:01 +00:00
|
|
|
new uint8_t[motion_transform_size]);
|
2019-09-12 12:50:06 +00:00
|
|
|
motion_transform.child = handle;
|
|
|
|
motion_transform.motionOptions.numKeys = ob->motion.size();
|
|
|
|
motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
|
|
|
|
motion_transform.motionOptions.timeBegin = 0.0f;
|
|
|
|
motion_transform.motionOptions.timeEnd = 1.0f;
|
|
|
|
|
|
|
|
OptixSRTData *const srt_data = motion_transform.srtData;
|
|
|
|
array<DecomposedTransform> decomp(ob->motion.size());
|
|
|
|
transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ob->motion.size(); ++i) {
|
2019-12-05 18:17:01 +00:00
|
|
|
// Scale
|
2019-09-12 12:50:06 +00:00
|
|
|
srt_data[i].sx = decomp[i].y.w; // scale.x.x
|
|
|
|
srt_data[i].sy = decomp[i].z.w; // scale.y.y
|
|
|
|
srt_data[i].sz = decomp[i].w.w; // scale.z.z
|
2019-12-05 18:17:01 +00:00
|
|
|
|
|
|
|
// Shear
|
|
|
|
srt_data[i].a = decomp[i].z.x; // scale.x.y
|
|
|
|
srt_data[i].b = decomp[i].z.y; // scale.x.z
|
|
|
|
srt_data[i].c = decomp[i].w.x; // scale.y.z
|
2020-01-16 17:08:52 +00:00
|
|
|
assert(decomp[i].z.z == 0.0f); // scale.y.x
|
|
|
|
assert(decomp[i].w.y == 0.0f); // scale.z.x
|
|
|
|
assert(decomp[i].w.z == 0.0f); // scale.z.y
|
2019-12-05 18:17:01 +00:00
|
|
|
|
|
|
|
// Pivot point
|
|
|
|
srt_data[i].pvx = 0.0f;
|
|
|
|
srt_data[i].pvy = 0.0f;
|
|
|
|
srt_data[i].pvz = 0.0f;
|
|
|
|
|
|
|
|
// Rotation
|
2019-09-12 12:50:06 +00:00
|
|
|
srt_data[i].qx = decomp[i].x.x;
|
|
|
|
srt_data[i].qy = decomp[i].x.y;
|
|
|
|
srt_data[i].qz = decomp[i].x.z;
|
|
|
|
srt_data[i].qw = decomp[i].x.w;
|
2019-12-05 18:17:01 +00:00
|
|
|
|
|
|
|
// Translation
|
2019-09-12 12:50:06 +00:00
|
|
|
srt_data[i].tx = decomp[i].y.x;
|
|
|
|
srt_data[i].ty = decomp[i].y.y;
|
|
|
|
srt_data[i].tz = decomp[i].y.z;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upload motion transform to GPU
|
2019-12-05 18:17:01 +00:00
|
|
|
cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
|
|
|
|
delete[] reinterpret_cast<uint8_t *>(&motion_transform);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Disable instance transform if object uses motion transform already
|
|
|
|
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
|
|
|
|
|
|
|
// Get traversable handle to motion transform
|
|
|
|
optixConvertPointerToTraversableHandle(context,
|
2019-12-05 18:17:01 +00:00
|
|
|
motion_transform_gpu,
|
2019-09-12 12:50:06 +00:00
|
|
|
OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
|
|
|
|
&instance.traversableHandle);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
instance.traversableHandle = handle;
|
|
|
|
|
|
|
|
if (ob->mesh->is_instanced()) {
|
|
|
|
// Set transform matrix
|
|
|
|
memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// Disable instance transform if mesh already has it applied to vertex data
|
|
|
|
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
|
|
|
// Non-instanced objects read ID from prim_object, so
|
|
|
|
// distinguish them from instanced objects with high bit set
|
|
|
|
instance.instanceId |= 0x800000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Upload instance descriptions
|
|
|
|
aabbs.resize(num_instances);
|
|
|
|
aabbs.copy_to_device();
|
|
|
|
instances.resize(num_instances);
|
|
|
|
instances.copy_to_device();
|
|
|
|
|
2019-11-28 12:57:02 +00:00
|
|
|
// Build top-level acceleration structure (TLAS)
|
2019-09-12 12:50:06 +00:00
|
|
|
OptixBuildInput build_input = {};
|
|
|
|
build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
|
|
|
|
build_input.instanceArray.instances = instances.device_pointer;
|
|
|
|
build_input.instanceArray.numInstances = num_instances;
|
|
|
|
build_input.instanceArray.aabbs = aabbs.device_pointer;
|
|
|
|
build_input.instanceArray.numAabbs = num_instances;
|
|
|
|
|
2019-12-05 18:17:01 +00:00
|
|
|
return build_optix_bvh(build_input, 0, tlas_handle);
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void update_texture_info()
|
|
|
|
{
|
|
|
|
if (need_texture_info) {
|
|
|
|
texture_info.copy_to_device();
|
|
|
|
need_texture_info = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
|
|
|
|
{
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
for (int i = 0; i < info.cpu_threads; ++i)
|
|
|
|
check_result_cuda(
|
|
|
|
cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
|
|
|
|
data,
|
|
|
|
data_size));
|
|
|
|
|
|
|
|
// Set constant memory for CUDA module
|
|
|
|
// TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
|
|
|
|
// Could be removed by moving those functions to filter CUDA module.
|
|
|
|
size_t bytes = 0;
|
|
|
|
CUdeviceptr mem = 0;
|
|
|
|
check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
|
2019-11-07 10:06:41 +00:00
|
|
|
assert(mem != 0 && bytes == data_size);
|
2019-09-12 12:50:06 +00:00
|
|
|
check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_alloc(device_memory &mem) override
|
|
|
|
{
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.type == MEM_PIXELS && !background) {
|
2019-10-21 12:22:24 +00:00
|
|
|
// Always fall back to no interop for now
|
|
|
|
// TODO(pmours): Support OpenGL interop when moving CUDA memory management to common code
|
|
|
|
background = true;
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
assert(!"mem_alloc not supported for textures.");
|
2019-10-21 12:22:24 +00:00
|
|
|
return;
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
2019-10-21 12:22:24 +00:00
|
|
|
|
|
|
|
generic_alloc(mem);
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
|
|
|
|
{
|
|
|
|
CUDAContextScope scope(cuda_context);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
CUdeviceptr device_pointer = 0;
|
|
|
|
size_t size = mem.memory_size() + pitch_padding;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
|
|
|
|
const char *status = "";
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
/* First try allocating in device memory, respecting headroom. We make
|
|
|
|
* an exception for texture info. It is small and frequently accessed,
|
|
|
|
* so treat it as working memory.
|
|
|
|
*
|
|
|
|
* If there is not enough room for working memory, we will try to move
|
|
|
|
* textures to host memory, assuming the performance impact would have
|
|
|
|
* been worse for working memory. */
|
|
|
|
bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
|
|
|
|
bool is_image = is_texture && (mem.data_height > 1);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
size_t total = 0, free = 0;
|
|
|
|
cuMemGetInfo(&free, &total);
|
|
|
|
|
|
|
|
/* Move textures to host memory if needed. */
|
2019-11-05 15:27:52 +00:00
|
|
|
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
|
2019-10-18 10:06:28 +00:00
|
|
|
move_textures_to_host(size + headroom - free, is_texture);
|
|
|
|
cuMemGetInfo(&free, &total);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Allocate in device memory. */
|
|
|
|
if (!move_texture_to_host && (size + headroom) < free) {
|
|
|
|
mem_alloc_result = cuMemAlloc(&device_pointer, size);
|
|
|
|
if (mem_alloc_result == CUDA_SUCCESS) {
|
|
|
|
status = " in device memory";
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Fall back to mapped host memory if needed and possible. */
|
2019-11-05 15:27:52 +00:00
|
|
|
void *shared_pointer = 0;
|
2019-10-18 10:06:28 +00:00
|
|
|
|
2019-11-05 15:27:52 +00:00
|
|
|
if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.shared_pointer) {
|
|
|
|
/* Another device already allocated host memory. */
|
|
|
|
mem_alloc_result = CUDA_SUCCESS;
|
2019-11-05 15:27:52 +00:00
|
|
|
shared_pointer = mem.shared_pointer;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-11-05 15:27:52 +00:00
|
|
|
else if (map_host_used + size < map_host_limit) {
|
2019-10-18 10:06:28 +00:00
|
|
|
/* Allocate host memory ourselves. */
|
|
|
|
mem_alloc_result = cuMemHostAlloc(
|
2019-11-05 15:27:52 +00:00
|
|
|
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
|
|
|
|
|
|
|
|
assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
|
|
|
|
(mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem_alloc_result == CUDA_SUCCESS) {
|
2019-11-05 15:27:52 +00:00
|
|
|
cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0);
|
2019-10-18 10:06:28 +00:00
|
|
|
map_host_used += size;
|
|
|
|
status = " in host memory";
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
else {
|
|
|
|
status = " failed, out of host memory";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (mem_alloc_result != CUDA_SUCCESS) {
|
|
|
|
status = " failed, out of device and host memory";
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.name) {
|
|
|
|
VLOG(1) << "Buffer allocate: " << mem.name << ", "
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")" << status;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem_alloc_result != CUDA_SUCCESS) {
|
|
|
|
set_error(string_printf("Buffer allocate %s", status));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mem.device_pointer = (device_ptr)device_pointer;
|
|
|
|
mem.device_size = size;
|
|
|
|
stats.mem_alloc(size);
|
|
|
|
|
|
|
|
if (!mem.device_pointer) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
/* Insert into map of allocations. */
|
|
|
|
CUDAMem *cmem = &cuda_mem_map[&mem];
|
2019-11-05 15:27:52 +00:00
|
|
|
if (shared_pointer != 0) {
|
|
|
|
/* Replace host pointer with our host allocation. Only works if
|
|
|
|
* CUDA memory layout is the same and has no pitch padding. Also
|
|
|
|
* does not work if we move textures to host during a render,
|
|
|
|
* since other devices might be using the memory. */
|
|
|
|
|
|
|
|
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
|
|
|
|
mem.host_pointer != shared_pointer) {
|
|
|
|
memcpy(shared_pointer, mem.host_pointer, size);
|
|
|
|
|
|
|
|
/* A call to device_memory::host_free() should be preceded by
|
|
|
|
* a call to device_memory::device_free() for host memory
|
|
|
|
* allocated by a device to be handled properly. Two exceptions
|
|
|
|
* are here and a call in CUDADevice::generic_alloc(), where
|
|
|
|
* the current host memory can be assumed to be allocated by
|
|
|
|
* device_memory::host_alloc(), not by a device */
|
|
|
|
|
|
|
|
mem.host_free();
|
|
|
|
mem.host_pointer = shared_pointer;
|
|
|
|
}
|
|
|
|
mem.shared_pointer = shared_pointer;
|
|
|
|
mem.shared_counter++;
|
|
|
|
cmem->use_mapped_host = true;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
cmem->use_mapped_host = false;
|
|
|
|
}
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
return cmem;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
void tex_alloc(device_memory &mem)
|
|
|
|
{
|
|
|
|
CUDAContextScope scope(cuda_context);
|
|
|
|
|
|
|
|
/* General variables for both architectures */
|
|
|
|
string bind_name = mem.name;
|
|
|
|
size_t dsize = datatype_size(mem.data_type);
|
|
|
|
size_t size = mem.memory_size();
|
|
|
|
|
|
|
|
CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
|
|
|
switch (mem.extension) {
|
|
|
|
case EXTENSION_REPEAT:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
|
|
|
break;
|
|
|
|
case EXTENSION_EXTEND:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_CLAMP;
|
|
|
|
break;
|
|
|
|
case EXTENSION_CLIP:
|
|
|
|
address_mode = CU_TR_ADDRESS_MODE_BORDER;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0);
|
|
|
|
break;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
CUfilter_mode filter_mode;
|
|
|
|
if (mem.interpolation == INTERPOLATION_CLOSEST) {
|
|
|
|
filter_mode = CU_TR_FILTER_MODE_POINT;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
else {
|
2019-10-18 10:06:28 +00:00
|
|
|
filter_mode = CU_TR_FILTER_MODE_LINEAR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Data Storage */
|
|
|
|
if (mem.interpolation == INTERPOLATION_NONE) {
|
|
|
|
generic_alloc(mem);
|
|
|
|
generic_copy_to(mem);
|
2019-09-12 12:50:06 +00:00
|
|
|
|
|
|
|
// Update data storage pointers in launch parameters
|
|
|
|
# define KERNEL_TEX(data_type, tex_name) \
|
|
|
|
if (strcmp(mem.name, #tex_name) == 0) \
|
|
|
|
update_launch_params( \
|
|
|
|
mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
|
|
|
|
# include "kernel/kernel_textures.h"
|
|
|
|
# undef KERNEL_TEX
|
2019-10-18 10:06:28 +00:00
|
|
|
return;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
/* Image Texture Storage */
|
|
|
|
CUarray_format_enum format;
|
|
|
|
switch (mem.data_type) {
|
|
|
|
case TYPE_UCHAR:
|
|
|
|
format = CU_AD_FORMAT_UNSIGNED_INT8;
|
|
|
|
break;
|
|
|
|
case TYPE_UINT16:
|
|
|
|
format = CU_AD_FORMAT_UNSIGNED_INT16;
|
|
|
|
break;
|
|
|
|
case TYPE_UINT:
|
|
|
|
format = CU_AD_FORMAT_UNSIGNED_INT32;
|
|
|
|
break;
|
|
|
|
case TYPE_INT:
|
|
|
|
format = CU_AD_FORMAT_SIGNED_INT32;
|
|
|
|
break;
|
|
|
|
case TYPE_FLOAT:
|
|
|
|
format = CU_AD_FORMAT_FLOAT;
|
|
|
|
break;
|
|
|
|
case TYPE_HALF:
|
|
|
|
format = CU_AD_FORMAT_HALF;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0);
|
|
|
|
return;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
CUDAMem *cmem = NULL;
|
|
|
|
CUarray array_3d = NULL;
|
|
|
|
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
|
|
|
|
size_t dst_pitch = src_pitch;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.data_depth > 1) {
|
|
|
|
/* 3D texture using array, there is no API for linear memory. */
|
|
|
|
CUDA_ARRAY3D_DESCRIPTOR desc;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
desc.Width = mem.data_width;
|
|
|
|
desc.Height = mem.data_height;
|
|
|
|
desc.Depth = mem.data_depth;
|
|
|
|
desc.Format = format;
|
|
|
|
desc.NumChannels = mem.data_elements;
|
|
|
|
desc.Flags = 0;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
VLOG(1) << "Array 3D allocate: " << mem.name << ", "
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
check_result_cuda(cuArray3DCreate(&array_3d, &desc));
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
if (!array_3d) {
|
|
|
|
return;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
|
|
|
|
CUDA_MEMCPY3D param;
|
|
|
|
memset(¶m, 0, sizeof(param));
|
|
|
|
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
|
|
param.dstArray = array_3d;
|
|
|
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
|
|
param.srcHost = mem.host_pointer;
|
|
|
|
param.srcPitch = src_pitch;
|
|
|
|
param.WidthInBytes = param.srcPitch;
|
|
|
|
param.Height = mem.data_height;
|
|
|
|
param.Depth = mem.data_depth;
|
|
|
|
|
|
|
|
check_result_cuda(cuMemcpy3D(¶m));
|
|
|
|
|
|
|
|
mem.device_pointer = (device_ptr)array_3d;
|
|
|
|
mem.device_size = size;
|
|
|
|
stats.mem_alloc(size);
|
|
|
|
|
|
|
|
cmem = &cuda_mem_map[&mem];
|
|
|
|
cmem->texobject = 0;
|
|
|
|
cmem->array = array_3d;
|
|
|
|
}
|
|
|
|
else if (mem.data_height > 0) {
|
|
|
|
/* 2D texture, using pitch aligned linear memory. */
|
|
|
|
int alignment = 0;
|
|
|
|
check_result_cuda(cuDeviceGetAttribute(
|
|
|
|
&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device));
|
|
|
|
dst_pitch = align_up(src_pitch, alignment);
|
|
|
|
size_t dst_size = dst_pitch * mem.data_height;
|
|
|
|
|
|
|
|
cmem = generic_alloc(mem, dst_size - mem.memory_size());
|
|
|
|
if (!cmem) {
|
|
|
|
return;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
|
|
|
|
CUDA_MEMCPY2D param;
|
|
|
|
memset(¶m, 0, sizeof(param));
|
|
|
|
param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
|
|
param.dstDevice = mem.device_pointer;
|
|
|
|
param.dstPitch = dst_pitch;
|
|
|
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
|
|
param.srcHost = mem.host_pointer;
|
|
|
|
param.srcPitch = src_pitch;
|
|
|
|
param.WidthInBytes = param.srcPitch;
|
|
|
|
param.Height = mem.data_height;
|
|
|
|
|
|
|
|
check_result_cuda(cuMemcpy2DUnaligned(¶m));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* 1D texture, using linear memory. */
|
|
|
|
cmem = generic_alloc(mem);
|
|
|
|
if (!cmem) {
|
|
|
|
return;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
|
|
|
|
check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Kepler+, bindless textures. */
|
|
|
|
int flat_slot = 0;
|
|
|
|
if (string_startswith(mem.name, "__tex_image")) {
|
|
|
|
int pos = string(mem.name).rfind("_");
|
|
|
|
flat_slot = atoi(mem.name + pos + 1);
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
else {
|
2019-10-18 10:06:28 +00:00
|
|
|
assert(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
CUDA_RESOURCE_DESC resDesc;
|
|
|
|
memset(&resDesc, 0, sizeof(resDesc));
|
|
|
|
|
|
|
|
if (array_3d) {
|
|
|
|
resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
|
|
|
|
resDesc.res.array.hArray = array_3d;
|
|
|
|
resDesc.flags = 0;
|
|
|
|
}
|
|
|
|
else if (mem.data_height > 0) {
|
|
|
|
resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
|
|
|
|
resDesc.res.pitch2D.devPtr = mem.device_pointer;
|
|
|
|
resDesc.res.pitch2D.format = format;
|
|
|
|
resDesc.res.pitch2D.numChannels = mem.data_elements;
|
|
|
|
resDesc.res.pitch2D.height = mem.data_height;
|
|
|
|
resDesc.res.pitch2D.width = mem.data_width;
|
|
|
|
resDesc.res.pitch2D.pitchInBytes = dst_pitch;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
|
|
|
|
resDesc.res.linear.devPtr = mem.device_pointer;
|
|
|
|
resDesc.res.linear.format = format;
|
|
|
|
resDesc.res.linear.numChannels = mem.data_elements;
|
|
|
|
resDesc.res.linear.sizeInBytes = mem.device_size;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
|
|
|
|
CUDA_TEXTURE_DESC texDesc;
|
|
|
|
memset(&texDesc, 0, sizeof(texDesc));
|
|
|
|
texDesc.addressMode[0] = address_mode;
|
|
|
|
texDesc.addressMode[1] = address_mode;
|
|
|
|
texDesc.addressMode[2] = address_mode;
|
|
|
|
texDesc.filterMode = filter_mode;
|
|
|
|
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
|
|
|
|
|
|
|
|
check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
|
|
|
|
|
|
|
|
/* Resize once */
|
|
|
|
if (flat_slot >= texture_info.size()) {
|
|
|
|
/* Allocate some slots in advance, to reduce amount
|
|
|
|
* of re-allocations. */
|
|
|
|
texture_info.resize(flat_slot + 128);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set Mapping and tag that we need to (re-)upload to device */
|
|
|
|
TextureInfo &info = texture_info[flat_slot];
|
|
|
|
info.data = (uint64_t)cmem->texobject;
|
|
|
|
info.cl_buffer = 0;
|
|
|
|
info.interpolation = mem.interpolation;
|
|
|
|
info.extension = mem.extension;
|
|
|
|
info.width = mem.data_width;
|
|
|
|
info.height = mem.data_height;
|
|
|
|
info.depth = mem.data_depth;
|
|
|
|
need_texture_info = true;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
void mem_copy_to(device_memory &mem) override
|
2019-09-12 12:50:06 +00:00
|
|
|
{
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.type == MEM_PIXELS) {
|
|
|
|
assert(!"mem_copy_to not supported for pixels.");
|
|
|
|
}
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
tex_free(mem);
|
|
|
|
tex_alloc(mem);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (!mem.device_pointer) {
|
|
|
|
generic_alloc(mem);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
generic_copy_to(mem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void generic_copy_to(device_memory &mem)
|
|
|
|
{
|
2019-09-12 12:50:06 +00:00
|
|
|
if (mem.host_pointer && mem.device_pointer) {
|
2019-10-18 10:06:28 +00:00
|
|
|
CUDAContextScope scope(cuda_context);
|
|
|
|
|
2019-11-05 15:27:52 +00:00
|
|
|
/* If use_mapped_host of mem is false, the current device only
|
|
|
|
* uses device memory allocated by cuMemAlloc regardless of
|
|
|
|
* mem.host_pointer and mem.shared_pointer, and should copy
|
|
|
|
* data from mem.host_pointer. */
|
|
|
|
|
|
|
|
if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
|
2019-10-18 10:06:28 +00:00
|
|
|
check_result_cuda(
|
|
|
|
cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
|
|
|
|
{
|
|
|
|
if (mem.type == MEM_PIXELS && !background) {
|
|
|
|
assert(!"mem_copy_from not supported for pixels.");
|
|
|
|
}
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
assert(!"mem_copy_from not supported for textures.");
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
else {
|
|
|
|
// Calculate linear memory offset and size
|
|
|
|
const size_t size = elem * w * h;
|
|
|
|
const size_t offset = elem * y * w;
|
|
|
|
|
|
|
|
if (mem.host_pointer && mem.device_pointer) {
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
check_result_cuda(cuMemcpyDtoH(
|
|
|
|
(char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
|
|
|
|
}
|
|
|
|
else if (mem.host_pointer) {
|
|
|
|
memset((char *)mem.host_pointer + offset, 0, size);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_zero(device_memory &mem) override
|
|
|
|
{
|
|
|
|
if (mem.host_pointer)
|
|
|
|
memset(mem.host_pointer, 0, mem.memory_size());
|
|
|
|
|
|
|
|
if (!mem.device_pointer)
|
|
|
|
mem_alloc(mem); // Need to allocate memory first if it does not exist yet
|
|
|
|
|
2019-11-05 15:27:52 +00:00
|
|
|
/* If use_mapped_host of mem is false, mem.device_pointer currently
|
|
|
|
* refers to device memory regardless of mem.host_pointer and
|
|
|
|
* mem.shared_pointer. */
|
|
|
|
|
|
|
|
if (mem.device_pointer &&
|
|
|
|
(cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) {
|
|
|
|
const CUDAContextScope scope(cuda_context);
|
|
|
|
check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void mem_free(device_memory &mem) override
|
|
|
|
{
|
2019-10-18 10:06:28 +00:00
|
|
|
if (mem.type == MEM_PIXELS && !background) {
|
|
|
|
assert(!"mem_free not supported for pixels.");
|
|
|
|
}
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
tex_free(mem);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
generic_free(mem);
|
|
|
|
}
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
void generic_free(device_memory &mem)
|
|
|
|
{
|
|
|
|
if (mem.device_pointer) {
|
|
|
|
CUDAContextScope scope(cuda_context);
|
|
|
|
const CUDAMem &cmem = cuda_mem_map[&mem];
|
|
|
|
|
2019-11-05 15:27:52 +00:00
|
|
|
/* If cmem.use_mapped_host is true, reference counting is used
|
|
|
|
* to safely free a mapped host memory. */
|
|
|
|
|
|
|
|
if (cmem.use_mapped_host) {
|
|
|
|
assert(mem.shared_pointer);
|
|
|
|
if (mem.shared_pointer) {
|
|
|
|
assert(mem.shared_counter > 0);
|
|
|
|
if (--mem.shared_counter == 0) {
|
|
|
|
if (mem.host_pointer == mem.shared_pointer) {
|
|
|
|
mem.host_pointer = 0;
|
|
|
|
}
|
|
|
|
cuMemFreeHost(mem.shared_pointer);
|
|
|
|
mem.shared_pointer = 0;
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
map_host_used -= mem.device_size;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* Free device memory. */
|
|
|
|
cuMemFree(mem.device_pointer);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
mem.device_pointer = 0;
|
|
|
|
mem.device_size = 0;
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
cuda_mem_map.erase(cuda_mem_map.find(&mem));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void tex_free(device_memory &mem)
|
|
|
|
{
|
|
|
|
if (mem.device_pointer) {
|
|
|
|
CUDAContextScope scope(cuda_context);
|
|
|
|
const CUDAMem &cmem = cuda_mem_map[&mem];
|
|
|
|
|
|
|
|
if (cmem.texobject) {
|
|
|
|
/* Free bindless texture. */
|
2019-09-12 12:50:06 +00:00
|
|
|
cuTexObjectDestroy(cmem.texobject);
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cmem.array) {
|
|
|
|
/* Free array. */
|
|
|
|
cuArrayDestroy(cmem.array);
|
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
mem.device_pointer = 0;
|
|
|
|
mem.device_size = 0;
|
|
|
|
|
|
|
|
cuda_mem_map.erase(cuda_mem_map.find(&mem));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
generic_free(mem);
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
2019-10-18 10:06:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void move_textures_to_host(size_t size, bool for_texture)
|
|
|
|
{
|
|
|
|
/* Signal to reallocate textures in host memory only. */
|
|
|
|
move_texture_to_host = true;
|
|
|
|
|
|
|
|
while (size > 0) {
|
|
|
|
/* Find suitable memory allocation to move. */
|
|
|
|
device_memory *max_mem = NULL;
|
|
|
|
size_t max_size = 0;
|
|
|
|
bool max_is_image = false;
|
|
|
|
|
|
|
|
foreach (auto &pair, cuda_mem_map) {
|
|
|
|
device_memory &mem = *pair.first;
|
|
|
|
CUDAMem *cmem = &pair.second;
|
|
|
|
|
|
|
|
bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
|
|
|
|
bool is_image = is_texture && (mem.data_height > 1);
|
|
|
|
|
|
|
|
/* Can't move this type of memory. */
|
|
|
|
if (!is_texture || cmem->array) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Already in host memory. */
|
2019-11-05 15:27:52 +00:00
|
|
|
if (cmem->use_mapped_host) {
|
2019-10-18 10:06:28 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* For other textures, only move image textures. */
|
|
|
|
if (for_texture && !is_image) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Try to move largest allocation, prefer moving images. */
|
|
|
|
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
|
|
|
|
max_is_image = is_image;
|
|
|
|
max_size = mem.device_size;
|
|
|
|
max_mem = &mem;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Move to host memory. This part is mutex protected since
|
|
|
|
* multiple CUDA devices could be moving the memory. The
|
|
|
|
* first one will do it, and the rest will adopt the pointer. */
|
|
|
|
if (max_mem) {
|
|
|
|
VLOG(1) << "Move memory from device to host: " << max_mem->name;
|
|
|
|
|
|
|
|
static thread_mutex move_mutex;
|
|
|
|
thread_scoped_lock lock(move_mutex);
|
|
|
|
|
|
|
|
/* Preserve the original device pointer, in case of multi device
|
|
|
|
* we can't change it because the pointer mapping would break. */
|
|
|
|
device_ptr prev_pointer = max_mem->device_pointer;
|
|
|
|
size_t prev_size = max_mem->device_size;
|
|
|
|
|
|
|
|
tex_free(*max_mem);
|
|
|
|
tex_alloc(*max_mem);
|
|
|
|
size = (max_size >= size) ? 0 : size - max_size;
|
|
|
|
|
|
|
|
max_mem->device_pointer = prev_pointer;
|
|
|
|
max_mem->device_size = prev_size;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
break;
|
|
|
|
}
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
/* Update texture info array with new pointers. */
|
|
|
|
update_texture_info();
|
2019-09-12 12:50:06 +00:00
|
|
|
|
2019-10-18 10:06:28 +00:00
|
|
|
move_texture_to_host = false;
|
2019-09-12 12:50:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void const_copy_to(const char *name, void *host, size_t size) override
|
|
|
|
{
|
|
|
|
if (strcmp(name, "__data") == 0) {
|
|
|
|
assert(size <= sizeof(KernelData));
|
|
|
|
|
|
|
|
// Fix traversable handle on multi devices
|
|
|
|
KernelData *const data = (KernelData *)host;
|
|
|
|
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
|
|
|
|
|
|
|
|
update_launch_params(name, offsetof(KernelParams, data), host, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
|
|
|
|
{
|
|
|
|
return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
|
|
|
|
}
|
|
|
|
|
|
|
|
void task_add(DeviceTask &task) override
|
|
|
|
{
|
|
|
|
// Upload texture information to device if it has changed since last launch
|
|
|
|
update_texture_info();
|
|
|
|
|
|
|
|
// Split task into smaller ones
|
|
|
|
list<DeviceTask> tasks;
|
|
|
|
task.split(tasks, info.cpu_threads);
|
|
|
|
|
|
|
|
// Queue tasks in internal task pool
|
|
|
|
struct OptiXDeviceTask : public DeviceTask {
|
|
|
|
OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
|
|
|
|
{
|
|
|
|
// Using task index parameter instead of thread index, since number of CUDA streams may
|
|
|
|
// differ from number of threads
|
|
|
|
run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
int task_index = 0;
|
|
|
|
for (DeviceTask &task : tasks)
|
|
|
|
task_pool.push(new OptiXDeviceTask(this, task, task_index++));
|
|
|
|
}
|
|
|
|
|
|
|
|
void task_wait() override
|
|
|
|
{
|
|
|
|
// Wait for all queued tasks to finish
|
|
|
|
task_pool.wait_work();
|
|
|
|
}
|
|
|
|
|
|
|
|
void task_cancel() override
|
|
|
|
{
|
|
|
|
// Cancel any remaining tasks in the internal pool
|
|
|
|
task_pool.cancel();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_non_local_means(device_ptr image_ptr,
|
|
|
|
device_ptr guide_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
device_ptr out_ptr,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
int stride = task->buffer.stride;
|
|
|
|
int w = task->buffer.width;
|
|
|
|
int h = task->buffer.h;
|
|
|
|
int r = task->nlm_state.r;
|
|
|
|
int f = task->nlm_state.f;
|
|
|
|
float a = task->nlm_state.a;
|
|
|
|
float k_2 = task->nlm_state.k_2;
|
|
|
|
|
|
|
|
int pass_stride = task->buffer.pass_stride;
|
|
|
|
int num_shifts = (2 * r + 1) * (2 * r + 1);
|
|
|
|
int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
|
|
|
|
int frame_offset = 0;
|
|
|
|
|
|
|
|
CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
|
|
|
|
CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
|
|
|
|
CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
|
|
|
|
CUdeviceptr scale_ptr = 0;
|
|
|
|
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
{
|
|
|
|
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));
|
|
|
|
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
|
|
|
|
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
|
|
|
|
|
|
|
|
void *calc_difference_args[] = {&guide_ptr,
|
|
|
|
&variance_ptr,
|
|
|
|
&scale_ptr,
|
|
|
|
&difference,
|
|
|
|
&w,
|
|
|
|
&h,
|
|
|
|
&stride,
|
|
|
|
&pass_stride,
|
|
|
|
&r,
|
|
|
|
&channel_offset,
|
|
|
|
&frame_offset,
|
|
|
|
&a,
|
|
|
|
&k_2};
|
|
|
|
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
|
|
|
|
void *calc_weight_args[] = {
|
|
|
|
&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
|
|
|
|
void *update_output_args[] = {&blurDifference,
|
|
|
|
&image_ptr,
|
|
|
|
&out_ptr,
|
|
|
|
&weightAccum,
|
|
|
|
&w,
|
|
|
|
&h,
|
|
|
|
&stride,
|
|
|
|
&pass_stride,
|
|
|
|
&channel_offset,
|
|
|
|
&r,
|
|
|
|
&f};
|
|
|
|
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
CUfunction cuNLMNormalize;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
|
|
|
|
CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
|
|
|
|
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
}
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_construct_transform(DenoisingTask *task, int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterConstructTransform;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
|
|
|
|
cuda_filter_module,
|
|
|
|
"kernel_cuda_filter_construct_transform"));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
|
|
|
|
CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
|
|
|
|
|
|
|
|
void *args[] = {&task->buffer.mem.device_pointer,
|
|
|
|
&task->tile_info_mem.device_pointer,
|
|
|
|
&task->storage.transform.device_pointer,
|
|
|
|
&task->storage.rank.device_pointer,
|
|
|
|
&task->filter_area,
|
|
|
|
&task->rect,
|
|
|
|
&task->radius,
|
|
|
|
&task->pca_threshold,
|
|
|
|
&task->buffer.pass_stride,
|
|
|
|
&task->buffer.frame_stride,
|
|
|
|
&task->buffer.use_time};
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
|
|
|
|
check_result_cuda_ret(cuCtxSynchronize());
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_accumulate(device_ptr color_ptr,
|
|
|
|
device_ptr color_variance_ptr,
|
|
|
|
device_ptr scale_ptr,
|
|
|
|
int frame,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
int r = task->radius;
|
|
|
|
int f = 4;
|
|
|
|
float a = 1.0f;
|
|
|
|
float k_2 = task->nlm_k_2;
|
|
|
|
|
|
|
|
int w = task->reconstruction_state.source_w;
|
|
|
|
int h = task->reconstruction_state.source_h;
|
|
|
|
int stride = task->buffer.stride;
|
|
|
|
int frame_offset = frame * task->buffer.frame_stride;
|
|
|
|
int t = task->tile_info->frames[frame];
|
|
|
|
|
|
|
|
int pass_stride = task->buffer.pass_stride;
|
|
|
|
int num_shifts = (2 * r + 1) * (2 * r + 1);
|
|
|
|
|
|
|
|
CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
|
|
|
|
CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
|
|
|
|
|
|
|
|
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));
|
|
|
|
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
|
|
|
|
|
|
|
|
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
|
|
|
|
task->reconstruction_state.source_w *
|
|
|
|
task->reconstruction_state.source_h,
|
|
|
|
num_shifts);
|
|
|
|
|
|
|
|
void *calc_difference_args[] = {&color_ptr,
|
|
|
|
&color_variance_ptr,
|
|
|
|
&scale_ptr,
|
|
|
|
&difference,
|
|
|
|
&w,
|
|
|
|
&h,
|
|
|
|
&stride,
|
|
|
|
&pass_stride,
|
|
|
|
&r,
|
|
|
|
&pass_stride,
|
|
|
|
&frame_offset,
|
|
|
|
&a,
|
|
|
|
&k_2};
|
|
|
|
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
|
|
|
|
void *calc_weight_args[] = {
|
|
|
|
&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
|
|
|
|
void *construct_gramian_args[] = {&t,
|
|
|
|
&blurDifference,
|
|
|
|
&task->buffer.mem.device_pointer,
|
|
|
|
&task->storage.transform.device_pointer,
|
|
|
|
&task->storage.rank.device_pointer,
|
|
|
|
&task->storage.XtWX.device_pointer,
|
|
|
|
&task->storage.XtWY.device_pointer,
|
|
|
|
&task->reconstruction_state.filter_window,
|
|
|
|
&w,
|
|
|
|
&h,
|
|
|
|
&stride,
|
|
|
|
&pass_stride,
|
|
|
|
&r,
|
|
|
|
&f,
|
|
|
|
&frame_offset,
|
|
|
|
&task->buffer.use_time};
|
|
|
|
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
|
|
|
CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
|
|
|
|
check_result_cuda_ret(cuCtxSynchronize());
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFinalize;
|
|
|
|
check_result_cuda_ret(
|
|
|
|
cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
void *finalize_args[] = {&output_ptr,
|
|
|
|
&task->storage.rank.device_pointer,
|
|
|
|
&task->storage.XtWX.device_pointer,
|
|
|
|
&task->storage.XtWY.device_pointer,
|
|
|
|
&task->filter_area,
|
|
|
|
&task->reconstruction_state.buffer_params.x,
|
|
|
|
&task->render_buffer.samples};
|
|
|
|
CUDA_GET_BLOCKSIZE(
|
|
|
|
cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_combine_halves(device_ptr a_ptr,
|
|
|
|
device_ptr b_ptr,
|
|
|
|
device_ptr mean_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
int r,
|
|
|
|
int4 rect,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterCombineHalves;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
CUDA_GET_BLOCKSIZE(
|
|
|
|
cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
|
|
|
|
|
|
|
|
void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_divide_shadow(device_ptr a_ptr,
|
|
|
|
device_ptr b_ptr,
|
|
|
|
device_ptr sample_variance_ptr,
|
|
|
|
device_ptr sv_variance_ptr,
|
|
|
|
device_ptr buffer_variance_ptr,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterDivideShadow;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
CUDA_GET_BLOCKSIZE(
|
|
|
|
cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
|
|
|
|
|
|
|
|
void *args[] = {&task->render_buffer.samples,
|
|
|
|
&task->tile_info_mem.device_pointer,
|
|
|
|
&a_ptr,
|
|
|
|
&b_ptr,
|
|
|
|
&sample_variance_ptr,
|
|
|
|
&sv_variance_ptr,
|
|
|
|
&buffer_variance_ptr,
|
|
|
|
&task->rect,
|
|
|
|
&task->render_buffer.pass_stride,
|
|
|
|
&task->render_buffer.offset};
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_get_feature(int mean_offset,
|
|
|
|
int variance_offset,
|
|
|
|
device_ptr mean_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
float scale,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterGetFeature;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
CUDA_GET_BLOCKSIZE(
|
|
|
|
cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
|
|
|
|
|
|
|
|
void *args[] = {&task->render_buffer.samples,
|
|
|
|
&task->tile_info_mem.device_pointer,
|
|
|
|
&mean_offset,
|
|
|
|
&variance_offset,
|
|
|
|
&mean_ptr,
|
|
|
|
&variance_ptr,
|
|
|
|
&scale,
|
|
|
|
&task->rect,
|
|
|
|
&task->render_buffer.pass_stride,
|
|
|
|
&task->render_buffer.offset};
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_write_feature(int out_offset,
|
|
|
|
device_ptr from_ptr,
|
|
|
|
device_ptr buffer_ptr,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterWriteFeature;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
|
|
|
|
|
|
|
|
void *args[] = {&task->render_buffer.samples,
|
|
|
|
&task->reconstruction_state.buffer_params,
|
|
|
|
&task->filter_area,
|
|
|
|
&from_ptr,
|
|
|
|
&buffer_ptr,
|
|
|
|
&out_offset,
|
|
|
|
&task->rect};
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool denoising_detect_outliers(device_ptr image_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
device_ptr depth_ptr,
|
|
|
|
device_ptr output_ptr,
|
|
|
|
DenoisingTask *task,
|
|
|
|
int thread_index)
|
|
|
|
{
|
|
|
|
if (have_error())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
CUfunction cuFilterDetectOutliers;
|
|
|
|
check_result_cuda_ret(cuModuleGetFunction(
|
|
|
|
&cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
|
|
|
|
check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
|
|
|
|
CUDA_GET_BLOCKSIZE(
|
|
|
|
cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
|
|
|
|
|
|
|
|
void *args[] = {&image_ptr,
|
|
|
|
&variance_ptr,
|
|
|
|
&depth_ptr,
|
|
|
|
&output_ptr,
|
|
|
|
&task->rect,
|
|
|
|
&task->buffer.pass_stride};
|
|
|
|
|
|
|
|
CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
|
|
|
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
|
|
|
|
|
|
|
return !have_error();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
bool device_optix_init()
|
|
|
|
{
|
|
|
|
if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
|
|
|
|
return true; // Already initialized function table
|
|
|
|
|
|
|
|
// Need to initialize CUDA as well
|
|
|
|
if (!device_cuda_init())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
# ifdef WITH_CUDA_DYNLOAD
|
|
|
|
// Load NVRTC function pointers for adaptive kernel compilation
|
|
|
|
if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) {
|
|
|
|
VLOG(1)
|
|
|
|
<< "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available.";
|
|
|
|
}
|
|
|
|
# endif
|
|
|
|
|
|
|
|
const OptixResult result = optixInit();
|
|
|
|
|
|
|
|
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
|
2019-12-11 17:11:46 +00:00
|
|
|
VLOG(1) << "OptiX initialization failed because driver does not support ABI version "
|
|
|
|
<< OPTIX_ABI_VERSION;
|
2019-09-12 12:50:06 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else if (result != OPTIX_SUCCESS) {
|
|
|
|
VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Loaded OptiX successfully!
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void device_optix_info(vector<DeviceInfo> &devices)
|
|
|
|
{
|
|
|
|
// Simply add all supported CUDA devices as OptiX devices again
|
|
|
|
vector<DeviceInfo> cuda_devices;
|
|
|
|
device_cuda_info(cuda_devices);
|
|
|
|
|
|
|
|
for (auto it = cuda_devices.begin(); it != cuda_devices.end();) {
|
|
|
|
DeviceInfo &info = *it;
|
|
|
|
assert(info.type == DEVICE_CUDA);
|
|
|
|
info.type = DEVICE_OPTIX;
|
|
|
|
info.id += "_OptiX";
|
|
|
|
|
|
|
|
// Figure out RTX support
|
|
|
|
CUdevice cuda_device = 0;
|
|
|
|
CUcontext cuda_context = NULL;
|
|
|
|
unsigned int rtcore_version = 0;
|
|
|
|
if (cuDeviceGet(&cuda_device, info.num) == CUDA_SUCCESS &&
|
|
|
|
cuDevicePrimaryCtxRetain(&cuda_context, cuda_device) == CUDA_SUCCESS) {
|
|
|
|
OptixDeviceContext optix_context = NULL;
|
|
|
|
if (optixDeviceContextCreate(cuda_context, nullptr, &optix_context) == OPTIX_SUCCESS) {
|
|
|
|
optixDeviceContextGetProperty(optix_context,
|
|
|
|
OPTIX_DEVICE_PROPERTY_RTCORE_VERSION,
|
|
|
|
&rtcore_version,
|
|
|
|
sizeof(rtcore_version));
|
|
|
|
optixDeviceContextDestroy(optix_context);
|
|
|
|
}
|
|
|
|
cuDevicePrimaryCtxRelease(cuda_device);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only add devices with RTX support
|
|
|
|
if (rtcore_version == 0)
|
|
|
|
it = cuda_devices.erase(it);
|
|
|
|
else
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
|
|
|
|
devices.insert(devices.end(), cuda_devices.begin(), cuda_devices.end());
|
|
|
|
}
|
|
|
|
|
|
|
|
Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
|
|
|
|
{
|
|
|
|
return new OptiXDevice(info, stats, profiler, background);
|
|
|
|
}
|
|
|
|
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif
|