Cycles: Add OptiX acceleration structure compaction

This adds compaction support for OptiX acceleration structures, which reduces the device memory footprint in a post step after building. Depending on the scene this can reduce the amount of used device memory quite a bit and even improve performance (smaller acceleration structure improves cache usage). It's only enabled for background renders to make acceleration structure builds fast in viewport.

Also fixes a bug in the memory management for OptiX acceleration structures: These were held in a dynamic vector of 'device_memory' instances and used the mem_alloc/mem_free functions. However, those keep track of memory instances in the 'cuda_mem_map' via pointers to 'device_memory' (which works fine everywhere else since those are never copied/moved). But in the case of the vector, it may decide to reallocate at some point, which invalidates those pointers and would result in some nasty accesses to invalid memory. So it is not actually safe to move a 'device_memory' object and therefore this removes the move operator overloads again.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D6369
This commit is contained in:
Patrick Mours 2019-12-05 19:17:01 +01:00
parent c9dc57be3a
commit baeb11826b
4 changed files with 101 additions and 77 deletions

@ -47,31 +47,6 @@ device_memory::~device_memory()
assert(shared_counter == 0);
}
device_memory::device_memory(device_memory &&other)
: data_type(other.data_type),
data_elements(other.data_elements),
data_size(other.data_size),
device_size(other.device_size),
data_width(other.data_width),
data_height(other.data_height),
data_depth(other.data_depth),
type(other.type),
name(other.name),
interpolation(other.interpolation),
extension(other.extension),
device(other.device),
device_pointer(other.device_pointer),
host_pointer(other.host_pointer),
shared_pointer(other.shared_pointer),
shared_counter(other.shared_counter)
{
other.device_size = 0;
other.device_pointer = 0;
other.host_pointer = 0;
other.shared_pointer = 0;
other.shared_counter = 0;
}
void *device_memory::host_alloc(size_t size)
{
if (!size) {

@ -235,9 +235,6 @@ class device_memory {
device_memory(const device_memory &) = delete;
device_memory &operator=(const device_memory &) = delete;
/* But moving is possible. */
device_memory(device_memory &&);
/* Host allocation on the device. All host_pointer memory should be
* allocated with these functions, for devices that support using
* the same pointer for host and device. */
@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory {
free();
}
device_only_memory(device_only_memory &&other)
: device_memory(static_cast<device_memory &&>(other))
{
}
void alloc_to_device(size_t num, bool shrink_to_fit = true)
{
size_t new_size = num;
@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory {
free();
}
device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other))
{
}
/* Host memory allocation. */
T *alloc(size_t width, size_t height = 0, size_t depth = 0)
{

@ -174,7 +174,7 @@ class OptiXDevice : public Device {
device_vector<SbtRecord> sbt_data;
device_vector<TextureInfo> texture_info;
device_only_memory<KernelParams> launch_params;
vector<device_only_memory<uint8_t>> as_mem;
vector<CUdeviceptr> as_mem;
OptixTraversableHandle tlas_handle = 0;
// TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
@ -269,6 +269,9 @@ class OptiXDevice : public Device {
task_pool.stop();
// Free all acceleration structures
for (CUdeviceptr mem : as_mem) {
cuMemFree(mem);
}
as_mem.clear();
sbt_data.free();
@ -831,7 +834,6 @@ class OptiXDevice : public Device {
bool build_optix_bvh(const OptixBuildInput &build_input,
uint16_t num_motion_steps,
device_memory &out_data,
OptixTraversableHandle &out_handle)
{
out_handle = 0;
@ -842,7 +844,15 @@ class OptiXDevice : public Device {
OptixAccelBufferSizes sizes = {};
OptixAccelBuildOptions options;
options.operation = OPTIX_BUILD_OPERATION_BUILD;
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
if (background) {
// Prefer best performance and lowest memory consumption in background
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
}
else {
// Prefer fast updates in viewport
options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
}
options.motionOptions.numKeys = num_motion_steps;
options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
options.motionOptions.timeBegin = 0.0f;
@ -853,31 +863,75 @@ class OptiXDevice : public Device {
// Allocate required output buffers
device_only_memory<char> temp_mem(this, "temp_build_mem");
temp_mem.alloc_to_device(sizes.tempSizeInBytes);
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
if (!temp_mem.device_pointer)
return false; // Make sure temporary memory allocation succeeded
out_data.type = MEM_DEVICE_ONLY;
out_data.data_type = TYPE_UNKNOWN;
out_data.data_elements = 1;
out_data.data_size = sizes.outputSizeInBytes;
mem_alloc(out_data);
// Move textures to host memory if there is not enough room
size_t size = 0, free = 0;
cuMemGetInfo(&free, &size);
size = sizes.outputSizeInBytes + device_working_headroom;
if (size >= free && can_map_host) {
move_textures_to_host(size - free, false);
}
CUdeviceptr out_data = 0;
check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
as_mem.push_back(out_data);
// Finally build the acceleration structure
OptixAccelEmitDesc compacted_size_prop;
compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
// A tiny space was allocated for this property at the end of the temporary buffer above
// Make sure this pointer is 8-byte aligned
compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
check_result_optix_ret(optixAccelBuild(context,
NULL,
&options,
&build_input,
1,
temp_mem.device_pointer,
sizes.tempSizeInBytes,
out_data.device_pointer,
temp_mem.device_size,
out_data,
sizes.outputSizeInBytes,
&out_handle,
NULL,
0));
&compacted_size_prop,
1));
// Wait for all operations to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
// Compact acceleration structure to save memory (do not do this in viewport for faster builds)
if (background) {
uint64_t compacted_size = sizes.outputSizeInBytes;
check_result_cuda_ret(
cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
// Temporary memory is no longer needed, so free it now to make space
temp_mem.free();
// There is no point compacting if the size does not change
if (compacted_size < sizes.outputSizeInBytes) {
CUdeviceptr compacted_data = 0;
if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
// Do not compact if memory allocation for compacted acceleration structure fails
// Can just use the uncompacted one then, so succeed here regardless
return true;
as_mem.push_back(compacted_data);
check_result_optix_ret(optixAccelCompact(
context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
// Wait for compaction to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
// Free uncompacted acceleration structure
cuMemFree(out_data);
as_mem.erase(as_mem.end() - 2); // Remove 'out_data' from 'as_mem' array
}
}
return true;
}
@ -889,7 +943,10 @@ class OptiXDevice : public Device {
unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
meshes.reserve(bvh->meshes.size());
// Free all previous acceleration structure
// Free all previous acceleration structures
for (CUdeviceptr mem : as_mem) {
cuMemFree(mem);
}
as_mem.clear();
// Build bottom level acceleration structures (BLAS)
@ -968,9 +1025,8 @@ class OptiXDevice : public Device {
build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
// Allocate memory for new BLAS and build it
as_mem.emplace_back(this, "blas");
handles.emplace_back();
if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
return false;
}
@ -1034,9 +1090,8 @@ class OptiXDevice : public Device {
build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
// Allocate memory for new BLAS and build it
as_mem.emplace_back(this, "blas");
handles.emplace_back();
if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
return false;
}
@ -1081,15 +1136,17 @@ class OptiXDevice : public Device {
// Insert motion traversable if object has motion
if (motion_blur && ob->use_motion()) {
as_mem.emplace_back(this, "motion_transform");
device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back();
motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
(max(ob->motion.size(), 2) - 2) *
sizeof(OptixSRTData));
size_t motion_keys = max(ob->motion.size(), 2) - 2;
size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
motion_keys * sizeof(OptixSRTData);
CUdeviceptr motion_transform_gpu = 0;
check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
as_mem.push_back(motion_transform_gpu);
// Allocate host side memory for motion transform and fill it with transform data
OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
new uint8_t[motion_transform_size]);
motion_transform.child = handle;
motion_transform.motionOptions.numKeys = ob->motion.size();
motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
@ -1101,38 +1158,43 @@ class OptiXDevice : public Device {
transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
for (size_t i = 0; i < ob->motion.size(); ++i) {
// scaling
srt_data[i].a = decomp[i].z.x; // scale.x.y
srt_data[i].b = decomp[i].z.y; // scale.x.z
srt_data[i].c = decomp[i].w.x; // scale.y.z
// Scale
srt_data[i].sx = decomp[i].y.w; // scale.x.x
srt_data[i].sy = decomp[i].z.w; // scale.y.y
srt_data[i].sz = decomp[i].w.w; // scale.z.z
srt_data[i].pvx = 0;
srt_data[i].pvy = 0;
srt_data[i].pvz = 0;
// rotation
// Shear
srt_data[i].a = decomp[i].z.x; // scale.x.y
srt_data[i].b = decomp[i].z.y; // scale.x.z
srt_data[i].c = decomp[i].w.x; // scale.y.z
// Pivot point
srt_data[i].pvx = 0.0f;
srt_data[i].pvy = 0.0f;
srt_data[i].pvz = 0.0f;
// Rotation
srt_data[i].qx = decomp[i].x.x;
srt_data[i].qy = decomp[i].x.y;
srt_data[i].qz = decomp[i].x.z;
srt_data[i].qw = decomp[i].x.w;
// transform
// Translation
srt_data[i].tx = decomp[i].y.x;
srt_data[i].ty = decomp[i].y.y;
srt_data[i].tz = decomp[i].y.z;
}
// Upload motion transform to GPU
mem_copy_to(motion_transform_gpu);
delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
motion_transform_gpu.host_pointer = 0;
cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
delete[] reinterpret_cast<uint8_t *>(&motion_transform);
// Disable instance transform if object uses motion transform already
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
// Get traversable handle to motion transform
optixConvertPointerToTraversableHandle(context,
motion_transform_gpu.device_pointer,
motion_transform_gpu,
OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
&instance.traversableHandle);
}
@ -1168,8 +1230,7 @@ class OptiXDevice : public Device {
build_input.instanceArray.aabbs = aabbs.device_pointer;
build_input.instanceArray.numAabbs = num_instances;
as_mem.emplace_back(this, "tlas");
return build_optix_bvh(build_input, 0, as_mem.back(), tlas_handle);
return build_optix_bvh(build_input, 0, tlas_handle);
}
void update_texture_info()

@ -53,7 +53,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
for (int sample = 0; sample < num_samples; sample++) {
float disk_u = 0.0f, disk_v = 0.0f;
float disk_u, disk_v;
path_branched_rng_2D(
kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);