forked from bartvdbraak/blender
Cycles: Change device-only memory to actually only allocate on the device
This patch changes the `MEM_DEVICE_ONLY` type to only allocate on the device and fail if that is not possible anymore because out-of-memory (since OptiX acceleration structures may not be allocated in host memory). It also fixes high peak memory usage during OptiX acceleration structure building. Reviewed By: brecht Maniphest Tasks: T85985 Differential Revision: https://developer.blender.org/D10535
This commit is contained in:
parent
ba996ddb3a
commit
f4f8b6dde3
@ -27,8 +27,8 @@ BVHOptiX::BVHOptiX(const BVHParams ¶ms_,
|
|||||||
Device *device)
|
Device *device)
|
||||||
: BVH(params_, geometry_, objects_),
|
: BVH(params_, geometry_, objects_),
|
||||||
traversable_handle(0),
|
traversable_handle(0),
|
||||||
as_data(device, params_.top_level ? "optix tlas" : "optix blas"),
|
as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
|
||||||
motion_transform_data(device, "optix motion transform")
|
motion_transform_data(device, "optix motion transform", false)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -854,7 +854,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
|
|||||||
|
|
||||||
void *shared_pointer = 0;
|
void *shared_pointer = 0;
|
||||||
|
|
||||||
if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
|
if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
|
||||||
if (mem.shared_pointer) {
|
if (mem.shared_pointer) {
|
||||||
/* Another device already allocated host memory. */
|
/* Another device already allocated host memory. */
|
||||||
mem_alloc_result = CUDA_SUCCESS;
|
mem_alloc_result = CUDA_SUCCESS;
|
||||||
@ -877,8 +877,14 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mem_alloc_result != CUDA_SUCCESS) {
|
if (mem_alloc_result != CUDA_SUCCESS) {
|
||||||
status = " failed, out of device and host memory";
|
if (mem.type == MEM_DEVICE_ONLY) {
|
||||||
set_error("System is out of GPU and shared host memory");
|
status = " failed, out of device memory";
|
||||||
|
set_error("System is out of GPU memory");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
status = " failed, out of device and host memory";
|
||||||
|
set_error("System is out of GPU and shared host memory");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mem.name) {
|
if (mem.name) {
|
||||||
|
@ -396,8 +396,7 @@ class CPUDevice : public Device {
|
|||||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mem.type == MEM_DEVICE_ONLY) {
|
if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
|
||||||
assert(!mem.host_pointer);
|
|
||||||
size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
|
size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
|
||||||
void *data = util_aligned_malloc(mem.memory_size(), alignment);
|
void *data = util_aligned_malloc(mem.memory_size(), alignment);
|
||||||
mem.device_pointer = (device_ptr)data;
|
mem.device_pointer = (device_ptr)data;
|
||||||
@ -459,7 +458,7 @@ class CPUDevice : public Device {
|
|||||||
tex_free((device_texture &)mem);
|
tex_free((device_texture &)mem);
|
||||||
}
|
}
|
||||||
else if (mem.device_pointer) {
|
else if (mem.device_pointer) {
|
||||||
if (mem.type == MEM_DEVICE_ONLY) {
|
if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
|
||||||
util_aligned_free((void *)mem.device_pointer);
|
util_aligned_free((void *)mem.device_pointer);
|
||||||
}
|
}
|
||||||
mem.device_pointer = 0;
|
mem.device_pointer = 0;
|
||||||
|
@ -171,7 +171,8 @@ class DenoisingTask {
|
|||||||
bool gpu_temporary_mem;
|
bool gpu_temporary_mem;
|
||||||
|
|
||||||
DenoiseBuffers(Device *device)
|
DenoiseBuffers(Device *device)
|
||||||
: mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem")
|
: mem(device, "denoising pixel buffer"),
|
||||||
|
temporary_mem(device, "denoising temporary mem", true)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
} buffer;
|
} buffer;
|
||||||
|
@ -270,8 +270,8 @@ class device_memory {
|
|||||||
|
|
||||||
template<typename T> class device_only_memory : public device_memory {
|
template<typename T> class device_only_memory : public device_memory {
|
||||||
public:
|
public:
|
||||||
device_only_memory(Device *device, const char *name)
|
device_only_memory(Device *device, const char *name, bool allow_host_memory_fallback = false)
|
||||||
: device_memory(device, name, MEM_DEVICE_ONLY)
|
: device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
|
||||||
{
|
{
|
||||||
data_type = device_type_traits<T>::data_type;
|
data_type = device_type_traits<T>::data_type;
|
||||||
data_elements = max(device_type_traits<T>::num_elements, 1);
|
data_elements = max(device_type_traits<T>::num_elements, 1);
|
||||||
|
@ -197,8 +197,8 @@ class OptiXDevice : public CUDADevice {
|
|||||||
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
||||||
: CUDADevice(info_, stats_, profiler_, background_),
|
: CUDADevice(info_, stats_, profiler_, background_),
|
||||||
sbt_data(this, "__sbt", MEM_READ_ONLY),
|
sbt_data(this, "__sbt", MEM_READ_ONLY),
|
||||||
launch_params(this, "__params"),
|
launch_params(this, "__params", false),
|
||||||
denoiser_state(this, "__denoiser_state")
|
denoiser_state(this, "__denoiser_state", true)
|
||||||
{
|
{
|
||||||
// Store number of CUDA streams in device info
|
// Store number of CUDA streams in device info
|
||||||
info.cpu_threads = DebugFlags().optix.cuda_streams;
|
info.cpu_threads = DebugFlags().optix.cuda_streams;
|
||||||
@ -878,8 +878,8 @@ class OptiXDevice : public CUDADevice {
|
|||||||
device_ptr input_ptr = rtile.buffer + pixel_offset;
|
device_ptr input_ptr = rtile.buffer + pixel_offset;
|
||||||
|
|
||||||
// Copy tile data into a common buffer if necessary
|
// Copy tile data into a common buffer if necessary
|
||||||
device_only_memory<float> input(this, "denoiser input");
|
device_only_memory<float> input(this, "denoiser input", true);
|
||||||
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
|
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
|
||||||
|
|
||||||
bool contiguous_memory = true;
|
bool contiguous_memory = true;
|
||||||
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
|
for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
|
||||||
@ -924,7 +924,7 @@ class OptiXDevice : public CUDADevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
||||||
device_only_memory<float> input_rgb(this, "denoiser input rgb");
|
device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
|
||||||
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
|
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
|
||||||
|
|
||||||
void *input_args[] = {&input_rgb.device_pointer,
|
void *input_args[] = {&input_rgb.device_pointer,
|
||||||
@ -1146,6 +1146,13 @@ class OptiXDevice : public CUDADevice {
|
|||||||
const OptixBuildInput &build_input,
|
const OptixBuildInput &build_input,
|
||||||
uint16_t num_motion_steps)
|
uint16_t num_motion_steps)
|
||||||
{
|
{
|
||||||
|
/* Allocate and build acceleration structures only one at a time, to prevent parallel builds
|
||||||
|
* from running out of memory (since both original and compacted acceleration structure memory
|
||||||
|
* may be allocated at the same time for the duration of this function). The builds would
|
||||||
|
* otherwise happen on the same CUDA stream anyway. */
|
||||||
|
static thread_mutex mutex;
|
||||||
|
thread_scoped_lock lock(mutex);
|
||||||
|
|
||||||
const CUDAContextScope scope(cuContext);
|
const CUDAContextScope scope(cuContext);
|
||||||
|
|
||||||
// Compute memory usage
|
// Compute memory usage
|
||||||
@ -1170,11 +1177,12 @@ class OptiXDevice : public CUDADevice {
|
|||||||
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
|
optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
|
||||||
|
|
||||||
// Allocate required output buffers
|
// Allocate required output buffers
|
||||||
device_only_memory<char> temp_mem(this, "optix temp as build mem");
|
device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
|
||||||
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
|
temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
|
||||||
if (!temp_mem.device_pointer)
|
if (!temp_mem.device_pointer)
|
||||||
return false; // Make sure temporary memory allocation succeeded
|
return false; // Make sure temporary memory allocation succeeded
|
||||||
|
|
||||||
|
// Acceleration structure memory has to be allocated on the device (not allowed to be on host)
|
||||||
device_only_memory<char> &out_data = bvh->as_data;
|
device_only_memory<char> &out_data = bvh->as_data;
|
||||||
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
|
if (operation == OPTIX_BUILD_OPERATION_BUILD) {
|
||||||
assert(out_data.device == this);
|
assert(out_data.device == this);
|
||||||
@ -1222,7 +1230,7 @@ class OptiXDevice : public CUDADevice {
|
|||||||
|
|
||||||
// There is no point compacting if the size does not change
|
// There is no point compacting if the size does not change
|
||||||
if (compacted_size < sizes.outputSizeInBytes) {
|
if (compacted_size < sizes.outputSizeInBytes) {
|
||||||
device_only_memory<char> compacted_data(this, "optix compacted as");
|
device_only_memory<char> compacted_data(this, "optix compacted as", false);
|
||||||
compacted_data.alloc_to_device(compacted_size);
|
compacted_data.alloc_to_device(compacted_size);
|
||||||
if (!compacted_data.device_pointer)
|
if (!compacted_data.device_pointer)
|
||||||
// Do not compact if memory allocation for compacted acceleration structure fails
|
// Do not compact if memory allocation for compacted acceleration structure fails
|
||||||
@ -1242,6 +1250,7 @@ class OptiXDevice : public CUDADevice {
|
|||||||
|
|
||||||
std::swap(out_data.device_size, compacted_data.device_size);
|
std::swap(out_data.device_size, compacted_data.device_size);
|
||||||
std::swap(out_data.device_pointer, compacted_data.device_pointer);
|
std::swap(out_data.device_pointer, compacted_data.device_pointer);
|
||||||
|
// Original acceleration structure memory is freed when 'compacted_data' goes out of scope
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user