blender/intern/cycles/integrator/path_trace_work_gpu.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

972 lines
34 KiB
C++
Raw Normal View History

/*
* Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "integrator/path_trace_work_gpu.h"
#include "integrator/path_trace_display.h"
#include "device/device.h"
#include "integrator/pass_accessor_gpu.h"
#include "render/buffers.h"
#include "render/scene.h"
#include "util/util_logging.h"
#include "util/util_string.h"
#include "util/util_tbb.h"
#include "util/util_time.h"
#include "kernel/kernel_types.h"
CCL_NAMESPACE_BEGIN
static size_t estimate_single_state_size(DeviceScene *device_scene)
{
size_t state_size = 0;
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
#define KERNEL_STRUCT_END(name) \
break; \
}
#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
if (array_index == gpu_array_size - 1) { \
break; \
} \
}
#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene->data.volume_stack_size)
#include "kernel/integrator/integrator_state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
return state_size;
}
PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
Film *film,
DeviceScene *device_scene,
bool *cancel_requested_flag)
: PathTraceWork(device, film, device_scene, cancel_requested_flag),
queue_(device->gpu_queue_create()),
integrator_state_soa_kernel_features_(0),
integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
integrator_shader_raytrace_sort_counter_(
device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
integrator_next_shadow_catcher_path_index_(
device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
queued_paths_(device, "queued_paths", MEM_READ_WRITE),
num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
work_tiles_(device, "work_tiles", MEM_READ_WRITE),
display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size(device_scene))),
min_num_active_paths_(queue_->num_concurrent_busy_states()),
max_active_path_index_(0)
{
memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
/* Limit number of active paths to the half of the overall state. This is due to the logic in the
* path compaction which relies on the fact that regeneration does not happen sooner than half of
* the states are available again. */
min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
}
void PathTraceWorkGPU::alloc_integrator_soa()
{
/* IntegrateState allocated as structure of arrays. */
/* Check if we already allocated memory for the required features. */
const uint kernel_features = device_scene_->data.kernel_features;
if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
return;
}
integrator_state_soa_kernel_features_ = kernel_features;
/* Allocate a device only memory buffer before for each struct member, and then
* write the pointers into a struct that resides in constant memory.
*
* TODO: store float3 in separate XYZ arrays. */
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
array->alloc_to_device(max_num_paths_); \
integrator_state_soa_.emplace_back(array); \
integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
}
#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
if ((kernel_features & feature) && \
(integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
array->alloc_to_device(max_num_paths_); \
integrator_state_soa_.emplace_back(array); \
integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
}
#define KERNEL_STRUCT_END(name) \
break; \
}
#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
if (array_index == gpu_array_size - 1) { \
break; \
} \
}
#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene_->data.volume_stack_size)
#include "kernel/integrator/integrator_state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
if (VLOG_IS_ON(3)) {
size_t total_soa_size = 0;
for (auto &&soa_memory : integrator_state_soa_) {
total_soa_size += soa_memory->memory_size();
}
VLOG(3) << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
}
}
void PathTraceWorkGPU::alloc_integrator_queue()
{
if (integrator_queue_counter_.size() == 0) {
integrator_queue_counter_.alloc(1);
integrator_queue_counter_.zero_to_device();
integrator_queue_counter_.copy_from_device();
integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
integrator_queue_counter_.device_pointer;
}
/* Allocate data for active path index arrays. */
if (num_queued_paths_.size() == 0) {
num_queued_paths_.alloc(1);
num_queued_paths_.zero_to_device();
}
if (queued_paths_.size() == 0) {
queued_paths_.alloc(max_num_paths_);
/* TODO: this could be skip if we had a function to just allocate on device. */
queued_paths_.zero_to_device();
}
}
void PathTraceWorkGPU::alloc_integrator_sorting()
{
/* Allocate arrays for shader sorting. */
const int max_shaders = device_scene_->data.max_shaders;
if (integrator_shader_sort_counter_.size() < max_shaders) {
integrator_shader_sort_counter_.alloc(max_shaders);
integrator_shader_sort_counter_.zero_to_device();
integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
}
}
void PathTraceWorkGPU::alloc_integrator_path_split()
{
if (integrator_next_shadow_catcher_path_index_.size() != 0) {
return;
}
integrator_next_shadow_catcher_path_index_.alloc(1);
/* TODO(sergey): Use queue? */
integrator_next_shadow_catcher_path_index_.zero_to_device();
integrator_state_gpu_.next_shadow_catcher_path_index =
(int *)integrator_next_shadow_catcher_path_index_.device_pointer;
}
void PathTraceWorkGPU::alloc_work_memory()
{
alloc_integrator_soa();
alloc_integrator_queue();
alloc_integrator_sorting();
alloc_integrator_path_split();
}
void PathTraceWorkGPU::init_execution()
{
queue_->init_execution();
/* Copy to device side struct in constant memory. */
device_->const_copy_to(
"__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
}
void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
int start_sample,
int samples_num)
{
/* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
* add more work (because tiles are smaller, so there is higher chance that more paths will
* become busy after adding new tiles). This is especially important for the shadow catcher which
* schedules work in halves of available number of paths. */
work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
enqueue_reset();
int num_iterations = 0;
uint64_t num_busy_accum = 0;
/* TODO: set a hard limit in case of undetected kernel failures? */
while (true) {
/* Enqueue work from the scheduler, on start or when there are not enough
* paths to keep the device occupied. */
bool finished;
if (enqueue_work_tiles(finished)) {
/* Copy stats from the device. */
queue_->copy_from_device(integrator_queue_counter_);
if (!queue_->synchronize()) {
break; /* Stop on error. */
}
}
if (is_cancel_requested()) {
break;
}
/* Stop if no more work remaining. */
if (finished) {
break;
}
/* Enqueue on of the path iteration kernels. */
if (enqueue_path_iteration()) {
/* Copy stats from the device. */
queue_->copy_from_device(integrator_queue_counter_);
if (!queue_->synchronize()) {
break; /* Stop on error. */
}
}
if (is_cancel_requested()) {
break;
}
num_busy_accum += get_num_active_paths();
++num_iterations;
}
statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
}
DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
{
const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
int max_num_queued = 0;
DeviceKernel kernel = DEVICE_KERNEL_NUM;
for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
if (queue_counter->num_queued[i] > max_num_queued) {
kernel = (DeviceKernel)i;
max_num_queued = queue_counter->num_queued[i];
}
}
return kernel;
}
void PathTraceWorkGPU::enqueue_reset()
{
void *args[] = {&max_num_paths_};
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
queue_->zero_to_device(integrator_queue_counter_);
queue_->zero_to_device(integrator_shader_sort_counter_);
queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
/* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
* counter on the host side because `zero_to_device()` is not doing it. */
if (integrator_queue_counter_.host_pointer) {
memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
}
}
bool PathTraceWorkGPU::enqueue_path_iteration()
{
/* Find kernel to execute, with max number of queued paths. */
const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
int num_active_paths = 0;
for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
num_active_paths += queue_counter->num_queued[i];
}
if (num_active_paths == 0) {
return false;
}
/* Find kernel to execute, with max number of queued paths. */
const DeviceKernel kernel = get_most_queued_kernel();
if (kernel == DEVICE_KERNEL_NUM) {
return false;
}
/* Finish shadows before potentially adding more shadow rays. We can only
* store one shadow ray in the integrator state. */
if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
return true;
}
else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
return true;
}
}
/* Schedule kernel with maximum number of queued items. */
enqueue_path_iteration(kernel);
return true;
}
void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
{
void *d_path_index = (void *)NULL;
/* Create array of path indices for which this kernel is queued to be executed. */
int work_size = max_active_path_index_;
IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
int num_queued = queue_counter->num_queued[kernel];
if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
/* Compute array of active paths, sorted by shader. */
work_size = num_queued;
d_path_index = (void *)queued_paths_.device_pointer;
compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
}
else if (num_queued < work_size) {
work_size = num_queued;
d_path_index = (void *)queued_paths_.device_pointer;
if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
/* Compute array of active shadow paths for specific kernel. */
compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
}
else {
/* Compute array of active paths for specific kernel. */
compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
}
}
DCHECK_LE(work_size, max_num_paths_);
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
/* Ray intersection kernels with integrator state. */
void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
queue_->enqueue(kernel, work_size, args);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
/* Shading kernels with integrator state and render buffer. */
void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
queue_->enqueue(kernel, work_size, args);
break;
}
default:
LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
<< " used for path iteration, should never happen.";
break;
}
}
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
{
int d_queued_kernel = queued_kernel;
void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
assert(d_counter != nullptr);
/* Compute prefix sum of number of active paths with each shader. */
{
const int work_size = 1;
int max_shaders = device_scene_->data.max_shaders;
void *args[] = {&d_counter, &max_shaders};
queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
}
queue_->zero_to_device(num_queued_paths_);
/* Launch kernel to fill the active paths arrays. */
{
/* TODO: this could be smaller for terminated paths based on amount of work we want
* to schedule. */
const int work_size = max_active_path_index_;
void *d_queued_paths = (void *)queued_paths_.device_pointer;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
void *args[] = {const_cast<int *>(&work_size),
&d_queued_paths,
&d_num_queued_paths,
&d_counter,
&d_queued_kernel};
queue_->enqueue(kernel, work_size, args);
}
if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
queue_->zero_to_device(integrator_shader_sort_counter_);
}
else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
}
else {
assert(0);
}
}
void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
{
int d_queued_kernel = queued_kernel;
/* Launch kernel to fill the active paths arrays. */
const int work_size = max_active_path_index_;
void *d_queued_paths = (void *)queued_paths_.device_pointer;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
void *args[] = {
const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
queue_->zero_to_device(num_queued_paths_);
queue_->enqueue(kernel, work_size, args);
}
void PathTraceWorkGPU::compact_states(const int num_active_paths)
{
if (num_active_paths == 0) {
max_active_path_index_ = 0;
}
/* Compact fragmented path states into the start of the array, moving any paths
* with index higher than the number of active paths into the gaps. */
if (max_active_path_index_ == num_active_paths) {
return;
}
void *d_compact_paths = (void *)queued_paths_.device_pointer;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
/* Create array with terminated paths that we can write to. */
{
/* TODO: can the work size be reduced here? */
int offset = num_active_paths;
int work_size = num_active_paths;
void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
queue_->zero_to_device(num_queued_paths_);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
}
/* Create array of paths that we need to compact, where the path index is bigger
* than the number of active paths. */
{
int work_size = max_active_path_index_;
void *args[] = {
&work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
queue_->zero_to_device(num_queued_paths_);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
}
queue_->copy_from_device(num_queued_paths_);
queue_->synchronize();
int num_compact_paths = num_queued_paths_.data()[0];
/* Move paths into gaps. */
if (num_compact_paths > 0) {
int work_size = num_compact_paths;
int active_states_offset = 0;
int terminated_states_offset = num_active_paths;
void *args[] = {
&d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
}
queue_->synchronize();
/* Adjust max active path index now we know which part of the array is actually used. */
max_active_path_index_ = num_active_paths;
}
bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
{
/* If there are existing paths wait them to go to intersect closest kernel, which will align the
2021-09-22 04:48:01 +00:00
* wavefront of the existing and newly added paths. */
/* TODO: Check whether counting new intersection kernels here will have positive affect on the
* performance. */
const DeviceKernel kernel = get_most_queued_kernel();
if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
return false;
}
int num_active_paths = get_num_active_paths();
2021-09-22 04:48:01 +00:00
/* Don't schedule more work if canceling. */
if (is_cancel_requested()) {
if (num_active_paths == 0) {
finished = true;
}
return false;
}
finished = false;
vector<KernelWorkTile> work_tiles;
int max_num_camera_paths = max_num_paths_;
int num_predicted_splits = 0;
if (has_shadow_catcher()) {
/* When there are shadow catchers in the scene bounce from them will split the state. So we
* make sure there is enough space in the path states array to fit split states.
*
* Basically, when adding N new paths we ensure that there is 2*N available path states, so
* that all the new paths can be split.
*
* Note that it is possible that some of the current states can still split, so need to make
* sure there is enough space for them as well. */
/* Number of currently in-flight states which can still split. */
const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
const int num_available_paths = max_num_paths_ - num_active_paths;
const int num_new_paths = num_available_paths / 2;
max_num_camera_paths = max(num_active_paths,
num_active_paths + num_new_paths - num_scheduled_possible_split);
num_predicted_splits += num_scheduled_possible_split + num_new_paths;
}
/* Schedule when we're out of paths or there are too few paths to keep the
* device occupied. */
int num_paths = num_active_paths;
if (num_paths == 0 || num_paths < min_num_active_paths_) {
/* Get work tiles until the maximum number of path is reached. */
while (num_paths < max_num_camera_paths) {
KernelWorkTile work_tile;
if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
work_tiles.push_back(work_tile);
num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
}
else {
break;
}
}
/* If we couldn't get any more tiles, we're done. */
if (work_tiles.size() == 0 && num_paths == 0) {
finished = true;
return false;
}
}
/* Initialize paths from work tiles. */
if (work_tiles.size() == 0) {
return false;
}
/* Compact state array when number of paths becomes small relative to the
* known maximum path index, which makes computing active index arrays slow. */
compact_states(num_active_paths);
if (has_shadow_catcher()) {
integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
}
enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
work_tiles.data(),
work_tiles.size(),
num_active_paths,
num_predicted_splits);
return true;
}
void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
const KernelWorkTile work_tiles[],
const int num_work_tiles,
const int num_active_paths,
const int num_predicted_splits)
{
/* Copy work tiles to device. */
if (work_tiles_.size() < num_work_tiles) {
work_tiles_.alloc(num_work_tiles);
}
int path_index_offset = num_active_paths;
int max_tile_work_size = 0;
for (int i = 0; i < num_work_tiles; i++) {
KernelWorkTile &work_tile = work_tiles_.data()[i];
work_tile = work_tiles[i];
const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
work_tile.path_index_offset = path_index_offset;
work_tile.work_size = tile_work_size;
path_index_offset += tile_work_size;
max_tile_work_size = max(max_tile_work_size, tile_work_size);
}
queue_->copy_to_device(work_tiles_);
void *d_work_tiles = (void *)work_tiles_.device_pointer;
void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
/* Launch kernel. */
void *args[] = {&d_work_tiles,
const_cast<int *>(&num_work_tiles),
&d_render_buffer,
const_cast<int *>(&max_tile_work_size)};
queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
max_active_path_index_ = path_index_offset + num_predicted_splits;
}
int PathTraceWorkGPU::get_num_active_paths()
{
/* TODO: this is wrong, does not account for duplicates with shadow! */
IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
int num_paths = 0;
for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
DCHECK_GE(queue_counter->num_queued[i], 0)
<< "Invalid number of queued states for kernel "
<< device_kernel_as_string(static_cast<DeviceKernel>(i));
num_paths += queue_counter->num_queued[i];
}
return num_paths;
}
bool PathTraceWorkGPU::should_use_graphics_interop()
{
/* There are few aspects with the graphics interop when using multiple devices caused by the fact
* that the PathTraceDisplay has a single texture:
*
* CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
* attempting to register OpenGL PBO which has been mapped. Which makes sense, because
* otherwise one would run into a conflict of where the source of truth is. */
if (has_multiple_works()) {
return false;
}
if (!interop_use_checked_) {
Device *device = queue_->device;
interop_use_ = device->should_use_graphics_interop();
if (interop_use_) {
VLOG(2) << "Will be using graphics interop GPU display update.";
}
else {
VLOG(2) << "Will be using naive GPU display update.";
}
interop_use_checked_ = true;
}
return interop_use_;
}
void PathTraceWorkGPU::copy_to_display(PathTraceDisplay *display,
PassMode pass_mode,
int num_samples)
{
if (device_->have_error()) {
/* Don't attempt to update GPU display if the device has errors: the error state will make
* wrong decisions to happen about interop, causing more chained bugs. */
return;
}
if (!buffers_->buffer.device_pointer) {
LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
return;
}
if (should_use_graphics_interop()) {
if (copy_to_display_interop(display, pass_mode, num_samples)) {
return;
}
/* If error happens when trying to use graphics interop fallback to the native implementation
* and don't attempt to use interop for the further updates. */
interop_use_ = false;
}
copy_to_display_naive(display, pass_mode, num_samples);
}
void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
PassMode pass_mode,
int num_samples)
{
const int full_x = effective_buffer_params_.full_x;
const int full_y = effective_buffer_params_.full_y;
const int width = effective_buffer_params_.window_width;
const int height = effective_buffer_params_.window_height;
const int final_width = buffers_->params.window_width;
const int final_height = buffers_->params.window_height;
const int texture_x = full_x - effective_full_params_.full_x + effective_buffer_params_.window_x;
const int texture_y = full_y - effective_full_params_.full_y + effective_buffer_params_.window_y;
/* Re-allocate display memory if needed, and make sure the device pointer is allocated.
*
* NOTE: allocation happens to the final resolution so that no re-allocation happens on every
* change of the resolution divider. However, if the display becomes smaller, shrink the
* allocated memory as well. */
if (display_rgba_half_.data_width != final_width ||
display_rgba_half_.data_height != final_height) {
display_rgba_half_.alloc(final_width, final_height);
/* TODO(sergey): There should be a way to make sure device-side memory is allocated without
2021-09-22 04:48:01 +00:00
* transferring zeroes to the device. */
queue_->zero_to_device(display_rgba_half_);
}
PassAccessor::Destination destination(film_->get_display_pass());
destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
get_render_tile_film_pixels(destination, pass_mode, num_samples);
queue_->copy_from_device(display_rgba_half_);
queue_->synchronize();
display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
}
bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,
PassMode pass_mode,
int num_samples)
{
if (!device_graphics_interop_) {
device_graphics_interop_ = queue_->graphics_interop_create();
}
const DisplayDriver::GraphicsInterop graphics_interop_dst = display->graphics_interop_get();
device_graphics_interop_->set_display_interop(graphics_interop_dst);
const device_ptr d_rgba_half = device_graphics_interop_->map();
if (!d_rgba_half) {
return false;
}
PassAccessor::Destination destination = get_display_destination_template(display);
destination.d_pixels_half_rgba = d_rgba_half;
get_render_tile_film_pixels(destination, pass_mode, num_samples);
device_graphics_interop_->unmap();
return true;
}
void PathTraceWorkGPU::destroy_gpu_resources(PathTraceDisplay *display)
{
if (!device_graphics_interop_) {
return;
}
display->graphics_interop_activate();
device_graphics_interop_ = nullptr;
display->graphics_interop_deactivate();
}
void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
PassMode pass_mode,
int num_samples)
{
const KernelFilm &kfilm = device_scene_->data.film;
const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
}
int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
{
const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
if (num_active_pixels) {
enqueue_adaptive_sampling_filter_x();
enqueue_adaptive_sampling_filter_y();
queue_->synchronize();
}
return num_active_pixels;
}
int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
{
device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
num_active_pixels.alloc(1);
queue_->zero_to_device(num_active_pixels);
const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
void *args[] = {&buffers_->buffer.device_pointer,
const_cast<int *>(&effective_buffer_params_.full_x),
const_cast<int *>(&effective_buffer_params_.full_y),
const_cast<int *>(&effective_buffer_params_.width),
const_cast<int *>(&effective_buffer_params_.height),
&threshold,
&reset,
&effective_buffer_params_.offset,
&effective_buffer_params_.stride,
&num_active_pixels.device_pointer};
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
queue_->copy_from_device(num_active_pixels);
queue_->synchronize();
return num_active_pixels.data()[0];
}
void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
{
const int work_size = effective_buffer_params_.height;
void *args[] = {&buffers_->buffer.device_pointer,
&effective_buffer_params_.full_x,
&effective_buffer_params_.full_y,
&effective_buffer_params_.width,
&effective_buffer_params_.height,
&effective_buffer_params_.offset,
&effective_buffer_params_.stride};
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
}
void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
{
const int work_size = effective_buffer_params_.width;
void *args[] = {&buffers_->buffer.device_pointer,
&effective_buffer_params_.full_x,
&effective_buffer_params_.full_y,
&effective_buffer_params_.width,
&effective_buffer_params_.height,
&effective_buffer_params_.offset,
&effective_buffer_params_.stride};
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
}
void PathTraceWorkGPU::cryptomatte_postproces()
{
const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
void *args[] = {&buffers_->buffer.device_pointer,
const_cast<int *>(&work_size),
&effective_buffer_params_.offset,
&effective_buffer_params_.stride};
queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
}
bool PathTraceWorkGPU::copy_render_buffers_from_device()
{
queue_->copy_from_device(buffers_->buffer);
/* Synchronize so that the CPU-side buffer is available at the exit of this function. */
return queue_->synchronize();
}
bool PathTraceWorkGPU::copy_render_buffers_to_device()
{
queue_->copy_to_device(buffers_->buffer);
/* NOTE: The direct device access to the buffers only happens within this path trace work. The
* rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
* which will perform synchronization as needed. */
return true;
}
bool PathTraceWorkGPU::zero_render_buffers()
{
queue_->zero_to_device(buffers_->buffer);
return true;
}
bool PathTraceWorkGPU::has_shadow_catcher() const
{
return device_scene_->data.integrator.has_shadow_catcher;
}
int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
{
if (max_active_path_index_ == 0) {
return 0;
}
if (!has_shadow_catcher()) {
return 0;
}
queue_->zero_to_device(num_queued_paths_);
const int work_size = max_active_path_index_;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
queue_->copy_from_device(num_queued_paths_);
queue_->synchronize();
return num_queued_paths_.data()[0];
}
CCL_NAMESPACE_END