Fix T82351: Cycles: Tile stealing glitches with adaptive sampling

In my testing this works, but it requires me to remove the min(start_sample...) part in the
adaptive sampling kernel, and I assume there's a reason why it was there?

Reviewed By: brecht

Maniphest Tasks: T82351

Differential Revision: https://developer.blender.org/D9445
This commit is contained in:
Lukas Stockner 2021-01-11 20:42:47 +01:00 committed by Lukas Stockner
parent b70f4a265a
commit 688e5c6d38
8 changed files with 35 additions and 55 deletions

@ -1929,18 +1929,19 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkT
}
uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
if (task.adaptive_sampling.use) {
step_samples = task.adaptive_sampling.align_static_samples(step_samples);
}
/* Render all samples. */
int start_sample = rtile.start_sample;
int end_sample = rtile.start_sample + rtile.num_samples;
for (int sample = start_sample; sample < end_sample; sample += step_samples) {
for (int sample = start_sample; sample < end_sample;) {
/* Setup and copy work tile to device. */
wtile->start_sample = sample;
wtile->num_samples = min(step_samples, end_sample - sample);
wtile->num_samples = step_samples;
if (task.adaptive_sampling.use) {
wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
}
wtile->num_samples = min(wtile->num_samples, end_sample - sample);
work_tiles.copy_to_device();
CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
@ -1962,7 +1963,8 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkT
cuda_assert(cuCtxSynchronize());
/* Update progress. */
rtile.sample = sample + wtile->num_samples;
sample += wtile->num_samples;
rtile.sample = sample;
task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
if (task.get_cancel()) {

@ -920,8 +920,7 @@ class CPUDevice : public Device {
ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
buffer[kernel_data.film.pass_sample_count]);
float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
if (sample_multiplier != 1.0f) {
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
}
@ -997,7 +996,7 @@ class CPUDevice : public Device {
coverage.finalize();
}
if (task.adaptive_sampling.use) {
if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
adaptive_sampling_post(tile, kg);
}
}

@ -760,9 +760,6 @@ class OptiXDevice : public CUDADevice {
const int end_sample = rtile.start_sample + rtile.num_samples;
// Keep this number reasonable to avoid running into TDRs
int step_samples = (info.display_device ? 8 : 32);
if (task.adaptive_sampling.use) {
step_samples = task.adaptive_sampling.align_static_samples(step_samples);
}
// Offset into launch params buffer so that streams use separate data
device_ptr launch_params_ptr = launch_params.device_pointer +
@ -770,10 +767,14 @@ class OptiXDevice : public CUDADevice {
const CUDAContextScope scope(cuContext);
for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
for (int sample = rtile.start_sample; sample < end_sample;) {
// Copy work tile information to device
wtile.num_samples = min(step_samples, end_sample - sample);
wtile.start_sample = sample;
wtile.num_samples = step_samples;
if (task.adaptive_sampling.use) {
wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
}
wtile.num_samples = min(wtile.num_samples, end_sample - sample);
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
check_result_cuda(
cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
@ -815,7 +816,8 @@ class OptiXDevice : public CUDADevice {
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
// Update current sample, so it is displayed correctly
rtile.sample = wtile.start_sample + wtile.num_samples;
sample += wtile.num_samples;
rtile.sample = sample;
// Update task progress after the kernel completed rendering
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);

@ -223,8 +223,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask &task,
subtile.num_samples = samples_per_second;
if (task.adaptive_sampling.use) {
subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample,
subtile.num_samples);
subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
subtile.num_samples);
}
/* Don't go beyond requested number of samples. */

@ -144,41 +144,20 @@ AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(
}
/* Render samples in steps that align with the adaptive filtering. */
int AdaptiveSampling::align_static_samples(int samples) const
int AdaptiveSampling::align_samples(int sample, int num_samples) const
{
if (samples > adaptive_step) {
/* Make multiple of adaptive_step. */
while (samples % adaptive_step != 0) {
samples--;
}
int end_sample = sample + num_samples;
/* Round down end sample to the nearest sample that needs filtering. */
end_sample &= ~(adaptive_step - 1);
if (end_sample <= sample) {
/* In order to reach the next sample that needs filtering, we'd need
* to increase num_samples. We don't do that in this function, so
* just keep it as is and don't filter this time around. */
return num_samples;
}
else if (samples < adaptive_step) {
/* Make divisor of adaptive_step. */
while (adaptive_step % samples != 0) {
samples--;
}
}
return max(samples, 1);
}
/* Render samples in steps that align with the adaptive filtering, with the
* suggested number of samples dynamically changing. */
int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
{
/* Round so that we end up on multiples of adaptive_samples. */
samples += offset;
if (samples > adaptive_step) {
/* Make multiple of adaptive_step. */
while (samples % adaptive_step != 0) {
samples--;
}
}
samples -= offset;
return max(samples, 1);
return end_sample - sample;
}
bool AdaptiveSampling::need_filter(int sample) const

@ -117,8 +117,7 @@ class AdaptiveSampling {
public:
AdaptiveSampling();
int align_static_samples(int samples) const;
int align_dynamic_samples(int offset, int samples) const;
int align_samples(int sample, int num_samples) const;
bool need_filter(int sample) const;
bool use;

@ -139,7 +139,7 @@ kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample,
ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
if(sample_multiplier != 1.0f) {
kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
}

@ -29,8 +29,7 @@ ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
buffer[kernel_data.film.pass_sample_count]);
float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
if (sample_multiplier != 1.0f) {
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
}