forked from bartvdbraak/blender
Fix T82351: Cycles: Tile stealing glitches with adaptive sampling
In my testing this works, but it requires me to remove the min(start_sample...) part in the adaptive sampling kernel, and I assume there's a reason why it was there? Reviewed By: brecht Maniphest Tasks: T82351 Differential Revision: https://developer.blender.org/D9445
This commit is contained in:
parent
b70f4a265a
commit
688e5c6d38
@ -1929,18 +1929,19 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkT
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
|
uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
|
||||||
if (task.adaptive_sampling.use) {
|
|
||||||
step_samples = task.adaptive_sampling.align_static_samples(step_samples);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Render all samples. */
|
/* Render all samples. */
|
||||||
int start_sample = rtile.start_sample;
|
int start_sample = rtile.start_sample;
|
||||||
int end_sample = rtile.start_sample + rtile.num_samples;
|
int end_sample = rtile.start_sample + rtile.num_samples;
|
||||||
|
|
||||||
for (int sample = start_sample; sample < end_sample; sample += step_samples) {
|
for (int sample = start_sample; sample < end_sample;) {
|
||||||
/* Setup and copy work tile to device. */
|
/* Setup and copy work tile to device. */
|
||||||
wtile->start_sample = sample;
|
wtile->start_sample = sample;
|
||||||
wtile->num_samples = min(step_samples, end_sample - sample);
|
wtile->num_samples = step_samples;
|
||||||
|
if (task.adaptive_sampling.use) {
|
||||||
|
wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
|
||||||
|
}
|
||||||
|
wtile->num_samples = min(wtile->num_samples, end_sample - sample);
|
||||||
work_tiles.copy_to_device();
|
work_tiles.copy_to_device();
|
||||||
|
|
||||||
CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
|
CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
|
||||||
@ -1962,7 +1963,8 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkT
|
|||||||
cuda_assert(cuCtxSynchronize());
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
/* Update progress. */
|
/* Update progress. */
|
||||||
rtile.sample = sample + wtile->num_samples;
|
sample += wtile->num_samples;
|
||||||
|
rtile.sample = sample;
|
||||||
task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
|
task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
|
||||||
|
|
||||||
if (task.get_cancel()) {
|
if (task.get_cancel()) {
|
||||||
|
@ -920,8 +920,7 @@ class CPUDevice : public Device {
|
|||||||
ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
|
ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
|
||||||
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
||||||
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
||||||
float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
|
float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
|
||||||
buffer[kernel_data.film.pass_sample_count]);
|
|
||||||
if (sample_multiplier != 1.0f) {
|
if (sample_multiplier != 1.0f) {
|
||||||
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
|
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
|
||||||
}
|
}
|
||||||
@ -997,7 +996,7 @@ class CPUDevice : public Device {
|
|||||||
coverage.finalize();
|
coverage.finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.adaptive_sampling.use) {
|
if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
|
||||||
adaptive_sampling_post(tile, kg);
|
adaptive_sampling_post(tile, kg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -760,9 +760,6 @@ class OptiXDevice : public CUDADevice {
|
|||||||
const int end_sample = rtile.start_sample + rtile.num_samples;
|
const int end_sample = rtile.start_sample + rtile.num_samples;
|
||||||
// Keep this number reasonable to avoid running into TDRs
|
// Keep this number reasonable to avoid running into TDRs
|
||||||
int step_samples = (info.display_device ? 8 : 32);
|
int step_samples = (info.display_device ? 8 : 32);
|
||||||
if (task.adaptive_sampling.use) {
|
|
||||||
step_samples = task.adaptive_sampling.align_static_samples(step_samples);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Offset into launch params buffer so that streams use separate data
|
// Offset into launch params buffer so that streams use separate data
|
||||||
device_ptr launch_params_ptr = launch_params.device_pointer +
|
device_ptr launch_params_ptr = launch_params.device_pointer +
|
||||||
@ -770,10 +767,14 @@ class OptiXDevice : public CUDADevice {
|
|||||||
|
|
||||||
const CUDAContextScope scope(cuContext);
|
const CUDAContextScope scope(cuContext);
|
||||||
|
|
||||||
for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
|
for (int sample = rtile.start_sample; sample < end_sample;) {
|
||||||
// Copy work tile information to device
|
// Copy work tile information to device
|
||||||
wtile.num_samples = min(step_samples, end_sample - sample);
|
|
||||||
wtile.start_sample = sample;
|
wtile.start_sample = sample;
|
||||||
|
wtile.num_samples = step_samples;
|
||||||
|
if (task.adaptive_sampling.use) {
|
||||||
|
wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
|
||||||
|
}
|
||||||
|
wtile.num_samples = min(wtile.num_samples, end_sample - sample);
|
||||||
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
|
device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
|
||||||
check_result_cuda(
|
check_result_cuda(
|
||||||
cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
|
cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
|
||||||
@ -815,7 +816,8 @@ class OptiXDevice : public CUDADevice {
|
|||||||
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
|
||||||
|
|
||||||
// Update current sample, so it is displayed correctly
|
// Update current sample, so it is displayed correctly
|
||||||
rtile.sample = wtile.start_sample + wtile.num_samples;
|
sample += wtile.num_samples;
|
||||||
|
rtile.sample = sample;
|
||||||
// Update task progress after the kernel completed rendering
|
// Update task progress after the kernel completed rendering
|
||||||
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
|
task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
|
||||||
|
|
||||||
|
@ -223,8 +223,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask &task,
|
|||||||
subtile.num_samples = samples_per_second;
|
subtile.num_samples = samples_per_second;
|
||||||
|
|
||||||
if (task.adaptive_sampling.use) {
|
if (task.adaptive_sampling.use) {
|
||||||
subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample,
|
subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
|
||||||
subtile.num_samples);
|
subtile.num_samples);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Don't go beyond requested number of samples. */
|
/* Don't go beyond requested number of samples. */
|
||||||
|
@ -144,41 +144,20 @@ AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Render samples in steps that align with the adaptive filtering. */
|
/* Render samples in steps that align with the adaptive filtering. */
|
||||||
int AdaptiveSampling::align_static_samples(int samples) const
|
int AdaptiveSampling::align_samples(int sample, int num_samples) const
|
||||||
{
|
{
|
||||||
if (samples > adaptive_step) {
|
int end_sample = sample + num_samples;
|
||||||
/* Make multiple of adaptive_step. */
|
|
||||||
while (samples % adaptive_step != 0) {
|
/* Round down end sample to the nearest sample that needs filtering. */
|
||||||
samples--;
|
end_sample &= ~(adaptive_step - 1);
|
||||||
}
|
|
||||||
|
if (end_sample <= sample) {
|
||||||
|
/* In order to reach the next sample that needs filtering, we'd need
|
||||||
|
* to increase num_samples. We don't do that in this function, so
|
||||||
|
* just keep it as is and don't filter this time around. */
|
||||||
|
return num_samples;
|
||||||
}
|
}
|
||||||
else if (samples < adaptive_step) {
|
return end_sample - sample;
|
||||||
/* Make divisor of adaptive_step. */
|
|
||||||
while (adaptive_step % samples != 0) {
|
|
||||||
samples--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return max(samples, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Render samples in steps that align with the adaptive filtering, with the
|
|
||||||
* suggested number of samples dynamically changing. */
|
|
||||||
int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
|
|
||||||
{
|
|
||||||
/* Round so that we end up on multiples of adaptive_samples. */
|
|
||||||
samples += offset;
|
|
||||||
|
|
||||||
if (samples > adaptive_step) {
|
|
||||||
/* Make multiple of adaptive_step. */
|
|
||||||
while (samples % adaptive_step != 0) {
|
|
||||||
samples--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
samples -= offset;
|
|
||||||
|
|
||||||
return max(samples, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AdaptiveSampling::need_filter(int sample) const
|
bool AdaptiveSampling::need_filter(int sample) const
|
||||||
|
@ -117,8 +117,7 @@ class AdaptiveSampling {
|
|||||||
public:
|
public:
|
||||||
AdaptiveSampling();
|
AdaptiveSampling();
|
||||||
|
|
||||||
int align_static_samples(int samples) const;
|
int align_samples(int sample, int num_samples) const;
|
||||||
int align_dynamic_samples(int offset, int samples) const;
|
|
||||||
bool need_filter(int sample) const;
|
bool need_filter(int sample) const;
|
||||||
|
|
||||||
bool use;
|
bool use;
|
||||||
|
@ -139,7 +139,7 @@ kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample,
|
|||||||
ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
|
ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
|
||||||
if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
||||||
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
||||||
float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
|
float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
|
||||||
if(sample_multiplier != 1.0f) {
|
if(sample_multiplier != 1.0f) {
|
||||||
kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
|
kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
|
||||||
}
|
}
|
||||||
|
@ -29,8 +29,7 @@ ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
|
|||||||
int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
|
int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
|
||||||
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
|
||||||
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
|
||||||
float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
|
float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
|
||||||
buffer[kernel_data.film.pass_sample_count]);
|
|
||||||
if (sample_multiplier != 1.0f) {
|
if (sample_multiplier != 1.0f) {
|
||||||
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
|
kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user