Cycles Denoising: Add more robust outlier heuristic to avoid artifacts

Extremely bright pixels in the rendered image cause the denoising algorithm
to produce extremely noticable artifacts. Therefore, a heuristic is needed
to exclude these pixels from the filtering process.

The new approach calculates the 75% percentile of the 5x5 neighborhood of
each pixel and flags the pixel if it is more than twice as bright.

During the reconstruction process, flagged pixels are skipped. Therefore,
they don't cause any problems for neighboring pixels, and the outlier pixels
themselves are replaced by a prediction of their actual value based on their
feature pass values and the neighboring pixels.

Therefore, the denoiser now also works as a smarter despeckling filter that
uses a more accurate prediction of the pixel instead of a simple average.
This can be used even if denoising isn't wanted by setting the denoising
radius to 1.
This commit is contained in:
Lukas Stockner 2017-05-18 03:03:18 +02:00
parent b3a3459e1a
commit 740cd28748
14 changed files with 224 additions and 8 deletions

@ -176,6 +176,7 @@ public:
KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
@ -210,6 +211,7 @@ public:
REGISTER_KERNEL(shader),
REGISTER_KERNEL(filter_divide_shadow),
REGISTER_KERNEL(filter_get_feature),
REGISTER_KERNEL(filter_detect_outliers),
REGISTER_KERNEL(filter_combine_halves),
REGISTER_KERNEL(filter_nlm_calc_difference),
REGISTER_KERNEL(filter_nlm_blur),
@ -594,6 +596,26 @@ public:
return true;
}
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
for(int y = task->rect.y; y < task->rect.w; y++) {
for(int x = task->rect.x; x < task->rect.z; x++) {
filter_detect_outliers_kernel()(x, y,
(float*) image_ptr,
(float*) variance_ptr,
(float*) depth_ptr,
(float*) output_ptr,
&task->rect.x,
task->buffer.pass_stride);
}
}
return true;
}
void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
{
float *render_buffer = (float*)tile.buffer;
@ -632,6 +654,7 @@ public:
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);

@ -1248,6 +1248,38 @@ public:
return !have_error();
}
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
if(have_error())
return false;
cuda_push_context();
CUfunction cuFilterDetectOutliers;
cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
void *args[] = {&image_ptr,
&variance_ptr,
&depth_ptr,
&output_ptr,
&task->rect,
&task->buffer.pass_stride};
CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
cuda_assert(cuCtxSynchronize());
cuda_pop_context();
return !have_error();
}
void denoise(RenderTile &rtile, const DeviceTask &task)
{
DenoisingTask denoising(this);
@ -1258,6 +1290,7 @@ public:
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);

@ -159,11 +159,25 @@ bool DenoisingTask::run_denoising()
int mean_to[] = { 8, 9, 10};
int variance_to[] = {11, 12, 13};
int num_color_passes = 3;
device_only_memory<float> temp_color;
temp_color.resize(3*buffer.pass_stride);
device->mem_alloc("Denoising temporary color", temp_color, MEM_READ_WRITE);
for(int pass = 0; pass < num_color_passes; pass++) {
device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr color_pass(device, temp_color, pass*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
}
{
device_sub_ptr depth_pass (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr color_var_pass(device, buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
device_sub_ptr output_pass (device, buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
}
device->mem_free(temp_color);
}
storage.w = filter_area.z;

@ -82,6 +82,11 @@ public:
device_ptr mean_ptr,
device_ptr variance_ptr
)> get_feature;
function<bool(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr
)> detect_outliers;
function<bool(device_ptr*)> set_tiles;
} functions;

@ -411,6 +411,11 @@ protected:
device_ptr mean_ptr,
device_ptr variance_ptr,
DenoisingTask *task);
bool denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr,
DenoisingTask *task);
bool denoising_set_tiles(device_ptr *buffers,
DenoisingTask *task);

@ -216,6 +216,7 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
denoising_program.add_kernel(ustring("filter_divide_shadow"));
denoising_program.add_kernel(ustring("filter_get_feature"));
denoising_program.add_kernel(ustring("filter_detect_outliers"));
denoising_program.add_kernel(ustring("filter_combine_halves"));
denoising_program.add_kernel(ustring("filter_construct_transform"));
denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
@ -910,6 +911,33 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
return true;
}
bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
device_ptr variance_ptr,
device_ptr depth_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
cl_mem image_mem = CL_MEM_PTR(image_ptr);
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
cl_mem output_mem = CL_MEM_PTR(output_ptr);
cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
kernel_set_args(ckFilterDetectOutliers, 0,
image_mem,
variance_mem,
depth_mem,
output_mem,
task->rect,
task->buffer.pass_stride);
enqueue_kernel(ckFilterDetectOutliers,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
return true;
}
bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
DenoisingTask *task)
{
@ -942,6 +970,7 @@ void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
denoising.render_buffer.samples = rtile.sample;

@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
* pixel_buffer always points to the current pixel in the first pass. */
@ -32,7 +32,7 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest
{
features[0] = pixel.x;
features[1] = pixel.y;
features[2] = ccl_get_feature(buffer, 0);
features[2] = fabsf(ccl_get_feature(buffer, 0));
features[3] = ccl_get_feature(buffer, 1);
features[4] = ccl_get_feature(buffer, 2);
features[5] = ccl_get_feature(buffer, 3);
@ -50,7 +50,7 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float cc
{
scales[0] = fabsf(pixel.x - mean[0]);
scales[1] = fabsf(pixel.y - mean[1]);
scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
ccl_get_feature(buffer, 2) - mean[4],
ccl_get_feature(buffer, 3) - mean[5]));
@ -107,7 +107,7 @@ ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
math_vector_zero(design_row+1, rank);
design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));

@ -37,7 +37,7 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active
{
features[0] = x;
features[1] = y;
features[2] = ccl_get_feature_sse(0);
features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
features[3] = ccl_get_feature_sse(1);
features[4] = ccl_get_feature_sse(2);
features[5] = ccl_get_feature_sse(3);
@ -58,7 +58,7 @@ ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
__m128 diff, scale;
diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);

@ -104,6 +104,57 @@ ccl_device void kernel_filter_get_feature(int sample,
}
}
ccl_device void kernel_filter_detect_outliers(int x, int y,
ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
ccl_global float *out,
int4 rect,
int pass_stride)
{
int buffer_w = align_up(rect.z - rect.x, 4);
int n = 0;
float values[25];
for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
/* Find the position of L. */
int i;
for(i = 0; i < n; i++) {
if(values[i] > L) break;
}
/* Make space for L by shifting all following values to the right. */
for(int j = n; j > i; j--) {
values[j] = values[j-1];
}
/* Insert L. */
values[i] = L;
n++;
}
}
int idx = (y-rect.y)*buffer_w + (x-rect.x);
float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
float ref = 2.0f*values[(int)(n*0.75f)];
float fac = 1.0f;
if(L > ref) {
/* If the pixel is an outlier, negate the depth value to mark it as one.
* Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
depth[idx] = -depth[idx];
fac = ref/L;
variance[idx ] *= fac*fac;
variance[idx + pass_stride] *= fac*fac;
variance[idx+2*pass_stride] *= fac*fac;
}
out[idx ] = fac*image[idx];
out[idx + pass_stride] = fac*image[idx + pass_stride];
out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
}
/* Combine A/B buffers.
* Calculates the combined mean and the buffer variance. */
ccl_device void kernel_filter_combine_halves(int x, int y,

@ -54,7 +54,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
/* If the pixel was flagged as an outlier during prefiltering, skip it.
* Otherwise, perform the regular confidence interval test. */
if(ccl_get_feature(buffer + q_offset, 0) < 0.0f ||
average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f)) {
return;
}

@ -43,6 +43,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
int buffer_denoising_offset,
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
ccl_global float *output,
int *rect,
int pass_stride);
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,

@ -91,6 +91,21 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
ccl_global float *output,
int *rect,
int pass_stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
#else
kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,

@ -86,6 +86,22 @@ kernel_cuda_filter_get_feature(int sample,
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_detect_outliers(float *image,
float *variance,
float *depth,
float *output,
int4 prefilter_rect,
int pass_stride)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)

@ -78,6 +78,20 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
}
}
__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
ccl_global float *output,
int4 prefilter_rect,
int pass_stride)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
}
}
__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,