From 12f453820514e9478afdda0acf4c4fb1eac11e1c Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Wed, 27 Sep 2017 01:03:50 +0200 Subject: [PATCH] Code refactor: use split variance calculation for mega kernels too. There is no significant difference in denoised benchmark scenes and denoising ctests, so might as well make it all consistent. --- intern/cycles/device/device_cpu.cpp | 14 ++++---- intern/cycles/device/device_cuda.cpp | 8 ++--- intern/cycles/device/opencl/opencl_base.cpp | 8 ++--- .../cycles/kernel/filter/filter_prefilter.h | 25 ++++++-------- intern/cycles/kernel/kernel_passes.h | 34 ------------------- intern/cycles/kernel/kernels/cpu/filter_cpu.h | 6 ++-- .../kernel/kernels/cpu/filter_cpu_impl.h | 12 +++---- intern/cycles/kernel/kernels/cuda/filter.cu | 12 +++---- intern/cycles/kernel/kernels/opencl/filter.cl | 12 +++---- 9 files changed, 35 insertions(+), 96 deletions(-) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 72330b02a28..ff34f4f9ce4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -176,10 +176,10 @@ public: KernelFunctions convert_to_byte_kernel; KernelFunctions shader_kernel; - KernelFunctions filter_divide_shadow_kernel; - KernelFunctions filter_get_feature_kernel; - KernelFunctions filter_detect_outliers_kernel; - KernelFunctions filter_combine_halves_kernel; + KernelFunctions filter_divide_shadow_kernel; + KernelFunctions filter_get_feature_kernel; + KernelFunctions filter_detect_outliers_kernel; + KernelFunctions filter_combine_halves_kernel; KernelFunctions filter_nlm_calc_difference_kernel; KernelFunctions filter_nlm_blur_kernel; @@ -563,8 +563,7 @@ public: (float*) buffer_variance_ptr, &task->rect.x, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset, - use_split_kernel); + task->render_buffer.denoising_data_offset); } } return true; @@ -587,8 +586,7 @@ public: (float*) variance_ptr, &task->rect.x, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset, - use_split_kernel); + task->render_buffer.denoising_data_offset); } } return true; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index e5464dcf34e..54e012191ae 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1173,7 +1173,6 @@ public: task->rect.z-task->rect.x, task->rect.w-task->rect.y); - bool use_split_variance = use_split_kernel(); void *args[] = {&task->render_buffer.samples, &task->tiles_mem.device_pointer, &a_ptr, @@ -1183,8 +1182,7 @@ public: &buffer_variance_ptr, &task->rect, &task->render_buffer.pass_stride, - &task->render_buffer.denoising_data_offset, - &use_split_variance}; + &task->render_buffer.denoising_data_offset}; CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); cuda_assert(cuCtxSynchronize()); @@ -1209,7 +1207,6 @@ public: task->rect.z-task->rect.x, task->rect.w-task->rect.y); - bool use_split_variance = use_split_kernel(); void *args[] = {&task->render_buffer.samples, &task->tiles_mem.device_pointer, &mean_offset, @@ -1218,8 +1215,7 @@ public: &variance_ptr, &task->rect, &task->render_buffer.pass_stride, - &task->render_buffer.denoising_data_offset, - &use_split_variance}; + &task->render_buffer.denoising_data_offset}; CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); cuda_assert(cuCtxSynchronize()); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 7bdf81462b8..8095611f099 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -982,7 +982,6 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - char split_kernel = is_split_kernel()? 1 : 0; kernel_set_args(ckFilterDivideShadow, 0, task->render_buffer.samples, tiles_mem, @@ -993,8 +992,7 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, buffer_variance_mem, task->rect, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset, - split_kernel); + task->render_buffer.denoising_data_offset); enqueue_kernel(ckFilterDivideShadow, task->rect.z-task->rect.x, task->rect.w-task->rect.y); @@ -1015,7 +1013,6 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - char split_kernel = is_split_kernel()? 1 : 0; kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tiles_mem, @@ -1025,8 +1022,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, variance_mem, task->rect, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset, - split_kernel); + task->render_buffer.denoising_data_offset); enqueue_kernel(ckFilterGetFeature, task->rect.z-task->rect.x, task->rect.w-task->rect.y); diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 2aeb54a62be..eefcbfea230 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -35,8 +35,7 @@ ccl_device void kernel_filter_divide_shadow(int sample, ccl_global float *bufferVariance, int4 rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); @@ -57,10 +56,12 @@ ccl_device void kernel_filter_divide_shadow(int sample, float varB = center_buffer[5]; int odd_sample = (sample+1)/2; int even_sample = sample/2; - if(use_split_variance) { - varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); - varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); - } + + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + varA /= max(odd_sample - 1, 1); varB /= max(even_sample - 1, 1); @@ -84,8 +85,7 @@ ccl_device void kernel_filter_get_feature(int sample, ccl_global float *mean, ccl_global float *variance, int4 rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); @@ -97,12 +97,9 @@ ccl_device void kernel_filter_get_feature(int sample, mean[idx] = center_buffer[m_offset] / sample; if(sample > 1) { - if(use_split_variance) { - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); - } - else { - variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); - } + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); } else { /* Can't compute variance with single sample, just set it very high. */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index fff7f4cfdb7..bd756185e78 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -67,18 +67,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer /* The online one-pass variance update that's used for the megakernel can't easily be implemented * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ -# ifdef __SPLIT_KERNEL__ kernel_write_pass_float(buffer+1, sample, value*value); -# else - if(sample == 0) { - kernel_write_pass_float(buffer+1, sample, 0.0f); - } - else { - float new_mean = buffer[0] * (1.0f / (sample + 1)); - float old_mean = (buffer[0] - value) * (1.0f / sample); - kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean)); - } -# endif } # if defined(__SPLIT_KERNEL__) @@ -95,19 +84,7 @@ ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buff ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) { kernel_write_pass_float3_unaligned(buffer, sample, value); -# ifdef __SPLIT_KERNEL__ kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); -# else - if(sample == 0) { - kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f)); - } - else { - float3 sum = make_float3(buffer[0], buffer[1], buffer[2]); - float3 new_mean = sum * (1.0f / (sample + 1)); - float3 old_mean = (sum - value) * (1.0f / sample); - kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean)); - } -# endif } ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, @@ -125,18 +102,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); float value = path_total_shaded / max(path_total, 1e-7f); -# ifdef __SPLIT_KERNEL__ kernel_write_pass_float(buffer+2, sample/2, value*value); -# else - if(sample < 2) { - kernel_write_pass_float(buffer+2, sample/2, 0.0f); - } - else { - float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f); - float new_value = buffer[1] / max(buffer[0], 1e-7f); - kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value)); - } -# endif } #endif /* __DENOISING_FEATURES__ */ diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 2ed713299fd..bf13ba62806 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -27,8 +27,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, float *bufferV, int* prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance); + int buffer_denoising_offset); void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, TilesInfo *tiles, @@ -40,8 +39,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, float *variance, int* prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance); + int buffer_denoising_offset); void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 8dc1a8d583c..2fbb0ea2bdb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -45,8 +45,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, float *bufferVariance, int* prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); @@ -60,8 +59,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, bufferVariance, load_int4(prefilter_rect), buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); #endif } @@ -74,8 +72,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, float *mean, float *variance, int* prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_get_feature); @@ -86,8 +83,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, mean, variance, load_int4(prefilter_rect), buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 009c3fde9d5..c8172355a7f 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -37,8 +37,7 @@ kernel_cuda_filter_divide_shadow(int sample, float *bufferVariance, int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; @@ -53,8 +52,7 @@ kernel_cuda_filter_divide_shadow(int sample, bufferVariance, prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); } } @@ -68,8 +66,7 @@ kernel_cuda_filter_get_feature(int sample, float *variance, int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - bool use_split_variance) + int buffer_denoising_offset) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; @@ -81,8 +78,7 @@ kernel_cuda_filter_get_feature(int sample, mean, variance, prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index f015ac47d8a..7a7b596a350 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -31,8 +31,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample, ccl_global float *bufferVariance, int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - char use_split_variance) + int buffer_denoising_offset) { int x = prefilter_rect.x + get_global_id(0); int y = prefilter_rect.y + get_global_id(1); @@ -47,8 +46,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample, bufferVariance, prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); } } @@ -60,8 +58,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, ccl_global float *variance, int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, - char use_split_variance) + int buffer_denoising_offset) { int x = prefilter_rect.x + get_global_id(0); int y = prefilter_rect.y + get_global_id(1); @@ -73,8 +70,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, mean, variance, prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, - use_split_variance); + buffer_denoising_offset); } }