diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 7c72ab1a009..4986bb809fb 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -184,11 +184,11 @@ public: KernelFunctions filter_detect_outliers_kernel; KernelFunctions filter_combine_halves_kernel; - KernelFunctions filter_nlm_calc_difference_kernel; - KernelFunctions filter_nlm_blur_kernel; - KernelFunctions filter_nlm_calc_weight_kernel; - KernelFunctions filter_nlm_update_output_kernel; - KernelFunctions filter_nlm_normalize_kernel; + KernelFunctions filter_nlm_calc_difference_kernel; + KernelFunctions filter_nlm_blur_kernel; + KernelFunctions filter_nlm_calc_weight_kernel; + KernelFunctions filter_nlm_update_output_kernel; + KernelFunctions filter_nlm_normalize_kernel; KernelFunctions filter_construct_transform_kernel; KernelFunctions filter_nlm_construct_gramian_kernel; @@ -475,6 +475,7 @@ public: float *blurDifference = temporary_mem; float *difference = temporary_mem + task->buffer.pass_stride; float *weightAccum = temporary_mem + 2*task->buffer.pass_stride; + float *temp_image = temporary_mem + 3*task->buffer.pass_stride; memset(weightAccum, 0, sizeof(float)*w*h); memset((float*) out_ptr, 0, sizeof(float)*w*h); @@ -499,6 +500,7 @@ public: filter_nlm_update_output_kernel()(dx, dy, blurDifference, (float*) image_ptr, + temp_image, (float*) out_ptr, weightAccum, local_rect, diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 23c18fa15b2..3384e1d81fd 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer() buffer.mem.alloc_to_device(mem_size, false); /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_shifts = 1; + int num_layers; if(buffer.gpu_temporary_mem) { /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ int max_radius = max(radius, 6); - num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + int num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + num_layers = 2*num_shifts + 1; + } + else { + num_layers = 4; } /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride); + buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); } void DenoisingTask::prefilter_shadowing() diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index e2da0fd872b..af73c0dadf2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,6 +16,9 @@ CCL_NAMESPACE_BEGIN +#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf)+(ofs)) + ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, @@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0)? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - float diff = 0.0f; - int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + int idx_p = y*stride + aligned_lowx; + int idx_q = (y+dy)*stride + aligned_lowx + dx; + for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } - if(numChannels > 1) { - diff *= 1.0f/numChannels; - } - difference_image[y*stride + x] = diff; + load4_a(difference_image, idx_p) = diff*channel_fac; } } } @@ -50,23 +61,61 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int stride, int f) { - int aligned_lowx = rect.x / 4; - int aligned_highx = (rect.z + 3) / 4; + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } for(int y1 = low; y1 < high; y1++) { - float4* out_image4 = (float4*)(out_image + y*stride); - float4* difference_image4 = (float4*)(difference_image + y1*stride); - for(int x = aligned_lowx; x < aligned_highx; x++) { - out_image4[x] += difference_image4[x]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); } } - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] *= 1.0f/(high - low); + float fac = 1.0f/(high - low); + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) *= fac; + } + } +} + +ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + int aligned_lowx = round_down(rect.x, 4); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); + } + } + + for(int dx = -f; dx <= f; dx++) { + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y*stride + x + dx); + load4_a(out_image, y*stride + x) += mask(active, diff); + } + } + } + + aligned_lowx = round_down(rect.x, 4); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); + load4_a(out_image, y*stride + x) *= rcp(high - low); } } } @@ -77,25 +126,12 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d int stride, int f) { + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; - } - } - for(int dx = -f; dx <= f; dx++) { - int pos_dx = max(0, dx); - int neg_dx = min(0, dx); - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - out_image[y*stride + x] += difference_image[y*stride + x+dx]; - } - } - } - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); } } } @@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, + float *temp_image, float *out_image, float *accum_image, int4 rect, int stride, int f) { + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - accum_image[y*stride + x] += weight; - out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); + + float4 val = load4_u(image, idx_q); + + load4_a(out_image, idx_p) += mask(active, weight*val); } } } @@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, } } +#undef load4_a +#undef load4_u + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index b62aa9663ec..e036b53b810 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int* rect, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 26777fdabb2..4c758711481 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int *rect, @@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); #endif } diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52aeb8d8599..eafae5f31c0 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } + +ccl_device_inline int4 __float4_as_int4(float4 f) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(f.m128)); + #else + return make_int4(__float_as_int(f.x), + __float_as_int(f.y), + __float_as_int(f.z), + __float_as_int(f.w)); +#endif +} + +ccl_device_inline float4 __int4_as_float4(int4 i) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_castsi128_ps(i.m128)); +#else + return make_float4(__int_as_float(i.x), + __int_as_float(i.y), + __int_as_float(i.z), + __int_as_float(i.w)); +#endif +} #endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index d3960deb3b4..323d40058e5 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c) return a * b + c; } +ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c) +{ + return a * b + c; +} + /* * FAST & APPROXIMATE MATH * @@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } +#ifndef __KERNEL_GPU__ +ccl_device float4 fast_exp2f4(float4 x) +{ + const float4 one = make_float4(1.0f); + const float4 limit = make_float4(126.0f); + x = clamp(x, -limit, limit); + int4 m = make_int4(x); + x = one - (one - (x - make_float4(m))); + float4 r = make_float4(1.33336498402e-3f); + r = madd4(x, r, make_float4(9.810352697968e-3f)); + r = madd4(x, r, make_float4(5.551834031939e-2f)); + r = madd4(x, r, make_float4(0.2401793301105f)); + r = madd4(x, r, make_float4(0.693144857883f)); + r = madd4(x, r, make_float4(1.0f)); + return __int4_as_float4(__float4_as_int4(r) + (m << 23)); +} + +ccl_device_inline float4 fast_expf4(float4 x) +{ + return fast_exp2f4(x / M_LN2_F); +} +#endif + ccl_device_inline float fast_exp10(float x) { /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index aa7e56fefe9..105547098b5 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b); ccl_device_inline float4 operator-(const float4& a, const float4& b); ccl_device_inline float4 operator+=(float4& a, const float4& b); ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, float f); ccl_device_inline float4 operator/=(float4& a, float f); ccl_device_inline int4 operator<(const float4& a, const float4& b); @@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx); ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ @@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b) return a = a * b; } +ccl_device_inline float4 operator*=(float4& a, float f) +{ + return a = a * f; +} + ccl_device_inline float4 operator/=(float4& a, float f) { return a = a / f; @@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b) #endif } +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx) +{ + return min(max(a, mn), mx); +} + ccl_device_inline float4 fabs(const float4& a) { #ifdef __KERNEL_SSE__ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 79a8c0841e7..cde366b8c27 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int4 operator+(const int4& a, const int4& b); ccl_device_inline int4 operator+=(int4& a, const int4& b); ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 operator<<(const int4& a, int i); +ccl_device_inline int4 operator<(const int4& a, const int4& b); +ccl_device_inline int4 operator>=(const int4& a, const int4& b); +ccl_device_inline int4 operator&(const int4& a, const int4& b); ccl_device_inline int4 min(int4 a, int4 b); ccl_device_inline int4 max(int4 a, int4 b); ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); @@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i) #endif } +ccl_device_inline int4 operator<<(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_slli_epi32(a.m128, i)); +#else + return make_int4(a.x << i, a.y << i, a.z << i, a.w << i); +#endif +} + +ccl_device_inline int4 operator<(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_cmplt_epi32(a.m128, b.m128)); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator&(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_and_si128(a.m128, b.m128)); +#else + return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +#endif +} + ccl_device_inline int4 min(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h index cdd0ecbdae5..4ef162f1ac6 100644 --- a/intern/cycles/util/util_types_int4.h +++ b/intern/cycles/util/util_types_int4.h @@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ struct float3; +struct float4; struct ccl_try_align(16) int4 { #ifdef __KERNEL_SSE__ @@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 { ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline int4 make_int4(const float4& f); ccl_device_inline void print_int4(const char *label, const int4& a); #endif /* __KERNEL_GPU__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h index 07cdc88f2dc..a62561709de 100644 --- a/intern/cycles/util/util_types_int4_impl.h +++ b/intern/cycles/util/util_types_int4_impl.h @@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f) return a; } +ccl_device_inline int4 make_int4(const float4& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + ccl_device_inline void print_int4(const char *label, const int4& a) { printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);