From 2e50add1643d1f37dd9bd412348135477f1c3504 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sun, 15 Oct 2017 17:40:01 +0200 Subject: [PATCH] Fix OpenCL performance regression after cubic interpolation. Reorganize code to reduce register pressure. --- .../kernel/kernels/cuda/kernel_cuda_image.h | 2 +- .../kernels/opencl/kernel_opencl_image.h | 349 +++++++----------- 2 files changed, 128 insertions(+), 223 deletions(-) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h index b7be4fe4409..5ca07eaeb05 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h @@ -87,7 +87,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObjec g1x * tex2D(tex, x1, y1)); } -/* Fast tricubic texture lookup using 8 bilinear lookups. */ +/* Fast tricubic texture lookup using 8 trilinear lookups. */ template ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z) { diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h index d908af78c7a..faa9dd66d0e 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -27,9 +27,21 @@ ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uin #define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)] -ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if(x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width-1); +} + +ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset) { - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); const int texture_type = kernel_tex_type(id); /* Float4 */ @@ -55,19 +67,45 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int o } } -ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y) { - x %= width; - if(x < 0) - x += width; - return x; + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if(info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + } + + int offset = x + info->width * y; + return svm_image_texture_read(kg, info, id, offset); } -ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z) { - return clamp(x, 0, width-1); + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if(info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + z = svm_image_texture_wrap_periodic(z, info->depth); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + z = svm_image_texture_wrap_clamp(z, info->depth); + } + + int offset = x + info->width * y + info->width * info->height * z; + return svm_image_texture_read(kg, info, id, offset); } + ccl_device_inline float svm_image_texture_frac(float x, int *ix) { int i = float_to_int(x) - ((x < 0.0f)? 1: 0); @@ -87,107 +125,52 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl { const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - uint width = info->width; - uint height = info->height; - uint interpolation = info->interpolation; - uint extension = info->extension; + if(info->extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } - /* Actual sampling. */ - if(interpolation == INTERPOLATION_CLOSEST) { + if(info->interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ int ix, iy; - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); + svm_image_texture_frac(x*info->width, &ix); + svm_image_texture_frac(y*info->height, &iy); - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } + return svm_image_texture_read_2d(kg, id, ix, iy); + } + else if(info->interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - return svm_image_texture_read(kg, id, ix + iy*width); + float4 r; + r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy); + r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy); + r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1); + r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1); + return r; } else { - /* Bilinear or bicubic interpolation. */ - int ix, iy, nix, niy; - float tx = svm_image_texture_frac(x*width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - } - - if(interpolation == INTERPOLATION_LINEAR) { - /* Bilinear interpolation. */ - float4 r; - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width); - r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width); - r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width); - r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width); - return r; - } - /* Bicubic interpolation. */ - int pix, piy, nnix, nniy; - if(extension == EXTENSION_REPEAT) { - pix = svm_image_texture_wrap_periodic(ix-1, width); - piy = svm_image_texture_wrap_periodic(iy-1, height); - nnix = svm_image_texture_wrap_periodic(ix+2, width); - nniy = svm_image_texture_wrap_periodic(iy+2, height); - } - else { - pix = svm_image_texture_wrap_clamp(ix-1, width); - piy = svm_image_texture_wrap_clamp(iy-1, height); - nnix = svm_image_texture_wrap_clamp(ix+2, width); - nniy = svm_image_texture_wrap_clamp(iy+2, height); - } + int ix, iy; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; float u[4], v[4]; - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y])) -#define TERM(col) \ - (v[col] * (u[0] * DATA(0, col) + \ - u[1] * DATA(1, col) + \ - u[2] * DATA(2, col) + \ - u[3] * DATA(3, col))) - SET_CUBIC_SPLINE_WEIGHTS(u, tx); SET_CUBIC_SPLINE_WEIGHTS(v, ty); - /* Actual interpolation. */ - return TERM(0) + TERM(1) + TERM(2) + TERM(3); -#undef TERM -#undef DATA + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for(int y = 0; y < 4; y++) { + for(int x = 0; x < 4; x++) { + float weight = u[x]*v[y]; + r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1); + } + } + return r; } } @@ -196,145 +179,67 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, { const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - uint width = info->width; - uint height = info->height; - uint depth = info->depth; + if(info->extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp; - uint extension = info->extension; - /* Actual sampling. */ if(interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ int ix, iy, iz; - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - svm_image_texture_frac(z*depth, &iz); + svm_image_texture_frac(x*info->width, &ix); + svm_image_texture_frac(y*info->height, &iy); + svm_image_texture_frac(z*info->depth, &iz); - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height); + return svm_image_texture_read_3d(kg, id, ix, iy, iz); + } + else if(interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); + + float4 r; + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz); + r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz); + r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz); + r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz); + + r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1); + r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1); + r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1); + r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1); + return r; } else { - /* Bilinear or bicubic interpolation. */ - int ix, iy, iz, nix, niy, niz; - float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - niz = svm_image_texture_wrap_periodic(iz+1, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - niz = svm_image_texture_wrap_clamp(iz+1, depth); - - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - - if(interpolation == INTERPOLATION_LINEAR) { - /* Bilinear interpolation. */ - float4 r; - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + iz*width*height); - r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + iz*width*height); - r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + iz*width*height); - r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + iz*width*height); - - r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + niz*width*height); - r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + niz*width*height); - r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + niz*width*height); - r += tz*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + niz*width*height); - return r; - } - /* Bicubic interpolation. */ - int pix, piy, piz, nnix, nniy, nniz; - if(extension == EXTENSION_REPEAT) { - pix = svm_image_texture_wrap_periodic(ix-1, width); - piy = svm_image_texture_wrap_periodic(iy-1, height); - piz = svm_image_texture_wrap_periodic(iz-1, depth); - nnix = svm_image_texture_wrap_periodic(ix+2, width); - nniy = svm_image_texture_wrap_periodic(iy+2, height); - nniz = svm_image_texture_wrap_periodic(iz+2, depth); - } - else { - pix = svm_image_texture_wrap_clamp(ix-1, width); - piy = svm_image_texture_wrap_clamp(iy-1, height); - piz = svm_image_texture_wrap_clamp(iz-1, depth); - nnix = svm_image_texture_wrap_clamp(ix+2, width); - nniy = svm_image_texture_wrap_clamp(iy+2, height); - nniz = svm_image_texture_wrap_clamp(iz+2, depth); - } + int ix, iy, iz; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - const int zc[4] = {width * height * piz, - width * height * iz, - width * height * niz, - width * height * nniz}; float u[4], v[4], w[4]; - - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y, z) (svm_image_texture_read(kg, id, xc[x] + yc[y] + zc[z])) -#define COL_TERM(col, row) \ - (v[col] * (u[0] * DATA(0, col, row) + \ - u[1] * DATA(1, col, row) + \ - u[2] * DATA(2, col, row) + \ - u[3] * DATA(3, col, row))) -#define ROW_TERM(row) \ - (w[row] * (COL_TERM(0, row) + \ - COL_TERM(1, row) + \ - COL_TERM(2, row) + \ - COL_TERM(3, row))) - SET_CUBIC_SPLINE_WEIGHTS(u, tx); SET_CUBIC_SPLINE_WEIGHTS(v, ty); SET_CUBIC_SPLINE_WEIGHTS(w, tz); - /* Actual interpolation. */ - return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); -#undef COL_TERM -#undef ROW_TERM -#undef DATA + for(int z = 0; z < 4; z++) { + for(int y = 0; y < 4; y++) { + for(int x = 0; x < 4; x++) { + float weight = u[x]*v[y]*w[z]; + r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1); + } + } + } + return r; } }