From 45dcd20ca9e1f60c51e7752560b0042128740d69 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sat, 5 Aug 2017 04:06:39 +0200 Subject: [PATCH] Cycles: CUDA split performance tweaks, still far from megakernel. On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after. --- intern/cycles/device/device_cuda.cpp | 10 +++------- intern/cycles/kernel/kernels/cuda/kernel_config.h | 9 +++++++-- intern/cycles/kernel/kernels/cuda/kernel_split.cu | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 3a29538aa13..dbf636e1405 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1898,17 +1898,13 @@ public: int threads_per_block; cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - - int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; - int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); cuda_assert(cuLaunchKernel(func, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ + xblocks, 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ 0, 0, args, 0)); device->cuda_pop_context(); diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 9fa39dc9ebb..7ae205b7e14 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,8 +81,13 @@ # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ __launch_bounds__( \ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 628891b1458..e97e87285a5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ kernel_##name(NULL); \ @@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ ccl_local type locals; \