Cycles: CUDA split performance tweaks, still far from megakernel.

On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after.
This commit is contained in:
Brecht Van Lommel 2017-08-05 04:06:39 +02:00
parent fa05718f27
commit 45dcd20ca9
3 changed files with 12 additions and 11 deletions

@ -1898,17 +1898,13 @@ public:
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
int xthreads = (int)sqrt(threads_per_block);
int ythreads = (int)sqrt(threads_per_block);
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuLaunchKernel(func,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
xblocks, 1, 1, /* blocks */
threads_per_block, 1, 1, /* threads */
0, 0, args, 0));
device->cuda_pop_context();

@ -81,8 +81,13 @@
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread */
/* For split kernel using all registers seems fastest for now, but this
* is unlikely to be optimal once we resolve other bottlenecks. */
#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \

@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
extern "C" __global__ void \
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
kernel_cuda_##name() \
{ \
kernel_##name(NULL); \
@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
extern "C" __global__ void \
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
kernel_cuda_##name() \
{ \
ccl_local type locals; \