forked from bartvdbraak/blender
Cycles: CUDA split performance tweaks, still far from megakernel.
On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after.
This commit is contained in:
parent
fa05718f27
commit
45dcd20ca9
@ -1898,17 +1898,13 @@ public:
|
||||
int threads_per_block;
|
||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
|
||||
|
||||
int xthreads = (int)sqrt(threads_per_block);
|
||||
int ythreads = (int)sqrt(threads_per_block);
|
||||
|
||||
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
|
||||
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
|
||||
int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
cuda_assert(cuLaunchKernel(func,
|
||||
xblocks , yblocks, 1, /* blocks */
|
||||
xthreads, ythreads, 1, /* threads */
|
||||
xblocks, 1, 1, /* blocks */
|
||||
threads_per_block, 1, 1, /* threads */
|
||||
0, 0, args, 0));
|
||||
|
||||
device->cuda_pop_context();
|
||||
|
@ -81,8 +81,13 @@
|
||||
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
|
||||
#endif
|
||||
|
||||
/* compute number of threads per block and minimum blocks per multiprocessor
|
||||
* given the maximum number of registers per thread */
|
||||
/* For split kernel using all registers seems fastest for now, but this
|
||||
* is unlikely to be optimal once we resolve other bottlenecks. */
|
||||
|
||||
#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
|
||||
|
||||
/* Compute number of threads per block and minimum blocks per multiprocessor
|
||||
* given the maximum number of registers per thread. */
|
||||
|
||||
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
|
||||
__launch_bounds__( \
|
||||
|
@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
extern "C" __global__ void \
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
|
||||
kernel_cuda_##name() \
|
||||
{ \
|
||||
kernel_##name(NULL); \
|
||||
@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
||||
extern "C" __global__ void \
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
|
||||
kernel_cuda_##name() \
|
||||
{ \
|
||||
ccl_local type locals; \
|
||||
|
Loading…
Reference in New Issue
Block a user