Cycles: CUDA split performance tweaks, still far from megakernel.

On Pabellon, 25.8s mega, 35.4s split before, 32.7s split after.
2017-08-05 04:06:39 +02:00 · 2017-08-05 04:06:39 +02:00 · 45dcd20ca9
commit 45dcd20ca9
parent fa05718f27
3 changed files with 12 additions and 11 deletions
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -1898,17 +1898,13 @@ public:
 		int threads_per_block;
 		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));

-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-
-		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
-		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+		int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;

 		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));

 		cuda_assert(cuLaunchKernel(func,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
+		                           xblocks, 1, 1, /* blocks */
+		                           threads_per_block, 1, 1, /* threads */
 		                           0, 0, args, 0));

 		device->cuda_pop_context();
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@ -81,8 +81,13 @@
 #  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
 #endif

-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */

 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
 	__launch_bounds__( \
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(

 #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		kernel_##name(NULL); \
@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(

 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		ccl_local type locals; \