Cycles: CUDA implementation of split kernel

2017-02-14 05:50:29 -05:00 · 2017-02-14 05:50:29 -05:00 · 817873cc83
commit 817873cc83
parent 0892352bfe
13 changed files with 599 additions and 134 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_opencl_kernel_type = EnumProperty(
            name="OpenCL Kernel Type",
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
        col = layout.column()
        col.label('CUDA Flags:')
        col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")

        col = layout.column()
        col.label('OpenCL Flags:')
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -22,6 +22,7 @@

 #include "device.h"
 #include "device_intern.h"
+#include "device_split_kernel.h"

 #include "buffers.h"

@ -43,6 +44,8 @@
 #include "util_types.h"
 #include "util_time.h"

+#include "split/kernel_split_data.h"
+
 CCL_NAMESPACE_BEGIN

 #ifndef WITH_CUDA_DYNLOAD
@ -79,6 +82,29 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */

+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@ -259,11 +285,16 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}

+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@ -288,6 +319,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}

@ -321,7 +357,7 @@ public:
 		return true;
 	}

-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@ -330,7 +366,8 @@ public:

 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+			                                                  : "lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@ -340,7 +377,7 @@ public:
 		}

 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, split);

 		/* Try to use locally compiled kernel. */
 		const string kernel_path = path_get("kernel");
@ -351,7 +388,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);

-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+		                                              : "cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@ -386,7 +424,7 @@ public:
 		const char *nvcc = cuewCompilerPath();
 		const string kernel = path_join(kernel_path,
 		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		                                    path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");

@ -434,7 +472,7 @@ public:
 			return false;

 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
+		string cubin = compile_kernel(requested_features, use_split_kernel());

 		if(cubin == "")
 			return false;
@ -1261,6 +1299,7 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();

+			if(!use_split_kernel()) {
 				/* keep rendering tiles until done */
 				while(task->acquire_tile(this, tile)) {
 					int start_sample = tile.start_sample;
@ -1282,6 +1321,28 @@ public:
 					task->release_tile(tile);
 				}
 			}
+			else {
+				DeviceRequestedFeatures requested_features;
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
+				}
+
+				CUDASplitKernel split_kernel(this);
+				split_kernel.load_kernels(requested_features);
+
+				while(task->acquire_tile(this, tile)) {
+					device_memory void_buffer;
+					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+					task->release_tile(tile);
+
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+				}
+			}
+		}
 		else if(task->type == DeviceTask::SHADER) {
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
@ -1332,8 +1393,186 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
 };

+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		device->cuda_push_context();
+
+		if(device->have_error())
+			return false;
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
+
+		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+			                       xblocks , yblocks, 1, /* blocks */
+			                       xthreads, ythreads, 1, /* threads */
+			                       0, 0, args, 0));
+
+		device->cuda_pop_context();
+
+		return !device->have_error();
+	}
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	device->cuda_push_context();
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		CUdeviceptr* rng_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&d_rng_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	device->cuda_pop_context();
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CUfunction func;
+
+	device->cuda_push_context();
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	device->cuda_pop_context();
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+{
+	/* TODO(mai): implement something here to detect ideal work size */
+	return make_int2(256, 256);
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -27,6 +27,7 @@ set(SRC
 	kernels/opencl/kernel_next_iteration_setup.cl
 	kernels/opencl/kernel_sum_all_radiance.cl
 	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
 )

 set(SRC_BVH_HEADERS
@ -89,6 +90,10 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )

+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@ -230,8 +235,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()

 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@ -240,15 +246,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)

-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
-		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+		if(${split})
+			set(cuda_extra_flags "-D__SPLIT__")
+			set(cuda_cubin kernel_split)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_cubin kernel)
 		endif()

+		if(${experimental})
+			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_cubin ${cuda_cubin}_experimental)
+		endif()
+
+		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@ -261,13 +274,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")

+		if(split)
+			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+		else()
+			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+		endif()
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@ -294,7 +313,12 @@ if(WITH_CYCLES_CUDA_BINARIES)

 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+		endif()
 	endforeach()

 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@ -352,6 +376,7 @@ add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@ -386,7 +411,9 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@ -46,11 +46,58 @@
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)

+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
+
 /* No assert supported for CUDA */

 #define kernel_assert(cond)
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -92,12 +92,14 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
+#  ifndef __SPLIT_KERNEL__
 #    define __BRANCHED_PATH__
 #    define __VOLUME__
 #    define __VOLUME_SCATTER__
 #    define __SUBSURFACE__
 #    define __CMJ__
 #    define __SHADOW_RECORD_ALL__
+#  endif
 #endif  /* __KERNEL_CUDA__ */

 #ifdef __KERNEL_OPENCL__
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@ -16,7 +16,10 @@

 /* CUDA kernel entry points */

+#ifdef __CUDA_ARCH__
+
 #include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
 #include "../../kernel_math.h"
 #include "../../kernel_types.h"
 #include "../../kernel_globals.h"
@ -25,104 +28,7 @@
 #include "../../kernel_path_branched.h"
 #include "../../kernel_bake.h"

-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 32
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 48
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "../../split/kernel_split_common.h"
+#include "../../split/kernel_data_init.h"
+#include "../../split/kernel_scene_intersect.h"
+#include "../../split/kernel_lamp_emission.h"
+#include "../../split/kernel_queue_enqueue.h"
+#include "../../split/kernel_background_buffer_update.h"
+#include "../../split/kernel_shader_eval.h"
+#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "../../split/kernel_direct_lighting.h"
+#include "../../split/kernel_shadow_blocked.h"
+#include "../../split/kernel_next_iteration_setup.h"
+#include "../../split/kernel_sum_all_radiance.h"
+
+#include "../../kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 rng_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@ -142,8 +142,15 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
 	split_data->ray_state = ray_state;
 }

+#ifndef __KERNEL_CUDA__
 #  define kernel_split_state (kg->split_data)
 #  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */

 CCL_NAMESPACE_END

--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@ -60,7 +60,8 @@ void DebugFlags::CPU::reset()
 }

 DebugFlags::CUDA::CUDA()
-  : adaptive_compile(false)
+  : adaptive_compile(false),
+    split_kernel(false)
 {
 	reset();
 }
@ -69,6 +70,8 @@ void DebugFlags::CUDA::reset()
 {
 	if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
 		adaptive_compile = true;
+
+	split_kernel = false;
 }

 DebugFlags::OpenCL::OpenCL()
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@ -61,6 +61,9 @@ public:
 		/* Whether adaptive feature based runtime compile is enabled or not.
 		 * Requires the CUDA Toolkit and only works on Linux atm. */
 		bool adaptive_compile;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};

 	/* Descriptor of OpenCL feature-set to be used. */