diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 1f0b712c93e..ca109734314 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False) + cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_opencl_kernel_type = EnumProperty( name="OpenCL Kernel Type", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 8d3fe877597..7c1e3e270fb 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): col = layout.column() col.label('CUDA Flags:') col.prop(cscene, "debug_use_cuda_adaptive_compile") + col.prop(cscene, "debug_use_cuda_split_kernel") col = layout.column() col.label('OpenCL Flags:') diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index ed410e15e7b..75118c43747 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); + flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); /* Synchronize OpenCL kernel type. */ switch(get_enum(cscene, "debug_opencl_kernel_type")) { case 0: diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 1e5ce7875b1..74f36022b30 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -22,6 +22,7 @@ #include "device.h" #include "device_intern.h" +#include "device_split_kernel.h" #include "buffers.h" @@ -43,6 +44,8 @@ #include "util_types.h" #include "util_time.h" +#include "split/kernel_split_data.h" + CCL_NAMESPACE_BEGIN #ifndef WITH_CUDA_DYNLOAD @@ -79,6 +82,29 @@ int cuewCompilerVersion(void) } /* namespace */ #endif /* WITH_CUDA_DYNLOAD */ +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; +public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(DeviceTask *task); +}; + class CUDADevice : public Device { public: @@ -259,11 +285,16 @@ public: return DebugFlags().cuda.adaptive_compile; } + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + /* Common NVCC flags which stays the same regardless of shading model, * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features) + const DeviceRequestedFeatures& requested_features, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -288,6 +319,11 @@ public: #ifdef WITH_CYCLES_DEBUG cflags += " -D__KERNEL_DEBUG__"; #endif + + if(split) { + cflags += " -D__SPLIT__"; + } + return cflags; } @@ -321,7 +357,7 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features) + string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) { /* Compute cubin name. */ int major, minor; @@ -330,7 +366,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", + const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" + : "lib/kernel_sm_%d%d.cubin", major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { @@ -340,7 +377,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features); + compile_kernel_get_common_cflags(requested_features, split); /* Try to use locally compiled kernel. */ const string kernel_path = path_get("kernel"); @@ -351,7 +388,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin", + const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" + : "cycles_kernel_sm%d%d_%s.cubin", major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); @@ -386,7 +424,7 @@ public: const char *nvcc = cuewCompilerPath(); const string kernel = path_join(kernel_path, path_join("kernels", - path_join("cuda", "kernel.cu"))); + path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -434,7 +472,7 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features); + string cubin = compile_kernel(requested_features, use_split_kernel()); if(cubin == "") return false; @@ -1261,25 +1299,48 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(!use_split_kernel()) { + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample, branched); + + tile.sample = sample + 1; + + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + } + } + else { + DeviceRequestedFeatures requested_features; + if(!use_adaptive_compilation()) { + requested_features.max_closure = 64; + } + + CUDASplitKernel split_kernel(this); + split_kernel.load_kernels(requested_features); + + while(task->acquire_tile(this, tile)) { + device_memory void_buffer; + split_kernel.path_trace(task, tile, void_buffer, void_buffer); + + task->release_tile(tile); - for(int sample = start_sample; sample < end_sample; sample++) { if(task->get_cancel()) { if(task->need_finish_queue == false) break; } - - path_trace(tile, sample, branched); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); } - - task->release_tile(tile); } } else if(task->type == DeviceTask::SHADER) { @@ -1332,8 +1393,186 @@ public: { task_pool.cancel(); } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; }; +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +#undef cuda_assert +#define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ + \ + if(result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if(device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } (void)0 + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction{ + CUDADevice* device; + CUfunction func; +public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + device->cuda_push_context(); + + if(device->have_error()) + return false; + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + + int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; + int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks , yblocks, 1, /* blocks */ + xthreads, ythreads, 1, /* threads */ + 0, 0, args, 0)); + + device->cuda_pop_context(); + + return !device->have_error(); + } +}; + +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& /*kernel_globals*/, + device_memory& /*kernel_data*/, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) +{ + device->cuda_push_context(); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr* split_data_buffer; + int* num_elements; + CUdeviceptr* ray_state; + CUdeviceptr* rng_state; + int* start_sample; + int* end_sample; + int* sx; + int* sy; + int* sw; + int* sh; + int* offset; + int* stride; + CUdeviceptr* queue_index; + int* queuesize; + CUdeviceptr* use_queues_flag; + CUdeviceptr* work_pool_wgs; + int* num_samples; + CUdeviceptr* buffer; + }; + + args_t args = { + &d_split_data, + &num_global_elements, + &d_ray_state, + &d_rng_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer + }; + + CUfunction data_init; + cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if(device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); + + device->cuda_pop_context(); + + return !device->have_error(); +} + +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CUfunction func; + + device->cuda_push_context(); + + cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if(device->have_error()) { + device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + device->cuda_pop_context(); + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/) +{ + /* TODO(mai): implement something here to detect ideal work size */ + return make_int2(256, 256); +} + bool device_cuda_init(void) { #ifdef WITH_CUDA_DYNLOAD diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index d844ba007aa..685955170b5 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -27,6 +27,7 @@ set(SRC kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_sum_all_radiance.cl kernels/cuda/kernel.cu + kernels/cuda/kernel_split.cu ) set(SRC_BVH_HEADERS @@ -89,6 +90,10 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu_image.h ) +set(SRC_KERNELS_CUDA_HEADERS + kernels/cuda/kernel_config.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -230,8 +235,9 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu + set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} @@ -240,15 +246,22 @@ if(WITH_CYCLES_CUDA_BINARIES) ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch experimental) - if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") - set(cuda_cubin kernel_experimental_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) + if(${split}) + set(cuda_extra_flags "-D__SPLIT__") + set(cuda_cubin kernel_split) else() set(cuda_extra_flags "") - set(cuda_cubin kernel_${arch}.cubin) + set(cuda_cubin kernel) endif() + if(${experimental}) + set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) + set(cuda_cubin ${cuda_cubin}_experimental) + endif() + + set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") else() @@ -261,13 +274,19 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") + if(split) + set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") + else() + set(cuda_kernel_src "/kernels/cuda/kernel.cu") + endif() + add_custom_command( OUTPUT ${cuda_cubin} COMMAND ${cuda_nvcc_command} -arch=${arch} ${CUDA_NVCC_FLAGS} -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu + --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} @@ -294,7 +313,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + + if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) + # Compile split kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + endif() endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -352,6 +376,7 @@ add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} @@ -386,7 +411,9 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..8fffe2a13c9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -46,11 +46,58 @@ #define ccl_device_noinline __device__ __noinline__ #define ccl_global #define ccl_constant +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index a016e5293ca..a7faaef89ca 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -92,12 +92,14 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_CUDA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ -# define __VOLUME__ -# define __VOLUME_SCATTER__ -# define __SUBSURFACE__ -# define __CMJ__ -# define __SHADOW_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SUBSURFACE__ +# define __CMJ__ +# define __SHADOW_RECORD_ALL__ +# endif #endif /* __KERNEL_CUDA__ */ #ifdef __KERNEL_OPENCL__ diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 090ab2c50c2..52e541321e3 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,7 +16,10 @@ /* CUDA kernel entry points */ +#ifdef __CUDA_ARCH__ + #include "../../kernel_compat_cuda.h" +#include "kernel_config.h" #include "../../kernel_math.h" #include "../../kernel_types.h" #include "../../kernel_globals.h" @@ -25,104 +28,7 @@ #include "../../kernel_path_branched.h" #include "../../kernel_bake.h" -/* device data taken from CUDA occupancy calculator */ - -#ifdef __CUDA_ARCH__ - -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..9fa39dc9ebb --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 32 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 48 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..441cd96fafa --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,118 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "../../kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "../../split/kernel_split_common.h" +#include "../../split/kernel_data_init.h" +#include "../../split/kernel_scene_intersect.h" +#include "../../split/kernel_lamp_emission.h" +#include "../../split/kernel_queue_enqueue.h" +#include "../../split/kernel_background_buffer_update.h" +#include "../../split/kernel_shader_eval.h" +#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "../../split/kernel_direct_lighting.h" +#include "../../split/kernel_shadow_blocked.h" +#include "../../split/kernel_next_iteration_setup.h" +#include "../../split/kernel_sum_all_radiance.h" + +#include "../../kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + rng_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 18f062ef682..7e88b6f5168 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -142,8 +142,15 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data, split_data->ray_state = ray_state; } -#define kernel_split_state (kg->split_data) -#define kernel_split_params (kg->split_param_data) +#ifndef __KERNEL_CUDA__ +# define kernel_split_state (kg->split_data) +# define kernel_split_params (kg->split_param_data) +#else +__device__ SplitData __split_data; +# define kernel_split_state (__split_data) +__device__ SplitParams __split_param_data; +# define kernel_split_params (__split_param_data) +#endif /* __KERNEL_CUDA__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 318248998aa..f12c5e28c80 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -60,7 +60,8 @@ void DebugFlags::CPU::reset() } DebugFlags::CUDA::CUDA() - : adaptive_compile(false) + : adaptive_compile(false), + split_kernel(false) { reset(); } @@ -69,6 +70,8 @@ void DebugFlags::CUDA::reset() { if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; + + split_kernel = false; } DebugFlags::OpenCL::OpenCL() diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 171e43ec32a..911c95de4ab 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -61,6 +61,9 @@ public: /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux atm. */ bool adaptive_compile; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of OpenCL feature-set to be used. */