Cycles: CUDA implementation of split kernel

This commit is contained in:
Mai Lavelle 2017-02-14 05:50:29 -05:00
parent 0892352bfe
commit 817873cc83
13 changed files with 599 additions and 134 deletions

@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_opencl_kernel_type = EnumProperty(
name="OpenCL Kernel Type",

@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
col = layout.column()
col.label('CUDA Flags:')
col.prop(cscene, "debug_use_cuda_adaptive_compile")
col.prop(cscene, "debug_use_cuda_split_kernel")
col = layout.column()
col.label('OpenCL Flags:')

@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
/* Synchronize OpenCL kernel type. */
switch(get_enum(cscene, "debug_opencl_kernel_type")) {
case 0:

@ -22,6 +22,7 @@
#include "device.h"
#include "device_intern.h"
#include "device_split_kernel.h"
#include "buffers.h"
@ -43,6 +44,8 @@
#include "util_types.h"
#include "util_time.h"
#include "split/kernel_split_data.h"
CCL_NAMESPACE_BEGIN
#ifndef WITH_CUDA_DYNLOAD
@ -79,6 +82,29 @@ int cuewCompilerVersion(void)
} /* namespace */
#endif /* WITH_CUDA_DYNLOAD */
class CUDADevice;
class CUDASplitKernel : public DeviceSplitKernel {
CUDADevice *device;
public:
explicit CUDASplitKernel(CUDADevice *device);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
device_memory& kernel_globals,
device_memory& kernel_data_,
device_memory& split_data,
device_memory& ray_state,
device_memory& queue_index,
device_memory& use_queues_flag,
device_memory& work_pool_wgs);
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
virtual int2 split_kernel_global_size(DeviceTask *task);
};
class CUDADevice : public Device
{
public:
@ -259,11 +285,16 @@ public:
return DebugFlags().cuda.adaptive_compile;
}
bool use_split_kernel()
{
return DebugFlags().cuda.split_kernel;
}
/* Common NVCC flags which stays the same regardless of shading model,
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
const DeviceRequestedFeatures& requested_features)
const DeviceRequestedFeatures& requested_features, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@ -288,6 +319,11 @@ public:
#ifdef WITH_CYCLES_DEBUG
cflags += " -D__KERNEL_DEBUG__";
#endif
if(split) {
cflags += " -D__SPLIT__";
}
return cflags;
}
@ -321,7 +357,7 @@ public:
return true;
}
string compile_kernel(const DeviceRequestedFeatures& requested_features)
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
{
/* Compute cubin name. */
int major, minor;
@ -330,7 +366,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
: "lib/kernel_sm_%d%d.cubin",
major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
@ -340,7 +377,7 @@ public:
}
const string common_cflags =
compile_kernel_get_common_cflags(requested_features);
compile_kernel_get_common_cflags(requested_features, split);
/* Try to use locally compiled kernel. */
const string kernel_path = path_get("kernel");
@ -351,7 +388,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
: "cycles_kernel_sm%d%d_%s.cubin",
major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
@ -386,7 +424,7 @@ public:
const char *nvcc = cuewCompilerPath();
const string kernel = path_join(kernel_path,
path_join("kernels",
path_join("cuda", "kernel.cu")));
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@ -434,7 +472,7 @@ public:
return false;
/* get kernel */
string cubin = compile_kernel(requested_features);
string cubin = compile_kernel(requested_features, use_split_kernel());
if(cubin == "")
return false;
@ -1261,25 +1299,48 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
if(!use_split_kernel()) {
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
path_trace(tile, sample, branched);
tile.sample = sample + 1;
task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
}
}
else {
DeviceRequestedFeatures requested_features;
if(!use_adaptive_compilation()) {
requested_features.max_closure = 64;
}
CUDASplitKernel split_kernel(this);
split_kernel.load_kernels(requested_features);
while(task->acquire_tile(this, tile)) {
device_memory void_buffer;
split_kernel.path_trace(task, tile, void_buffer, void_buffer);
task->release_tile(tile);
for(int sample = start_sample; sample < end_sample; sample++) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
path_trace(tile, sample, branched);
tile.sample = sample + 1;
task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
}
}
else if(task->type == DeviceTask::SHADER) {
@ -1332,8 +1393,186 @@ public:
{
task_pool.cancel();
}
friend class CUDASplitKernelFunction;
friend class CUDASplitKernel;
};
/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
* now that the definition of that class is complete
*/
#undef cuda_assert
#define cuda_assert(stmt) \
{ \
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
if(device->error_msg == "") \
device->error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
/*cuda_abort();*/ \
device->cuda_error_documentation(); \
} \
} (void)0
/* split kernel */
class CUDASplitKernelFunction : public SplitKernelFunction{
CUDADevice* device;
CUfunction func;
public:
CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
/* enqueue the kernel, returns false if there is an error */
bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
{
return enqueue(dim, NULL);
}
/* enqueue the kernel, returns false if there is an error */
bool enqueue(const KernelDimensions &dim, void *args[])
{
device->cuda_push_context();
if(device->have_error())
return false;
/* we ignore dim.local_size for now, as this is faster */
int threads_per_block;
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
int xthreads = (int)sqrt(threads_per_block);
int ythreads = (int)sqrt(threads_per_block);
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuLaunchKernel(func,
xblocks , yblocks, 1, /* blocks */
xthreads, ythreads, 1, /* threads */
0, 0, args, 0));
device->cuda_pop_context();
return !device->have_error();
}
};
CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
{
}
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
device_memory& /*kernel_globals*/,
device_memory& /*kernel_data*/,
device_memory& split_data,
device_memory& ray_state,
device_memory& queue_index,
device_memory& use_queues_flag,
device_memory& work_pool_wgs)
{
device->cuda_push_context();
CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
int end_sample = rtile.start_sample + rtile.num_samples;
int queue_size = dim.global_size[0] * dim.global_size[1];
struct args_t {
CUdeviceptr* split_data_buffer;
int* num_elements;
CUdeviceptr* ray_state;
CUdeviceptr* rng_state;
int* start_sample;
int* end_sample;
int* sx;
int* sy;
int* sw;
int* sh;
int* offset;
int* stride;
CUdeviceptr* queue_index;
int* queuesize;
CUdeviceptr* use_queues_flag;
CUdeviceptr* work_pool_wgs;
int* num_samples;
CUdeviceptr* buffer;
};
args_t args = {
&d_split_data,
&num_global_elements,
&d_ray_state,
&d_rng_state,
&rtile.start_sample,
&end_sample,
&rtile.x,
&rtile.y,
&rtile.w,
&rtile.h,
&rtile.offset,
&rtile.stride,
&d_queue_index,
&queue_size,
&d_use_queues_flag,
&d_work_pool_wgs,
&rtile.num_samples,
&d_buffer
};
CUfunction data_init;
cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
if(device->have_error()) {
return false;
}
CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
device->cuda_pop_context();
return !device->have_error();
}
SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
{
CUfunction func;
device->cuda_push_context();
cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
if(device->have_error()) {
device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
return NULL;
}
device->cuda_pop_context();
return new CUDASplitKernelFunction(device, func);
}
int2 CUDASplitKernel::split_kernel_local_size()
{
return make_int2(32, 1);
}
int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
{
/* TODO(mai): implement something here to detect ideal work size */
return make_int2(256, 256);
}
bool device_cuda_init(void)
{
#ifdef WITH_CUDA_DYNLOAD

@ -27,6 +27,7 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_sum_all_radiance.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
)
set(SRC_BVH_HEADERS
@ -89,6 +90,10 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu_image.h
)
set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_config.h
)
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@ -230,8 +235,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
# build for each arch
set(cuda_sources kernels/cuda/kernel.cu
set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
${SRC_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
@ -240,15 +246,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
)
set(cuda_cubins)
macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
if(${experimental})
set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
set(cuda_cubin kernel_experimental_${arch}.cubin)
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
if(${split})
set(cuda_extra_flags "-D__SPLIT__")
set(cuda_cubin kernel_split)
else()
set(cuda_extra_flags "")
set(cuda_cubin kernel_${arch}.cubin)
set(cuda_cubin kernel)
endif()
if(${experimental})
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
set(cuda_cubin ${cuda_cubin}_experimental)
endif()
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
else()
@ -261,13 +274,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
if(split)
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
else()
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
endif()
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${cuda_nvcc_command}
-arch=${arch}
${CUDA_NVCC_FLAGS}
-m${CUDA_BITS}
--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--ptxas-options="-v"
${cuda_arch_flags}
@ -294,7 +313,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
endif()
endforeach()
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@ -352,6 +376,7 @@ add_library(cycles_kernel
${SRC}
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_SVM_HEADERS}
@ -386,7 +411,9 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)

@ -46,11 +46,58 @@
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
#define ccl_constant
#define ccl_local __shared__
#define ccl_local_param
#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
ccl_device_inline uint ccl_local_id(uint d)
{
switch(d) {
case 0: return threadIdx.x;
case 1: return threadIdx.y;
case 2: return threadIdx.z;
default: return 0;
}
}
#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
ccl_device_inline uint ccl_local_size(uint d)
{
switch(d) {
case 0: return blockDim.x;
case 1: return blockDim.y;
case 2: return blockDim.z;
default: return 0;
}
}
#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
ccl_device_inline uint ccl_group_id(uint d)
{
switch(d) {
case 0: return blockIdx.x;
case 1: return blockIdx.y;
case 2: return blockIdx.z;
default: return 0;
}
}
ccl_device_inline uint ccl_num_groups(uint d)
{
switch(d) {
case 0: return gridDim.x;
case 1: return gridDim.y;
case 2: return gridDim.z;
default: return 0;
}
}
/* No assert supported for CUDA */
#define kernel_assert(cond)

@ -92,12 +92,14 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_CUDA__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __BRANCHED_PATH__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SUBSURFACE__
# define __CMJ__
# define __SHADOW_RECORD_ALL__
# ifndef __SPLIT_KERNEL__
# define __BRANCHED_PATH__
# define __VOLUME__
# define __VOLUME_SCATTER__
# define __SUBSURFACE__
# define __CMJ__
# define __SHADOW_RECORD_ALL__
# endif
#endif /* __KERNEL_CUDA__ */
#ifdef __KERNEL_OPENCL__

@ -16,7 +16,10 @@
/* CUDA kernel entry points */
#ifdef __CUDA_ARCH__
#include "../../kernel_compat_cuda.h"
#include "kernel_config.h"
#include "../../kernel_math.h"
#include "../../kernel_types.h"
#include "../../kernel_globals.h"
@ -25,104 +28,7 @@
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
/* device data taken from CUDA occupancy calculator */
#ifdef __CUDA_ARCH__
/* 2.0 and 2.1 */
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 32
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
/* 3.0 and 3.5 */
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.2 */
#elif __CUDA_ARCH__ == 320
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.7 */
#elif __CUDA_ARCH__ == 370
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 5.0, 5.2, 5.3, 6.0, 6.1 */
#elif __CUDA_ARCH__ >= 500
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 48
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* unknown architecture */
#else
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \
threads_block_width*threads_block_width, \
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
)
/* sanity checks */
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
# error "Maximum number of threads per block exceeded"
#endif
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
# error "Maximum number of blocks per multiprocessor exceeded"
#endif
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)

@ -0,0 +1,110 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* device data taken from CUDA occupancy calculator */
/* 2.0 and 2.1 */
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 32
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
/* 3.0 and 3.5 */
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.2 */
#elif __CUDA_ARCH__ == 320
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.7 */
#elif __CUDA_ARCH__ == 370
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 5.0, 5.2, 5.3, 6.0, 6.1 */
#elif __CUDA_ARCH__ >= 500
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 48
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* unknown architecture */
#else
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \
threads_block_width*threads_block_width, \
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
)
/* sanity checks */
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
# error "Maximum number of threads per block exceeded"
#endif
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
# error "Maximum number of blocks per multiprocessor exceeded"
#endif
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif

@ -0,0 +1,118 @@
/*
* Copyright 2011-2016 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CUDA split kernel entry points */
#ifdef __CUDA_ARCH__
#define __SPLIT_KERNEL__
#include "../../kernel_compat_cuda.h"
#include "kernel_config.h"
#include "../../split/kernel_split_common.h"
#include "../../split/kernel_data_init.h"
#include "../../split/kernel_scene_intersect.h"
#include "../../split/kernel_lamp_emission.h"
#include "../../split/kernel_queue_enqueue.h"
#include "../../split/kernel_background_buffer_update.h"
#include "../../split/kernel_shader_eval.h"
#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
#include "../../split/kernel_direct_lighting.h"
#include "../../split/kernel_shadow_blocked.h"
#include "../../split/kernel_next_iteration_setup.h"
#include "../../split/kernel_sum_all_radiance.h"
#include "../../kernel_film.h"
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace_data_init(
ccl_global void *split_data_buffer,
int num_elements,
ccl_global char *ray_state,
ccl_global uint *rng_state,
int start_sample,
int end_sample,
int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index,
int queuesize,
ccl_global char *use_queues_flag,
ccl_global unsigned int *work_pool_wgs,
unsigned int num_samples,
ccl_global float *buffer)
{
kernel_data_init(NULL,
NULL,
split_data_buffer,
num_elements,
ray_state,
rng_state,
start_sample,
end_sample,
sx, sy, sw, sh, offset, stride,
Queue_index,
queuesize,
use_queues_flag,
work_pool_wgs,
num_samples,
buffer);
}
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
extern "C" __global__ void \
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
kernel_cuda_##name() \
{ \
kernel_##name(NULL); \
}
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
}
#endif

@ -142,8 +142,15 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
split_data->ray_state = ray_state;
}
#define kernel_split_state (kg->split_data)
#define kernel_split_params (kg->split_param_data)
#ifndef __KERNEL_CUDA__
# define kernel_split_state (kg->split_data)
# define kernel_split_params (kg->split_param_data)
#else
__device__ SplitData __split_data;
# define kernel_split_state (__split_data)
__device__ SplitParams __split_param_data;
# define kernel_split_params (__split_param_data)
#endif /* __KERNEL_CUDA__ */
CCL_NAMESPACE_END

@ -60,7 +60,8 @@ void DebugFlags::CPU::reset()
}
DebugFlags::CUDA::CUDA()
: adaptive_compile(false)
: adaptive_compile(false),
split_kernel(false)
{
reset();
}
@ -69,6 +70,8 @@ void DebugFlags::CUDA::reset()
{
if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
split_kernel = false;
}
DebugFlags::OpenCL::OpenCL()

@ -61,6 +61,9 @@ public:
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux atm. */
bool adaptive_compile;
/* Whether split kernel is used */
bool split_kernel;
};
/* Descriptor of OpenCL feature-set to be used. */