forked from bartvdbraak/blender
Cycles: CUDA implementation of split kernel
This commit is contained in:
parent
0892352bfe
commit
817873cc83
@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
|
||||
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
|
||||
|
||||
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
|
||||
cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
|
||||
|
||||
cls.debug_opencl_kernel_type = EnumProperty(
|
||||
name="OpenCL Kernel Type",
|
||||
|
@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
|
||||
col = layout.column()
|
||||
col.label('CUDA Flags:')
|
||||
col.prop(cscene, "debug_use_cuda_adaptive_compile")
|
||||
col.prop(cscene, "debug_use_cuda_split_kernel")
|
||||
|
||||
col = layout.column()
|
||||
col.label('OpenCL Flags:')
|
||||
|
@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
|
||||
flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
|
||||
/* Synchronize CUDA flags. */
|
||||
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
|
||||
flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
|
||||
/* Synchronize OpenCL kernel type. */
|
||||
switch(get_enum(cscene, "debug_opencl_kernel_type")) {
|
||||
case 0:
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "device_intern.h"
|
||||
#include "device_split_kernel.h"
|
||||
|
||||
#include "buffers.h"
|
||||
|
||||
@ -43,6 +44,8 @@
|
||||
#include "util_types.h"
|
||||
#include "util_time.h"
|
||||
|
||||
#include "split/kernel_split_data.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef WITH_CUDA_DYNLOAD
|
||||
@ -79,6 +82,29 @@ int cuewCompilerVersion(void)
|
||||
} /* namespace */
|
||||
#endif /* WITH_CUDA_DYNLOAD */
|
||||
|
||||
class CUDADevice;
|
||||
|
||||
class CUDASplitKernel : public DeviceSplitKernel {
|
||||
CUDADevice *device;
|
||||
public:
|
||||
explicit CUDASplitKernel(CUDADevice *device);
|
||||
|
||||
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
||||
RenderTile& rtile,
|
||||
int num_global_elements,
|
||||
device_memory& kernel_globals,
|
||||
device_memory& kernel_data_,
|
||||
device_memory& split_data,
|
||||
device_memory& ray_state,
|
||||
device_memory& queue_index,
|
||||
device_memory& use_queues_flag,
|
||||
device_memory& work_pool_wgs);
|
||||
|
||||
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
|
||||
virtual int2 split_kernel_local_size();
|
||||
virtual int2 split_kernel_global_size(DeviceTask *task);
|
||||
};
|
||||
|
||||
class CUDADevice : public Device
|
||||
{
|
||||
public:
|
||||
@ -259,11 +285,16 @@ public:
|
||||
return DebugFlags().cuda.adaptive_compile;
|
||||
}
|
||||
|
||||
bool use_split_kernel()
|
||||
{
|
||||
return DebugFlags().cuda.split_kernel;
|
||||
}
|
||||
|
||||
/* Common NVCC flags which stays the same regardless of shading model,
|
||||
* kernel sources md5 and only depends on compiler or compilation settings.
|
||||
*/
|
||||
string compile_kernel_get_common_cflags(
|
||||
const DeviceRequestedFeatures& requested_features)
|
||||
const DeviceRequestedFeatures& requested_features, bool split=false)
|
||||
{
|
||||
const int cuda_version = cuewCompilerVersion();
|
||||
const int machine = system_cpu_bits();
|
||||
@ -288,6 +319,11 @@ public:
|
||||
#ifdef WITH_CYCLES_DEBUG
|
||||
cflags += " -D__KERNEL_DEBUG__";
|
||||
#endif
|
||||
|
||||
if(split) {
|
||||
cflags += " -D__SPLIT__";
|
||||
}
|
||||
|
||||
return cflags;
|
||||
}
|
||||
|
||||
@ -321,7 +357,7 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
string compile_kernel(const DeviceRequestedFeatures& requested_features)
|
||||
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
|
||||
{
|
||||
/* Compute cubin name. */
|
||||
int major, minor;
|
||||
@ -330,7 +366,8 @@ public:
|
||||
|
||||
/* Attempt to use kernel provided with Blender. */
|
||||
if(!use_adaptive_compilation()) {
|
||||
const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
|
||||
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
|
||||
: "lib/kernel_sm_%d%d.cubin",
|
||||
major, minor));
|
||||
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
|
||||
if(path_exists(cubin)) {
|
||||
@ -340,7 +377,7 @@ public:
|
||||
}
|
||||
|
||||
const string common_cflags =
|
||||
compile_kernel_get_common_cflags(requested_features);
|
||||
compile_kernel_get_common_cflags(requested_features, split);
|
||||
|
||||
/* Try to use locally compiled kernel. */
|
||||
const string kernel_path = path_get("kernel");
|
||||
@ -351,7 +388,8 @@ public:
|
||||
*/
|
||||
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
|
||||
|
||||
const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
|
||||
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
|
||||
: "cycles_kernel_sm%d%d_%s.cubin",
|
||||
major, minor,
|
||||
cubin_md5.c_str());
|
||||
const string cubin = path_cache_get(path_join("kernels", cubin_file));
|
||||
@ -386,7 +424,7 @@ public:
|
||||
const char *nvcc = cuewCompilerPath();
|
||||
const string kernel = path_join(kernel_path,
|
||||
path_join("kernels",
|
||||
path_join("cuda", "kernel.cu")));
|
||||
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
|
||||
double starttime = time_dt();
|
||||
printf("Compiling CUDA kernel ...\n");
|
||||
|
||||
@ -434,7 +472,7 @@ public:
|
||||
return false;
|
||||
|
||||
/* get kernel */
|
||||
string cubin = compile_kernel(requested_features);
|
||||
string cubin = compile_kernel(requested_features, use_split_kernel());
|
||||
|
||||
if(cubin == "")
|
||||
return false;
|
||||
@ -1261,6 +1299,7 @@ public:
|
||||
/* Upload Bindless Mapping */
|
||||
load_bindless_mapping();
|
||||
|
||||
if(!use_split_kernel()) {
|
||||
/* keep rendering tiles until done */
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
int start_sample = tile.start_sample;
|
||||
@ -1282,6 +1321,28 @@ public:
|
||||
task->release_tile(tile);
|
||||
}
|
||||
}
|
||||
else {
|
||||
DeviceRequestedFeatures requested_features;
|
||||
if(!use_adaptive_compilation()) {
|
||||
requested_features.max_closure = 64;
|
||||
}
|
||||
|
||||
CUDASplitKernel split_kernel(this);
|
||||
split_kernel.load_kernels(requested_features);
|
||||
|
||||
while(task->acquire_tile(this, tile)) {
|
||||
device_memory void_buffer;
|
||||
split_kernel.path_trace(task, tile, void_buffer, void_buffer);
|
||||
|
||||
task->release_tile(tile);
|
||||
|
||||
if(task->get_cancel()) {
|
||||
if(task->need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(task->type == DeviceTask::SHADER) {
|
||||
/* Upload Bindless Mapping */
|
||||
load_bindless_mapping();
|
||||
@ -1332,8 +1393,186 @@ public:
|
||||
{
|
||||
task_pool.cancel();
|
||||
}
|
||||
|
||||
friend class CUDASplitKernelFunction;
|
||||
friend class CUDASplitKernel;
|
||||
};
|
||||
|
||||
/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
|
||||
* now that the definition of that class is complete
|
||||
*/
|
||||
#undef cuda_assert
|
||||
#define cuda_assert(stmt) \
|
||||
{ \
|
||||
CUresult result = stmt; \
|
||||
\
|
||||
if(result != CUDA_SUCCESS) { \
|
||||
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
|
||||
if(device->error_msg == "") \
|
||||
device->error_msg = message; \
|
||||
fprintf(stderr, "%s\n", message.c_str()); \
|
||||
/*cuda_abort();*/ \
|
||||
device->cuda_error_documentation(); \
|
||||
} \
|
||||
} (void)0
|
||||
|
||||
/* split kernel */
|
||||
|
||||
class CUDASplitKernelFunction : public SplitKernelFunction{
|
||||
CUDADevice* device;
|
||||
CUfunction func;
|
||||
public:
|
||||
CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
|
||||
|
||||
/* enqueue the kernel, returns false if there is an error */
|
||||
bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
|
||||
{
|
||||
return enqueue(dim, NULL);
|
||||
}
|
||||
|
||||
/* enqueue the kernel, returns false if there is an error */
|
||||
bool enqueue(const KernelDimensions &dim, void *args[])
|
||||
{
|
||||
device->cuda_push_context();
|
||||
|
||||
if(device->have_error())
|
||||
return false;
|
||||
|
||||
/* we ignore dim.local_size for now, as this is faster */
|
||||
int threads_per_block;
|
||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
|
||||
|
||||
int xthreads = (int)sqrt(threads_per_block);
|
||||
int ythreads = (int)sqrt(threads_per_block);
|
||||
|
||||
int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
|
||||
int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
cuda_assert(cuLaunchKernel(func,
|
||||
xblocks , yblocks, 1, /* blocks */
|
||||
xthreads, ythreads, 1, /* threads */
|
||||
0, 0, args, 0));
|
||||
|
||||
device->cuda_pop_context();
|
||||
|
||||
return !device->have_error();
|
||||
}
|
||||
};
|
||||
|
||||
CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
|
||||
{
|
||||
}
|
||||
|
||||
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
||||
RenderTile& rtile,
|
||||
int num_global_elements,
|
||||
device_memory& /*kernel_globals*/,
|
||||
device_memory& /*kernel_data*/,
|
||||
device_memory& split_data,
|
||||
device_memory& ray_state,
|
||||
device_memory& queue_index,
|
||||
device_memory& use_queues_flag,
|
||||
device_memory& work_pool_wgs)
|
||||
{
|
||||
device->cuda_push_context();
|
||||
|
||||
CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
|
||||
CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
|
||||
CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
|
||||
CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
|
||||
CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
|
||||
|
||||
CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
|
||||
CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
|
||||
|
||||
int end_sample = rtile.start_sample + rtile.num_samples;
|
||||
int queue_size = dim.global_size[0] * dim.global_size[1];
|
||||
|
||||
struct args_t {
|
||||
CUdeviceptr* split_data_buffer;
|
||||
int* num_elements;
|
||||
CUdeviceptr* ray_state;
|
||||
CUdeviceptr* rng_state;
|
||||
int* start_sample;
|
||||
int* end_sample;
|
||||
int* sx;
|
||||
int* sy;
|
||||
int* sw;
|
||||
int* sh;
|
||||
int* offset;
|
||||
int* stride;
|
||||
CUdeviceptr* queue_index;
|
||||
int* queuesize;
|
||||
CUdeviceptr* use_queues_flag;
|
||||
CUdeviceptr* work_pool_wgs;
|
||||
int* num_samples;
|
||||
CUdeviceptr* buffer;
|
||||
};
|
||||
|
||||
args_t args = {
|
||||
&d_split_data,
|
||||
&num_global_elements,
|
||||
&d_ray_state,
|
||||
&d_rng_state,
|
||||
&rtile.start_sample,
|
||||
&end_sample,
|
||||
&rtile.x,
|
||||
&rtile.y,
|
||||
&rtile.w,
|
||||
&rtile.h,
|
||||
&rtile.offset,
|
||||
&rtile.stride,
|
||||
&d_queue_index,
|
||||
&queue_size,
|
||||
&d_use_queues_flag,
|
||||
&d_work_pool_wgs,
|
||||
&rtile.num_samples,
|
||||
&d_buffer
|
||||
};
|
||||
|
||||
CUfunction data_init;
|
||||
cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
|
||||
if(device->have_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
|
||||
|
||||
device->cuda_pop_context();
|
||||
|
||||
return !device->have_error();
|
||||
}
|
||||
|
||||
SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
|
||||
{
|
||||
CUfunction func;
|
||||
|
||||
device->cuda_push_context();
|
||||
|
||||
cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
|
||||
if(device->have_error()) {
|
||||
device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
device->cuda_pop_context();
|
||||
|
||||
return new CUDASplitKernelFunction(device, func);
|
||||
}
|
||||
|
||||
int2 CUDASplitKernel::split_kernel_local_size()
|
||||
{
|
||||
return make_int2(32, 1);
|
||||
}
|
||||
|
||||
int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
|
||||
{
|
||||
/* TODO(mai): implement something here to detect ideal work size */
|
||||
return make_int2(256, 256);
|
||||
}
|
||||
|
||||
bool device_cuda_init(void)
|
||||
{
|
||||
#ifdef WITH_CUDA_DYNLOAD
|
||||
|
@ -27,6 +27,7 @@ set(SRC
|
||||
kernels/opencl/kernel_next_iteration_setup.cl
|
||||
kernels/opencl/kernel_sum_all_radiance.cl
|
||||
kernels/cuda/kernel.cu
|
||||
kernels/cuda/kernel_split.cu
|
||||
)
|
||||
|
||||
set(SRC_BVH_HEADERS
|
||||
@ -89,6 +90,10 @@ set(SRC_KERNELS_CPU_HEADERS
|
||||
kernels/cpu/kernel_cpu_image.h
|
||||
)
|
||||
|
||||
set(SRC_KERNELS_CUDA_HEADERS
|
||||
kernels/cuda/kernel_config.h
|
||||
)
|
||||
|
||||
set(SRC_CLOSURE_HEADERS
|
||||
closure/alloc.h
|
||||
closure/bsdf.h
|
||||
@ -230,8 +235,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
endif()
|
||||
|
||||
# build for each arch
|
||||
set(cuda_sources kernels/cuda/kernel.cu
|
||||
set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
|
||||
${SRC_HEADERS}
|
||||
${SRC_KERNELS_CUDA_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_SVM_HEADERS}
|
||||
${SRC_GEOM_HEADERS}
|
||||
@ -240,15 +246,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
)
|
||||
set(cuda_cubins)
|
||||
|
||||
macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
|
||||
if(${experimental})
|
||||
set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
|
||||
set(cuda_cubin kernel_experimental_${arch}.cubin)
|
||||
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
|
||||
if(${split})
|
||||
set(cuda_extra_flags "-D__SPLIT__")
|
||||
set(cuda_cubin kernel_split)
|
||||
else()
|
||||
set(cuda_extra_flags "")
|
||||
set(cuda_cubin kernel_${arch}.cubin)
|
||||
set(cuda_cubin kernel)
|
||||
endif()
|
||||
|
||||
if(${experimental})
|
||||
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
|
||||
set(cuda_cubin ${cuda_cubin}_experimental)
|
||||
endif()
|
||||
|
||||
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
|
||||
|
||||
if(WITH_CYCLES_DEBUG)
|
||||
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
|
||||
else()
|
||||
@ -261,13 +274,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
|
||||
set(cuda_math_flags "--use_fast_math")
|
||||
|
||||
if(split)
|
||||
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
|
||||
else()
|
||||
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${cuda_cubin}
|
||||
COMMAND ${cuda_nvcc_command}
|
||||
-arch=${arch}
|
||||
${CUDA_NVCC_FLAGS}
|
||||
-m${CUDA_BITS}
|
||||
--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
|
||||
--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
|
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
|
||||
--ptxas-options="-v"
|
||||
${cuda_arch_flags}
|
||||
@ -294,7 +313,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
|
||||
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
|
||||
# Compile regular kernel
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
|
||||
|
||||
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
|
||||
# Compile split kernel
|
||||
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
|
||||
@ -352,6 +376,7 @@ add_library(cycles_kernel
|
||||
${SRC}
|
||||
${SRC_HEADERS}
|
||||
${SRC_KERNELS_CPU_HEADERS}
|
||||
${SRC_KERNELS_CUDA_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
${SRC_SVM_HEADERS}
|
||||
@ -386,7 +411,9 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
|
||||
|
@ -46,11 +46,58 @@
|
||||
#define ccl_device_noinline __device__ __noinline__
|
||||
#define ccl_global
|
||||
#define ccl_constant
|
||||
#define ccl_local __shared__
|
||||
#define ccl_local_param
|
||||
#define ccl_private
|
||||
#define ccl_may_alias
|
||||
#define ccl_addr_space
|
||||
#define ccl_restrict __restrict__
|
||||
#define ccl_align(n) __align__(n)
|
||||
|
||||
ccl_device_inline uint ccl_local_id(uint d)
|
||||
{
|
||||
switch(d) {
|
||||
case 0: return threadIdx.x;
|
||||
case 1: return threadIdx.y;
|
||||
case 2: return threadIdx.z;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
|
||||
|
||||
ccl_device_inline uint ccl_local_size(uint d)
|
||||
{
|
||||
switch(d) {
|
||||
case 0: return blockDim.x;
|
||||
case 1: return blockDim.y;
|
||||
case 2: return blockDim.z;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
|
||||
|
||||
ccl_device_inline uint ccl_group_id(uint d)
|
||||
{
|
||||
switch(d) {
|
||||
case 0: return blockIdx.x;
|
||||
case 1: return blockIdx.y;
|
||||
case 2: return blockIdx.z;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline uint ccl_num_groups(uint d)
|
||||
{
|
||||
switch(d) {
|
||||
case 0: return gridDim.x;
|
||||
case 1: return gridDim.y;
|
||||
case 2: return gridDim.z;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* No assert supported for CUDA */
|
||||
|
||||
#define kernel_assert(cond)
|
||||
|
@ -92,12 +92,14 @@ CCL_NAMESPACE_BEGIN
|
||||
#ifdef __KERNEL_CUDA__
|
||||
# define __KERNEL_SHADING__
|
||||
# define __KERNEL_ADV_SHADING__
|
||||
# ifndef __SPLIT_KERNEL__
|
||||
# define __BRANCHED_PATH__
|
||||
# define __VOLUME__
|
||||
# define __VOLUME_SCATTER__
|
||||
# define __SUBSURFACE__
|
||||
# define __CMJ__
|
||||
# define __SHADOW_RECORD_ALL__
|
||||
# endif
|
||||
#endif /* __KERNEL_CUDA__ */
|
||||
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
|
@ -16,7 +16,10 @@
|
||||
|
||||
/* CUDA kernel entry points */
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
#include "../../kernel_compat_cuda.h"
|
||||
#include "kernel_config.h"
|
||||
#include "../../kernel_math.h"
|
||||
#include "../../kernel_types.h"
|
||||
#include "../../kernel_globals.h"
|
||||
@ -25,104 +28,7 @@
|
||||
#include "../../kernel_path_branched.h"
|
||||
#include "../../kernel_bake.h"
|
||||
|
||||
/* device data taken from CUDA occupancy calculator */
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
/* 2.0 and 2.1 */
|
||||
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 32
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
|
||||
|
||||
/* 3.0 and 3.5 */
|
||||
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 3.2 */
|
||||
#elif __CUDA_ARCH__ == 320
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 3.7 */
|
||||
#elif __CUDA_ARCH__ == 370
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 255
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 5.0, 5.2, 5.3, 6.0, 6.1 */
|
||||
#elif __CUDA_ARCH__ >= 500
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 255
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 48
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* unknown architecture */
|
||||
#else
|
||||
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
|
||||
#endif
|
||||
|
||||
/* compute number of threads per block and minimum blocks per multiprocessor
|
||||
* given the maximum number of registers per thread */
|
||||
|
||||
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
|
||||
__launch_bounds__( \
|
||||
threads_block_width*threads_block_width, \
|
||||
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
|
||||
)
|
||||
|
||||
/* sanity checks */
|
||||
|
||||
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
|
||||
# error "Maximum number of threads per block exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
|
||||
# error "Maximum number of blocks per multiprocessor exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
|
||||
# error "Maximum number of registers per thread exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
|
||||
# error "Maximum number of registers per thread exceeded"
|
||||
#endif
|
||||
|
||||
/* kernels */
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
|
||||
|
110
intern/cycles/kernel/kernels/cuda/kernel_config.h
Normal file
110
intern/cycles/kernel/kernels/cuda/kernel_config.h
Normal file
@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* device data taken from CUDA occupancy calculator */
|
||||
|
||||
/* 2.0 and 2.1 */
|
||||
#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 32
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
|
||||
|
||||
/* 3.0 and 3.5 */
|
||||
#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 3.2 */
|
||||
#elif __CUDA_ARCH__ == 320
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 63
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 3.7 */
|
||||
#elif __CUDA_ARCH__ == 370
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 255
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 63
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* 5.0, 5.2, 5.3, 6.0, 6.1 */
|
||||
#elif __CUDA_ARCH__ >= 500
|
||||
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
|
||||
# define CUDA_BLOCK_MAX_THREADS 1024
|
||||
# define CUDA_THREAD_MAX_REGISTERS 255
|
||||
|
||||
/* tunable parameters */
|
||||
# define CUDA_THREADS_BLOCK_WIDTH 16
|
||||
# define CUDA_KERNEL_MAX_REGISTERS 48
|
||||
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
|
||||
|
||||
/* unknown architecture */
|
||||
#else
|
||||
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
|
||||
#endif
|
||||
|
||||
/* compute number of threads per block and minimum blocks per multiprocessor
|
||||
* given the maximum number of registers per thread */
|
||||
|
||||
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
|
||||
__launch_bounds__( \
|
||||
threads_block_width*threads_block_width, \
|
||||
CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
|
||||
)
|
||||
|
||||
/* sanity checks */
|
||||
|
||||
#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
|
||||
# error "Maximum number of threads per block exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
|
||||
# error "Maximum number of blocks per multiprocessor exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
|
||||
# error "Maximum number of registers per thread exceeded"
|
||||
#endif
|
||||
|
||||
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
|
||||
# error "Maximum number of registers per thread exceeded"
|
||||
#endif
|
||||
|
118
intern/cycles/kernel/kernels/cuda/kernel_split.cu
Normal file
118
intern/cycles/kernel/kernels/cuda/kernel_split.cu
Normal file
@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Copyright 2011-2016 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* CUDA split kernel entry points */
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "../../kernel_compat_cuda.h"
|
||||
#include "kernel_config.h"
|
||||
|
||||
#include "../../split/kernel_split_common.h"
|
||||
#include "../../split/kernel_data_init.h"
|
||||
#include "../../split/kernel_scene_intersect.h"
|
||||
#include "../../split/kernel_lamp_emission.h"
|
||||
#include "../../split/kernel_queue_enqueue.h"
|
||||
#include "../../split/kernel_background_buffer_update.h"
|
||||
#include "../../split/kernel_shader_eval.h"
|
||||
#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
|
||||
#include "../../split/kernel_direct_lighting.h"
|
||||
#include "../../split/kernel_shadow_blocked.h"
|
||||
#include "../../split/kernel_next_iteration_setup.h"
|
||||
#include "../../split/kernel_sum_all_radiance.h"
|
||||
|
||||
#include "../../kernel_film.h"
|
||||
|
||||
/* kernels */
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_path_trace_data_init(
|
||||
ccl_global void *split_data_buffer,
|
||||
int num_elements,
|
||||
ccl_global char *ray_state,
|
||||
ccl_global uint *rng_state,
|
||||
int start_sample,
|
||||
int end_sample,
|
||||
int sx, int sy, int sw, int sh, int offset, int stride,
|
||||
ccl_global int *Queue_index,
|
||||
int queuesize,
|
||||
ccl_global char *use_queues_flag,
|
||||
ccl_global unsigned int *work_pool_wgs,
|
||||
unsigned int num_samples,
|
||||
ccl_global float *buffer)
|
||||
{
|
||||
kernel_data_init(NULL,
|
||||
NULL,
|
||||
split_data_buffer,
|
||||
num_elements,
|
||||
ray_state,
|
||||
rng_state,
|
||||
start_sample,
|
||||
end_sample,
|
||||
sx, sy, sw, sh, offset, stride,
|
||||
Queue_index,
|
||||
queuesize,
|
||||
use_queues_flag,
|
||||
work_pool_wgs,
|
||||
num_samples,
|
||||
buffer);
|
||||
}
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
extern "C" __global__ void \
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
|
||||
kernel_cuda_##name() \
|
||||
{ \
|
||||
kernel_##name(NULL); \
|
||||
}
|
||||
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
|
||||
{
|
||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x < sx + sw && y < sy + sh)
|
||||
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
|
||||
{
|
||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x < sx + sw && y < sy + sh)
|
||||
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -142,8 +142,15 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
|
||||
split_data->ray_state = ray_state;
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_CUDA__
|
||||
# define kernel_split_state (kg->split_data)
|
||||
# define kernel_split_params (kg->split_param_data)
|
||||
#else
|
||||
__device__ SplitData __split_data;
|
||||
# define kernel_split_state (__split_data)
|
||||
__device__ SplitParams __split_param_data;
|
||||
# define kernel_split_params (__split_param_data)
|
||||
#endif /* __KERNEL_CUDA__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
@ -60,7 +60,8 @@ void DebugFlags::CPU::reset()
|
||||
}
|
||||
|
||||
DebugFlags::CUDA::CUDA()
|
||||
: adaptive_compile(false)
|
||||
: adaptive_compile(false),
|
||||
split_kernel(false)
|
||||
{
|
||||
reset();
|
||||
}
|
||||
@ -69,6 +70,8 @@ void DebugFlags::CUDA::reset()
|
||||
{
|
||||
if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
|
||||
adaptive_compile = true;
|
||||
|
||||
split_kernel = false;
|
||||
}
|
||||
|
||||
DebugFlags::OpenCL::OpenCL()
|
||||
|
@ -61,6 +61,9 @@ public:
|
||||
/* Whether adaptive feature based runtime compile is enabled or not.
|
||||
* Requires the CUDA Toolkit and only works on Linux atm. */
|
||||
bool adaptive_compile;
|
||||
|
||||
/* Whether split kernel is used */
|
||||
bool split_kernel;
|
||||
};
|
||||
|
||||
/* Descriptor of OpenCL feature-set to be used. */
|
||||
|
Loading…
Reference in New Issue
Block a user