Cycles: CPU implementation of split kernel

This commit is contained in:
Mai Lavelle 2017-02-14 06:20:48 -05:00
parent 352ee7c3ef
commit 0892352bfe
25 changed files with 776 additions and 21 deletions

@ -665,6 +665,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)

@ -1518,6 +1518,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_use_qbvh")
col.prop(cscene, "debug_use_cpu_split_kernel")
col = layout.column()
col.label('CUDA Flags:')

@ -67,6 +67,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
/* Synchronize OpenCL kernel type. */

@ -26,10 +26,12 @@
#include "device.h"
#include "device_intern.h"
#include "device_split_kernel.h"
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_types.h"
#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "osl_shader.h"
@ -41,6 +43,7 @@
#include "util_foreach.h"
#include "util_function.h"
#include "util_logging.h"
#include "util_map.h"
#include "util_opengl.h"
#include "util_progress.h"
#include "util_system.h"
@ -48,8 +51,92 @@
CCL_NAMESPACE_BEGIN
class CPUDevice;
class CPUSplitKernel : public DeviceSplitKernel {
CPUDevice *device;
public:
explicit CPUSplitKernel(CPUDevice *device);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
device_memory& kernel_globals,
device_memory& kernel_data_,
device_memory& split_data,
device_memory& ray_state,
device_memory& queue_index,
device_memory& use_queues_flag,
device_memory& work_pool_wgs);
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
virtual int2 split_kernel_global_size(DeviceTask *task);
};
class CPUDevice : public Device
{
static unordered_map<string, void*> kernel_functions;
static void register_kernel_function(const char* name, void* func)
{
kernel_functions[name] = func;
}
static const char* get_arch_name()
{
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
return "cpu_avx2";
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
return "cpu_avx";
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
return "cpu_sse41";
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
return "cpu_sse3";
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
return "cpu_sse2";
}
else
#endif
{
return "cpu";
}
}
template<typename F>
static F get_kernel_function(string name)
{
name = string("kernel_") + get_arch_name() + "_" + name;
unordered_map<string, void*>::iterator it = kernel_functions.find(name);
if(it == kernel_functions.end()) {
assert(!"kernel function not found");
return NULL;
}
return (F)it->second;
}
friend class CPUSplitKernel;
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@ -57,10 +144,15 @@ public:
#ifdef WITH_OSL
OSLGlobals osl_globals;
#endif
bool use_split_kernel;
DeviceRequestedFeatures requested_features;
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
: Device(info, stats, background)
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
@ -105,6 +197,28 @@ public:
{
VLOG(1) << "Will be using regular kernels.";
}
use_split_kernel = DebugFlags().cpu.split_kernel;
if(use_split_kernel) {
VLOG(1) << "Will be using split kernel.";
}
kernel_cpu_register_functions(register_kernel_function);
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
kernel_cpu_sse2_register_functions(register_kernel_function);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
kernel_cpu_sse3_register_functions(register_kernel_function);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
kernel_cpu_sse41_register_functions(register_kernel_function);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
kernel_cpu_avx_register_functions(register_kernel_function);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
kernel_cpu_avx2_register_functions(register_kernel_function);
#endif
}
~CPUDevice()
@ -205,8 +319,14 @@ public:
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::PATH_TRACE)
thread_path_trace(*task);
if(task->type == DeviceTask::PATH_TRACE) {
if(!use_split_kernel) {
thread_path_trace(*task);
}
else {
thread_path_trace_split(*task);
}
}
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
else if(task->type == DeviceTask::SHADER)
@ -267,7 +387,7 @@ public:
{
path_trace_kernel = kernel_cpu_path_trace;
}
while(task.acquire_tile(this, tile)) {
float *render_buffer = (float*)tile.buffer;
uint *rng_state = (uint*)tile.rng_state;
@ -303,6 +423,49 @@ public:
thread_kernel_globals_free(&kg);
}
void thread_path_trace_split(DeviceTask& task)
{
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
return;
}
RenderTile tile;
CPUSplitKernel split_kernel(this);
/* allocate buffer for kernel globals */
device_memory kgbuffer;
kgbuffer.resize(sizeof(KernelGlobals));
mem_alloc(kgbuffer, MEM_READ_WRITE);
KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
*kg = thread_kernel_globals_init();
requested_features.max_closure = MAX_CLOSURE;
if(!split_kernel.load_kernels(requested_features)) {
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
mem_free(kgbuffer);
return;
}
while(task.acquire_tile(this, tile)) {
device_memory data;
split_kernel.path_trace(&task, tile, kgbuffer, data);
task.release_tile(tile);
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
}
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
mem_free(kgbuffer);
}
void thread_film_convert(DeviceTask& task)
{
float sample_scale = 1.0f/(task.sample + 1);
@ -510,6 +673,10 @@ protected:
inline void thread_kernel_globals_free(KernelGlobals *kg)
{
if(kg == NULL) {
return;
}
if(kg->transparent_shadow_intersections != NULL) {
free(kg->transparent_shadow_intersections);
}
@ -524,8 +691,170 @@ protected:
OSLShader::thread_free(kg);
#endif
}
virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
requested_features = requested_features_;
return true;
}
};
/* split kernel */
class CPUSplitKernelFunction : public SplitKernelFunction {
public:
CPUDevice* device;
void (*func)(KernelGlobals *kg, KernelData *data);
CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
~CPUSplitKernelFunction() {}
virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
{
if(!func) {
return false;
}
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
for(int y = 0; y < dim.global_size[1]; y++) {
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
func(kg, (KernelData*)data.device_pointer);
}
}
return true;
}
};
CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
{
}
bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
device_memory& kernel_globals,
device_memory& data,
device_memory& split_data,
device_memory& ray_state,
device_memory& queue_index,
device_memory& use_queues_flags,
device_memory& work_pool_wgs)
{
typedef void(*data_init_t)(KernelGlobals *kg,
ccl_constant KernelData *data,
ccl_global void *split_data_buffer,
int num_elements,
ccl_global char *ray_state,
ccl_global uint *rng_state,
int start_sample,
int end_sample,
int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index,
int queuesize,
ccl_global char *use_queues_flag,
ccl_global unsigned int *work_pool_wgs,
unsigned int num_samples,
ccl_global float *buffer);
data_init_t data_init;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
data_init = kernel_cpu_avx2_data_init;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
data_init = kernel_cpu_avx_data_init;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
data_init = kernel_cpu_sse41_data_init;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
data_init = kernel_cpu_sse3_data_init;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
data_init = kernel_cpu_sse2_data_init;
}
else
#endif
{
data_init = kernel_cpu_data_init;
}
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
for(int y = 0; y < dim.global_size[1]; y++) {
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
data_init((KernelGlobals*)kernel_globals.device_pointer,
(KernelData*)data.device_pointer,
(void*)split_data.device_pointer,
num_global_elements,
(char*)ray_state.device_pointer,
(uint*)rtile.rng_state,
rtile.start_sample,
rtile.start_sample + rtile.num_samples,
rtile.x,
rtile.y,
rtile.w,
rtile.h,
rtile.offset,
rtile.stride,
(int*)queue_index.device_pointer,
dim.global_size[0] * dim.global_size[1],
(char*)use_queues_flags.device_pointer,
(uint*)work_pool_wgs.device_pointer,
rtile.num_samples,
(float*)rtile.buffer);
}
}
return true;
}
SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
{
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
if(!kernel->func) {
delete kernel;
return NULL;
}
return kernel;
}
int2 CPUSplitKernel::split_kernel_local_size()
{
return make_int2(1, 1);
}
int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) {
/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
return task->requested_tile_size;
}
unordered_map<string, void*> CPUDevice::kernel_functions;
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);

@ -13,6 +13,7 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
kernels/cpu/kernel_split.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_data_init.cl
kernels/opencl/kernel_queue_enqueue.cl
@ -316,25 +317,35 @@ if(CXX_HAS_SSE)
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
kernels/cpu/kernel_split_sse2.cpp
kernels/cpu/kernel_split_sse3.cpp
kernels/cpu/kernel_split_sse41.cpp
)
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
list(APPEND SRC
kernels/cpu/kernel_avx.cpp
kernels/cpu/kernel_split_avx.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split_avx2.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel

@ -20,6 +20,7 @@
/* CPU Kernel Interface */
#include "util_types.h"
#include "kernel_types.h"
CCL_NAMESPACE_BEGIN
@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
struct KernelGlobals;
struct KernelData;
KernelGlobals *kernel_globals_create();
void kernel_globals_free(KernelGlobals *kg);

@ -44,6 +44,15 @@
#define ccl_addr_space
#define ccl_local_id(d) 0
#define ccl_global_id(d) (kg->global_id[d])
#define ccl_local_size(d) 1
#define ccl_global_size(d) (kg->global_size[d])
#define ccl_group_id(d) ccl_global_id(d)
#define ccl_num_groups(d) ccl_global_size(d)
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
* much slower than the double version. This was fixed in glibc 2.16.
*/

@ -64,6 +64,13 @@ typedef struct KernelGlobals {
/* Storage for decoupled volume steps. */
VolumeStep *decoupled_volume_steps[2];
int decoupled_volume_steps_index;
/* split kernel */
SplitData split_data;
SplitParams split_param_data;
int2 global_size;
int2 global_id;
} KernelGlobals;
#endif /* __KERNEL_CPU__ */

@ -32,6 +32,11 @@
# define ccl_addr_space
#endif
#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__)
/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */
#define __COMPUTE_DEVICE_GPU__
#endif
CCL_NAMESPACE_BEGIN
/* constants */
@ -65,17 +70,23 @@ CCL_NAMESPACE_BEGIN
# endif
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
# define __BRANCHED_PATH__
# ifndef __SPLIT_KERNEL__
# define __BRANCHED_PATH__
# endif
# ifdef WITH_OSL
# define __OSL__
# endif
# define __SUBSURFACE__
# ifndef __SPLIT_KERNEL__
# define __SUBSURFACE__
# endif
# define __CMJ__
# define __VOLUME__
# define __VOLUME_DECOUPLED__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
# define __VOLUME_RECORD_ALL__
# ifndef __SPLIT_KERNEL__
# define __VOLUME__
# define __VOLUME_DECOUPLED__
# define __VOLUME_SCATTER__
# define __SHADOW_RECORD_ALL__
# define __VOLUME_RECORD_ALL__
# endif
#endif /* __KERNEL_CPU__ */
#ifdef __KERNEL_CUDA__

@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample);
/* Split kernels */
void KERNEL_FUNCTION_FULL_NAME(data_init)(
KernelGlobals *kg,
ccl_constant KernelData *data,
ccl_global void *split_data_buffer,
int num_elements,
ccl_global char *ray_state,
ccl_global uint *rng_state,
int start_sample,
int end_sample,
int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index,
int queuesize,
ccl_global char *use_queues_flag,
ccl_global unsigned int *work_pool_wgs,
unsigned int num_samples,
ccl_global float *buffer);
#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
#undef KERNEL_ARCH

@ -21,17 +21,39 @@
*/
#include "kernel_compat_cpu.h"
#include "kernel_math.h"
#include "kernel_types.h"
#include "kernel_globals.h"
#include "kernel_cpu_image.h"
#include "kernel_film.h"
#include "kernel_path.h"
#include "kernel_path_branched.h"
#include "kernel_bake.h"
#ifndef __SPLIT_KERNEL__
# include "kernel_math.h"
# include "kernel_types.h"
# include "split/kernel_split_data.h"
# include "kernel_globals.h"
# include "kernel_cpu_image.h"
# include "kernel_film.h"
# include "kernel_path.h"
# include "kernel_path_branched.h"
# include "kernel_bake.h"
#else
# include "split/kernel_split_common.h"
# include "split/kernel_data_init.h"
# include "split/kernel_scene_intersect.h"
# include "split/kernel_lamp_emission.h"
# include "split/kernel_queue_enqueue.h"
# include "split/kernel_background_buffer_update.h"
# include "split/kernel_shader_eval.h"
# include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
# include "split/kernel_direct_lighting.h"
# include "split/kernel_shadow_blocked.h"
# include "split/kernel_next_iteration_setup.h"
# include "split/kernel_sum_all_radiance.h"
#endif
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
/* Path Tracing */
void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
}
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
{
#define REGISTER_NAME_STRING(name) #name
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
REGISTER(path_trace);
REGISTER(convert_to_byte);
REGISTER(convert_to_half_float);
REGISTER(shader);
REGISTER(data_init);
REGISTER(scene_intersect);
REGISTER(lamp_emission);
REGISTER(queue_enqueue);
REGISTER(background_buffer_update);
REGISTER(shader_eval);
REGISTER(holdout_emission_blurring_pathtermination_ao);
REGISTER(direct_lighting);
REGISTER(shadow_blocked);
REGISTER(next_iteration_setup);
REGISTER(sum_all_radiance);
#undef REGISTER
#undef REGISTER_EVAL_NAME
#undef REGISTER_NAME_STRING
}
#endif /* __SPLIT_KERNEL__ */
CCL_NAMESPACE_END

@ -0,0 +1,63 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CPU kernel entry points */
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
* one with SSE2 intrinsics.
*/
#if defined(__x86_64__) || defined(_M_X64)
# define __KERNEL_SSE2__
#endif
#define __SPLIT_KERNEL__
/* When building kernel for native machine detect kernel features from the flags
* set by compiler.
*/
#ifdef WITH_KERNEL_NATIVE
# ifdef __SSE2__
# ifndef __KERNEL_SSE2__
# define __KERNEL_SSE2__
# endif
# endif
# ifdef __SSE3__
# define __KERNEL_SSE3__
# endif
# ifdef __SSSE3__
# define __KERNEL_SSSE3__
# endif
# ifdef __SSE4_1__
# define __KERNEL_SSE41__
# endif
# ifdef __AVX__
# define __KERNEL_AVX__
# endif
# ifdef __AVX2__
# define __KERNEL_SSE__
# define __KERNEL_AVX2__
# endif
#endif
/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__)
/* do nothing */
#endif
#include "kernel.h"
#define KERNEL_ARCH cpu
#include "kernel_cpu_impl.h"

@ -0,0 +1,38 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
#endif
#define __SPLIT_KERNEL__
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# include "kernel.h"
# define KERNEL_ARCH cpu_avx
# include "kernel_cpu_impl.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */

@ -0,0 +1,40 @@
/*
* Copyright 2011-2014 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
#endif
#define __SPLIT_KERNEL__
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# include "kernel.h"
# define KERNEL_ARCH cpu_avx2
# include "kernel_cpu_impl.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */

@ -0,0 +1,34 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
#endif
#define __SPLIT_KERNEL__
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# include "kernel.h"
# define KERNEL_ARCH cpu_sse2
# include "kernel_cpu_impl.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */

@ -0,0 +1,36 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
#endif
#define __SPLIT_KERNEL__
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# include "kernel.h"
# define KERNEL_ARCH cpu_sse3
# include "kernel_cpu_impl.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */

@ -0,0 +1,37 @@
/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
#endif
#define __SPLIT_KERNEL__
#include "util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# include "kernel.h"
# define KERNEL_ARCH cpu_sse41
# include "kernel_cpu_impl.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */

@ -42,6 +42,7 @@
#include "kernel_types.h"
#include "kernel_compat_cpu.h"
#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "kernel_montecarlo.h"
#include "kernel_random.h"

@ -39,6 +39,7 @@
#include "util_string.h"
#include "kernel_compat_cpu.h"
#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "kernel_random.h"
#include "kernel_projection.h"

@ -19,6 +19,7 @@
#include "kernel_compat_cpu.h"
#include "kernel_montecarlo.h"
#include "kernel_types.h"
#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "geom/geom_object.h"

@ -51,7 +51,11 @@ CCL_NAMESPACE_BEGIN
* The number of elements in the queues is initialized to 0;
*/
#ifndef __KERNEL_CPU__
ccl_device void kernel_data_init(
#else
void KERNEL_FUNCTION_FULL_NAME(data_init)(
#endif
KernelGlobals *kg,
ccl_constant KernelData *data,
ccl_global void *split_data_buffer,

@ -23,7 +23,17 @@
#include "kernel_split_data.h"
#include "kernel_globals.h"
#include "kernel_image_opencl.h"
#ifdef __OSL__
# include "osl_shader.h"
#endif
#ifdef __KERNEL_OPENCL__
# include "kernel_image_opencl.h"
#endif
#ifdef __KERNEL_CPU__
# include "../kernels/cpu/kernel_cpu_image.h"
#endif
#include "util_atomic.h"

@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
sse41(true),
sse3(true),
sse2(true),
qbvh(true)
qbvh(true),
split_kernel(false)
{
reset();
}
@ -55,6 +56,7 @@ void DebugFlags::CPU::reset()
#undef CHECK_CPU_FLAGS
qbvh = true;
split_kernel = false;
}
DebugFlags::CUDA::CUDA()
@ -133,7 +135,9 @@ std::ostream& operator <<(std::ostream &os,
<< " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n"
<< " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
<< " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n";
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
<< " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n"
<< " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
os << "CUDA flags:\n"
<< " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";

@ -46,6 +46,9 @@ public:
/* Whether QBVH usage is allowed or not. */
bool qbvh;
/* Whether split kernel is used */
bool split_kernel;
};
/* Descriptor of CUDA feature-set to be used. */

@ -37,6 +37,9 @@
#define ccl_device_noinline static
#define ccl_global
#define ccl_constant
#define ccl_local
#define ccl_local_param
#define ccl_private
#define ccl_restrict __restrict
#define __KERNEL_WITH_SSE_ALIGN__