forked from bartvdbraak/blender
Cycles: CPU implementation of split kernel
This commit is contained in:
parent
352ee7c3ef
commit
0892352bfe
@ -665,6 +665,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
|
||||
cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
|
||||
cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
|
||||
cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
|
||||
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
|
||||
|
||||
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
|
||||
|
||||
|
@ -1518,6 +1518,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
|
||||
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
|
||||
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
|
||||
col.prop(cscene, "debug_use_qbvh")
|
||||
col.prop(cscene, "debug_use_cpu_split_kernel")
|
||||
|
||||
col = layout.column()
|
||||
col.label('CUDA Flags:')
|
||||
|
@ -67,6 +67,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
|
||||
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
|
||||
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
|
||||
flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
|
||||
flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
|
||||
/* Synchronize CUDA flags. */
|
||||
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
|
||||
/* Synchronize OpenCL kernel type. */
|
||||
|
@ -26,10 +26,12 @@
|
||||
|
||||
#include "device.h"
|
||||
#include "device_intern.h"
|
||||
#include "device_split_kernel.h"
|
||||
|
||||
#include "kernel.h"
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "kernel_types.h"
|
||||
#include "split/kernel_split_data.h"
|
||||
#include "kernel_globals.h"
|
||||
|
||||
#include "osl_shader.h"
|
||||
@ -41,6 +43,7 @@
|
||||
#include "util_foreach.h"
|
||||
#include "util_function.h"
|
||||
#include "util_logging.h"
|
||||
#include "util_map.h"
|
||||
#include "util_opengl.h"
|
||||
#include "util_progress.h"
|
||||
#include "util_system.h"
|
||||
@ -48,8 +51,92 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class CPUDevice;
|
||||
|
||||
class CPUSplitKernel : public DeviceSplitKernel {
|
||||
CPUDevice *device;
|
||||
public:
|
||||
explicit CPUSplitKernel(CPUDevice *device);
|
||||
|
||||
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
||||
RenderTile& rtile,
|
||||
int num_global_elements,
|
||||
device_memory& kernel_globals,
|
||||
device_memory& kernel_data_,
|
||||
device_memory& split_data,
|
||||
device_memory& ray_state,
|
||||
device_memory& queue_index,
|
||||
device_memory& use_queues_flag,
|
||||
device_memory& work_pool_wgs);
|
||||
|
||||
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
|
||||
virtual int2 split_kernel_local_size();
|
||||
virtual int2 split_kernel_global_size(DeviceTask *task);
|
||||
};
|
||||
|
||||
class CPUDevice : public Device
|
||||
{
|
||||
static unordered_map<string, void*> kernel_functions;
|
||||
|
||||
static void register_kernel_function(const char* name, void* func)
|
||||
{
|
||||
kernel_functions[name] = func;
|
||||
}
|
||||
|
||||
static const char* get_arch_name()
|
||||
{
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
if(system_cpu_support_avx2()) {
|
||||
return "cpu_avx2";
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
return "cpu_avx";
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
return "cpu_sse41";
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
return "cpu_sse3";
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
return "cpu_sse2";
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
return "cpu";
|
||||
}
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static F get_kernel_function(string name)
|
||||
{
|
||||
name = string("kernel_") + get_arch_name() + "_" + name;
|
||||
|
||||
unordered_map<string, void*>::iterator it = kernel_functions.find(name);
|
||||
|
||||
if(it == kernel_functions.end()) {
|
||||
assert(!"kernel function not found");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (F)it->second;
|
||||
}
|
||||
|
||||
friend class CPUSplitKernel;
|
||||
|
||||
public:
|
||||
TaskPool task_pool;
|
||||
KernelGlobals kernel_globals;
|
||||
@ -57,10 +144,15 @@ public:
|
||||
#ifdef WITH_OSL
|
||||
OSLGlobals osl_globals;
|
||||
#endif
|
||||
|
||||
bool use_split_kernel;
|
||||
|
||||
DeviceRequestedFeatures requested_features;
|
||||
|
||||
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
|
||||
: Device(info, stats, background)
|
||||
{
|
||||
|
||||
#ifdef WITH_OSL
|
||||
kernel_globals.osl = &osl_globals;
|
||||
#endif
|
||||
@ -105,6 +197,28 @@ public:
|
||||
{
|
||||
VLOG(1) << "Will be using regular kernels.";
|
||||
}
|
||||
|
||||
use_split_kernel = DebugFlags().cpu.split_kernel;
|
||||
if(use_split_kernel) {
|
||||
VLOG(1) << "Will be using split kernel.";
|
||||
}
|
||||
|
||||
kernel_cpu_register_functions(register_kernel_function);
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
kernel_cpu_sse2_register_functions(register_kernel_function);
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
kernel_cpu_sse3_register_functions(register_kernel_function);
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
kernel_cpu_sse41_register_functions(register_kernel_function);
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
kernel_cpu_avx_register_functions(register_kernel_function);
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
kernel_cpu_avx2_register_functions(register_kernel_function);
|
||||
#endif
|
||||
}
|
||||
|
||||
~CPUDevice()
|
||||
@ -205,8 +319,14 @@ public:
|
||||
|
||||
void thread_run(DeviceTask *task)
|
||||
{
|
||||
if(task->type == DeviceTask::PATH_TRACE)
|
||||
thread_path_trace(*task);
|
||||
if(task->type == DeviceTask::PATH_TRACE) {
|
||||
if(!use_split_kernel) {
|
||||
thread_path_trace(*task);
|
||||
}
|
||||
else {
|
||||
thread_path_trace_split(*task);
|
||||
}
|
||||
}
|
||||
else if(task->type == DeviceTask::FILM_CONVERT)
|
||||
thread_film_convert(*task);
|
||||
else if(task->type == DeviceTask::SHADER)
|
||||
@ -267,7 +387,7 @@ public:
|
||||
{
|
||||
path_trace_kernel = kernel_cpu_path_trace;
|
||||
}
|
||||
|
||||
|
||||
while(task.acquire_tile(this, tile)) {
|
||||
float *render_buffer = (float*)tile.buffer;
|
||||
uint *rng_state = (uint*)tile.rng_state;
|
||||
@ -303,6 +423,49 @@ public:
|
||||
thread_kernel_globals_free(&kg);
|
||||
}
|
||||
|
||||
void thread_path_trace_split(DeviceTask& task)
|
||||
{
|
||||
if(task_pool.canceled()) {
|
||||
if(task.need_finish_queue == false)
|
||||
return;
|
||||
}
|
||||
|
||||
RenderTile tile;
|
||||
|
||||
CPUSplitKernel split_kernel(this);
|
||||
|
||||
/* allocate buffer for kernel globals */
|
||||
device_memory kgbuffer;
|
||||
kgbuffer.resize(sizeof(KernelGlobals));
|
||||
mem_alloc(kgbuffer, MEM_READ_WRITE);
|
||||
|
||||
KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
|
||||
*kg = thread_kernel_globals_init();
|
||||
|
||||
requested_features.max_closure = MAX_CLOSURE;
|
||||
if(!split_kernel.load_kernels(requested_features)) {
|
||||
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
||||
mem_free(kgbuffer);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
while(task.acquire_tile(this, tile)) {
|
||||
device_memory data;
|
||||
split_kernel.path_trace(&task, tile, kgbuffer, data);
|
||||
|
||||
task.release_tile(tile);
|
||||
|
||||
if(task_pool.canceled()) {
|
||||
if(task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
||||
mem_free(kgbuffer);
|
||||
}
|
||||
|
||||
void thread_film_convert(DeviceTask& task)
|
||||
{
|
||||
float sample_scale = 1.0f/(task.sample + 1);
|
||||
@ -510,6 +673,10 @@ protected:
|
||||
|
||||
inline void thread_kernel_globals_free(KernelGlobals *kg)
|
||||
{
|
||||
if(kg == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(kg->transparent_shadow_intersections != NULL) {
|
||||
free(kg->transparent_shadow_intersections);
|
||||
}
|
||||
@ -524,8 +691,170 @@ protected:
|
||||
OSLShader::thread_free(kg);
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
|
||||
requested_features = requested_features_;
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/* split kernel */
|
||||
|
||||
class CPUSplitKernelFunction : public SplitKernelFunction {
|
||||
public:
|
||||
CPUDevice* device;
|
||||
void (*func)(KernelGlobals *kg, KernelData *data);
|
||||
|
||||
CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
|
||||
~CPUSplitKernelFunction() {}
|
||||
|
||||
virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
|
||||
{
|
||||
if(!func) {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
|
||||
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
|
||||
|
||||
for(int y = 0; y < dim.global_size[1]; y++) {
|
||||
for(int x = 0; x < dim.global_size[0]; x++) {
|
||||
kg->global_id = make_int2(x, y);
|
||||
|
||||
func(kg, (KernelData*)data.device_pointer);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
|
||||
{
|
||||
}
|
||||
|
||||
bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
||||
RenderTile& rtile,
|
||||
int num_global_elements,
|
||||
device_memory& kernel_globals,
|
||||
device_memory& data,
|
||||
device_memory& split_data,
|
||||
device_memory& ray_state,
|
||||
device_memory& queue_index,
|
||||
device_memory& use_queues_flags,
|
||||
device_memory& work_pool_wgs)
|
||||
{
|
||||
typedef void(*data_init_t)(KernelGlobals *kg,
|
||||
ccl_constant KernelData *data,
|
||||
ccl_global void *split_data_buffer,
|
||||
int num_elements,
|
||||
ccl_global char *ray_state,
|
||||
ccl_global uint *rng_state,
|
||||
int start_sample,
|
||||
int end_sample,
|
||||
int sx, int sy, int sw, int sh, int offset, int stride,
|
||||
ccl_global int *Queue_index,
|
||||
int queuesize,
|
||||
ccl_global char *use_queues_flag,
|
||||
ccl_global unsigned int *work_pool_wgs,
|
||||
unsigned int num_samples,
|
||||
ccl_global float *buffer);
|
||||
|
||||
data_init_t data_init;
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
if(system_cpu_support_avx2()) {
|
||||
data_init = kernel_cpu_avx2_data_init;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
if(system_cpu_support_avx()) {
|
||||
data_init = kernel_cpu_avx_data_init;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
if(system_cpu_support_sse41()) {
|
||||
data_init = kernel_cpu_sse41_data_init;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
if(system_cpu_support_sse3()) {
|
||||
data_init = kernel_cpu_sse3_data_init;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
if(system_cpu_support_sse2()) {
|
||||
data_init = kernel_cpu_sse2_data_init;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
data_init = kernel_cpu_data_init;
|
||||
}
|
||||
|
||||
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
|
||||
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
|
||||
|
||||
for(int y = 0; y < dim.global_size[1]; y++) {
|
||||
for(int x = 0; x < dim.global_size[0]; x++) {
|
||||
kg->global_id = make_int2(x, y);
|
||||
|
||||
data_init((KernelGlobals*)kernel_globals.device_pointer,
|
||||
(KernelData*)data.device_pointer,
|
||||
(void*)split_data.device_pointer,
|
||||
num_global_elements,
|
||||
(char*)ray_state.device_pointer,
|
||||
(uint*)rtile.rng_state,
|
||||
rtile.start_sample,
|
||||
rtile.start_sample + rtile.num_samples,
|
||||
rtile.x,
|
||||
rtile.y,
|
||||
rtile.w,
|
||||
rtile.h,
|
||||
rtile.offset,
|
||||
rtile.stride,
|
||||
(int*)queue_index.device_pointer,
|
||||
dim.global_size[0] * dim.global_size[1],
|
||||
(char*)use_queues_flags.device_pointer,
|
||||
(uint*)work_pool_wgs.device_pointer,
|
||||
rtile.num_samples,
|
||||
(float*)rtile.buffer);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
|
||||
{
|
||||
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
|
||||
|
||||
kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
|
||||
if(!kernel->func) {
|
||||
delete kernel;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return kernel;
|
||||
}
|
||||
|
||||
int2 CPUSplitKernel::split_kernel_local_size()
|
||||
{
|
||||
return make_int2(1, 1);
|
||||
}
|
||||
|
||||
int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) {
|
||||
/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
|
||||
return task->requested_tile_size;
|
||||
}
|
||||
|
||||
unordered_map<string, void*> CPUDevice::kernel_functions;
|
||||
|
||||
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
|
||||
{
|
||||
return new CPUDevice(info, stats, background);
|
||||
|
@ -13,6 +13,7 @@ set(INC_SYS
|
||||
|
||||
set(SRC
|
||||
kernels/cpu/kernel.cpp
|
||||
kernels/cpu/kernel_split.cpp
|
||||
kernels/opencl/kernel.cl
|
||||
kernels/opencl/kernel_data_init.cl
|
||||
kernels/opencl/kernel_queue_enqueue.cl
|
||||
@ -316,25 +317,35 @@ if(CXX_HAS_SSE)
|
||||
kernels/cpu/kernel_sse2.cpp
|
||||
kernels/cpu/kernel_sse3.cpp
|
||||
kernels/cpu/kernel_sse41.cpp
|
||||
kernels/cpu/kernel_split_sse2.cpp
|
||||
kernels/cpu/kernel_split_sse3.cpp
|
||||
kernels/cpu/kernel_split_sse41.cpp
|
||||
)
|
||||
|
||||
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(CXX_HAS_AVX)
|
||||
list(APPEND SRC
|
||||
kernels/cpu/kernel_avx.cpp
|
||||
kernels/cpu/kernel_split_avx.cpp
|
||||
)
|
||||
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(CXX_HAS_AVX2)
|
||||
list(APPEND SRC
|
||||
kernels/cpu/kernel_avx2.cpp
|
||||
kernels/cpu/kernel_split_avx2.cpp
|
||||
)
|
||||
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||
endif()
|
||||
|
||||
add_library(cycles_kernel
|
||||
|
@ -20,6 +20,7 @@
|
||||
/* CPU Kernel Interface */
|
||||
|
||||
#include "util_types.h"
|
||||
#include "kernel_types.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
|
||||
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
|
||||
|
||||
struct KernelGlobals;
|
||||
struct KernelData;
|
||||
|
||||
KernelGlobals *kernel_globals_create();
|
||||
void kernel_globals_free(KernelGlobals *kg);
|
||||
|
@ -44,6 +44,15 @@
|
||||
|
||||
#define ccl_addr_space
|
||||
|
||||
#define ccl_local_id(d) 0
|
||||
#define ccl_global_id(d) (kg->global_id[d])
|
||||
|
||||
#define ccl_local_size(d) 1
|
||||
#define ccl_global_size(d) (kg->global_size[d])
|
||||
|
||||
#define ccl_group_id(d) ccl_global_id(d)
|
||||
#define ccl_num_groups(d) ccl_global_size(d)
|
||||
|
||||
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
|
||||
* much slower than the double version. This was fixed in glibc 2.16.
|
||||
*/
|
||||
|
@ -64,6 +64,13 @@ typedef struct KernelGlobals {
|
||||
/* Storage for decoupled volume steps. */
|
||||
VolumeStep *decoupled_volume_steps[2];
|
||||
int decoupled_volume_steps_index;
|
||||
|
||||
/* split kernel */
|
||||
SplitData split_data;
|
||||
SplitParams split_param_data;
|
||||
|
||||
int2 global_size;
|
||||
int2 global_id;
|
||||
} KernelGlobals;
|
||||
|
||||
#endif /* __KERNEL_CPU__ */
|
||||
|
@ -32,6 +32,11 @@
|
||||
# define ccl_addr_space
|
||||
#endif
|
||||
|
||||
#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__)
|
||||
/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */
|
||||
#define __COMPUTE_DEVICE_GPU__
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* constants */
|
||||
@ -65,17 +70,23 @@ CCL_NAMESPACE_BEGIN
|
||||
# endif
|
||||
# define __KERNEL_SHADING__
|
||||
# define __KERNEL_ADV_SHADING__
|
||||
# define __BRANCHED_PATH__
|
||||
# ifndef __SPLIT_KERNEL__
|
||||
# define __BRANCHED_PATH__
|
||||
# endif
|
||||
# ifdef WITH_OSL
|
||||
# define __OSL__
|
||||
# endif
|
||||
# define __SUBSURFACE__
|
||||
# ifndef __SPLIT_KERNEL__
|
||||
# define __SUBSURFACE__
|
||||
# endif
|
||||
# define __CMJ__
|
||||
# define __VOLUME__
|
||||
# define __VOLUME_DECOUPLED__
|
||||
# define __VOLUME_SCATTER__
|
||||
# define __SHADOW_RECORD_ALL__
|
||||
# define __VOLUME_RECORD_ALL__
|
||||
# ifndef __SPLIT_KERNEL__
|
||||
# define __VOLUME__
|
||||
# define __VOLUME_DECOUPLED__
|
||||
# define __VOLUME_SCATTER__
|
||||
# define __SHADOW_RECORD_ALL__
|
||||
# define __VOLUME_RECORD_ALL__
|
||||
# endif
|
||||
#endif /* __KERNEL_CPU__ */
|
||||
|
||||
#ifdef __KERNEL_CUDA__
|
||||
|
@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||
int offset,
|
||||
int sample);
|
||||
|
||||
/* Split kernels */
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(data_init)(
|
||||
KernelGlobals *kg,
|
||||
ccl_constant KernelData *data,
|
||||
ccl_global void *split_data_buffer,
|
||||
int num_elements,
|
||||
ccl_global char *ray_state,
|
||||
ccl_global uint *rng_state,
|
||||
int start_sample,
|
||||
int end_sample,
|
||||
int sx, int sy, int sw, int sh, int offset, int stride,
|
||||
ccl_global int *Queue_index,
|
||||
int queuesize,
|
||||
ccl_global char *use_queues_flag,
|
||||
ccl_global unsigned int *work_pool_wgs,
|
||||
unsigned int num_samples,
|
||||
ccl_global float *buffer);
|
||||
|
||||
#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
|
||||
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
|
||||
DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
|
||||
|
||||
#undef KERNEL_ARCH
|
||||
|
@ -21,17 +21,39 @@
|
||||
*/
|
||||
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "kernel_math.h"
|
||||
#include "kernel_types.h"
|
||||
#include "kernel_globals.h"
|
||||
#include "kernel_cpu_image.h"
|
||||
#include "kernel_film.h"
|
||||
#include "kernel_path.h"
|
||||
#include "kernel_path_branched.h"
|
||||
#include "kernel_bake.h"
|
||||
|
||||
#ifndef __SPLIT_KERNEL__
|
||||
# include "kernel_math.h"
|
||||
# include "kernel_types.h"
|
||||
|
||||
# include "split/kernel_split_data.h"
|
||||
# include "kernel_globals.h"
|
||||
|
||||
# include "kernel_cpu_image.h"
|
||||
# include "kernel_film.h"
|
||||
# include "kernel_path.h"
|
||||
# include "kernel_path_branched.h"
|
||||
# include "kernel_bake.h"
|
||||
#else
|
||||
# include "split/kernel_split_common.h"
|
||||
|
||||
# include "split/kernel_data_init.h"
|
||||
# include "split/kernel_scene_intersect.h"
|
||||
# include "split/kernel_lamp_emission.h"
|
||||
# include "split/kernel_queue_enqueue.h"
|
||||
# include "split/kernel_background_buffer_update.h"
|
||||
# include "split/kernel_shader_eval.h"
|
||||
# include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
|
||||
# include "split/kernel_direct_lighting.h"
|
||||
# include "split/kernel_shadow_blocked.h"
|
||||
# include "split/kernel_next_iteration_setup.h"
|
||||
# include "split/kernel_sum_all_radiance.h"
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __SPLIT_KERNEL__
|
||||
|
||||
/* Path Tracing */
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
|
||||
@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
||||
}
|
||||
}
|
||||
|
||||
#else /* __SPLIT_KERNEL__ */
|
||||
|
||||
/* Split Kernel Path Tracing */
|
||||
|
||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||
{ \
|
||||
kernel_##name(kg); \
|
||||
}
|
||||
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
|
||||
DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
|
||||
{
|
||||
#define REGISTER_NAME_STRING(name) #name
|
||||
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
|
||||
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
|
||||
|
||||
REGISTER(path_trace);
|
||||
REGISTER(convert_to_byte);
|
||||
REGISTER(convert_to_half_float);
|
||||
REGISTER(shader);
|
||||
|
||||
REGISTER(data_init);
|
||||
REGISTER(scene_intersect);
|
||||
REGISTER(lamp_emission);
|
||||
REGISTER(queue_enqueue);
|
||||
REGISTER(background_buffer_update);
|
||||
REGISTER(shader_eval);
|
||||
REGISTER(holdout_emission_blurring_pathtermination_ao);
|
||||
REGISTER(direct_lighting);
|
||||
REGISTER(shadow_blocked);
|
||||
REGISTER(next_iteration_setup);
|
||||
REGISTER(sum_all_radiance);
|
||||
|
||||
#undef REGISTER
|
||||
#undef REGISTER_EVAL_NAME
|
||||
#undef REGISTER_NAME_STRING
|
||||
}
|
||||
|
||||
#endif /* __SPLIT_KERNEL__ */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
63
intern/cycles/kernel/kernels/cpu/kernel_split.cpp
Normal file
63
intern/cycles/kernel/kernels/cpu/kernel_split.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* CPU kernel entry points */
|
||||
|
||||
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
|
||||
* one with SSE2 intrinsics.
|
||||
*/
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
# define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
/* When building kernel for native machine detect kernel features from the flags
|
||||
* set by compiler.
|
||||
*/
|
||||
#ifdef WITH_KERNEL_NATIVE
|
||||
# ifdef __SSE2__
|
||||
# ifndef __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE2__
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __SSE3__
|
||||
# define __KERNEL_SSE3__
|
||||
# endif
|
||||
# ifdef __SSSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# endif
|
||||
# ifdef __SSE4_1__
|
||||
# define __KERNEL_SSE41__
|
||||
# endif
|
||||
# ifdef __AVX__
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
# ifdef __AVX2__
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* quiet unused define warnings */
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
/* do nothing */
|
||||
#endif
|
||||
|
||||
#include "kernel.h"
|
||||
#define KERNEL_ARCH cpu
|
||||
#include "kernel_cpu_impl.h"
|
||||
|
38
intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
Normal file
38
intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||
# include "kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx
|
||||
# include "kernel_cpu_impl.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
40
intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
Normal file
40
intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright 2011-2014 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with AVX2
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
# define __KERNEL_AVX__
|
||||
# define __KERNEL_AVX2__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||
# include "kernel.h"
|
||||
# define KERNEL_ARCH cpu_avx2
|
||||
# include "kernel_cpu_impl.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
34
intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
Normal file
34
intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE2
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||
# include "kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse2
|
||||
# include "kernel_cpu_impl.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
36
intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
Normal file
36
intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||
# include "kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse3
|
||||
# include "kernel_cpu_impl.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
37
intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
Normal file
37
intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright 2011-2013 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
# define __KERNEL_SSE41__
|
||||
#endif
|
||||
|
||||
#define __SPLIT_KERNEL__
|
||||
|
||||
#include "util_optimization.h"
|
||||
|
||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||
# include "kernel.h"
|
||||
# define KERNEL_ARCH cpu_sse41
|
||||
# include "kernel_cpu_impl.h"
|
||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
@ -42,6 +42,7 @@
|
||||
|
||||
#include "kernel_types.h"
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "split/kernel_split_data.h"
|
||||
#include "kernel_globals.h"
|
||||
#include "kernel_montecarlo.h"
|
||||
#include "kernel_random.h"
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "util_string.h"
|
||||
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "split/kernel_split_data.h"
|
||||
#include "kernel_globals.h"
|
||||
#include "kernel_random.h"
|
||||
#include "kernel_projection.h"
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "kernel_compat_cpu.h"
|
||||
#include "kernel_montecarlo.h"
|
||||
#include "kernel_types.h"
|
||||
#include "split/kernel_split_data.h"
|
||||
#include "kernel_globals.h"
|
||||
|
||||
#include "geom/geom_object.h"
|
||||
|
@ -51,7 +51,11 @@ CCL_NAMESPACE_BEGIN
|
||||
* The number of elements in the queues is initialized to 0;
|
||||
*/
|
||||
|
||||
#ifndef __KERNEL_CPU__
|
||||
ccl_device void kernel_data_init(
|
||||
#else
|
||||
void KERNEL_FUNCTION_FULL_NAME(data_init)(
|
||||
#endif
|
||||
KernelGlobals *kg,
|
||||
ccl_constant KernelData *data,
|
||||
ccl_global void *split_data_buffer,
|
||||
|
@ -23,7 +23,17 @@
|
||||
#include "kernel_split_data.h"
|
||||
|
||||
#include "kernel_globals.h"
|
||||
#include "kernel_image_opencl.h"
|
||||
|
||||
#ifdef __OSL__
|
||||
# include "osl_shader.h"
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
# include "kernel_image_opencl.h"
|
||||
#endif
|
||||
#ifdef __KERNEL_CPU__
|
||||
# include "../kernels/cpu/kernel_cpu_image.h"
|
||||
#endif
|
||||
|
||||
#include "util_atomic.h"
|
||||
|
||||
|
@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
|
||||
sse41(true),
|
||||
sse3(true),
|
||||
sse2(true),
|
||||
qbvh(true)
|
||||
qbvh(true),
|
||||
split_kernel(false)
|
||||
{
|
||||
reset();
|
||||
}
|
||||
@ -55,6 +56,7 @@ void DebugFlags::CPU::reset()
|
||||
#undef CHECK_CPU_FLAGS
|
||||
|
||||
qbvh = true;
|
||||
split_kernel = false;
|
||||
}
|
||||
|
||||
DebugFlags::CUDA::CUDA()
|
||||
@ -133,7 +135,9 @@ std::ostream& operator <<(std::ostream &os,
|
||||
<< " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n"
|
||||
<< " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
|
||||
<< " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
|
||||
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n";
|
||||
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
|
||||
<< " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n"
|
||||
<< " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
|
||||
|
||||
os << "CUDA flags:\n"
|
||||
<< " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
|
||||
|
@ -46,6 +46,9 @@ public:
|
||||
|
||||
/* Whether QBVH usage is allowed or not. */
|
||||
bool qbvh;
|
||||
|
||||
/* Whether split kernel is used */
|
||||
bool split_kernel;
|
||||
};
|
||||
|
||||
/* Descriptor of CUDA feature-set to be used. */
|
||||
|
@ -37,6 +37,9 @@
|
||||
#define ccl_device_noinline static
|
||||
#define ccl_global
|
||||
#define ccl_constant
|
||||
#define ccl_local
|
||||
#define ccl_local_param
|
||||
#define ccl_private
|
||||
#define ccl_restrict __restrict
|
||||
#define __KERNEL_WITH_SSE_ALIGN__
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user