forked from bartvdbraak/blender
Cycles: Pass all buffers to each kernel call for OpenCL
Technically not passing all buffers used by a kernel is undefined behavior. We haven't had any issues with this so far on AMD or Nvidia, but it's known to be a problem with Intel and we received a report from AMD that this is a problem on newer hardware, so we need to make this change at some point. Unfortunately there a cost to being correct, about 5% for the benchmark scenes. For low sample counts it's even worse, I've seen up to 50% slowdown. For the latter case I think adjusting tile updating logic can help, but not sure what that would look like yet (it would be just a few lines change however).
This commit is contained in:
parent
6238214159
commit
eb293f59f2
@ -176,17 +176,62 @@ protected:
|
||||
friend class OpenCLSplitKernelFunction;
|
||||
};
|
||||
|
||||
struct CachedSplitMemory {
|
||||
int id;
|
||||
device_memory *split_data;
|
||||
device_memory *ray_state;
|
||||
device_ptr *rng_state;
|
||||
device_memory *queue_index;
|
||||
device_memory *use_queues_flag;
|
||||
device_memory *work_pools;
|
||||
device_ptr *buffer;
|
||||
};
|
||||
|
||||
class OpenCLSplitKernelFunction : public SplitKernelFunction {
|
||||
public:
|
||||
OpenCLDeviceSplitKernel* device;
|
||||
OpenCLDeviceBase::OpenCLProgram program;
|
||||
CachedSplitMemory& cached_memory;
|
||||
int cached_id;
|
||||
|
||||
OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
|
||||
~OpenCLSplitKernelFunction() { program.release(); }
|
||||
OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
|
||||
device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
|
||||
{
|
||||
}
|
||||
|
||||
~OpenCLSplitKernelFunction()
|
||||
{
|
||||
program.release();
|
||||
}
|
||||
|
||||
virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
|
||||
{
|
||||
device->kernel_set_args(program(), 0, kg, data);
|
||||
if(cached_id != cached_memory.id) {
|
||||
cl_uint start_arg_index =
|
||||
device->kernel_set_args(program(),
|
||||
0,
|
||||
kg,
|
||||
data,
|
||||
*cached_memory.split_data,
|
||||
*cached_memory.ray_state,
|
||||
*cached_memory.rng_state);
|
||||
|
||||
/* TODO(sergey): Avoid map lookup here. */
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
device->set_kernel_arg_mem(program(), &start_arg_index, #name);
|
||||
#include "kernel/kernel_textures.h"
|
||||
#undef KERNEL_TEX
|
||||
|
||||
start_arg_index +=
|
||||
device->kernel_set_args(program(),
|
||||
start_arg_index,
|
||||
*cached_memory.queue_index,
|
||||
*cached_memory.use_queues_flag,
|
||||
*cached_memory.work_pools,
|
||||
*cached_memory.buffer);
|
||||
|
||||
cached_id = cached_memory.id;
|
||||
}
|
||||
|
||||
device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
|
||||
program(),
|
||||
@ -213,6 +258,7 @@ public:
|
||||
|
||||
class OpenCLSplitKernel : public DeviceSplitKernel {
|
||||
OpenCLDeviceSplitKernel *device;
|
||||
CachedSplitMemory cached_memory;
|
||||
public:
|
||||
explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
|
||||
}
|
||||
@ -220,7 +266,7 @@ public:
|
||||
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
|
||||
const DeviceRequestedFeatures& requested_features)
|
||||
{
|
||||
OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
|
||||
OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
|
||||
|
||||
bool single_program = OpenCLInfo::use_single_program();
|
||||
kernel->program =
|
||||
@ -349,6 +395,15 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
cached_memory.split_data = &split_data;
|
||||
cached_memory.ray_state = &ray_state;
|
||||
cached_memory.rng_state = &rtile.rng_state;
|
||||
cached_memory.queue_index = &queue_index;
|
||||
cached_memory.use_queues_flag = &use_queues_flag;
|
||||
cached_memory.work_pools = &work_pool_wgs;
|
||||
cached_memory.buffer = &rtile.buffer;
|
||||
cached_memory.id++;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -122,6 +122,10 @@ set(SRC_KERNELS_CUDA_HEADERS
|
||||
kernels/cuda/kernel_config.h
|
||||
)
|
||||
|
||||
set(SRC_KERNELS_OPENCL_HEADERS
|
||||
kernels/opencl/kernel_split_function.h
|
||||
)
|
||||
|
||||
set(SRC_CLOSURE_HEADERS
|
||||
closure/alloc.h
|
||||
closure/bsdf.h
|
||||
@ -452,6 +456,7 @@ add_library(cycles_kernel
|
||||
${SRC_HEADERS}
|
||||
${SRC_KERNELS_CPU_HEADERS}
|
||||
${SRC_KERNELS_CUDA_HEADERS}
|
||||
${SRC_KERNELS_OPENCL_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
${SRC_FILTER_HEADERS}
|
||||
@ -496,6 +501,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inact
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_buffer_update.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_buffer_update(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local unsigned int local_queue_atomics;
|
||||
kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
|
||||
}
|
||||
#define KERNEL_NAME buffer_update
|
||||
#define LOCALS_TYPE unsigned int
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_direct_lighting.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_direct_lighting(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local unsigned int local_queue_atomics;
|
||||
kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
|
||||
}
|
||||
#define KERNEL_NAME direct_lighting
|
||||
#define LOCALS_TYPE unsigned int
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_do_volume.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_do_volume(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_do_volume((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME do_volume
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_enqueue_inactive.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_enqueue_inactive(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local unsigned int local_queue_atomics;
|
||||
kernel_enqueue_inactive((KernelGlobals*)kg, &local_queue_atomics);
|
||||
}
|
||||
#define KERNEL_NAME enqueue_inactive
|
||||
#define LOCALS_TYPE unsigned int
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,12 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local BackgroundAOLocals locals;
|
||||
kernel_holdout_emission_blurring_pathtermination_ao(
|
||||
(KernelGlobals*)kg,
|
||||
&locals);
|
||||
}
|
||||
#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
|
||||
#define LOCALS_TYPE BackgroundAOLocals
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_indirect_background.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_indirect_background(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_indirect_background((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME indirect_background
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_indirect_subsurface.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_indirect_subsurface(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_indirect_subsurface((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME indirect_subsurface
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_lamp_emission.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_lamp_emission(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_lamp_emission((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME lamp_emission
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_next_iteration_setup.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_next_iteration_setup(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local unsigned int local_queue_atomics;
|
||||
kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
|
||||
}
|
||||
#define KERNEL_NAME next_iteration_setup
|
||||
#define LOCALS_TYPE unsigned int
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_path_init.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_path_init(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_path_init((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME path_init
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_queue_enqueue.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_queue_enqueue(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local QueueEnqueueLocals locals;
|
||||
kernel_queue_enqueue((KernelGlobals*)kg, &locals);
|
||||
}
|
||||
#define KERNEL_NAME queue_enqueue
|
||||
#define LOCALS_TYPE QueueEnqueueLocals
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_scene_intersect.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_scene_intersect(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_scene_intersect((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME scene_intersect
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_shader_eval.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_shader_eval(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_shader_eval((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME shader_eval
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,10 +18,9 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_shader_setup.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_shader_setup(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local unsigned int local_queue_atomics;
|
||||
kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics);
|
||||
}
|
||||
#define KERNEL_NAME shader_setup
|
||||
#define LOCALS_TYPE unsigned int
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -19,10 +19,9 @@
|
||||
#include "kernel/split/kernel_shader_sort.h"
|
||||
|
||||
__attribute__((reqd_work_group_size(64, 1, 1)))
|
||||
__kernel void kernel_ocl_path_trace_shader_sort(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
ccl_local ShaderSortLocals locals;
|
||||
kernel_shader_sort((KernelGlobals*)kg, &locals);
|
||||
}
|
||||
#define KERNEL_NAME shader_sort
|
||||
#define LOCALS_TYPE ShaderSortLocals
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
#undef LOCALS_TYPE
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_shadow_blocked_ao.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_shadow_blocked_ao((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME shadow_blocked_ao
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_shadow_blocked_dl.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_shadow_blocked_dl((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME shadow_blocked_dl
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
72
intern/cycles/kernel/kernels/opencl/kernel_split_function.h
Normal file
72
intern/cycles/kernel/kernels/opencl/kernel_split_function.h
Normal file
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Copyright 2011-2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
|
||||
#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
|
||||
|
||||
__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
|
||||
ccl_global char *kg_global,
|
||||
ccl_constant KernelData *data,
|
||||
|
||||
ccl_global void *split_data_buffer,
|
||||
ccl_global char *ray_state,
|
||||
ccl_global uint *rng_state,
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
ccl_global type *name,
|
||||
#include "kernel/kernel_textures.h"
|
||||
|
||||
ccl_global int *queue_index,
|
||||
ccl_global char *use_queues_flag,
|
||||
ccl_global unsigned int *work_pools,
|
||||
ccl_global float *buffer
|
||||
)
|
||||
{
|
||||
#ifdef LOCALS_TYPE
|
||||
ccl_local LOCALS_TYPE locals;
|
||||
#endif
|
||||
|
||||
KernelGlobals *kg = (KernelGlobals*)kg_global;
|
||||
|
||||
if(ccl_local_id(0) + ccl_local_id(1) == 0) {
|
||||
kg->data = data;
|
||||
|
||||
kernel_split_params.rng_state = rng_state;
|
||||
kernel_split_params.queue_index = queue_index;
|
||||
kernel_split_params.use_queues_flag = use_queues_flag;
|
||||
kernel_split_params.work_pools = work_pools;
|
||||
kernel_split_params.buffer = buffer;
|
||||
|
||||
split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
kg->name = name;
|
||||
#include "kernel/kernel_textures.h"
|
||||
}
|
||||
|
||||
ccl_barrier(CCL_LOCAL_MEM_FENCE);
|
||||
|
||||
KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
|
||||
kg
|
||||
#ifdef LOCALS_TYPE
|
||||
, &locals
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
#undef KERNEL_NAME_JOIN
|
||||
#undef KERNEL_NAME_EVAL
|
||||
|
@ -18,9 +18,7 @@
|
||||
#include "kernel/split/kernel_split_common.h"
|
||||
#include "kernel/split/kernel_subsurface_scatter.h"
|
||||
|
||||
__kernel void kernel_ocl_path_trace_subsurface_scatter(
|
||||
ccl_global char *kg,
|
||||
ccl_constant KernelData *data)
|
||||
{
|
||||
kernel_subsurface_scatter((KernelGlobals*)kg);
|
||||
}
|
||||
#define KERNEL_NAME subsurface_scatter
|
||||
#include "kernel/kernels/opencl/kernel_split_function.h"
|
||||
#undef KERNEL_NAME
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user