Cycles: Pass all buffers to each kernel call for OpenCL

Technically not passing all buffers used by a kernel is undefined
behavior. We haven't had any issues with this so far on AMD or
Nvidia, but it's known to be a problem with Intel and we received
a report from AMD that this is a problem on newer hardware, so we
need to make this change at some point.

Unfortunately there a cost to being correct, about 5% for the
benchmark scenes. For low sample counts it's even worse, I've
seen up to 50% slowdown. For the latter case I think adjusting
tile updating logic can help, but not sure what that would look
like yet (it would be just a few lines change however).
This commit is contained in:
Mai Lavelle 2017-05-11 19:23:49 -04:00
parent 6238214159
commit eb293f59f2
21 changed files with 225 additions and 122 deletions

@ -176,17 +176,62 @@ protected:
friend class OpenCLSplitKernelFunction;
};
struct CachedSplitMemory {
int id;
device_memory *split_data;
device_memory *ray_state;
device_ptr *rng_state;
device_memory *queue_index;
device_memory *use_queues_flag;
device_memory *work_pools;
device_ptr *buffer;
};
class OpenCLSplitKernelFunction : public SplitKernelFunction {
public:
OpenCLDeviceSplitKernel* device;
OpenCLDeviceBase::OpenCLProgram program;
CachedSplitMemory& cached_memory;
int cached_id;
OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
~OpenCLSplitKernelFunction() { program.release(); }
OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
{
}
~OpenCLSplitKernelFunction()
{
program.release();
}
virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
{
device->kernel_set_args(program(), 0, kg, data);
if(cached_id != cached_memory.id) {
cl_uint start_arg_index =
device->kernel_set_args(program(),
0,
kg,
data,
*cached_memory.split_data,
*cached_memory.ray_state,
*cached_memory.rng_state);
/* TODO(sergey): Avoid map lookup here. */
#define KERNEL_TEX(type, ttype, name) \
device->set_kernel_arg_mem(program(), &start_arg_index, #name);
#include "kernel/kernel_textures.h"
#undef KERNEL_TEX
start_arg_index +=
device->kernel_set_args(program(),
start_arg_index,
*cached_memory.queue_index,
*cached_memory.use_queues_flag,
*cached_memory.work_pools,
*cached_memory.buffer);
cached_id = cached_memory.id;
}
device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
program(),
@ -213,6 +258,7 @@ public:
class OpenCLSplitKernel : public DeviceSplitKernel {
OpenCLDeviceSplitKernel *device;
CachedSplitMemory cached_memory;
public:
explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
}
@ -220,7 +266,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
const DeviceRequestedFeatures& requested_features)
{
OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
bool single_program = OpenCLInfo::use_single_program();
kernel->program =
@ -349,6 +395,15 @@ public:
return false;
}
cached_memory.split_data = &split_data;
cached_memory.ray_state = &ray_state;
cached_memory.rng_state = &rtile.rng_state;
cached_memory.queue_index = &queue_index;
cached_memory.use_queues_flag = &use_queues_flag;
cached_memory.work_pools = &work_pool_wgs;
cached_memory.buffer = &rtile.buffer;
cached_memory.id++;
return true;
}

@ -122,6 +122,10 @@ set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_config.h
)
set(SRC_KERNELS_OPENCL_HEADERS
kernels/opencl/kernel_split_function.h
)
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@ -452,6 +456,7 @@ add_library(cycles_kernel
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
${SRC_KERNELS_OPENCL_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_FILTER_HEADERS}
@ -496,6 +501,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inact
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_buffer_update.h"
__kernel void kernel_ocl_path_trace_buffer_update(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local unsigned int local_queue_atomics;
kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
}
#define KERNEL_NAME buffer_update
#define LOCALS_TYPE unsigned int
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_direct_lighting.h"
__kernel void kernel_ocl_path_trace_direct_lighting(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local unsigned int local_queue_atomics;
kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
}
#define KERNEL_NAME direct_lighting
#define LOCALS_TYPE unsigned int
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_do_volume.h"
__kernel void kernel_ocl_path_trace_do_volume(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_do_volume((KernelGlobals*)kg);
}
#define KERNEL_NAME do_volume
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_enqueue_inactive.h"
__kernel void kernel_ocl_path_trace_enqueue_inactive(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local unsigned int local_queue_atomics;
kernel_enqueue_inactive((KernelGlobals*)kg, &local_queue_atomics);
}
#define KERNEL_NAME enqueue_inactive
#define LOCALS_TYPE unsigned int
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,12 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local BackgroundAOLocals locals;
kernel_holdout_emission_blurring_pathtermination_ao(
(KernelGlobals*)kg,
&locals);
}
#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
#define LOCALS_TYPE BackgroundAOLocals
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_background.h"
__kernel void kernel_ocl_path_trace_indirect_background(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_indirect_background((KernelGlobals*)kg);
}
#define KERNEL_NAME indirect_background
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_subsurface.h"
__kernel void kernel_ocl_path_trace_indirect_subsurface(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_indirect_subsurface((KernelGlobals*)kg);
}
#define KERNEL_NAME indirect_subsurface
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_lamp_emission.h"
__kernel void kernel_ocl_path_trace_lamp_emission(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_lamp_emission((KernelGlobals*)kg);
}
#define KERNEL_NAME lamp_emission
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_next_iteration_setup.h"
__kernel void kernel_ocl_path_trace_next_iteration_setup(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local unsigned int local_queue_atomics;
kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
}
#define KERNEL_NAME next_iteration_setup
#define LOCALS_TYPE unsigned int
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_path_init.h"
__kernel void kernel_ocl_path_trace_path_init(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_path_init((KernelGlobals*)kg);
}
#define KERNEL_NAME path_init
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_queue_enqueue.h"
__kernel void kernel_ocl_path_trace_queue_enqueue(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local QueueEnqueueLocals locals;
kernel_queue_enqueue((KernelGlobals*)kg, &locals);
}
#define KERNEL_NAME queue_enqueue
#define LOCALS_TYPE QueueEnqueueLocals
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_scene_intersect.h"
__kernel void kernel_ocl_path_trace_scene_intersect(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_scene_intersect((KernelGlobals*)kg);
}
#define KERNEL_NAME scene_intersect
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_eval.h"
__kernel void kernel_ocl_path_trace_shader_eval(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_shader_eval((KernelGlobals*)kg);
}
#define KERNEL_NAME shader_eval
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_setup.h"
__kernel void kernel_ocl_path_trace_shader_setup(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local unsigned int local_queue_atomics;
kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics);
}
#define KERNEL_NAME shader_setup
#define LOCALS_TYPE unsigned int
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -19,10 +19,9 @@
#include "kernel/split/kernel_shader_sort.h"
__attribute__((reqd_work_group_size(64, 1, 1)))
__kernel void kernel_ocl_path_trace_shader_sort(
ccl_global char *kg,
ccl_constant KernelData *data)
{
ccl_local ShaderSortLocals locals;
kernel_shader_sort((KernelGlobals*)kg, &locals);
}
#define KERNEL_NAME shader_sort
#define LOCALS_TYPE ShaderSortLocals
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME
#undef LOCALS_TYPE

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_shadow_blocked_ao((KernelGlobals*)kg);
}
#define KERNEL_NAME shadow_blocked_ao
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_shadow_blocked_dl((KernelGlobals*)kg);
}
#define KERNEL_NAME shadow_blocked_dl
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME

@ -0,0 +1,72 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
ccl_global char *kg_global,
ccl_constant KernelData *data,
ccl_global void *split_data_buffer,
ccl_global char *ray_state,
ccl_global uint *rng_state,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel/kernel_textures.h"
ccl_global int *queue_index,
ccl_global char *use_queues_flag,
ccl_global unsigned int *work_pools,
ccl_global float *buffer
)
{
#ifdef LOCALS_TYPE
ccl_local LOCALS_TYPE locals;
#endif
KernelGlobals *kg = (KernelGlobals*)kg_global;
if(ccl_local_id(0) + ccl_local_id(1) == 0) {
kg->data = data;
kernel_split_params.rng_state = rng_state;
kernel_split_params.queue_index = queue_index;
kernel_split_params.use_queues_flag = use_queues_flag;
kernel_split_params.work_pools = work_pools;
kernel_split_params.buffer = buffer;
split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "kernel/kernel_textures.h"
}
ccl_barrier(CCL_LOCAL_MEM_FENCE);
KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
kg
#ifdef LOCALS_TYPE
, &locals
#endif
);
}
#undef KERNEL_NAME_JOIN
#undef KERNEL_NAME_EVAL

@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_subsurface_scatter.h"
__kernel void kernel_ocl_path_trace_subsurface_scatter(
ccl_global char *kg,
ccl_constant KernelData *data)
{
kernel_subsurface_scatter((KernelGlobals*)kg);
}
#define KERNEL_NAME subsurface_scatter
#include "kernel/kernels/opencl/kernel_split_function.h"
#undef KERNEL_NAME