Code refactor: split displace/background into separate kernels, remove luma.

This commit is contained in:
Brecht Van Lommel 2017-10-05 15:17:09 +02:00
parent d8509b349d
commit fb99ea79f8
19 changed files with 129 additions and 125 deletions

@ -171,10 +171,10 @@ public:
DeviceRequestedFeatures requested_features;
KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel;
KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
@ -756,7 +756,6 @@ public:
shader_kernel()(&kg,
(uint4*)task.shader_input,
(float4*)task.shader_output,
(float*)task.shader_output_luma,
task.shader_eval_type,
task.shader_filter,
x,

@ -1424,14 +1424,16 @@ public:
CUfunction cuShader;
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
/* get kernel function */
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
}
else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
}
else {
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
}
/* do tasks in smaller chunks, so we can cancel it */
@ -1450,9 +1452,6 @@ public:
int arg = 0;
args[arg++] = &d_input;
args[arg++] = &d_output;
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
args[arg++] = &d_output_luma;
}
args[arg++] = &task.shader_eval_type;
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
args[arg++] = &task.shader_filter;

@ -383,7 +383,6 @@ public:
if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
sub.device->task_add(subtask);
}

@ -660,10 +660,6 @@ protected:
if(task.shader_output)
task.shader_output = device_ptr_from_client_pointer(task.shader_output);
if(task.shader_output_luma)
task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this);

@ -132,7 +132,7 @@ public:
archive & type & task.x & task.y & task.w & task.h;
archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
archive & task.offset & task.stride;
archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
archive & task.shader_input & task.shader_output & task.shader_eval_type;
archive & task.shader_x & task.shader_w;
archive & task.need_finish_queue;
}
@ -291,7 +291,7 @@ public:
*archive & type & task.x & task.y & task.w & task.h;
*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
*archive & task.offset & task.stride;
*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
*archive & task.shader_input & task.shader_output & task.shader_eval_type;
*archive & task.shader_x & task.shader_w;
*archive & task.need_finish_queue;

@ -31,7 +31,7 @@ CCL_NAMESPACE_BEGIN
DeviceTask::DeviceTask(Type type_)
: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
sample(0), num_samples(1),
shader_input(0), shader_output(0), shader_output_luma(0),
shader_input(0), shader_output(0),
shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0)
{
last_update_time = time_dt();

@ -46,7 +46,7 @@ public:
int offset, stride;
device_ptr shader_input;
device_ptr shader_output, shader_output_luma;
device_ptr shader_output;
int shader_eval_type;
int shader_filter;
int shader_x, shader_w;

@ -228,7 +228,8 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
base_program.add_kernel(ustring("convert_to_byte"));
base_program.add_kernel(ustring("convert_to_half_float"));
base_program.add_kernel(ustring("shader"));
base_program.add_kernel(ustring("displace"));
base_program.add_kernel(ustring("background"));
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
@ -1112,7 +1113,6 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
cl_mem d_input = CL_MEM_PTR(task.shader_input);
cl_mem d_output = CL_MEM_PTR(task.shader_output);
cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
cl_int d_shader_eval_type = task.shader_eval_type;
cl_int d_shader_filter = task.shader_filter;
cl_int d_shader_x = task.shader_x;
@ -1121,10 +1121,15 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
cl_kernel kernel;
if(task.shader_eval_type >= SHADER_EVAL_BAKE)
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
kernel = base_program(ustring("bake"));
else
kernel = base_program(ustring("shader"));
}
else if(task.shader_eval_type >= SHADER_EVAL_DISPLACE) {
kernel = base_program(ustring("displace"));
}
else {
kernel = base_program(ustring("background"));
}
cl_uint start_arg_index =
kernel_set_args(kernel,
@ -1133,12 +1138,6 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
d_input,
d_output);
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
d_output_luma);
}
set_kernel_arg_buffers(kernel, &start_arg_index);
start_arg_index += kernel_set_args(kernel,

@ -493,78 +493,69 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
#endif /* __BAKING__ */
ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
ccl_global uint4 *input,
ccl_global float4 *output,
ccl_global float *output_luma,
ShaderEvalType type,
int i,
int sample)
ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
ccl_global uint4 *input,
ccl_global float4 *output,
int i)
{
ShaderData sd;
PathState state = {0};
uint4 in = input[i];
float3 out;
if(type == SHADER_EVAL_DISPLACE) {
/* setup shader data */
int object = in.x;
int prim = in.y;
float u = __uint_as_float(in.z);
float v = __uint_as_float(in.w);
/* setup shader data */
int object = in.x;
int prim = in.y;
float u = __uint_as_float(in.z);
float v = __uint_as_float(in.w);
shader_setup_from_displace(kg, &sd, object, prim, u, v);
shader_setup_from_displace(kg, &sd, object, prim, u, v);
/* evaluate */
float3 P = sd.P;
shader_eval_displacement(kg, &sd, &state);
out = sd.P - P;
/* evaluate */
float3 P = sd.P;
shader_eval_displacement(kg, &sd, &state);
float3 D = sd.P - P;
object_inverse_dir_transform(kg, &sd, &out);
}
else { // SHADER_EVAL_BACKGROUND
/* setup ray */
Ray ray;
float u = __uint_as_float(in.x);
float v = __uint_as_float(in.y);
object_inverse_dir_transform(kg, &sd, &D);
ray.P = make_float3(0.0f, 0.0f, 0.0f);
ray.D = equirectangular_to_direction(u, v);
ray.t = 0.0f;
/* write output */
output[i] += make_float4(D.x, D.y, D.z, 0.0f);
}
ccl_device void kernel_background_evaluate(KernelGlobals *kg,
ccl_global uint4 *input,
ccl_global float4 *output,
int i)
{
ShaderData sd;
PathState state = {0};
uint4 in = input[i];
/* setup ray */
Ray ray;
float u = __uint_as_float(in.x);
float v = __uint_as_float(in.y);
ray.P = make_float3(0.0f, 0.0f, 0.0f);
ray.D = equirectangular_to_direction(u, v);
ray.t = 0.0f;
#ifdef __CAMERA_MOTION__
ray.time = 0.5f;
ray.time = 0.5f;
#endif
#ifdef __RAY_DIFFERENTIALS__
ray.dD = differential3_zero();
ray.dP = differential3_zero();
ray.dD = differential3_zero();
ray.dP = differential3_zero();
#endif
/* setup shader data */
shader_setup_from_background(kg, &sd, &ray);
/* setup shader data */
shader_setup_from_background(kg, &sd, &ray);
/* evaluate */
int flag = 0; /* we can't know which type of BSDF this is for */
float3 color = shader_eval_background(kg, &sd, &state, flag);
/* evaluate */
int flag = 0; /* we can't know which type of BSDF this is for */
out = shader_eval_background(kg, &sd, &state, flag);
}
/* write output */
if(sample == 0) {
if(output != NULL) {
output[i] = make_float4(out.x, out.y, out.z, 0.0f);
}
if(output_luma != NULL) {
output_luma[i] = average(out);
}
}
else {
if(output != NULL) {
output[i] += make_float4(out.x, out.y, out.z, 0.0f);
}
if(output_luma != NULL) {
output_luma[i] += average(out);
}
}
output[i] += make_float4(color.x, color.y, color.z, 0.0f);
}
CCL_NAMESPACE_END

@ -1204,7 +1204,7 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_
#ifdef __SVM__
# ifdef __OSL__
if(kg->osl)
OSLShader::eval_displacement(kg, sd);
OSLShader::eval_displacement(kg, sd, state);
else
# endif
{

@ -41,7 +41,6 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
uint4 *input,
float4 *output,
float *output_luma,
int type,
int filter,
int i,

@ -149,7 +149,6 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
uint4 *input,
float4 *output,
float *output_luma,
int type,
int filter,
int i,
@ -160,7 +159,6 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
STUB_ASSERT(KERNEL_ARCH, shader);
#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
@ -172,14 +170,11 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
sample);
# endif
}
else if(type == SHADER_EVAL_DISPLACE) {
kernel_displace_evaluate(kg, input, output, i);
}
else {
kernel_shader_evaluate(kg,
input,
output,
output_luma,
(ShaderEvalType)type,
i,
sample);
kernel_background_evaluate(kg, input, output, i);
}
#endif /* KERNEL_STUB */
}

@ -91,26 +91,37 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_shader(uint4 *input,
float4 *output,
float *output_luma,
int type,
int sx,
int sw,
int offset,
int sample)
kernel_cuda_displace(uint4 *input,
float4 *output,
int type,
int sx,
int sw,
int offset,
int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw) {
KernelGlobals kg;
kernel_shader_evaluate(&kg,
input,
output,
output_luma,
(ShaderEvalType)type,
x,
sample);
kernel_displace_evaluate(&kg, input, output, x);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_background(uint4 *input,
float4 *output,
int type,
int sx,
int sw,
int offset,
int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw) {
KernelGlobals kg;
kernel_background_evaluate(&kg, input, output, x);
}
}

@ -72,11 +72,10 @@ __kernel void kernel_ocl_path_trace(
#else /* __COMPILE_ONLY_MEGAKERNEL__ */
__kernel void kernel_ocl_shader(
__kernel void kernel_ocl_displace(
ccl_constant KernelData *data,
ccl_global uint4 *input,
ccl_global float4 *output,
ccl_global float *output_luma,
KERNEL_BUFFER_PARAMS,
@ -92,13 +91,29 @@ __kernel void kernel_ocl_shader(
int x = sx + ccl_global_id(0);
if(x < sx + sw) {
kernel_shader_evaluate(kg,
input,
output,
output_luma,
(ShaderEvalType)type,
x,
sample);
kernel_displace_evaluate(kg, input, output, x);
}
}
__kernel void kernel_ocl_background(
ccl_constant KernelData *data,
ccl_global uint4 *input,
ccl_global float4 *output,
KERNEL_BUFFER_PARAMS,
int type, int sx, int sw, int offset, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
kg->data = data;
kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
if(x < sx + sw) {
kernel_background_evaluate(kg, input, output, x);
}
}

@ -348,14 +348,12 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
/* Displacement */
void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd)
void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
PathState state = {0};
shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata);
shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
/* execute shader */
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;

@ -56,7 +56,7 @@ public:
static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
static void eval_displacement(KernelGlobals *kg, ShaderData *sd);
static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
/* attributes */
static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);

@ -174,6 +174,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
device->mem_alloc("bake_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
device->mem_alloc("bake_output", d_output, MEM_READ_WRITE);
device->mem_zero(d_output);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;

@ -60,6 +60,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY);
device->mem_zero(d_output);
DeviceTask main_task(DeviceTask::SHADER);
main_task.shader_input = d_input.device_pointer;

@ -124,6 +124,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
device->mem_alloc("displace_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY);
device->mem_zero(d_output);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;