From fb99ea79f84b49bf3de2d80c14a08c9040dc4ac1 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 5 Oct 2017 15:17:09 +0200 Subject: [PATCH] Code refactor: split displace/background into separate kernels, remove luma. --- intern/cycles/device/device_cpu.cpp | 9 +- intern/cycles/device/device_cuda.cpp | 9 +- intern/cycles/device/device_multi.cpp | 1 - intern/cycles/device/device_network.cpp | 4 - intern/cycles/device/device_network.h | 4 +- intern/cycles/device/device_task.cpp | 2 +- intern/cycles/device/device_task.h | 2 +- intern/cycles/device/opencl/opencl_base.cpp | 21 ++-- intern/cycles/kernel/kernel_bake.h | 101 ++++++++---------- intern/cycles/kernel/kernel_shader.h | 2 +- intern/cycles/kernel/kernels/cpu/kernel_cpu.h | 1 - .../kernel/kernels/cpu/kernel_cpu_impl.h | 13 +-- intern/cycles/kernel/kernels/cuda/kernel.cu | 41 ++++--- intern/cycles/kernel/kernels/opencl/kernel.cl | 33 ++++-- intern/cycles/kernel/osl/osl_shader.cpp | 6 +- intern/cycles/kernel/osl/osl_shader.h | 2 +- intern/cycles/render/bake.cpp | 1 + intern/cycles/render/light.cpp | 1 + intern/cycles/render/mesh_displace.cpp | 1 + 19 files changed, 129 insertions(+), 125 deletions(-) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index ff34f4f9ce4..19e3c0a9075 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -171,10 +171,10 @@ public: DeviceRequestedFeatures requested_features; - KernelFunctions path_trace_kernel; - KernelFunctions convert_to_half_float_kernel; - KernelFunctions convert_to_byte_kernel; - KernelFunctions shader_kernel; + KernelFunctions path_trace_kernel; + KernelFunctions convert_to_half_float_kernel; + KernelFunctions convert_to_byte_kernel; + KernelFunctions shader_kernel; KernelFunctions filter_divide_shadow_kernel; KernelFunctions filter_get_feature_kernel; @@ -756,7 +756,6 @@ public: shader_kernel()(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, - (float*)task.shader_output_luma, task.shader_eval_type, task.shader_filter, x, diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 8cfc5332e94..734edcff503 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1424,14 +1424,16 @@ public: CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); /* get kernel function */ if(task.shader_eval_type >= SHADER_EVAL_BAKE) { cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); } + else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); } /* do tasks in smaller chunks, so we can cancel it */ @@ -1450,9 +1452,6 @@ public: int arg = 0; args[arg++] = &d_input; args[arg++] = &d_output; - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - args[arg++] = &d_output_luma; - } args[arg++] = &task.shader_eval_type; if(task.shader_eval_type >= SHADER_EVAL_BAKE) { args[arg++] = &task.shader_filter; diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 164ed50bdf6..b17b972b06f 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -383,7 +383,6 @@ public: if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half]; if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input]; if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output]; - if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma]; sub.device->task_add(subtask); } diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 4ff8647f66b..deea59f1d23 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -660,10 +660,6 @@ protected: if(task.shader_output) task.shader_output = device_ptr_from_client_pointer(task.shader_output); - if(task.shader_output_luma) - task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma); - - task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index 7bfebaf5aec..3d3bd99dfe7 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -132,7 +132,7 @@ public: archive & type & task.x & task.y & task.w & task.h; archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; archive & task.offset & task.stride; - archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; + archive & task.shader_input & task.shader_output & task.shader_eval_type; archive & task.shader_x & task.shader_w; archive & task.need_finish_queue; } @@ -291,7 +291,7 @@ public: *archive & type & task.x & task.y & task.w & task.h; *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; *archive & task.offset & task.stride; - *archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; + *archive & task.shader_input & task.shader_output & task.shader_eval_type; *archive & task.shader_x & task.shader_w; *archive & task.need_finish_queue; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 3bc4c310283..3c7d24fb5b7 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -31,7 +31,7 @@ CCL_NAMESPACE_BEGIN DeviceTask::DeviceTask(Type type_) : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0), sample(0), num_samples(1), - shader_input(0), shader_output(0), shader_output_luma(0), + shader_input(0), shader_output(0), shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0) { last_update_time = time_dt(); diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 44a1efff1f5..b9658eb978f 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -46,7 +46,7 @@ public: int offset, stride; device_ptr shader_input; - device_ptr shader_output, shader_output_luma; + device_ptr shader_output; int shader_eval_type; int shader_filter; int shader_x, shader_w; diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 8095611f099..3db3efd1103 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -228,7 +228,8 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features)); base_program.add_kernel(ustring("convert_to_byte")); base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("shader")); + base_program.add_kernel(ustring("displace")); + base_program.add_kernel(ustring("background")); base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); @@ -1112,7 +1113,6 @@ void OpenCLDeviceBase::shader(DeviceTask& task) cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_input = CL_MEM_PTR(task.shader_input); cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma); cl_int d_shader_eval_type = task.shader_eval_type; cl_int d_shader_filter = task.shader_filter; cl_int d_shader_x = task.shader_x; @@ -1121,10 +1121,15 @@ void OpenCLDeviceBase::shader(DeviceTask& task) cl_kernel kernel; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { kernel = base_program(ustring("bake")); - else - kernel = base_program(ustring("shader")); + } + else if(task.shader_eval_type >= SHADER_EVAL_DISPLACE) { + kernel = base_program(ustring("displace")); + } + else { + kernel = base_program(ustring("background")); + } cl_uint start_arg_index = kernel_set_args(kernel, @@ -1133,12 +1138,6 @@ void OpenCLDeviceBase::shader(DeviceTask& task) d_input, d_output); - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_output_luma); - } - set_kernel_arg_buffers(kernel, &start_arg_index); start_arg_index += kernel_set_args(kernel, diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 0d10e17a593..84d8d84d486 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -493,78 +493,69 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #endif /* __BAKING__ */ -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, - ccl_global uint4 *input, - ccl_global float4 *output, - ccl_global float *output_luma, - ShaderEvalType type, - int i, - int sample) +ccl_device void kernel_displace_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) { ShaderData sd; PathState state = {0}; uint4 in = input[i]; - float3 out; - if(type == SHADER_EVAL_DISPLACE) { - /* setup shader data */ - int object = in.x; - int prim = in.y; - float u = __uint_as_float(in.z); - float v = __uint_as_float(in.w); + /* setup shader data */ + int object = in.x; + int prim = in.y; + float u = __uint_as_float(in.z); + float v = __uint_as_float(in.w); - shader_setup_from_displace(kg, &sd, object, prim, u, v); + shader_setup_from_displace(kg, &sd, object, prim, u, v); - /* evaluate */ - float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state); - out = sd.P - P; + /* evaluate */ + float3 P = sd.P; + shader_eval_displacement(kg, &sd, &state); + float3 D = sd.P - P; - object_inverse_dir_transform(kg, &sd, &out); - } - else { // SHADER_EVAL_BACKGROUND - /* setup ray */ - Ray ray; - float u = __uint_as_float(in.x); - float v = __uint_as_float(in.y); + object_inverse_dir_transform(kg, &sd, &D); - ray.P = make_float3(0.0f, 0.0f, 0.0f); - ray.D = equirectangular_to_direction(u, v); - ray.t = 0.0f; + /* write output */ + output[i] += make_float4(D.x, D.y, D.z, 0.0f); +} + +ccl_device void kernel_background_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) +{ + ShaderData sd; + PathState state = {0}; + uint4 in = input[i]; + + /* setup ray */ + Ray ray; + float u = __uint_as_float(in.x); + float v = __uint_as_float(in.y); + + ray.P = make_float3(0.0f, 0.0f, 0.0f); + ray.D = equirectangular_to_direction(u, v); + ray.t = 0.0f; #ifdef __CAMERA_MOTION__ - ray.time = 0.5f; + ray.time = 0.5f; #endif #ifdef __RAY_DIFFERENTIALS__ - ray.dD = differential3_zero(); - ray.dP = differential3_zero(); + ray.dD = differential3_zero(); + ray.dP = differential3_zero(); #endif - /* setup shader data */ - shader_setup_from_background(kg, &sd, &ray); + /* setup shader data */ + shader_setup_from_background(kg, &sd, &ray); + + /* evaluate */ + int flag = 0; /* we can't know which type of BSDF this is for */ + float3 color = shader_eval_background(kg, &sd, &state, flag); - /* evaluate */ - int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag); - } - /* write output */ - if(sample == 0) { - if(output != NULL) { - output[i] = make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] = average(out); - } - } - else { - if(output != NULL) { - output[i] += make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] += average(out); - } - } + output[i] += make_float4(color.x, color.y, color.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index eeb4eb0097f..695d4fc380a 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -1204,7 +1204,7 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_ #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) - OSLShader::eval_displacement(kg, sd); + OSLShader::eval_displacement(kg, sd, state); else # endif { diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index f5ebf4ad73f..6bdb8546a24 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -41,7 +41,6 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 3fefc1b7e9c..fdeb7dcd3e4 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -149,7 +149,6 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, @@ -160,7 +159,6 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, STUB_ASSERT(KERNEL_ARCH, shader); #else if(type >= SHADER_EVAL_BAKE) { - kernel_assert(output_luma == NULL); # ifdef __BAKING__ kernel_bake_evaluate(kg, input, @@ -172,14 +170,11 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, sample); # endif } + else if(type == SHADER_EVAL_DISPLACE) { + kernel_displace_evaluate(kg, input, output, i); + } else { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - i, - sample); + kernel_background_evaluate(kg, input, output, i); } #endif /* KERNEL_STUB */ } diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index e72edfa7bdf..1ac6afd167a 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -91,26 +91,37 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, - float4 *output, - float *output_luma, - int type, - int sx, - int sw, - int offset, - int sample) +kernel_cuda_displace(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; if(x < sx + sw) { KernelGlobals kg; - kernel_shader_evaluate(&kg, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + kernel_displace_evaluate(&kg, input, output, x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_background(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + if(x < sx + sw) { + KernelGlobals kg; + kernel_background_evaluate(&kg, input, output, x); } } diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 521b86121ff..66b6e19de84 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -72,11 +72,10 @@ __kernel void kernel_ocl_path_trace( #else /* __COMPILE_ONLY_MEGAKERNEL__ */ -__kernel void kernel_ocl_shader( +__kernel void kernel_ocl_displace( ccl_constant KernelData *data, ccl_global uint4 *input, ccl_global float4 *output, - ccl_global float *output_luma, KERNEL_BUFFER_PARAMS, @@ -92,13 +91,29 @@ __kernel void kernel_ocl_shader( int x = sx + ccl_global_id(0); if(x < sx + sw) { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + kernel_displace_evaluate(kg, input, output, x); + } +} +__kernel void kernel_ocl_background( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + + KERNEL_BUFFER_PARAMS, + + int type, int sx, int sw, int offset, int sample) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); + + int x = sx + ccl_global_id(0); + + if(x < sx + sw) { + kernel_background_evaluate(kg, input, output, x); } } diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 9a37e0987aa..6b3a996ca12 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -348,14 +348,12 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd) +void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; - PathState state = {0}; - - shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata); + shaderdata_to_shaderglobals(kg, sd, state, 0, tdata); /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index f7020d1223d..6b392b25cf7 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -56,7 +56,7 @@ public: static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd); + static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state); /* attributes */ static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index c0fcd517390..2bedf3668f7 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -174,6 +174,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre device->mem_alloc("bake_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); device->mem_alloc("bake_output", d_output, MEM_READ_WRITE); + device->mem_zero(d_output); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 4adc00bc839..6a7f985b756 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -60,6 +60,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY); + device->mem_zero(d_output); DeviceTask main_task(DeviceTask::SHADER); main_task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index 4ca20cf7ef3..350a56bf185 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -124,6 +124,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me device->mem_alloc("displace_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY); + device->mem_zero(d_output); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer;