From d2bb0e660bdec00164a9fc72d145e308fb723d16 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Tue, 31 May 2016 17:47:54 +0200 Subject: [PATCH] Fix T46207: Slow OpenCL GPU bake and blown out baking Cycles render --- intern/cycles/device/device_opencl.cpp | 87 +++++++++++++++----------- intern/cycles/kernel/kernel_bake.h | 8 +-- intern/cycles/render/bake.cpp | 2 +- 3 files changed, 53 insertions(+), 44 deletions(-) diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index c7dcf7602df..afe21c49730 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -1224,18 +1224,28 @@ public: CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); - - /* try to divide evenly over 2 dimensions */ + + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; - /* some implementations have max size 1 on 2nd dimension */ + /* Some implementations have max size 1 on 2nd dimension. */ if(local_size[1] > max_work_items[1]) { local_size[0] = workgroup_size/max_work_items[1]; local_size[1] = max_work_items[1]; } - size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)}; + size_t global_size[2] = {global_size_round_up(local_size[0], w), + global_size_round_up(local_size[1], h)}; + + /* Vertical size of 1 is coming from bake/shade kernels where we should + * not round anything up because otherwise we'll either be doing too + * much work per pixel (if we don't check global ID on Y axis) or will + * be checking for global ID to always have Y of 0. + */ + if (h == 1) { + global_size[h] = 1; + } /* run kernel */ opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); @@ -1320,48 +1330,49 @@ public: else kernel = ckShaderKernel; + cl_uint start_arg_index = + kernel_set_args(kernel, + 0, + d_data, + d_input, + d_output); + + if(task.shader_eval_type < SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_output_luma); + } + +#define KERNEL_TEX(type, ttype, name) \ + set_kernel_arg_mem(kernel, &start_arg_index, #name); +#include "kernel_textures.h" +#undef KERNEL_TEX + + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_eval_type); + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_filter); + } + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_shader_x, + d_shader_w, + d_offset); + for(int sample = 0; sample < task.num_samples; sample++) { if(task.get_cancel()) break; - cl_int d_sample = sample; - - cl_uint start_arg_index = - kernel_set_args(kernel, - 0, - d_data, - d_input, - d_output); - - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_output_luma); - } - -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(kernel, &start_arg_index, #name); -#include "kernel_textures.h" -#undef KERNEL_TEX - - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_eval_type); - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_x, - d_shader_w, - d_offset, - d_sample); + kernel_set_args(kernel, start_arg_index, sample); enqueue_kernel(kernel, task.shader_w, 1); + clFinish(cqCommandQueue); + task.update_progress(NULL); } } diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 3966a06fe33..8d05befe1d4 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -482,12 +482,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } /* write output */ - float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f; + const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f; + const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac; - if(sample == 0) - output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac; - else - output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac; + output[i] = (sample == 0)? scaled_result: output[i] + scaled_result; } #endif /* __BAKING__ */ diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index 5bf5e5113ef..13310a61761 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -177,7 +177,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre device->mem_alloc(d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc(d_output, MEM_READ_WRITE); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer;