forked from bartvdbraak/blender
Fix T46207: Slow OpenCL GPU bake and blown out baking Cycles render
This commit is contained in:
parent
ad1c3bef8b
commit
d2bb0e660b
@ -1224,18 +1224,28 @@ public:
|
||||
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
|
||||
clGetDeviceInfo(cdDevice,
|
||||
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
|
||||
|
||||
/* try to divide evenly over 2 dimensions */
|
||||
|
||||
/* Try to divide evenly over 2 dimensions. */
|
||||
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
||||
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
|
||||
|
||||
/* some implementations have max size 1 on 2nd dimension */
|
||||
/* Some implementations have max size 1 on 2nd dimension. */
|
||||
if(local_size[1] > max_work_items[1]) {
|
||||
local_size[0] = workgroup_size/max_work_items[1];
|
||||
local_size[1] = max_work_items[1];
|
||||
}
|
||||
|
||||
size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
|
||||
size_t global_size[2] = {global_size_round_up(local_size[0], w),
|
||||
global_size_round_up(local_size[1], h)};
|
||||
|
||||
/* Vertical size of 1 is coming from bake/shade kernels where we should
|
||||
* not round anything up because otherwise we'll either be doing too
|
||||
* much work per pixel (if we don't check global ID on Y axis) or will
|
||||
* be checking for global ID to always have Y of 0.
|
||||
*/
|
||||
if (h == 1) {
|
||||
global_size[h] = 1;
|
||||
}
|
||||
|
||||
/* run kernel */
|
||||
opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
|
||||
@ -1320,48 +1330,49 @@ public:
|
||||
else
|
||||
kernel = ckShaderKernel;
|
||||
|
||||
cl_uint start_arg_index =
|
||||
kernel_set_args(kernel,
|
||||
0,
|
||||
d_data,
|
||||
d_input,
|
||||
d_output);
|
||||
|
||||
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_output_luma);
|
||||
}
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
set_kernel_arg_mem(kernel, &start_arg_index, #name);
|
||||
#include "kernel_textures.h"
|
||||
#undef KERNEL_TEX
|
||||
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_eval_type);
|
||||
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_filter);
|
||||
}
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_x,
|
||||
d_shader_w,
|
||||
d_offset);
|
||||
|
||||
for(int sample = 0; sample < task.num_samples; sample++) {
|
||||
|
||||
if(task.get_cancel())
|
||||
break;
|
||||
|
||||
cl_int d_sample = sample;
|
||||
|
||||
cl_uint start_arg_index =
|
||||
kernel_set_args(kernel,
|
||||
0,
|
||||
d_data,
|
||||
d_input,
|
||||
d_output);
|
||||
|
||||
if(task.shader_eval_type < SHADER_EVAL_BAKE) {
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_output_luma);
|
||||
}
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
set_kernel_arg_mem(kernel, &start_arg_index, #name);
|
||||
#include "kernel_textures.h"
|
||||
#undef KERNEL_TEX
|
||||
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_eval_type);
|
||||
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_filter);
|
||||
}
|
||||
start_arg_index += kernel_set_args(kernel,
|
||||
start_arg_index,
|
||||
d_shader_x,
|
||||
d_shader_w,
|
||||
d_offset,
|
||||
d_sample);
|
||||
kernel_set_args(kernel, start_arg_index, sample);
|
||||
|
||||
enqueue_kernel(kernel, task.shader_w, 1);
|
||||
|
||||
clFinish(cqCommandQueue);
|
||||
|
||||
task.update_progress(NULL);
|
||||
}
|
||||
}
|
||||
|
@ -482,12 +482,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
|
||||
}
|
||||
|
||||
/* write output */
|
||||
float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
|
||||
const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
|
||||
const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
|
||||
|
||||
if(sample == 0)
|
||||
output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
|
||||
else
|
||||
output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
|
||||
output[i] = (sample == 0)? scaled_result: output[i] + scaled_result;
|
||||
}
|
||||
|
||||
#endif /* __BAKING__ */
|
||||
|
@ -177,7 +177,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
|
||||
|
||||
device->mem_alloc(d_input, MEM_READ_ONLY);
|
||||
device->mem_copy_to(d_input);
|
||||
device->mem_alloc(d_output, MEM_WRITE_ONLY);
|
||||
device->mem_alloc(d_output, MEM_READ_WRITE);
|
||||
|
||||
DeviceTask task(DeviceTask::SHADER);
|
||||
task.shader_input = d_input.device_pointer;
|
||||
|
Loading…
Reference in New Issue
Block a user