blender/intern/cycles/device/device_cpu.cpp
Lukas Toenne 31ed71cb6b Performance fix for Cycles: Don't wait in the main UI thread when resetting devices.
When the scene is updated Cycles resets the renderer device, cancelling
all existing tasks. The main thread would wait for all running tasks to
finish before continuing. This is ok when tasks can actually cancel in a
timely fashion. For OSL however, this does not work, since the OSL
shader group optimization takes quite a bit of time and can not be
easily be cancelled once running (on my crappy machine in full debug
mode: ~0.12 seconds for simple node trees). This would lead to very
laggy UI behavior and make it difficult to accurately control elements
such as sliders.

This patch removes the wait condition from the device->task_cancel
method. Instead it just sets the do_cancel flag and returns. To avoid
backlog in the task pool of the device it will return early from the
BlenderSession::sync function while the reset is going on (tested in
Session::resetting). Once all existing tasks have finished the do_cancel
flag is finally cleared again (checked in TaskPool::num_decrease).

Care has to be taken to avoid race conditions on the do_cancel flag,
since it can now be modified outside the TaskPool::cancel function
itself. For this purpose the scope of the TaskPool::num_mutex locks has
been extended, in most cases the mutex is now locked by the TaskPool
itself before calling TaskScheduler methods, instead of only locking
inside the num_increase/num_decrease functions themselves. The only
occurrence of a lock outside of the TaskPool methods is in
TaskScheduler::thread_run.

This patch is most useful in combination with the OSL renderer mode, so
it can probably wait until after the 2.64 release. SVM tasks tend to be
cancelled quickly, so the effect is less noticeable.
2012-09-11 11:41:51 +00:00

304 lines
6.6 KiB
C++

/*
* Copyright 2011, Blender Foundation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <stdlib.h>
#include <string.h>
#include "device.h"
#include "device_intern.h"
#include "kernel.h"
#include "kernel_types.h"
#include "osl_shader.h"
#include "buffers.h"
#include "util_debug.h"
#include "util_foreach.h"
#include "util_function.h"
#include "util_opengl.h"
#include "util_progress.h"
#include "util_system.h"
#include "util_thread.h"
CCL_NAMESPACE_BEGIN
class CPUDevice : public Device
{
public:
TaskPool task_pool;
KernelGlobals *kg;
CPUDevice(int threads_num)
{
kg = kernel_globals_create();
/* do now to avoid thread issues */
system_cpu_support_optimized();
}
~CPUDevice()
{
task_pool.stop();
kernel_globals_free(kg);
}
bool support_advanced_shading()
{
return true;
}
void mem_alloc(device_memory& mem, MemoryType type)
{
mem.device_pointer = mem.data_pointer;
}
void mem_copy_to(device_memory& mem)
{
/* no-op */
}
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
{
/* no-op */
}
void mem_zero(device_memory& mem)
{
memset((void*)mem.device_pointer, 0, mem.memory_size());
}
void mem_free(device_memory& mem)
{
mem.device_pointer = 0;
}
void const_copy_to(const char *name, void *host, size_t size)
{
kernel_const_copy(kg, name, host, size);
}
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
{
kernel_tex_copy(kg, name, mem.data_pointer, mem.data_width, mem.data_height);
mem.device_pointer = mem.data_pointer;
}
void tex_free(device_memory& mem)
{
mem.device_pointer = 0;
}
void *osl_memory()
{
#ifdef WITH_OSL
return kernel_osl_memory(kg);
#else
return NULL;
#endif
}
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::PATH_TRACE)
thread_path_trace(*task);
else if(task->type == DeviceTask::TONEMAP)
thread_tonemap(*task);
else if(task->type == DeviceTask::SHADER)
thread_shader(*task);
}
class CPUDeviceTask : public DeviceTask {
public:
CPUDeviceTask(CPUDevice *device, DeviceTask& task)
: DeviceTask(task)
{
run = function_bind(&CPUDevice::thread_run, device, this);
}
};
void thread_path_trace(DeviceTask& task)
{
if(task_pool.cancelled())
return;
#ifdef WITH_OSL
if(kernel_osl_use(kg))
OSLShader::thread_init(kg);
#endif
RenderTile tile;
while(task.acquire_tile(this, tile)) {
float *render_buffer = (float*)tile.buffer;
uint *rng_state = (uint*)tile.rng_state;
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.cancelled())
break;
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
kernel_cpu_optimized_path_trace(kg, render_buffer, rng_state,
sample, x, y, tile.offset, tile.stride);
}
}
tile.sample = sample + 1;
task.update_progress(tile);
}
}
else
#endif
{
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.cancelled())
break;
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
kernel_cpu_path_trace(kg, render_buffer, rng_state,
sample, x, y, tile.offset, tile.stride);
}
}
tile.sample = sample + 1;
task.update_progress(tile);
}
}
task.release_tile(tile);
if(task_pool.cancelled())
break;
}
#ifdef WITH_OSL
if(kernel_osl_use(kg))
OSLShader::thread_free(kg);
#endif
}
void thread_tonemap(DeviceTask& task)
{
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
task.sample, task.resolution, x, y, task.offset, task.stride);
}
else
#endif
{
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
task.sample, task.resolution, x, y, task.offset, task.stride);
}
}
void thread_shader(DeviceTask& task)
{
#ifdef WITH_OSL
if(kernel_osl_use(kg))
OSLShader::thread_init(kg);
#endif
#ifdef WITH_OPTIMIZED_KERNEL
if(system_cpu_support_optimized()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_optimized_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
if(task_pool.cancelled())
break;
}
}
else
#endif
{
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
if(task_pool.cancelled())
break;
}
}
#ifdef WITH_OSL
if(kernel_osl_use(kg))
OSLShader::thread_free(kg);
#endif
}
void task_add(DeviceTask& task)
{
/* split task into smaller ones, more than number of threads for uneven
* workloads where some parts of the image render slower than others */
list<DeviceTask> tasks;
task.split(tasks, TaskScheduler::num_threads()+1);
foreach(DeviceTask& task, tasks)
task_pool.push(new CPUDeviceTask(this, task));
}
void task_wait()
{
task_pool.wait_work();
}
void task_cancel()
{
task_pool.cancel();
}
bool task_cancelled()
{
return task_pool.cancelled();
}
};
Device *device_cpu_create(DeviceInfo& info, int threads)
{
return new CPUDevice(threads);
}
void device_cpu_info(vector<DeviceInfo>& devices)
{
DeviceInfo info;
info.type = DEVICE_CPU;
info.description = system_cpu_brand_string();
info.id = "CPU";
info.num = 0;
info.advanced_shading = true;
info.pack_images = false;
devices.insert(devices.begin(), info);
}
CCL_NAMESPACE_END