forked from bartvdbraak/blender
db8024f4b5
are compiled, one SSE optimized and the other not, and it will choose between them at runtime.
264 lines
5.6 KiB
C++
264 lines
5.6 KiB
C++
/*
|
|
* Copyright 2011, Blender Foundation.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "device.h"
|
|
#include "device_intern.h"
|
|
|
|
#include "kernel.h"
|
|
#include "kernel_types.h"
|
|
|
|
#include "osl_shader.h"
|
|
|
|
#include "util_debug.h"
|
|
#include "util_foreach.h"
|
|
#include "util_function.h"
|
|
#include "util_opengl.h"
|
|
#include "util_progress.h"
|
|
#include "util_system.h"
|
|
#include "util_thread.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
class CPUDevice : public Device
|
|
{
|
|
public:
|
|
vector<thread*> threads;
|
|
ThreadQueue<DeviceTask> tasks;
|
|
KernelGlobals *kg;
|
|
|
|
CPUDevice(int threads_num)
|
|
{
|
|
kg = kernel_globals_create();
|
|
|
|
/* do now to avoid thread issues */
|
|
system_cpu_support_optimized();
|
|
|
|
if(threads_num == 0)
|
|
threads_num = system_cpu_thread_count();
|
|
|
|
threads.resize(threads_num);
|
|
|
|
for(size_t i = 0; i < threads.size(); i++)
|
|
threads[i] = new thread(function_bind(&CPUDevice::thread_run, this, i));
|
|
}
|
|
|
|
~CPUDevice()
|
|
{
|
|
tasks.stop();
|
|
|
|
foreach(thread *t, threads) {
|
|
t->join();
|
|
delete t;
|
|
}
|
|
|
|
kernel_globals_free(kg);
|
|
}
|
|
|
|
bool support_full_kernel()
|
|
{
|
|
return true;
|
|
}
|
|
|
|
string description()
|
|
{
|
|
return system_cpu_brand_string();
|
|
}
|
|
|
|
void mem_alloc(device_memory& mem, MemoryType type)
|
|
{
|
|
mem.device_pointer = mem.data_pointer;
|
|
}
|
|
|
|
void mem_copy_to(device_memory& mem)
|
|
{
|
|
/* no-op */
|
|
}
|
|
|
|
void mem_copy_from(device_memory& mem, size_t offset, size_t size)
|
|
{
|
|
/* no-op */
|
|
}
|
|
|
|
void mem_zero(device_memory& mem)
|
|
{
|
|
memset((void*)mem.device_pointer, 0, mem.memory_size());
|
|
}
|
|
|
|
void mem_free(device_memory& mem)
|
|
{
|
|
mem.device_pointer = 0;
|
|
}
|
|
|
|
void const_copy_to(const char *name, void *host, size_t size)
|
|
{
|
|
kernel_const_copy(kg, name, host, size);
|
|
}
|
|
|
|
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
|
|
{
|
|
kernel_tex_copy(kg, name, mem.data_pointer, mem.data_width, mem.data_height);
|
|
mem.device_pointer = mem.data_pointer;
|
|
}
|
|
|
|
void tex_free(device_memory& mem)
|
|
{
|
|
mem.device_pointer = 0;
|
|
}
|
|
|
|
void *osl_memory()
|
|
{
|
|
#ifdef WITH_OSL
|
|
return kernel_osl_memory(kg);
|
|
#else
|
|
return NULL;
|
|
#endif
|
|
}
|
|
|
|
void thread_run(int t)
|
|
{
|
|
DeviceTask task;
|
|
|
|
while(tasks.worker_wait_pop(task)) {
|
|
if(task.type == DeviceTask::PATH_TRACE)
|
|
thread_path_trace(task);
|
|
else if(task.type == DeviceTask::TONEMAP)
|
|
thread_tonemap(task);
|
|
else if(task.type == DeviceTask::DISPLACE)
|
|
thread_displace(task);
|
|
|
|
tasks.worker_done();
|
|
}
|
|
}
|
|
|
|
void thread_path_trace(DeviceTask& task)
|
|
{
|
|
if(tasks.worker_cancel())
|
|
return;
|
|
|
|
#ifdef WITH_OSL
|
|
if(kernel_osl_use(kg))
|
|
OSLShader::thread_init(kg);
|
|
#endif
|
|
|
|
#ifdef WITH_OPTIMIZED_KERNEL
|
|
if(system_cpu_support_optimized()) {
|
|
for(int y = task.y; y < task.y + task.h; y++) {
|
|
for(int x = task.x; x < task.x + task.w; x++)
|
|
kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
|
|
|
|
if(tasks.worker_cancel())
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
for(int y = task.y; y < task.y + task.h; y++) {
|
|
for(int x = task.x; x < task.x + task.w; x++)
|
|
kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
|
|
|
|
if(tasks.worker_cancel())
|
|
break;
|
|
}
|
|
}
|
|
|
|
#ifdef WITH_OSL
|
|
if(kernel_osl_use(kg))
|
|
OSLShader::thread_free(kg);
|
|
#endif
|
|
}
|
|
|
|
void thread_tonemap(DeviceTask& task)
|
|
{
|
|
#ifdef WITH_OPTIMIZED_KERNEL
|
|
if(system_cpu_support_optimized()) {
|
|
for(int y = task.y; y < task.y + task.h; y++)
|
|
for(int x = task.x; x < task.x + task.w; x++)
|
|
kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
for(int y = task.y; y < task.y + task.h; y++)
|
|
for(int x = task.x; x < task.x + task.w; x++)
|
|
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
|
|
}
|
|
}
|
|
|
|
void thread_displace(DeviceTask& task)
|
|
{
|
|
#ifdef WITH_OSL
|
|
if(kernel_osl_use(kg))
|
|
OSLShader::thread_init(kg);
|
|
#endif
|
|
|
|
#ifdef WITH_OPTIMIZED_KERNEL
|
|
if(system_cpu_support_optimized()) {
|
|
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
|
|
kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
|
|
|
|
if(tasks.worker_cancel())
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
|
|
kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
|
|
|
|
if(tasks.worker_cancel())
|
|
break;
|
|
}
|
|
}
|
|
|
|
#ifdef WITH_OSL
|
|
if(kernel_osl_use(kg))
|
|
OSLShader::thread_free(kg);
|
|
#endif
|
|
}
|
|
|
|
void task_add(DeviceTask& task)
|
|
{
|
|
/* split task into smaller ones, more than number of threads for uneven
|
|
workloads where some parts of the image render slower than others */
|
|
task.split(tasks, threads.size()*10);
|
|
}
|
|
|
|
void task_wait()
|
|
{
|
|
tasks.wait_done();
|
|
}
|
|
|
|
void task_cancel()
|
|
{
|
|
tasks.cancel();
|
|
}
|
|
};
|
|
|
|
Device *device_cpu_create(int threads)
|
|
{
|
|
return new CPUDevice(threads);
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|
|
|