blender/intern/cycles/integrator/shader_eval.cpp
Michael Jones 98a5c924fc Cycles: Metal readiness: Specify DeviceQueue::enqueue arg types
This patch adds new arg-type parameters to `DeviceQueue::enqueue` and its overrides. This is in preparation for the Metal backend which needs this information for correct argument encoding.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13357
2021-11-29 14:56:06 +00:00

184 lines
5.6 KiB
C++

/*
* Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "integrator/shader_eval.h"
#include "device/device.h"
#include "device/queue.h"
#include "device/cpu/kernel.h"
#include "device/cpu/kernel_thread_globals.h"
#include "util/log.h"
#include "util/progress.h"
#include "util/tbb.h"
CCL_NAMESPACE_BEGIN
ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
{
DCHECK_NE(device_, nullptr);
}
bool ShaderEval::eval(const ShaderEvalType type,
const int max_num_inputs,
const int num_channels,
const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
const function<void(device_vector<float> &)> &read_output)
{
bool first_device = true;
bool success = true;
device_->foreach_device([&](Device *device) {
if (!first_device) {
LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
"single device.";
return;
}
first_device = false;
device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
device_vector<float> output(device, "ShaderEval output", MEM_READ_WRITE);
/* Allocate and copy device buffers. */
DCHECK_EQ(input.device, device);
DCHECK_EQ(output.device, device);
DCHECK_LE(output.size(), input.size());
input.alloc(max_num_inputs);
int num_points = fill_input(input);
if (num_points == 0) {
return;
}
input.copy_to_device();
output.alloc(num_points * num_channels);
output.zero_to_device();
/* Evaluate on CPU or GPU. */
success = (device->info.type == DEVICE_CPU) ?
eval_cpu(device, type, input, output, num_points) :
eval_gpu(device, type, input, output, num_points);
/* Copy data back from device if not canceled. */
if (success) {
output.copy_from_device(0, 1, output.size());
read_output(output);
}
input.free();
output.free();
});
return success;
}
bool ShaderEval::eval_cpu(Device *device,
const ShaderEvalType type,
device_vector<KernelShaderEvalInput> &input,
device_vector<float> &output,
const int64_t work_size)
{
vector<CPUKernelThreadGlobals> kernel_thread_globals;
device->get_cpu_kernel_thread_globals(kernel_thread_globals);
/* Find required kernel function. */
const CPUKernels &kernels = Device::get_cpu_kernels();
/* Simple parallel_for over all work items. */
KernelShaderEvalInput *input_data = input.data();
float *output_data = output.data();
bool success = true;
tbb::task_arena local_arena(device->info.cpu_threads);
local_arena.execute([&]() {
tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
/* TODO: is this fast enough? */
if (progress_.get_cancel()) {
success = false;
return;
}
const int thread_index = tbb::this_task_arena::current_thread_index();
const KernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];
switch (type) {
case SHADER_EVAL_DISPLACE:
kernels.shader_eval_displace(kg, input_data, output_data, work_index);
break;
case SHADER_EVAL_BACKGROUND:
kernels.shader_eval_background(kg, input_data, output_data, work_index);
break;
case SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
kernels.shader_eval_curve_shadow_transparency(kg, input_data, output_data, work_index);
break;
}
});
});
return success;
}
bool ShaderEval::eval_gpu(Device *device,
const ShaderEvalType type,
device_vector<KernelShaderEvalInput> &input,
device_vector<float> &output,
const int64_t work_size)
{
/* Find required kernel function. */
DeviceKernel kernel;
switch (type) {
case SHADER_EVAL_DISPLACE:
kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
break;
case SHADER_EVAL_BACKGROUND:
kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
break;
case SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
kernel = DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY;
break;
};
/* Create device queue. */
unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
queue->init_execution();
/* Execute work on GPU in chunk, so we can cancel.
* TODO : query appropriate size from device.*/
const int32_t chunk_size = 65536;
device_ptr d_input = input.device_pointer;
device_ptr d_output = output.device_pointer;
assert(work_size <= 0x7fffffff);
for (int32_t d_offset = 0; d_offset < int32_t(work_size); d_offset += chunk_size) {
int32_t d_work_size = std::min(chunk_size, int32_t(work_size) - d_offset);
DeviceKernelArguments args(&d_input, &d_output, &d_offset, &d_work_size);
queue->enqueue(kernel, d_work_size, args);
queue->synchronize();
if (progress_.get_cancel()) {
return false;
}
}
return true;
}
CCL_NAMESPACE_END