forked from bartvdbraak/blender
Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion
Adds a bunch of CPU kernel function to process on row of pixels, and use those instead of calling unoptimized implementations. Fixes T92598
This commit is contained in:
parent
d1a9425a2f
commit
97ff37bf54
@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
|
|||||||
{
|
{
|
||||||
/* Pick any kernel, all of them are supposed to have same level of microarchitecture
|
/* Pick any kernel, all of them are supposed to have same level of microarchitecture
|
||||||
* optimization. */
|
* optimization. */
|
||||||
VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
|
VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
|
||||||
|
<< " CPU kernels.";
|
||||||
|
|
||||||
if (info.cpu_threads == 0) {
|
if (info.cpu_threads == 0) {
|
||||||
info.cpu_threads = TaskScheduler::num_threads();
|
info.cpu_threads = TaskScheduler::num_threads();
|
||||||
@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
|||||||
Device::build_bvh(bvh, progress, refit);
|
Device::build_bvh(bvh, progress, refit);
|
||||||
}
|
}
|
||||||
|
|
||||||
const CPUKernels *CPUDevice::get_cpu_kernels() const
|
|
||||||
{
|
|
||||||
return &kernels;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPUDevice::get_cpu_kernel_thread_globals(
|
void CPUDevice::get_cpu_kernel_thread_globals(
|
||||||
vector<CPUKernelThreadGlobals> &kernel_thread_globals)
|
vector<CPUKernelThreadGlobals> &kernel_thread_globals)
|
||||||
{
|
{
|
||||||
|
@ -57,8 +57,6 @@ class CPUDevice : public Device {
|
|||||||
RTCDevice embree_device;
|
RTCDevice embree_device;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CPUKernels kernels;
|
|
||||||
|
|
||||||
CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
|
CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
|
||||||
~CPUDevice();
|
~CPUDevice();
|
||||||
|
|
||||||
@ -90,7 +88,6 @@ class CPUDevice : public Device {
|
|||||||
|
|
||||||
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
||||||
|
|
||||||
virtual const CPUKernels *get_cpu_kernels() const override;
|
|
||||||
virtual void get_cpu_kernel_thread_globals(
|
virtual void get_cpu_kernel_thread_globals(
|
||||||
vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
|
vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
|
||||||
virtual void *get_cpu_osl_memory() override;
|
virtual void *get_cpu_osl_memory() override;
|
||||||
|
@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
|
|||||||
KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
|
KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
|
||||||
|
|
||||||
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
|
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
|
||||||
|
#define REGISTER_KERNEL_FILM_CONVERT(name) \
|
||||||
|
film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
|
||||||
|
film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))
|
||||||
|
|
||||||
CPUKernels::CPUKernels()
|
CPUKernels::CPUKernels()
|
||||||
: /* Integrator. */
|
: /* Integrator. */
|
||||||
@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
|
|||||||
REGISTER_KERNEL(adaptive_sampling_filter_x),
|
REGISTER_KERNEL(adaptive_sampling_filter_x),
|
||||||
REGISTER_KERNEL(adaptive_sampling_filter_y),
|
REGISTER_KERNEL(adaptive_sampling_filter_y),
|
||||||
/* Cryptomatte. */
|
/* Cryptomatte. */
|
||||||
REGISTER_KERNEL(cryptomatte_postprocess)
|
REGISTER_KERNEL(cryptomatte_postprocess),
|
||||||
|
/* Film Convert. */
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(depth),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(mist),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(sample_count),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(float),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(light_path),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(float3),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(motion),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(combined),
|
||||||
|
REGISTER_KERNEL_FILM_CONVERT(float4)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef REGISTER_KERNEL
|
#undef REGISTER_KERNEL
|
||||||
|
#undef REGISTER_KERNEL_FILM_CONVERT
|
||||||
#undef KERNEL_FUNCTIONS
|
#undef KERNEL_FUNCTIONS
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
@ -17,11 +17,13 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "device/cpu/kernel_function.h"
|
#include "device/cpu/kernel_function.h"
|
||||||
|
#include "util/half.h"
|
||||||
#include "util/types.h"
|
#include "util/types.h"
|
||||||
|
|
||||||
CCL_NAMESPACE_BEGIN
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
struct KernelGlobalsCPU;
|
struct KernelGlobalsCPU;
|
||||||
|
struct KernelFilmConvert;
|
||||||
struct IntegratorStateCPU;
|
struct IntegratorStateCPU;
|
||||||
struct TileInfo;
|
struct TileInfo;
|
||||||
|
|
||||||
@ -102,6 +104,41 @@ class CPUKernels {
|
|||||||
|
|
||||||
CryptomattePostprocessFunction cryptomatte_postprocess;
|
CryptomattePostprocessFunction cryptomatte_postprocess;
|
||||||
|
|
||||||
|
/* Film Convert. */
|
||||||
|
using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
|
||||||
|
const float *buffer,
|
||||||
|
float *pixel,
|
||||||
|
const int width,
|
||||||
|
const int buffer_stride,
|
||||||
|
const int pixel_stride)>;
|
||||||
|
using FilmConvertHalfRGBAFunction =
|
||||||
|
CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
|
||||||
|
const float *buffer,
|
||||||
|
half4 *pixel,
|
||||||
|
const int width,
|
||||||
|
const int buffer_stride)>;
|
||||||
|
|
||||||
|
#define KERNEL_FILM_CONVERT_FUNCTION(name) \
|
||||||
|
FilmConvertFunction film_convert_##name; \
|
||||||
|
FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(depth)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(mist)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(sample_count)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(light_path)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float3)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(motion)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(combined)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float4)
|
||||||
|
|
||||||
|
#undef KERNEL_FILM_CONVERT_FUNCTION
|
||||||
|
|
||||||
CPUKernels();
|
CPUKernels();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#include "device/queue.h"
|
#include "device/queue.h"
|
||||||
|
|
||||||
#include "device/cpu/device.h"
|
#include "device/cpu/device.h"
|
||||||
|
#include "device/cpu/kernel.h"
|
||||||
#include "device/cuda/device.h"
|
#include "device/cuda/device.h"
|
||||||
#include "device/dummy/device.h"
|
#include "device/dummy/device.h"
|
||||||
#include "device/hip/device.h"
|
#include "device/hip/device.h"
|
||||||
@ -363,10 +364,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const CPUKernels *Device::get_cpu_kernels() const
|
const CPUKernels &Device::get_cpu_kernels()
|
||||||
{
|
{
|
||||||
LOG(FATAL) << "Device does not support CPU kernels.";
|
/* Initialize CPU kernels once and reuse. */
|
||||||
return nullptr;
|
static CPUKernels kernels;
|
||||||
|
return kernels;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Device::get_cpu_kernel_thread_globals(
|
void Device::get_cpu_kernel_thread_globals(
|
||||||
|
@ -180,7 +180,7 @@ class Device {
|
|||||||
* These may not be used on GPU or multi-devices. */
|
* These may not be used on GPU or multi-devices. */
|
||||||
|
|
||||||
/* Get CPU kernel functions for native instruction set. */
|
/* Get CPU kernel functions for native instruction set. */
|
||||||
virtual const CPUKernels *get_cpu_kernels() const;
|
static const CPUKernels &get_cpu_kernels();
|
||||||
/* Get kernel globals to pass to kernels. */
|
/* Get kernel globals to pass to kernels. */
|
||||||
virtual void get_cpu_kernel_thread_globals(
|
virtual void get_cpu_kernel_thread_globals(
|
||||||
vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
|
vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
|
||||||
|
@ -14,9 +14,12 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "device/device.h"
|
||||||
|
|
||||||
#include "integrator/pass_accessor_cpu.h"
|
#include "integrator/pass_accessor_cpu.h"
|
||||||
|
|
||||||
#include "session/buffers.h"
|
#include "session/buffers.h"
|
||||||
|
|
||||||
#include "util/log.h"
|
#include "util/log.h"
|
||||||
#include "util/tbb.h"
|
#include "util/tbb.h"
|
||||||
|
|
||||||
@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
|
|||||||
* Kernel processing.
|
* Kernel processing.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<typename Processor>
|
|
||||||
inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
|
|
||||||
const BufferParams &buffer_params,
|
|
||||||
const Destination &destination,
|
|
||||||
const Processor &processor) const
|
|
||||||
{
|
|
||||||
KernelFilmConvert kfilm_convert;
|
|
||||||
init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
|
|
||||||
|
|
||||||
if (destination.pixels) {
|
|
||||||
/* NOTE: No overlays are applied since they are not used for final renders.
|
|
||||||
* Can be supported via some sort of specialization to avoid code duplication. */
|
|
||||||
|
|
||||||
run_get_pass_kernel_processor_float(
|
|
||||||
&kfilm_convert, render_buffers, buffer_params, destination, processor);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (destination.pixels_half_rgba) {
|
|
||||||
/* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
|
|
||||||
|
|
||||||
if (destination.num_components == 1) {
|
|
||||||
run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
|
|
||||||
render_buffers,
|
|
||||||
buffer_params,
|
|
||||||
destination,
|
|
||||||
[&processor](const KernelFilmConvert *kfilm_convert,
|
|
||||||
ccl_global const float *buffer,
|
|
||||||
float *pixel_rgba) {
|
|
||||||
float pixel;
|
|
||||||
processor(kfilm_convert, buffer, &pixel);
|
|
||||||
|
|
||||||
pixel_rgba[0] = pixel;
|
|
||||||
pixel_rgba[1] = pixel;
|
|
||||||
pixel_rgba[2] = pixel;
|
|
||||||
pixel_rgba[3] = 1.0f;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (destination.num_components == 3) {
|
|
||||||
run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
|
|
||||||
render_buffers,
|
|
||||||
buffer_params,
|
|
||||||
destination,
|
|
||||||
[&processor](const KernelFilmConvert *kfilm_convert,
|
|
||||||
ccl_global const float *buffer,
|
|
||||||
float *pixel_rgba) {
|
|
||||||
processor(kfilm_convert, buffer, pixel_rgba);
|
|
||||||
pixel_rgba[3] = 1.0f;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (destination.num_components == 4) {
|
|
||||||
run_get_pass_kernel_processor_half_rgba(
|
|
||||||
&kfilm_convert, render_buffers, buffer_params, destination, processor);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Processor>
|
|
||||||
inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
|
inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
|
||||||
const KernelFilmConvert *kfilm_convert,
|
const KernelFilmConvert *kfilm_convert,
|
||||||
const RenderBuffers *render_buffers,
|
const RenderBuffers *render_buffers,
|
||||||
const BufferParams &buffer_params,
|
const BufferParams &buffer_params,
|
||||||
const Destination &destination,
|
const Destination &destination,
|
||||||
const Processor &processor) const
|
const CPUKernels::FilmConvertFunction func) const
|
||||||
{
|
{
|
||||||
|
/* NOTE: No overlays are applied since they are not used for final renders.
|
||||||
|
* Can be supported via some sort of specialization to avoid code duplication. */
|
||||||
|
|
||||||
DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
|
DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
|
||||||
|
|
||||||
const int64_t pass_stride = buffer_params.pass_stride;
|
const int64_t pass_stride = buffer_params.pass_stride;
|
||||||
@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
|
|||||||
const float *buffer = window_data + y * buffer_row_stride;
|
const float *buffer = window_data + y * buffer_row_stride;
|
||||||
float *pixel = destination.pixels +
|
float *pixel = destination.pixels +
|
||||||
(y * buffer_params.width + destination.offset) * pixel_stride;
|
(y * buffer_params.width + destination.offset) * pixel_stride;
|
||||||
|
func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
|
||||||
for (int64_t x = 0; x < buffer_params.window_width;
|
|
||||||
++x, buffer += pass_stride, pixel += pixel_stride) {
|
|
||||||
processor(kfilm_convert, buffer, pixel);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Processor>
|
|
||||||
inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
|
inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
|
||||||
const KernelFilmConvert *kfilm_convert,
|
const KernelFilmConvert *kfilm_convert,
|
||||||
const RenderBuffers *render_buffers,
|
const RenderBuffers *render_buffers,
|
||||||
const BufferParams &buffer_params,
|
const BufferParams &buffer_params,
|
||||||
const Destination &destination,
|
const Destination &destination,
|
||||||
const Processor &processor) const
|
const CPUKernels::FilmConvertHalfRGBAFunction func) const
|
||||||
{
|
{
|
||||||
const int64_t pass_stride = buffer_params.pass_stride;
|
const int64_t pass_stride = buffer_params.pass_stride;
|
||||||
const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
|
const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
|
||||||
@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
|
|||||||
tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
|
tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
|
||||||
const float *buffer = window_data + y * buffer_row_stride;
|
const float *buffer = window_data + y * buffer_row_stride;
|
||||||
half4 *pixel = dst_start + y * destination_stride;
|
half4 *pixel = dst_start + y * destination_stride;
|
||||||
for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
|
func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
|
||||||
|
|
||||||
float pixel_rgba[4];
|
|
||||||
processor(kfilm_convert, buffer, pixel_rgba);
|
|
||||||
|
|
||||||
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
|
|
||||||
|
|
||||||
*pixel = float4_to_half4_display(
|
|
||||||
make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
|
|||||||
const BufferParams &buffer_params, \
|
const BufferParams &buffer_params, \
|
||||||
const Destination &destination) const \
|
const Destination &destination) const \
|
||||||
{ \
|
{ \
|
||||||
run_get_pass_kernel_processor( \
|
const CPUKernels &kernels = Device::get_cpu_kernels(); \
|
||||||
render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
|
KernelFilmConvert kfilm_convert; \
|
||||||
|
init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
|
||||||
|
\
|
||||||
|
if (destination.pixels) { \
|
||||||
|
run_get_pass_kernel_processor_float(&kfilm_convert, \
|
||||||
|
render_buffers, \
|
||||||
|
buffer_params, \
|
||||||
|
destination, \
|
||||||
|
kernels.film_convert_##pass); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (destination.pixels_half_rgba) { \
|
||||||
|
run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
|
||||||
|
render_buffers, \
|
||||||
|
buffer_params, \
|
||||||
|
destination, \
|
||||||
|
kernels.film_convert_half_rgba_##pass); \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Float (scalar) passes. */
|
/* Float (scalar) passes. */
|
||||||
|
@ -16,6 +16,8 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "device/cpu/kernel.h"
|
||||||
|
|
||||||
#include "integrator/pass_accessor.h"
|
#include "integrator/pass_accessor.h"
|
||||||
|
|
||||||
CCL_NAMESPACE_BEGIN
|
CCL_NAMESPACE_BEGIN
|
||||||
@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor {
|
|||||||
using PassAccessor::PassAccessor;
|
using PassAccessor::PassAccessor;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
template<typename Processor>
|
inline void run_get_pass_kernel_processor_float(
|
||||||
inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
|
const KernelFilmConvert *kfilm_convert,
|
||||||
const BufferParams &buffer_params,
|
|
||||||
const Destination &destination,
|
|
||||||
const Processor &processor) const;
|
|
||||||
|
|
||||||
template<typename Processor>
|
|
||||||
inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
|
|
||||||
const RenderBuffers *render_buffers,
|
const RenderBuffers *render_buffers,
|
||||||
const BufferParams &buffer_params,
|
const BufferParams &buffer_params,
|
||||||
const Destination &destination,
|
const Destination &destination,
|
||||||
const Processor &processor) const;
|
const CPUKernels::FilmConvertFunction func) const;
|
||||||
|
|
||||||
template<typename Processor>
|
inline void run_get_pass_kernel_processor_half_rgba(
|
||||||
inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
|
const KernelFilmConvert *kfilm_convert,
|
||||||
const RenderBuffers *render_buffers,
|
const RenderBuffers *render_buffers,
|
||||||
const BufferParams &buffer_params,
|
const BufferParams &buffer_params,
|
||||||
const Destination &destination,
|
const Destination &destination,
|
||||||
const Processor &processor) const;
|
const CPUKernels::FilmConvertHalfRGBAFunction func) const;
|
||||||
|
|
||||||
#define DECLARE_PASS_ACCESSOR(pass) \
|
#define DECLARE_PASS_ACCESSOR(pass) \
|
||||||
virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
|
virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
|
||||||
|
@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
|
|||||||
DeviceScene *device_scene,
|
DeviceScene *device_scene,
|
||||||
bool *cancel_requested_flag)
|
bool *cancel_requested_flag)
|
||||||
: PathTraceWork(device, film, device_scene, cancel_requested_flag),
|
: PathTraceWork(device, film, device_scene, cancel_requested_flag),
|
||||||
kernels_(*(device->get_cpu_kernels()))
|
kernels_(Device::get_cpu_kernels())
|
||||||
{
|
{
|
||||||
DCHECK_EQ(device->info.type, DEVICE_CPU);
|
DCHECK_EQ(device->info.type, DEVICE_CPU);
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
|
|||||||
device->get_cpu_kernel_thread_globals(kernel_thread_globals);
|
device->get_cpu_kernel_thread_globals(kernel_thread_globals);
|
||||||
|
|
||||||
/* Find required kernel function. */
|
/* Find required kernel function. */
|
||||||
const CPUKernels &kernels = *(device->get_cpu_kernels());
|
const CPUKernels &kernels = Device::get_cpu_kernels();
|
||||||
|
|
||||||
/* Simple parallel_for over all work items. */
|
/* Simple parallel_for over all work items. */
|
||||||
KernelShaderEvalInput *input_data = input.data();
|
KernelShaderEvalInput *input_data = input.data();
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
/* CPU Kernel Interface */
|
/* CPU Kernel Interface */
|
||||||
|
|
||||||
|
#include "util/half.h"
|
||||||
#include "util/types.h"
|
#include "util/types.h"
|
||||||
|
|
||||||
#include "kernel/types.h"
|
#include "kernel/types.h"
|
||||||
|
@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
|
|||||||
#undef KERNEL_INTEGRATOR_INIT_FUNCTION
|
#undef KERNEL_INTEGRATOR_INIT_FUNCTION
|
||||||
#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
|
#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
|
||||||
|
|
||||||
|
#define KERNEL_FILM_CONVERT_FUNCTION(name) \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
float *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride, \
|
||||||
|
const int pixel_stride); \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
|
||||||
|
const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
half4 *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride);
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(depth)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(mist)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(sample_count)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(light_path)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float3)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(motion)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(combined)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float4)
|
||||||
|
|
||||||
|
#undef KERNEL_FILM_CONVERT_FUNCTION
|
||||||
|
|
||||||
/* --------------------------------------------------------------------
|
/* --------------------------------------------------------------------
|
||||||
* Shader evaluation.
|
* Shader evaluation.
|
||||||
*/
|
*/
|
||||||
|
@ -47,8 +47,8 @@
|
|||||||
# include "kernel/integrator/megakernel.h"
|
# include "kernel/integrator/megakernel.h"
|
||||||
|
|
||||||
# include "kernel/film/adaptive_sampling.h"
|
# include "kernel/film/adaptive_sampling.h"
|
||||||
# include "kernel/film/read.h"
|
|
||||||
# include "kernel/film/id_passes.h"
|
# include "kernel/film/id_passes.h"
|
||||||
|
# include "kernel/film/read.h"
|
||||||
|
|
||||||
# include "kernel/bake/bake.h"
|
# include "kernel/bake/bake.h"
|
||||||
|
|
||||||
@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --------------------------------------------------------------------
|
||||||
|
* Film Convert.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
|
||||||
|
# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
float *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride, \
|
||||||
|
const int pixel_stride) \
|
||||||
|
{ \
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
|
||||||
|
} \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
|
||||||
|
const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
half4 *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride) \
|
||||||
|
{ \
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
float *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride, \
|
||||||
|
const int pixel_stride) \
|
||||||
|
{ \
|
||||||
|
for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
|
||||||
|
film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
|
||||||
|
const KernelFilmConvert *kfilm_convert, \
|
||||||
|
const float *buffer, \
|
||||||
|
half4 *pixel, \
|
||||||
|
const int width, \
|
||||||
|
const int buffer_stride) \
|
||||||
|
{ \
|
||||||
|
for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
|
||||||
|
float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
|
||||||
|
film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
|
||||||
|
if (is_float) { \
|
||||||
|
pixel_rgba[1] = pixel_rgba[0]; \
|
||||||
|
pixel_rgba[2] = pixel_rgba[0]; \
|
||||||
|
} \
|
||||||
|
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
|
||||||
|
*pixel = float4_to_half4_display( \
|
||||||
|
make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(depth, true)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(mist, true)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float, true)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float3, false)
|
||||||
|
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(motion, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(combined, false)
|
||||||
|
KERNEL_FILM_CONVERT_FUNCTION(float4, false)
|
||||||
|
|
||||||
|
#undef KERNEL_FILM_CONVERT_FUNCTION
|
||||||
|
|
||||||
#undef KERNEL_INVOKE
|
#undef KERNEL_INVOKE
|
||||||
#undef DEFINE_INTEGRATOR_KERNEL
|
#undef DEFINE_INTEGRATOR_KERNEL
|
||||||
#undef DEFINE_INTEGRATOR_SHADE_KERNEL
|
#undef DEFINE_INTEGRATOR_SHADE_KERNEL
|
||||||
|
Loading…
Reference in New Issue
Block a user