diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index d494b40f71d..68dec7f0af2 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_ { /* Pick any kernel, all of them are supposed to have same level of microarchitecture * optimization. */ - VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels."; + VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name() + << " CPU kernels."; if (info.cpu_threads == 0) { info.cpu_threads = TaskScheduler::num_threads(); @@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) Device::build_bvh(bvh, progress, refit); } -const CPUKernels *CPUDevice::get_cpu_kernels() const -{ - return &kernels; -} - void CPUDevice::get_cpu_kernel_thread_globals( vector &kernel_thread_globals) { diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h index 553728ccc3b..90d217bb624 100644 --- a/intern/cycles/device/cpu/device_impl.h +++ b/intern/cycles/device/cpu/device_impl.h @@ -57,8 +57,6 @@ class CPUDevice : public Device { RTCDevice embree_device; #endif - CPUKernels kernels; - CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_); ~CPUDevice(); @@ -90,7 +88,6 @@ class CPUDevice : public Device { void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - virtual const CPUKernels *get_cpu_kernels() const override; virtual void get_cpu_kernel_thread_globals( vector &kernel_thread_globals) override; virtual void *get_cpu_osl_memory() override; diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp index 3b253c094fd..91c472d41e8 100644 --- a/intern/cycles/device/cpu/kernel.cpp +++ b/intern/cycles/device/cpu/kernel.cpp @@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) +#define REGISTER_KERNEL_FILM_CONVERT(name) \ + film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \ + film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name)) CPUKernels::CPUKernels() : /* Integrator. */ @@ -50,11 +53,25 @@ CPUKernels::CPUKernels() REGISTER_KERNEL(adaptive_sampling_filter_x), REGISTER_KERNEL(adaptive_sampling_filter_y), /* Cryptomatte. */ - REGISTER_KERNEL(cryptomatte_postprocess) + REGISTER_KERNEL(cryptomatte_postprocess), + /* Film Convert. */ + REGISTER_KERNEL_FILM_CONVERT(depth), + REGISTER_KERNEL_FILM_CONVERT(mist), + REGISTER_KERNEL_FILM_CONVERT(sample_count), + REGISTER_KERNEL_FILM_CONVERT(float), + REGISTER_KERNEL_FILM_CONVERT(light_path), + REGISTER_KERNEL_FILM_CONVERT(float3), + REGISTER_KERNEL_FILM_CONVERT(motion), + REGISTER_KERNEL_FILM_CONVERT(cryptomatte), + REGISTER_KERNEL_FILM_CONVERT(shadow_catcher), + REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow), + REGISTER_KERNEL_FILM_CONVERT(combined), + REGISTER_KERNEL_FILM_CONVERT(float4) { } #undef REGISTER_KERNEL +#undef REGISTER_KERNEL_FILM_CONVERT #undef KERNEL_FUNCTIONS CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h index 5beeaf148a1..406bd07ab3d 100644 --- a/intern/cycles/device/cpu/kernel.h +++ b/intern/cycles/device/cpu/kernel.h @@ -17,11 +17,13 @@ #pragma once #include "device/cpu/kernel_function.h" +#include "util/half.h" #include "util/types.h" CCL_NAMESPACE_BEGIN struct KernelGlobalsCPU; +struct KernelFilmConvert; struct IntegratorStateCPU; struct TileInfo; @@ -102,6 +104,41 @@ class CPUKernels { CryptomattePostprocessFunction cryptomatte_postprocess; + /* Film Convert. */ + using FilmConvertFunction = CPUKernelFunction; + using FilmConvertHalfRGBAFunction = + CPUKernelFunction; + +#define KERNEL_FILM_CONVERT_FUNCTION(name) \ + FilmConvertFunction film_convert_##name; \ + FilmConvertHalfRGBAFunction film_convert_half_rgba_##name; + + KERNEL_FILM_CONVERT_FUNCTION(depth) + KERNEL_FILM_CONVERT_FUNCTION(mist) + KERNEL_FILM_CONVERT_FUNCTION(sample_count) + KERNEL_FILM_CONVERT_FUNCTION(float) + + KERNEL_FILM_CONVERT_FUNCTION(light_path) + KERNEL_FILM_CONVERT_FUNCTION(float3) + + KERNEL_FILM_CONVERT_FUNCTION(motion) + KERNEL_FILM_CONVERT_FUNCTION(cryptomatte) + KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher) + KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow) + KERNEL_FILM_CONVERT_FUNCTION(combined) + KERNEL_FILM_CONVERT_FUNCTION(float4) + +#undef KERNEL_FILM_CONVERT_FUNCTION + CPUKernels(); }; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 69e959b6f7b..63d0a49d3eb 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -23,6 +23,7 @@ #include "device/queue.h" #include "device/cpu/device.h" +#include "device/cpu/kernel.h" #include "device/cuda/device.h" #include "device/dummy/device.h" #include "device/hip/device.h" @@ -363,10 +364,11 @@ unique_ptr Device::gpu_queue_create() return nullptr; } -const CPUKernels *Device::get_cpu_kernels() const +const CPUKernels &Device::get_cpu_kernels() { - LOG(FATAL) << "Device does not support CPU kernels."; - return nullptr; + /* Initialize CPU kernels once and reuse. */ + static CPUKernels kernels; + return kernels; } void Device::get_cpu_kernel_thread_globals( diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 3cb177adde7..65188459c2c 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -180,7 +180,7 @@ class Device { * These may not be used on GPU or multi-devices. */ /* Get CPU kernel functions for native instruction set. */ - virtual const CPUKernels *get_cpu_kernels() const; + static const CPUKernels &get_cpu_kernels(); /* Get kernel globals to pass to kernels. */ virtual void get_cpu_kernel_thread_globals( vector & /*kernel_thread_globals*/); diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp index 820da757be0..77ca332d142 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.cpp +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "device/device.h" + #include "integrator/pass_accessor_cpu.h" #include "session/buffers.h" + #include "util/log.h" #include "util/tbb.h" @@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN * Kernel processing. */ -template -inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const -{ - KernelFilmConvert kfilm_convert; - init_kernel_film_convert(&kfilm_convert, buffer_params, destination); - - if (destination.pixels) { - /* NOTE: No overlays are applied since they are not used for final renders. - * Can be supported via some sort of specialization to avoid code duplication. */ - - run_get_pass_kernel_processor_float( - &kfilm_convert, render_buffers, buffer_params, destination, processor); - } - - if (destination.pixels_half_rgba) { - /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */ - - if (destination.num_components == 1) { - run_get_pass_kernel_processor_half_rgba(&kfilm_convert, - render_buffers, - buffer_params, - destination, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - float pixel; - processor(kfilm_convert, buffer, &pixel); - - pixel_rgba[0] = pixel; - pixel_rgba[1] = pixel; - pixel_rgba[2] = pixel; - pixel_rgba[3] = 1.0f; - }); - } - else if (destination.num_components == 3) { - run_get_pass_kernel_processor_half_rgba(&kfilm_convert, - render_buffers, - buffer_params, - destination, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - processor(kfilm_convert, buffer, pixel_rgba); - pixel_rgba[3] = 1.0f; - }); - } - else if (destination.num_components == 4) { - run_get_pass_kernel_processor_half_rgba( - &kfilm_convert, render_buffers, buffer_params, destination, processor); - } - } -} - -template inline void PassAccessorCPU::run_get_pass_kernel_processor_float( const KernelFilmConvert *kfilm_convert, const RenderBuffers *render_buffers, const BufferParams &buffer_params, const Destination &destination, - const Processor &processor) const + const CPUKernels::FilmConvertFunction func) const { + /* NOTE: No overlays are applied since they are not used for final renders. + * Can be supported via some sort of specialization to avoid code duplication. */ + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; const int64_t pass_stride = buffer_params.pass_stride; @@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float( const float *buffer = window_data + y * buffer_row_stride; float *pixel = destination.pixels + (y * buffer_params.width + destination.offset) * pixel_stride; - - for (int64_t x = 0; x < buffer_params.window_width; - ++x, buffer += pass_stride, pixel += pixel_stride) { - processor(kfilm_convert, buffer, pixel); - } + func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride); }); } -template inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( const KernelFilmConvert *kfilm_convert, const RenderBuffers *render_buffers, const BufferParams &buffer_params, const Destination &destination, - const Processor &processor) const + const CPUKernels::FilmConvertHalfRGBAFunction func) const { const int64_t pass_stride = buffer_params.pass_stride; const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride; @@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) { const float *buffer = window_data + y * buffer_row_stride; half4 *pixel = dst_start + y * destination_stride; - for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) { - - float pixel_rgba[4]; - processor(kfilm_convert, buffer, pixel_rgba); - - film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); - - *pixel = float4_to_half4_display( - make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); - } + func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride); }); } @@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( const BufferParams &buffer_params, \ const Destination &destination) const \ { \ - run_get_pass_kernel_processor( \ - render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \ + const CPUKernels &kernels = Device::get_cpu_kernels(); \ + KernelFilmConvert kfilm_convert; \ + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \ +\ + if (destination.pixels) { \ + run_get_pass_kernel_processor_float(&kfilm_convert, \ + render_buffers, \ + buffer_params, \ + destination, \ + kernels.film_convert_##pass); \ + } \ +\ + if (destination.pixels_half_rgba) { \ + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \ + render_buffers, \ + buffer_params, \ + destination, \ + kernels.film_convert_half_rgba_##pass); \ + } \ } /* Float (scalar) passes. */ diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h index 0313dc5bb0d..9ed38ab256e 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.h +++ b/intern/cycles/integrator/pass_accessor_cpu.h @@ -16,6 +16,8 @@ #pragma once +#include "device/cpu/kernel.h" + #include "integrator/pass_accessor.h" CCL_NAMESPACE_BEGIN @@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor { using PassAccessor::PassAccessor; protected: - template - inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; + inline void run_get_pass_kernel_processor_float( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const CPUKernels::FilmConvertFunction func) const; - template - inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert, - const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; - - template - inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert, - const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; + inline void run_get_pass_kernel_processor_half_rgba( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const CPUKernels::FilmConvertHalfRGBAFunction func) const; #define DECLARE_PASS_ACCESSOR(pass) \ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index 541a7eca02f..36ce2be9f6d 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device, DeviceScene *device_scene, bool *cancel_requested_flag) : PathTraceWork(device, film, device_scene, cancel_requested_flag), - kernels_(*(device->get_cpu_kernels())) + kernels_(Device::get_cpu_kernels()) { DCHECK_EQ(device->info.type, DEVICE_CPU); } diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp index 42cbf87f254..9ec530c81df 100644 --- a/intern/cycles/integrator/shader_eval.cpp +++ b/intern/cycles/integrator/shader_eval.cpp @@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device, device->get_cpu_kernel_thread_globals(kernel_thread_globals); /* Find required kernel function. */ - const CPUKernels &kernels = *(device->get_cpu_kernels()); + const CPUKernels &kernels = Device::get_cpu_kernels(); /* Simple parallel_for over all work items. */ KernelShaderEvalInput *input_data = input.data(); diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h index c49d7ca445a..6af8094b1ea 100644 --- a/intern/cycles/kernel/device/cpu/kernel.h +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -18,6 +18,7 @@ /* CPU Kernel Interface */ +#include "util/half.h" #include "util/types.h" #include "kernel/types.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h index 432ac5e15a9..2f9a3f7c59d 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch.h @@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel); #undef KERNEL_INTEGRATOR_INIT_FUNCTION #undef KERNEL_INTEGRATOR_SHADE_FUNCTION +#define KERNEL_FILM_CONVERT_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride); \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride); + +KERNEL_FILM_CONVERT_FUNCTION(depth) +KERNEL_FILM_CONVERT_FUNCTION(mist) +KERNEL_FILM_CONVERT_FUNCTION(sample_count) +KERNEL_FILM_CONVERT_FUNCTION(float) + +KERNEL_FILM_CONVERT_FUNCTION(light_path) +KERNEL_FILM_CONVERT_FUNCTION(float3) + +KERNEL_FILM_CONVERT_FUNCTION(motion) +KERNEL_FILM_CONVERT_FUNCTION(cryptomatte) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow) +KERNEL_FILM_CONVERT_FUNCTION(combined) +KERNEL_FILM_CONVERT_FUNCTION(float4) + +#undef KERNEL_FILM_CONVERT_FUNCTION + /* -------------------------------------------------------------------- * Shader evaluation. */ diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h index 6df5d7787fc..1ea5002e300 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -47,8 +47,8 @@ # include "kernel/integrator/megakernel.h" # include "kernel/film/adaptive_sampling.h" -# include "kernel/film/read.h" # include "kernel/film/id_passes.h" +# include "kernel/film/read.h" # include "kernel/bake/bake.h" @@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU * #endif } +/* -------------------------------------------------------------------- + * Film Convert. + */ + +#ifdef KERNEL_STUB + +# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride) \ + { \ + STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \ + } \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride) \ + { \ + STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \ + } + +#else + +# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride) \ + { \ + for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \ + film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \ + } \ + } \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride) \ + { \ + for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \ + float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \ + film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \ + if (is_float) { \ + pixel_rgba[1] = pixel_rgba[0]; \ + pixel_rgba[2] = pixel_rgba[0]; \ + } \ + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \ + *pixel = float4_to_half4_display( \ + make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \ + } \ + } + +#endif + +KERNEL_FILM_CONVERT_FUNCTION(depth, true) +KERNEL_FILM_CONVERT_FUNCTION(mist, true) +KERNEL_FILM_CONVERT_FUNCTION(sample_count, true) +KERNEL_FILM_CONVERT_FUNCTION(float, true) + +KERNEL_FILM_CONVERT_FUNCTION(light_path, false) +KERNEL_FILM_CONVERT_FUNCTION(float3, false) + +KERNEL_FILM_CONVERT_FUNCTION(motion, false) +KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false) +KERNEL_FILM_CONVERT_FUNCTION(combined, false) +KERNEL_FILM_CONVERT_FUNCTION(float4, false) + +#undef KERNEL_FILM_CONVERT_FUNCTION + #undef KERNEL_INVOKE #undef DEFINE_INTEGRATOR_KERNEL #undef DEFINE_INTEGRATOR_SHADE_KERNEL