From 9937d5379ca936b4ba93534185477fa7e529181c Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Tue, 16 Nov 2021 14:03:59 +0100 Subject: [PATCH] Cycles: add packed_float3 type for storage Introduce a packed_float3 type for smaller storage that is exactly 3 floats, instead of 4. For computation float3 is still used since it can use SIMD instructions. Ref T92212 Differential Revision: https://developer.blender.org/D13243 --- intern/cycles/device/memory.cpp | 2 +- intern/cycles/device/memory.h | 125 ++++++++---------- intern/cycles/kernel/device/cuda/compat.h | 1 + intern/cycles/kernel/device/hip/compat.h | 1 + intern/cycles/kernel/device/metal/compat.h | 1 + intern/cycles/kernel/device/optix/compat.h | 1 + intern/cycles/kernel/film/accumulate.h | 2 +- .../cycles/kernel/integrator/shade_surface.h | 13 +- .../cycles/kernel/integrator/shade_volume.h | 7 +- .../kernel/integrator/shadow_state_template.h | 12 +- .../cycles/kernel/integrator/state_template.h | 18 +-- intern/cycles/util/defines.h | 2 + intern/cycles/util/math_float3.h | 26 ++++ intern/cycles/util/types_float3.h | 35 +++++ 14 files changed, 147 insertions(+), 99 deletions(-) diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp index f162b00d9f7..259bc2e5334 100644 --- a/intern/cycles/device/memory.cpp +++ b/intern/cycles/device/memory.cpp @@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN device_memory::device_memory(Device *device, const char *name, MemoryType type) : data_type(device_type_traits::data_type), - data_elements(device_type_traits::num_elements_cpu), + data_elements(device_type_traits::num_elements), data_size(0), device_size(0), data_width(0), diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index 281c54cc6a5..b2aa88b4e97 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -81,155 +81,140 @@ static constexpr size_t datatype_size(DataType datatype) template struct device_type_traits { static const DataType data_type = TYPE_UNKNOWN; - static const size_t num_elements_cpu = sizeof(T); - static const size_t num_elements_gpu = sizeof(T); + static const size_t num_elements = sizeof(T); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uchar) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(uchar2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 3; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(uchar3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(uchar4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(uint2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 3; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(uint3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(uint4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(int) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(int2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(int3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(int4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(float) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(float2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { + /* float3 has different size depending on the device, can't use it for interchanging + * memory between CPU and GPU. + * + * Leave body empty to trigger a compile error if used. */ +}; + +template<> struct device_type_traits { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(packed_float3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(float4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_HALF; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(half) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT16; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(ushort4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT16; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint16_t) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_HALF; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(half4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits { static const DataType data_type = TYPE_UINT64; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint64_t) == num_elements * datatype_size(data_type)); }; /* Device Memory @@ -320,9 +305,7 @@ template class device_only_memory : public device_memory { : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY) { data_type = device_type_traits::data_type; - data_elements = max(device_is_cpu() ? device_type_traits::num_elements_cpu : - device_type_traits::num_elements_gpu, - 1); + data_elements = max(device_type_traits::num_elements, 1); } device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other)) @@ -378,15 +361,11 @@ template class device_only_memory : public device_memory { template class device_vector : public device_memory { public: - /* Can only use this for types that have the same size on CPU and GPU. */ - static_assert(device_type_traits::num_elements_cpu == - device_type_traits::num_elements_gpu); - device_vector(Device *device, const char *name, MemoryType type) : device_memory(device, name, type) { data_type = device_type_traits::data_type; - data_elements = device_type_traits::num_elements_cpu; + data_elements = device_type_traits::num_elements; modified = true; need_realloc_ = true; diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 2feebad074f..ba3aefa43bf 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -52,6 +52,7 @@ typedef unsigned long long uint64_t; #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global #define ccl_static_constant __constant__ #define ccl_device_constant __constant__ __device__ diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index fb07602539b..b58179e12ff 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -45,6 +45,7 @@ typedef unsigned long long uint64_t; #define ccl_device_forceinline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global #define ccl_static_constant __constant__ #define ccl_device_constant __constant__ __device__ diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 4a2c39d90fd..19358e063d8 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -42,6 +42,7 @@ using namespace metal; #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device __attribute__((noinline)) #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global device #define ccl_static_constant static constant constexpr #define ccl_device_constant constant diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index 482b921a1a8..c7a7be7309a 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -49,6 +49,7 @@ typedef unsigned long long uint64_t; __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything #define ccl_device_inline ccl_device #define ccl_device_forceinline ccl_device +#define ccl_device_inline_method ccl_device #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device #define ccl_global diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h index 6bdf1f2f3a1..c9303088e3f 100644 --- a/intern/cycles/kernel/film/accumulate.h +++ b/intern/cycles/kernel/film/accumulate.h @@ -552,7 +552,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg, const bool is_transparent_background_ray, ccl_global float *ccl_restrict render_buffer) { - float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L; + float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L; kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1); ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer); diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h index 2793dd3e218..2c478784bc9 100644 --- a/intern/cycles/kernel/integrator/shade_surface.h +++ b/intern/cycles/kernel/integrator/shade_surface.h @@ -195,12 +195,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval); if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { - const float3 pass_diffuse_weight = (bounce == 0) ? - bsdf_eval_pass_diffuse_weight(&bsdf_eval) : - INTEGRATOR_STATE(state, path, pass_diffuse_weight); - const float3 pass_glossy_weight = (bounce == 0) ? - bsdf_eval_pass_glossy_weight(&bsdf_eval) : - INTEGRATOR_STATE(state, path, pass_glossy_weight); + const packed_float3 pass_diffuse_weight = + (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) : + INTEGRATOR_STATE(state, path, pass_diffuse_weight); + const packed_float3 pass_glossy_weight = (bounce == 0) ? + packed_float3( + bsdf_eval_pass_glossy_weight(&bsdf_eval)) : + INTEGRATOR_STATE(state, path, pass_glossy_weight); INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight; } diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h index c5a80eb336f..141433c37a8 100644 --- a/intern/cycles/kernel/integrator/shade_volume.h +++ b/intern/cycles/kernel/integrator/shade_volume.h @@ -792,9 +792,10 @@ ccl_device_forceinline void integrate_volume_direct_light( const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval); if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { - const float3 pass_diffuse_weight = (bounce == 0) ? - one_float3() : - INTEGRATOR_STATE(state, path, pass_diffuse_weight); + const packed_float3 pass_diffuse_weight = (bounce == 0) ? + packed_float3(one_float3()) : + INTEGRATOR_STATE( + state, path, pass_diffuse_weight); INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3(); } diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h index 667ab88c8c4..625a429d3db 100644 --- a/intern/cycles/kernel/integrator/shadow_state_template.h +++ b/intern/cycles/kernel/integrator/shadow_state_template.h @@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T /* enum PathRayFlag */ KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING) /* Throughput. */ -KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING) /* Throughput for shadow pass. */ KERNEL_STRUCT_MEMBER(shadow_path, - float3, + packed_float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE) /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */ -KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) -KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) /* Number of intersections found by ray-tracing. */ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_END(shadow_path) @@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path) /********************************** Shadow Ray *******************************/ KERNEL_STRUCT_BEGIN(shadow_ray) -KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h index 3299f973713..bd18a7498a3 100644 --- a/intern/cycles/kernel/integrator/state_template.h +++ b/intern/cycles/kernel/integrator/state_template.h @@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING) /* Continuation probability for path termination. */ KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING) /* Throughput. */ -KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING) /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */ -KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) -KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) /* Denoising. */ -KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING) +KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING) /* Shader sorting. */ /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */ KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING) @@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path) /************************************** Ray ***********************************/ KERNEL_STRUCT_BEGIN(ray) -KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) @@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect) /*************** Subsurface closure state for subsurface kernel ***************/ KERNEL_STRUCT_BEGIN(subsurface) -KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE) KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE) KERNEL_STRUCT_END(subsurface) /********************************** Volume Stack ******************************/ diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index a778bef52b2..edc36b14745 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -44,6 +44,7 @@ # if defined(_WIN32) && !defined(FREE_WINDOWS) # define ccl_device_inline static __forceinline # define ccl_device_forceinline static __forceinline +# define ccl_device_inline_method __forceinline # define ccl_align(...) __declspec(align(__VA_ARGS__)) # ifdef __KERNEL_64_BIT__ # define ccl_try_align(...) __declspec(align(__VA_ARGS__)) @@ -58,6 +59,7 @@ # else /* _WIN32 && !FREE_WINDOWS */ # define ccl_device_inline static inline __attribute__((always_inline)) # define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_device_inline_method __attribute__((always_inline)) # define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) # ifndef FREE_WINDOWS64 # define __forceinline inline __attribute__((always_inline)) diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 81550c5d03c..031aac1b5d4 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -222,6 +222,32 @@ ccl_device_inline float3 operator/=(float3 &a, float f) return a = a * invf; } +#if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__)) +ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b) +{ + a = float3(a) * b; + return a; +} + +ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f) +{ + a = float3(a) * f; + return a; +} + +ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b) +{ + a = float3(a) / b; + return a; +} + +ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f) +{ + a = float3(a) / f; + return a; +} +#endif + ccl_device_inline bool operator==(const float3 &a, const float3 &b) { #ifdef __KERNEL_SSE__ diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h index f990367e7b8..fc0f35fa87f 100644 --- a/intern/cycles/util/types_float3.h +++ b/intern/cycles/util/types_float3.h @@ -55,6 +55,41 @@ ccl_device_inline float3 make_float3(float x, float y, float z); ccl_device_inline void print_float3(const char *label, const float3 &a); #endif /* __KERNEL_GPU__ */ +/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the + * CPU SIMD instructions can be used. */ +#if defined(__KERNEL_METAL__) +/* Metal has native packed_float3. */ +#elif defined(__KERNEL_CUDA__) +/* CUDA float3 is already packed. */ +typedef float3 packed_float3; +#else +/* HIP float3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */ +struct packed_float3 { + ccl_device_inline_method packed_float3(){}; + + ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z) + { + } + + ccl_device_inline_method operator float3() const + { + return make_float3(x, y, z); + } + + ccl_device_inline_method packed_float3 &operator=(const float3 &a) + { + x = a.x; + y = a.y; + z = a.z; + return *this; + } + + float x, y, z; +}; +#endif + +static_assert(sizeof(packed_float3) == 12); + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_FLOAT3_H__ */