forked from bartvdbraak/blender
Cleanup: refactor float/half conversions for clarity
This commit is contained in:
parent
65dbeb1d81
commit
282516e53e
@ -115,7 +115,7 @@ static void pad_pixels(const BufferParams &buffer_params,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (destination.pixels_half_rgba) {
|
if (destination.pixels_half_rgba) {
|
||||||
const half one = float_to_half(1.0f);
|
const half one = float_to_half_display(1.0f);
|
||||||
half4 *pixel = destination.pixels_half_rgba + destination.offset;
|
half4 *pixel = destination.pixels_half_rgba + destination.offset;
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i++, pixel++) {
|
for (size_t i = 0; i < size; i++, pixel++) {
|
||||||
|
@ -148,7 +148,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
|
|||||||
|
|
||||||
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
|
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
|
||||||
|
|
||||||
float4_store_half(&pixel->x,
|
*pixel = float4_to_half4_display(
|
||||||
make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
|
make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator {
|
|||||||
|
|
||||||
static ccl_always_inline float4 read(half4 r)
|
static ccl_always_inline float4 read(half4 r)
|
||||||
{
|
{
|
||||||
return half4_to_float4(r);
|
return half4_to_float4_image(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ccl_always_inline float4 read(half r)
|
static ccl_always_inline float4 read(half r)
|
||||||
{
|
{
|
||||||
float f = half_to_float(r);
|
float f = half_to_float_image(r);
|
||||||
return make_float4(f, f, f, 1.0f);
|
return make_float4(f, f, f, 1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,6 +128,13 @@ __device__ half __float2half(const float f)
|
|||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ float __half2float(const half h)
|
||||||
|
{
|
||||||
|
float val;
|
||||||
|
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
/* Types */
|
/* Types */
|
||||||
|
|
||||||
#include "util/util_half.h"
|
#include "util/util_half.h"
|
||||||
|
@ -516,7 +516,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
|
|||||||
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
|
film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
|
||||||
|
|
||||||
ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
|
ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
|
||||||
float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
|
*out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Common implementation for half4 destination and 3-channel input pass. */
|
/* Common implementation for half4 destination and 3-channel input pass. */
|
||||||
|
@ -120,6 +120,13 @@ __device__ half __float2half(const float f)
|
|||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ float __half2float(const half h)
|
||||||
|
{
|
||||||
|
float val;
|
||||||
|
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
/* Types */
|
/* Types */
|
||||||
|
|
||||||
#include "util/util_half.h"
|
#include "util/util_half.h"
|
||||||
|
@ -59,99 +59,16 @@ struct half4 {
|
|||||||
half x, y, z, w;
|
half x, y, z, w;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Conversion to/from half float for image textures
|
||||||
|
*
|
||||||
|
* Simplified float to half for fast sampling on processor without a native
|
||||||
|
* instruction, and eliminating any NaN and inf values. */
|
||||||
|
|
||||||
|
ccl_device_inline half float_to_half_image(float f)
|
||||||
|
{
|
||||||
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||||
|
return __float2half(f);
|
||||||
ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
|
|
||||||
{
|
|
||||||
h[0] = __float2half(f.x);
|
|
||||||
h[1] = __float2half(f.y);
|
|
||||||
h[2] = __float2half(f.z);
|
|
||||||
h[3] = __float2half(f.w);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
|
|
||||||
{
|
|
||||||
|
|
||||||
# ifndef __KERNEL_SSE2__
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
/* optimized float to half for pixels:
|
|
||||||
* assumes no negative, no nan, no inf, and sets denormal to 0 */
|
|
||||||
union {
|
|
||||||
uint i;
|
|
||||||
float f;
|
|
||||||
} in;
|
|
||||||
in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
|
|
||||||
int x = in.i;
|
|
||||||
|
|
||||||
int absolute = x & 0x7FFFFFFF;
|
|
||||||
int Z = absolute + 0xC8000000;
|
|
||||||
int result = (absolute < 0x38800000) ? 0 : Z;
|
|
||||||
int rshift = (result >> 13);
|
|
||||||
|
|
||||||
h[i] = (rshift & 0x7FFF);
|
|
||||||
}
|
|
||||||
# else
|
|
||||||
/* same as above with SSE */
|
|
||||||
ssef x = min(max(load4f(f), 0.0f), 65504.0f);
|
|
||||||
|
|
||||||
# ifdef __KERNEL_AVX2__
|
|
||||||
ssei rpack = _mm_cvtps_ph(x, 0);
|
|
||||||
# else
|
|
||||||
ssei absolute = cast(x) & 0x7FFFFFFF;
|
|
||||||
ssei Z = absolute + 0xC8000000;
|
|
||||||
ssei result = andnot(absolute < 0x38800000, Z);
|
|
||||||
ssei rshift = (result >> 13) & 0x7FFF;
|
|
||||||
ssei rpack = _mm_packs_epi32(rshift, rshift);
|
|
||||||
# endif
|
|
||||||
|
|
||||||
_mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
|
|
||||||
# endif
|
|
||||||
}
|
|
||||||
|
|
||||||
# ifndef __KERNEL_HIP__
|
|
||||||
|
|
||||||
ccl_device_inline float half_to_float(half h)
|
|
||||||
{
|
|
||||||
float f;
|
|
||||||
|
|
||||||
*((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
|
|
||||||
|
|
||||||
return f;
|
|
||||||
}
|
|
||||||
# else
|
|
||||||
|
|
||||||
ccl_device_inline float half_to_float(std::uint32_t a) noexcept
|
|
||||||
{
|
|
||||||
|
|
||||||
std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
|
|
||||||
|
|
||||||
std::uint32_t v = __float_as_uint(__uint_as_float(u) *
|
|
||||||
__uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
|
|
||||||
0x38000000U;
|
|
||||||
|
|
||||||
u = (a & 0x7fff) != 0 ? v : u;
|
|
||||||
|
|
||||||
return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
|
|
||||||
}
|
|
||||||
|
|
||||||
# endif /* __KERNEL_HIP__ */
|
|
||||||
|
|
||||||
ccl_device_inline float4 half4_to_float4(half4 h)
|
|
||||||
{
|
|
||||||
float4 f;
|
|
||||||
|
|
||||||
f.x = half_to_float(h.x);
|
|
||||||
f.y = half_to_float(h.y);
|
|
||||||
f.z = half_to_float(h.z);
|
|
||||||
f.w = half_to_float(h.w);
|
|
||||||
|
|
||||||
return f;
|
|
||||||
}
|
|
||||||
|
|
||||||
ccl_device_inline half float_to_half(float f)
|
|
||||||
{
|
|
||||||
const uint u = __float_as_uint(f);
|
const uint u = __float_as_uint(f);
|
||||||
/* Sign bit, shifted to its position. */
|
/* Sign bit, shifted to its position. */
|
||||||
uint sign_bit = u & 0x80000000;
|
uint sign_bit = u & 0x80000000;
|
||||||
@ -170,9 +87,82 @@ ccl_device_inline half float_to_half(float f)
|
|||||||
value_bits = (exponent_bits == 0 ? 0 : value_bits);
|
value_bits = (exponent_bits == 0 ? 0 : value_bits);
|
||||||
/* Re-insert sign bit and return. */
|
/* Re-insert sign bit and return. */
|
||||||
return (value_bits | sign_bit);
|
return (value_bits | sign_bit);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ccl_device_inline float half_to_float_image(half h)
|
||||||
|
{
|
||||||
|
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||||
|
return __half2float(h);
|
||||||
|
#else
|
||||||
|
const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
|
||||||
|
return __int_as_float(x);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline float4 half4_to_float4_image(const half4 h)
|
||||||
|
{
|
||||||
|
/* Unable to use because it gives different results half_to_float_image, can we
|
||||||
|
* modify float_to_half_image so the conversion results are identical? */
|
||||||
|
#if 0 /* defined(__KERNEL_AVX2__) */
|
||||||
|
/* CPU: AVX. */
|
||||||
|
__m128i x = _mm_castpd_si128(_mm_load_sd((const double *)&h));
|
||||||
|
return float4(_mm_cvtph_ps(x));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const float4 f = make_float4(half_to_float_image(h.x),
|
||||||
|
half_to_float_image(h.y),
|
||||||
|
half_to_float_image(h.z),
|
||||||
|
half_to_float_image(h.w));
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Conversion to half float texture for display.
|
||||||
|
*
|
||||||
|
* Simplified float to half for fast display texture conversion on processors
|
||||||
|
* without a native instruction. Assumes no negative, no NaN, no inf, and sets
|
||||||
|
* denormal to 0. */
|
||||||
|
|
||||||
|
ccl_device_inline half float_to_half_display(const float f)
|
||||||
|
{
|
||||||
|
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||||
|
return __float2half(f);
|
||||||
|
#else
|
||||||
|
const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
|
||||||
|
const int absolute = x & 0x7FFFFFFF;
|
||||||
|
const int Z = absolute + 0xC8000000;
|
||||||
|
const int result = (absolute < 0x38800000) ? 0 : Z;
|
||||||
|
const int rshift = (result >> 13);
|
||||||
|
return (rshift & 0x7FFF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline half4 float4_to_half4_display(const float4 f)
|
||||||
|
{
|
||||||
|
#ifdef __KERNEL_SSE2__
|
||||||
|
/* CPU: SSE and AVX. */
|
||||||
|
ssef x = min(max(load4f(f), 0.0f), 65504.0f);
|
||||||
|
# ifdef __KERNEL_AVX2__
|
||||||
|
ssei rpack = _mm_cvtps_ph(x, 0);
|
||||||
|
# else
|
||||||
|
ssei absolute = cast(x) & 0x7FFFFFFF;
|
||||||
|
ssei Z = absolute + 0xC8000000;
|
||||||
|
ssei result = andnot(absolute < 0x38800000, Z);
|
||||||
|
ssei rshift = (result >> 13) & 0x7FFF;
|
||||||
|
ssei rpack = _mm_packs_epi32(rshift, rshift);
|
||||||
|
# endif
|
||||||
|
half4 h;
|
||||||
|
_mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
|
||||||
|
return h;
|
||||||
|
#else
|
||||||
|
/* GPU and scalar fallback. */
|
||||||
|
const half4 h = {float_to_half_display(f.x),
|
||||||
|
float_to_half_display(f.y),
|
||||||
|
float_to_half_display(f.z),
|
||||||
|
float_to_half_display(f.w)};
|
||||||
|
return h;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ template<> inline float util_image_cast_to_float(uint16_t value)
|
|||||||
}
|
}
|
||||||
template<> inline float util_image_cast_to_float(half value)
|
template<> inline float util_image_cast_to_float(half value)
|
||||||
{
|
{
|
||||||
return half_to_float(value);
|
return half_to_float_image(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Cast float value to output pixel type. */
|
/* Cast float value to output pixel type. */
|
||||||
@ -88,7 +88,7 @@ template<> inline uint16_t util_image_cast_from_float(float value)
|
|||||||
}
|
}
|
||||||
template<> inline half util_image_cast_from_float(float value)
|
template<> inline half util_image_cast_from_float(float value)
|
||||||
{
|
{
|
||||||
return float_to_half(value);
|
return float_to_half_image(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
Loading…
Reference in New Issue
Block a user