From 360cf8393a22f7beec81340ddd0aaa3bfd22275c Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 13 Apr 2017 15:07:07 +0200 Subject: [PATCH] Cycles: Make vectorized types constructor from register explicit This is not a cheap operation which we dont' want to happen silently. --- intern/cycles/kernel/geom/geom_curve.h | 2 +- intern/cycles/util/util_math.h | 79 +++++++++++++++----------- intern/cycles/util/util_types.h | 8 +-- 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 8888000f0e6..5c3b0ee3c15 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I r_ext = mw_extension + r_curr; #ifdef __KERNEL_SSE__ const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)); + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); float d = dxxx.x; #else float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 47fa181af83..77781ed4574 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -606,7 +606,7 @@ ccl_device_inline float3 normalize(const float3& a) { #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); - return _mm_div_ps(a.m128, norm); + return float3(_mm_div_ps(a.m128, norm)); #else return a/len(a); #endif @@ -657,7 +657,7 @@ ccl_device_inline bool operator!=(const float3& a, const float3& b) ccl_device_inline float3 min(const float3& a, const float3& b) { #ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); + return float3(_mm_min_ps(a.m128, b.m128)); #else return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); #endif @@ -666,7 +666,7 @@ ccl_device_inline float3 min(const float3& a, const float3& b) ccl_device_inline float3 max(const float3& a, const float3& b) { #ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); + return float3(_mm_max_ps(a.m128, b.m128)); #else return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); #endif @@ -681,7 +681,7 @@ ccl_device_inline float3 fabs(const float3& a) { #ifdef __KERNEL_SSE__ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); + return float3(_mm_and_ps(a.m128, mask)); #else return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); #endif @@ -714,8 +714,9 @@ ccl_device_inline void print_float3(const char *label, const float3& a) ccl_device_inline float3 rcp(const float3& a) { #ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const float4 r(_mm_rcp_ps(a.m128)); + return float3(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); #else return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); #endif @@ -769,26 +770,29 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b) #ifdef __KERNEL_SSE__ -template __forceinline const float4 shuffle(const float4& b) +template +__forceinline const float4 shuffle(const float4& b) { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); + return float4(_mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), + _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); } #if defined(__KERNEL_SSE3__) template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { - return _mm_moveldup_ps(b); + return float4(_mm_moveldup_ps(b)); } template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) { - return _mm_movehdup_ps(b); + return float4(_mm_movehdup_ps(b)); } #endif template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { - return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); + return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); } #endif @@ -799,7 +803,7 @@ ccl_device_inline float4 operator-(const float4& a) { #ifdef __KERNEL_SSE__ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); + return float4(_mm_xor_ps(a.m128, mask)); #else return make_float4(-a.x, -a.y, -a.z, -a.w); #endif @@ -808,7 +812,7 @@ ccl_device_inline float4 operator-(const float4& a) ccl_device_inline float4 operator*(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_mul_ps(a.m128, b.m128); + return float4(_mm_mul_ps(a.m128, b.m128)); #else return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); #endif @@ -831,8 +835,9 @@ ccl_device_inline float4 operator*(float f, const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + float4 r(_mm_rcp_ps(a.m128)); + return float4(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif @@ -856,7 +861,7 @@ ccl_device_inline float4 operator/(const float4& a, const float4& b) ccl_device_inline float4 operator+(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_add_ps(a.m128, b.m128); + return float4(_mm_add_ps(a.m128, b.m128)); #else return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); #endif @@ -865,7 +870,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b) ccl_device_inline float4 operator-(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_sub_ps(a.m128, b.m128); + return float4(_mm_sub_ps(a.m128, b.m128)); #else return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); #endif @@ -889,7 +894,8 @@ ccl_device_inline float4 operator/=(float4& a, float f) ccl_device_inline int4 operator<(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); #else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); #endif @@ -898,7 +904,8 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b) ccl_device_inline int4 operator>=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); #else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); #endif @@ -907,7 +914,8 @@ ccl_device_inline int4 operator>=(const float4& a, const float4& b) ccl_device_inline int4 operator<=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); #else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); #endif @@ -943,8 +951,9 @@ ccl_device_inline bool is_zero(const float4& a) ccl_device_inline float reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */ + float4 h(shuffle<1,0,3,2>(a) + a); + /* TODO(sergey): Investigate efficiency. */ + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); #else return ((a.x + a.y) + (a.z + a.w)); #endif @@ -974,7 +983,7 @@ ccl_device_inline float4 safe_normalize(const float4& a) ccl_device_inline float4 min(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); + return float4(_mm_min_ps(a.m128, b.m128)); #else return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); #endif @@ -983,7 +992,7 @@ ccl_device_inline float4 min(const float4& a, const float4& b) ccl_device_inline float4 max(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); + return float4(_mm_max_ps(a.m128, b.m128)); #else return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); #endif @@ -996,7 +1005,9 @@ ccl_device_inline float4 max(const float4& a, const float4& b) ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */ + /* TODO(sergey): avoid cvt. */ + return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), + _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); #endif @@ -1079,7 +1090,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b) ccl_device_inline int3 min(int3 a, int3 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); + return int3(_mm_min_epi32(a.m128, b.m128)); #else return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); #endif @@ -1088,7 +1099,7 @@ ccl_device_inline int3 min(int3 a, int3 b) ccl_device_inline int3 max(int3 a, int3 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); + return int3(_mm_max_epi32(a.m128, b.m128)); #else return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); #endif @@ -1130,7 +1141,7 @@ ccl_device_inline void print_int3(const char *label, const int3& a) ccl_device_inline int4 operator+(const int4& a, const int4& b) { #ifdef __KERNEL_SSE__ - return _mm_add_epi32(a.m128, b.m128); + return int4(_mm_add_epi32(a.m128, b.m128)); #else return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); #endif @@ -1144,7 +1155,7 @@ ccl_device_inline int4 operator+=(int4& a, const int4& b) ccl_device_inline int4 operator>>(const int4& a, int i) { #ifdef __KERNEL_SSE__ - return _mm_srai_epi32(a.m128, i); + return int4(_mm_srai_epi32(a.m128, i)); #else return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); #endif @@ -1153,7 +1164,7 @@ ccl_device_inline int4 operator>>(const int4& a, int i) ccl_device_inline int4 min(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); + return int4(_mm_min_epi32(a.m128, b.m128)); #else return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); #endif @@ -1162,7 +1173,7 @@ ccl_device_inline int4 min(int4 a, int4 b) ccl_device_inline int4 max(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); + return int4(_mm_max_epi32(a.m128, b.m128)); #else return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); #endif @@ -1176,8 +1187,10 @@ ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) { #ifdef __KERNEL_SSE__ - __m128 m = _mm_cvtepi32_ps(mask); - return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */ + const __m128 m = _mm_cvtepi32_ps(mask); + /* TODO(sergey): avoid cvt. */ + return int4(_mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), + _mm_andnot_ps(m, _mm_castsi128_ps(b))))); #else return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); #endif diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index aa22f6a2c57..5a8f11694a7 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -182,7 +182,7 @@ struct ccl_try_align(16) int3 { }; __forceinline int3() {} - __forceinline int3(const __m128i& a) : m128(a) {} + __forceinline explicit int3(const __m128i& a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } @@ -204,7 +204,7 @@ struct ccl_try_align(16) int4 { }; __forceinline int4() {} - __forceinline int4(const __m128i& a) : m128(a) {} + __forceinline explicit int4(const __m128i& a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } @@ -254,7 +254,7 @@ struct ccl_try_align(16) float3 { }; __forceinline float3() {} - __forceinline float3(const __m128& a) : m128(a) {} + __forceinline explicit float3(const __m128& a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } @@ -276,7 +276,7 @@ struct ccl_try_align(16) float4 { }; __forceinline float4() {} - __forceinline float4(const __m128& a) : m128(a) {} + __forceinline explicit float4(const __m128& a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; }