Cycles: Implement SSE-optimized path of util_max_axis()

The idea here is to avoid if statements which could cause wrong
branch prediction.

Gives a bit of measurable speedup up to ~1%. Still nice :)

Inspired by Maxym Dmytrychenko, thanks!
This commit is contained in:
Sergey Sharybin 2016-10-25 13:54:17 +02:00
parent 3e71006448
commit af411d918e

@ -1629,6 +1629,14 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
ccl_device_inline int util_max_axis(float3 vec)
{
#ifdef __KERNEL_SSE__
__m128 a = shuffle<0,0,1,1>(vec.m128);
__m128 b = shuffle<1,2,2,1>(vec.m128);
__m128 c = _mm_cmpgt_ps(a, b);
int mask = _mm_movemask_ps(c) & 0x7;
static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
return tab[mask];
#else
if(vec.x > vec.y) {
if(vec.x > vec.z)
return 0;
@ -1641,6 +1649,7 @@ ccl_device_inline int util_max_axis(float3 vec)
else
return 2;
}
#endif
}
CCL_NAMESPACE_END