Cycles: Implement SSE-optimized path of util_max_axis()

The idea here is to avoid if statements which could cause wrong branch prediction. Gives a bit of measurable speedup up to ~1%. Still nice :) Inspired by Maxym Dmytrychenko, thanks!
2016-10-25 13:54:17 +02:00 · 2016-10-25 13:54:17 +02:00 · af411d918e
commit af411d918e
parent 3e71006448
1 changed files with 9 additions and 0 deletions
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@ -1629,6 +1629,14 @@ ccl_device_inline float2 map_to_sphere(const float3 co)

 ccl_device_inline int util_max_axis(float3 vec)
 {
+#ifdef __KERNEL_SSE__
+	__m128 a = shuffle<0,0,1,1>(vec.m128);
+	__m128 b = shuffle<1,2,2,1>(vec.m128);
+	__m128 c = _mm_cmpgt_ps(a, b);
+	int mask = _mm_movemask_ps(c) & 0x7;
+	static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
+	return tab[mask];
+#else
 	if(vec.x > vec.y) {
 		if(vec.x > vec.z)
 			return 0;
@ -1641,6 +1649,7 @@ ccl_device_inline int util_max_axis(float3 vec)
 		else
 			return 2;
 	}
+#endif
 }

 CCL_NAMESPACE_END