forked from bartvdbraak/blender
Fix cycles crash on some processors. We actually need S-SSE3 support for this
new BVH traversal code, not just SSE3.
This commit is contained in:
parent
e6c54c26ae
commit
649dd6f648
@ -80,7 +80,7 @@ __device bool BVH_FUNCTION_NAME
|
||||
isect->u = 0.0f;
|
||||
isect->v = 0.0f;
|
||||
|
||||
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if defined(__KERNEL_SSSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
const __m128i shuffle_identity = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
const __m128i shuffle_swap = _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
|
||||
|
||||
@ -111,10 +111,10 @@ __device bool BVH_FUNCTION_NAME
|
||||
{
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
float t = isect->t;
|
||||
|
||||
#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if !defined(__KERNEL_SSSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect->t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
|
||||
@ -166,8 +166,8 @@ __device bool BVH_FUNCTION_NAME
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
#endif
|
||||
|
||||
#else // __KERNEL_SSE3__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
#else // __KERNEL_SSSE3__
|
||||
/* Intersect two child bounding boxes, SSSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
__m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
|
||||
@ -190,14 +190,14 @@ __device bool BVH_FUNCTION_NAME
|
||||
traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
|
||||
traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
|
||||
#endif
|
||||
#endif // __KERNEL_SSE3__
|
||||
#endif // __KERNEL_SSSE3__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if !defined(__KERNEL_SSSE3__) || FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
union { __m128 m128; float v[4]; } uminmax;
|
||||
@ -282,7 +282,7 @@ __device bool BVH_FUNCTION_NAME
|
||||
hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
|
||||
|
||||
/* shadow ray early termination */
|
||||
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if defined(__KERNEL_SSSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
if(hit) {
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
@ -315,7 +315,7 @@ __device bool BVH_FUNCTION_NAME
|
||||
bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
|
||||
#endif
|
||||
|
||||
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if defined(__KERNEL_SSSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
Psplat[0] = _mm_set_ps1(P.x);
|
||||
Psplat[1] = _mm_set_ps1(P.y);
|
||||
Psplat[2] = _mm_set_ps1(P.z);
|
||||
@ -359,7 +359,7 @@ __device bool BVH_FUNCTION_NAME
|
||||
bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
|
||||
#endif
|
||||
|
||||
#if defined(__KERNEL_SSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
#if defined(__KERNEL_SSSE3__) && !FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
Psplat[0] = _mm_set_ps1(P.x);
|
||||
Psplat[1] = _mm_set_ps1(P.y);
|
||||
Psplat[2] = _mm_set_ps1(P.z);
|
||||
|
@ -16,13 +16,14 @@
|
||||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3
|
||||
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||
* is compiled without for other CPU's. */
|
||||
|
||||
#ifdef WITH_OPTIMIZED_KERNEL
|
||||
|
||||
#define __KERNEL_SSE3__
|
||||
#define __KERNEL_SSSE3__
|
||||
|
||||
#include "kernel.h"
|
||||
#include "kernel_compat_cpu.h"
|
||||
|
@ -194,7 +194,7 @@ bool system_cpu_support_sse2()
|
||||
bool system_cpu_support_sse3()
|
||||
{
|
||||
CPUCapabilities& caps = system_cpu_capabilities();
|
||||
return caps.sse && caps.sse2 && caps.sse3;
|
||||
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -69,15 +69,19 @@
|
||||
#include <xmmintrin.h> /* SSE 1 */
|
||||
#include <emmintrin.h> /* SSE 2 */
|
||||
#include <pmmintrin.h> /* SSE 3 */
|
||||
#include <tmmintrin.h> /* SSE 3 */
|
||||
#include <tmmintrin.h> /* SSSE 3 */
|
||||
#include <smmintrin.h> /* SSE 4 */
|
||||
|
||||
#ifndef __KERNEL_SSE2__
|
||||
#define __KERNEL_SSE2__
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSE3__
|
||||
#define __KERNEL_SSE3__
|
||||
#ifndef __KERNEL_SSSE3__
|
||||
#define __KERNEL_SSSE3__
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSSE3__
|
||||
#define __KERNEL_SSSE3__
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSE4__
|
||||
@ -86,7 +90,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#if defined(__x86_64__) || defined(__KERNEL_SSE3__)
|
||||
#if defined(__x86_64__) || defined(__KERNEL_SSSE3__)
|
||||
|
||||
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
|
||||
* Since we can't avoid including <windows.h>, better only include that */
|
||||
@ -96,9 +100,11 @@
|
||||
#include <xmmintrin.h> /* SSE 1 */
|
||||
#include <emmintrin.h> /* SSE 2 */
|
||||
|
||||
#ifdef __KERNEL_SSE3__
|
||||
#ifdef __KERNEL_SSSE3__
|
||||
#include <pmmintrin.h> /* SSE 3 */
|
||||
#include <tmmintrin.h> /* SSE 3 */
|
||||
#endif
|
||||
#ifdef __KERNEL_SSSE3__
|
||||
#include <tmmintrin.h> /* SSSE 3 */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -110,10 +116,9 @@
|
||||
|
||||
#endif
|
||||
|
||||
/* int8_t, uint16_t, and friends */
|
||||
#ifndef _WIN32
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -486,7 +491,7 @@ __device_inline int4 make_int4(const float3& f)
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_SSE3__
|
||||
#ifdef __KERNEL_SSSE3__
|
||||
|
||||
/* SSE shuffle utility functions */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user