Cycles: Implement unaligned nodes BVH traversal

This commit implements traversal of unaligned BVH nodes. QBVH traversal is fully SIMD optimized and calculates orientation for all 4 children at a time, regular BVH might probably be optimized a bit more.
2016-07-07 12:23:13 +02:00 · 2016-07-07 12:23:13 +02:00 · a08e2179f1
commit a08e2179f1
parent b03e66e75f
14 changed files with 1574 additions and 533 deletions
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -141,6 +141,7 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_bvh.h
+	geom/geom_bvh_nodes.h
 	geom/geom_bvh_shadow.h
 	geom/geom_bvh_subsurface.h
 	geom/geom_bvh_traversal.h
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@ -77,6 +77,8 @@ CCL_NAMESPACE_BEGIN

 /* Regular BVH traversal */

+#include "geom_bvh_nodes.h"
+
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_traversal.h"
@ -109,13 +111,13 @@ CCL_NAMESPACE_BEGIN

 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
-#  define BVH_FUNCTION_FEATURES 0
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "geom_bvh_subsurface.h"
 #endif

 #if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#  define BVH_FUNCTION_FEATURES BVH_MOTION
+#  define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
 #  include "geom_bvh_subsurface.h"
 #endif

@ -123,19 +125,19 @@ CCL_NAMESPACE_BEGIN

 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
-#  define BVH_FUNCTION_FEATURES 0
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "geom_bvh_volume.h"
 #endif

 #if defined(__VOLUME__) && defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
 #  include "geom_bvh_volume.h"
 #endif

 #if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
 #  include "geom_bvh_volume.h"
 #endif

@ -175,19 +177,19 @@ CCL_NAMESPACE_BEGIN

 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#  define BVH_FUNCTION_FEATURES 0
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "geom_bvh_volume_all.h"
 #endif

 #if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
 #  include "geom_bvh_volume_all.h"
 #endif

 #if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
 #  include "geom_bvh_volume_all.h"
 #endif

--- a/intern/cycles/kernel/geom/geom_bvh_nodes.h
+++ b/intern/cycles/kernel/geom/geom_bvh_nodes.h
@ -0,0 +1,659 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+                                                           int nodeAddr,
+                                                           int child)
+{
+	Transform space;
+	const int child_addr = nodeAddr + child * 3;
+	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+	return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+                                                 const float3 P,
+                                                 const float3 idir,
+                                                 const float t,
+                                                 const int nodeAddr,
+                                                 const uint visibility,
+                                                 float *dist)
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+                                                        const float3 P,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const float difl,
+                                                        const float extmax,
+                                                        const int nodeAddr,
+                                                        const uint visibility,
+                                                        float *dist)
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	if(difl != 0.0f) {
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        int nodeAddr,
+        int child,
+        float *dist)
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	return tNear <= tFar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        const float difl,
+        const float /*extmax*/,
+        int nodeAddr,
+        int child,
+        float *dist)
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	if(difl != 0.0f) {
+		/* TODO(sergey): Same as for QBVH, needs a proper use. */
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		return round_down*tNear <= round_up*tFar;
+	}
+	else {
+		return tNear <= tFar;
+	}
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const float3 idir,
+                                                   const float t,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float *dist)
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const float3 idir,
+                                                          const float t,
+                                                          const float difl,
+                                                          const float extmax,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float *dist)
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3 P,
+                                         const float3 dir,
+                                         const float3 idir,
+                                         const float t,
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    idir,
+		                                    t,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  idir,
+		                                  t,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3 P,
+                                                const float3 dir,
+                                                const float3 idir,
+                                                const float t,
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           idir,
+		                                           t,
+		                                           difl,
+		                                           extmax,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         idir,
+		                                         t,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#else  /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const float difl,
+        const float extmax,
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+
+	if(difl != 0.0f) {
+		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+		float4 *tminmaxview = (float4*)&tminmax;
+		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const ssef& tnear,
+                                                   const ssef& tfar,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask = tNear <= tFar;
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const ssef& tnear,
+                                                          const ssef& tfar,
+                                                          const float difl,
+                                                          const float /*extmax*/,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask;
+	if(difl != 0.0f) {
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		vmask = round_down*tNear <= round_up*tFar;
+	}
+	else {
+		vmask = tNear <= tFar;
+	}
+
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3& P,
+                                         const float3& dir,
+                                         const ssef& tnear,
+                                         const ssef& tfar,
+                                         const ssef& tsplat,
+                                         const ssef Psplat[3],
+                                         const ssef idirsplat[3],
+                                         const shuffle_swap_t shufflexyz[3],
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    tnear,
+		                                    tfar,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  dir,
+		                                  tsplat,
+		                                  Psplat,
+		                                  idirsplat,
+		                                  shufflexyz,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3& P,
+                                                const float3& dir,
+                                                const ssef& tnear,
+                                                const ssef& tfar,
+                                                const ssef& tsplat,
+                                                const ssef Psplat[3],
+                                                const ssef idirsplat[3],
+                                                const shuffle_swap_t shufflexyz[3],
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           tnear,
+		                                           tfar,
+		                                           difl,
+		                                           extmax,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         dir,
+		                                         tsplat,
+		                                         Psplat,
+		                                         idirsplat,
+		                                         shufflexyz,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#endif  /* !defined(__KERNEL_SSE2__) */
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@ -21,6 +21,12 @@
 #  include "geom_qbvh_shadow.h"
 #endif

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
 /* This is a template BVH traversal function, where various features can be
 * enabled/disabled. This way we can compile optimized versions for each case
 * without new features slowing things down.
@ -41,7 +47,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 * - likely and unlikely for if() statements
 	 * - test restrict attribute for pointers
 	 */
-	
+
 	/* traversal stack in CUDA thread-local memory */
 	int traversalStack[BVH_STACK_SIZE];
 	traversalStack[0] = ENTRYPOINT_SENTINEL;
@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];

 	Psplat[0] = ssef(P.x);
@ -94,86 +103,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		do {
 			/* traverse internal nodes */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);

 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-				/* intersect ray against child nodes */
-				float c0lox = (node0.x - P.x) * idir.x;
-				float c0hix = (node0.z - P.x) * idir.x;
-				float c0loy = (node1.x - P.y) * idir.y;
-				float c0hiy = (node1.z - P.y) * idir.y;
-				float c0loz = (node2.x - P.z) * idir.z;
-				float c0hiz = (node2.z - P.z) * idir.z;
-				float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				float c1lox = (node0.y - P.x) * idir.x;
-				float c1hix = (node0.w - P.x) * idir.x;
-				float c1loy = (node1.y - P.y) * idir.y;
-				float c1hiy = (node1.w - P.y) * idir.y;
-				float c1loz = (node2.y - P.z) * idir.z;
-				float c1hiz = (node2.w - P.z) * idir.z;
-				float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-#  else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
 #  endif
-
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-#  else
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
 #  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
 #endif // __KERNEL_SSE2__

-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);

-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);

 					if(closestChild1) {
 						int tmp = nodeAddr;
@ -186,12 +153,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
 						nodeAddr = nodeAddrChild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
@ -238,7 +205,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
 									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
 								else
 									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
@ -317,6 +284,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					Psplat[2] = ssef(P.z);

 					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect_t);
+#    endif
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif

@ -369,6 +339,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);

 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif

@ -410,3 +383,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,

 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@ -21,6 +21,12 @@
 #  include "geom_qbvh_subsurface.h"
 #endif

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
 /* This is a template BVH traversal function for subsurface scattering, where
 * various features can be enabled/disabled. This way we can compile optimized
 * versions for each case without new features slowing things down.
@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];

 	Psplat[0] = ssef(P.x);
@ -100,79 +109,47 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 	/* traversal loop */
 	do {
-		do
-		{
+		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
-			{
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);

 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-				/* intersect ray against child nodes */
-				float c0lox = (node0.x - P.x) * idir.x;
-				float c0hix = (node0.z - P.x) * idir.x;
-				float c0loy = (node1.x - P.y) * idir.y;
-				float c0hiy = (node1.z - P.y) * idir.y;
-				float c0loz = (node2.x - P.z) * idir.z;
-				float c0hiz = (node2.z - P.z) * idir.z;
-				float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				float c1lox = (node0.y - P.x) * idir.x;
-				float c1hix = (node0.w - P.x) * idir.x;
-				float c1loy = (node1.y - P.y) * idir.y;
-				float c1hiy = (node1.w - P.y) * idir.y;
-				float c1loz = (node2.y - P.z) * idir.z;
-				float c1hiz = (node2.w - P.z) * idir.z;
-				float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
 #endif // __KERNEL_SSE2__

-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);

-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);

 					if(closestChild1) {
 						int tmp = nodeAddr;
@ -185,12 +162,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
 						nodeAddr = nodeAddrChild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
@ -286,3 +263,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,

 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@ -21,6 +21,14 @@
 #  include "geom_qbvh_traversal.h"
 #endif

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
+#endif
+
 /* This is a template BVH traversal function, where various features can be
 * enabled/disabled. This way we can compile optimized versions for each case
 * without new features slowing things down.
@ -49,7 +57,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 * - likely and unlikely for if() statements
 	 * - test restrict attribute for pointers
 	 */
-	
+
 	/* traversal stack in CUDA thread-local memory */
 	int traversalStack[BVH_STACK_SIZE];
 	traversalStack[0] = ENTRYPOINT_SENTINEL;
@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
 	shuffle_swap_t shufflexyz[3];

 	Psplat[0] = ssef(P.x);
@ -101,121 +112,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		do {
 			/* traverse internal nodes */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);

 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-				/* intersect ray against child nodes */
-				float c0lox = (node0.x - P.x) * idir.x;
-				float c0hix = (node0.z - P.x) * idir.x;
-				float c0loy = (node1.x - P.y) * idir.y;
-				float c0hiy = (node1.z - P.y) * idir.y;
-				float c0loz = (node2.x - P.z) * idir.z;
-				float c0hiz = (node2.z - P.z) * idir.z;
-				float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				float c1lox = (node0.y - P.x) * idir.x;
-				float c1hix = (node0.w - P.x) * idir.x;
-				float c1loy = (node1.y - P.y) * idir.y;
-				float c1hiy = (node1.w - P.y) * idir.y;
-				float c1loz = (node2.y - P.z) * idir.z;
-				float c1hiz = (node2.w - P.z) * idir.z;
-				float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      dir,
+#    endif
+					                                      idir,
+					                                      isect->t,
+					                                      difl,
+					                                      extmax,
+					                                      nodeAddr,
+					                                      visibility,
+					                                      dist);
 				}
+				else
 #  endif
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#  else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-#  endif
-
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               dir,
+#    endif
+					                               idir,
+					                               isect->t,
+					                               nodeAddr,
+					                               visibility,
+					                               dist);
+				}
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
-					float4 *tminmaxview = (float4*)&tminmax;
-					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
-					float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
-
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+					                                      dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      tnear,
+					                                      tfar,
+#    endif
+					                                      tsplat,
+					                                      Psplat,
+					                                      idirsplat,
+					                                      shufflexyz,
+					                                      difl,
+					                                      extmax,
+					                                      nodeAddr,
+					                                      visibility,
+					                                      dist);
 				}
+				else
 #  endif
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#  else
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
-#  endif
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+					                               dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               tnear,
+					                               tfar,
+#    endif
+					                               tsplat,
+					                               Psplat,
+					                               idirsplat,
+					                               shufflexyz,
+					                               nodeAddr,
+					                               visibility,
+					                               dist);
+				}
 #endif // __KERNEL_SSE2__

-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);

-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);

 					if(closestChild1) {
 						int tmp = nodeAddr;
@ -228,12 +204,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
 						nodeAddr = nodeAddrChild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
@ -268,6 +244,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#  endif
 #else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@ -287,6 +266,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
 #  else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@ -313,6 +295,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
 #  else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@ -342,6 +327,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					Psplat[2] = ssef(P.z);

 					tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect->t);
+#    endif

 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -376,6 +364,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);

 			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif

 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -433,3 +424,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,

 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@ -18,7 +18,13 @@
 */

 #ifdef __QBVH__
-#include "geom_qbvh_volume.h"
+#  include "geom_qbvh_volume.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif

 /* This is a template BVH traversal function for volumes, where
@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
 	shuffle_swap_t shufflexyz[3];

 	Psplat[0] = ssef(P.x);
@ -91,75 +100,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		do {
 			/* traverse internal nodes */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);

 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-				/* intersect ray against child nodes */
-				float c0lox = (node0.x - P.x) * idir.x;
-				float c0hix = (node0.z - P.x) * idir.x;
-				float c0loy = (node1.x - P.y) * idir.y;
-				float c0hiy = (node1.z - P.y) * idir.y;
-				float c0loz = (node2.x - P.z) * idir.z;
-				float c0hiz = (node2.z - P.z) * idir.z;
-				float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				float c1lox = (node0.y - P.x) * idir.x;
-				float c1hix = (node0.w - P.x) * idir.x;
-				float c1loy = (node1.y - P.y) * idir.y;
-				float c1hiy = (node1.w - P.y) * idir.y;
-				float c1loz = (node2.y - P.z) * idir.z;
-				float c1hiz = (node2.w - P.z) * idir.z;
-				float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect->t,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
 #endif // __KERNEL_SSE2__

-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);

-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);

 					if(closestChild1) {
 						int tmp = nodeAddr;
@ -172,12 +150,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
 						nodeAddr = nodeAddrChild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
@ -258,6 +236,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						Psplat[2] = ssef(P.z);

 						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect->t);
+#    endif

 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -298,6 +279,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);

 			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif

 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -337,3 +321,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,

 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
@ -18,7 +18,13 @@
 */

 #ifdef __QBVH__
-#include "geom_qbvh_volume_all.h"
+#  include "geom_qbvh_volume_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif

 /* This is a template BVH traversal function for volumes, where
@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];

 	Psplat[0] = ssef(P.x);
@ -95,75 +104,44 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		do {
 			/* traverse internal nodes */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);

 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_array->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-				/* intersect ray against child nodes */
-				float c0lox = (node0.x - P.x) * idir.x;
-				float c0hix = (node0.z - P.x) * idir.x;
-				float c0loy = (node1.x - P.y) * idir.y;
-				float c0hiy = (node1.z - P.y) * idir.y;
-				float c0loz = (node2.x - P.z) * idir.z;
-				float c0hiz = (node2.z - P.z) * idir.z;
-				float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				float c1lox = (node0.y - P.x) * idir.x;
-				float c1hix = (node0.w - P.x) * idir.x;
-				float c1loy = (node1.y - P.y) * idir.y;
-				float c1hiy = (node1.w - P.y) * idir.y;
-				float c1loz = (node2.y - P.z) * idir.z;
-				float c1hiz = (node2.w - P.z) * idir.z;
-				float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
 #endif // __KERNEL_SSE2__

-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);

-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);

 					if(closestChild1) {
 						int tmp = nodeAddr;
@ -176,12 +154,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
 						nodeAddr = nodeAddrChild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
 						nodeAddr = traversalStack[stackPtr];
 						--stackPtr;
 					}
@ -311,6 +289,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						Psplat[2] = ssef(P.z);

 						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect_t);
+#    endif

 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -368,6 +349,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);

 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif

 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
@ -410,3 +394,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,

 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_qbvh.h
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@ -51,23 +51,25 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
 	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
 }

-ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
-                                          const ssef& tnear,
-                                          const ssef& tfar,
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
+                                                  const ssef& tnear,
+                                                  const ssef& tfar,
 #ifdef __KERNEL_AVX2__
-                                          const sse3f& org_idir,
+                                                  const sse3f& org_idir,
 #else
-                                          const sse3f& org,
+                                                  const sse3f& org,
 #endif
-                                          const sse3f& idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int nodeAddr,
-                                          ssef *__restrict dist)
+                                                  const sse3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int nodeAddr,
+                                                  ssef *__restrict dist)
 {
 	const int offset = nodeAddr + 1;
 #ifdef __KERNEL_AVX2__
@ -101,24 +103,25 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
 	return mask;
 }

-ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
-                                                 const ssef& tnear,
-                                                 const ssef& tfar,
+ccl_device_inline int qbvh_aligned_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
 #ifdef __KERNEL_AVX2__
-                                                 const sse3f& P_idir,
+        const sse3f& P_idir,
 #else
-                                                 const sse3f& P,
+        const sse3f& P,
 #endif
-                                                 const sse3f& idir,
-                                                 const int near_x,
-                                                 const int near_y,
-                                                 const int near_z,
-                                                 const int far_x,
-                                                 const int far_y,
-                                                 const int far_z,
-                                                 const int nodeAddr,
-                                                 const float difl,
-                                                 ssef *__restrict dist)
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
 {
 	const int offset = nodeAddr + 1;
 #ifdef __KERNEL_AVX2__
@ -145,3 +148,286 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
 	*dist = tNear;
 	return (int)movemask(vmask);
 }
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int qbvh_unaligned_node_intersect(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tNear <= tFar;
+	*dist = tNear;
+	return movemask(vmask);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tNear <= tFar;
+	*dist = tNear;
+	return movemask(vmask);
+#endif
+}
+
+ccl_device_inline int qbvh_unaligned_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+#endif
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int qbvh_node_intersect(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect(kg,
+		                                     tnear,
+		                                     tfar,
+#ifdef __KERNEL_AVX2__
+		                                     org_idir,
+#endif
+		                                     org,
+		                                     dir,
+		                                     idir,
+		                                     near_x, near_y, near_z,
+		                                     far_x, far_y, far_z,
+		                                     nodeAddr,
+		                                     dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect(kg,
+		                                   tnear,
+		                                   tfar,
+#ifdef __KERNEL_AVX2__
+		                                   org_idir,
+#else
+		                                   org,
+#endif
+		                                   idir,
+		                                   near_x, near_y, near_z,
+		                                   far_x, far_y, far_z,
+		                                   nodeAddr,
+		                                   dist);
+	}
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect_robust(kg,
+		                                            tnear,
+		                                            tfar,
+#ifdef __KERNEL_AVX2__
+		                                            P_idir,
+#endif
+		                                            P,
+		                                            dir,
+		                                            idir,
+		                                            near_x, near_y, near_z,
+		                                            far_x, far_y, far_z,
+		                                            nodeAddr,
+		                                            difl,
+		                                            dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect_robust(kg,
+		                                          tnear,
+		                                          tfar,
+#ifdef __KERNEL_AVX2__
+		                                          P_idir,
+#else
+		                                          P,
+#endif
+		                                          idir,
+		                                          near_x, near_y, near_z,
+		                                          far_x, far_y, far_z,
+		                                          nodeAddr,
+		                                          difl,
+		                                          dist);
+	}
+}
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@ -27,6 +27,12 @@
 *
 */

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                             const Ray *ray,
                                             Intersection *isect_array,
@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 	ssef tnear(0.0f), tfar(tmax);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));

 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif

 	/* Offsets to select the side that becomes the lower or upper bound. */
@ -109,22 +119,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                   P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#  endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);

 				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}

 					/* One child is hit, continue with that child. */
 					int r = __bscf(traverseChild);
@ -340,13 +363,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 					tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 					P_idir = P*idir;
 					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 					triangle_intersect_precalc(dir, &isect_precalc);

 					++stackPtr;
@ -394,13 +422,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(tmax);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);

 			object = OBJECT_NONE;
@ -412,3 +445,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 	return false;
 }
+
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@ -25,6 +25,12 @@
 *
 */

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                             const Ray *ray,
                                             SubsurfaceIntersection *ss_isect,
@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));

 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif

 	/* Offsets to select the side that becomes the lower or upper bound. */
@ -108,22 +118,37 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                   P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);

 				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}

 					/* One child is hit, continue with that child. */
 					int r = __bscf(traverseChild);
@ -270,3 +295,5 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 }
+
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@ -28,6 +28,14 @@
 *
 */

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
+#endif
+
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                             const Ray *ray,
                                             Intersection *isect,
@ -81,13 +89,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	BVH_DEBUG_INIT();

 	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));

 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
 	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif

 	/* Offsets to select the side that becomes the lower or upper bound. */
@ -132,41 +144,62 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 *
 					 * Need to test if doing opposite would be any faster.
 					 */
-					traverseChild = qbvh_node_intersect_robust(kg,
-					                                           tnear,
-					                                           tfar,
+					traverseChild = NODE_INTERSECT_ROBUST(kg,
+					                                      tnear,
+					                                      tfar,
 #  ifdef __KERNEL_AVX2__
-					                                           P_idir4,
-#  else
-					                                           org,
+					                                      P_idir4,
 #  endif
-					                                           idir4,
-					                                           near_x, near_y, near_z,
-					                                           far_x, far_y, far_z,
-					                                           nodeAddr,
-					                                           difl,
-					                                           &dist);
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                                      org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+					                                      dir4,
+#  endif
+					                                      idir4,
+					                                      near_x, near_y, near_z,
+					                                      far_x, far_y, far_z,
+					                                      nodeAddr,
+					                                      difl,
+					                                      &dist);
 				}
 				else
 #endif  /* BVH_HAIR_MINIMUM_WIDTH */
 				{
-					traverseChild = qbvh_node_intersect(kg,
-					                                    tnear,
-					                                    tfar,
+					traverseChild = NODE_INTERSECT(kg,
+					                               tnear,
+					                               tfar,
 #ifdef __KERNEL_AVX2__
-					                                    P_idir4,
-#else
-					                                    org,
+					                               P_idir4,
 #endif
-					                                    idir4,
-					                                    near_x, near_y, near_z,
-					                                    far_x, far_y, far_z,
-					                                    nodeAddr,
-					                                    &dist);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                               org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                               dir4,
+#endif
+					                               idir4,
+					                               near_x, near_y, near_z,
+					                               far_x, far_y, far_z,
+					                               nodeAddr,
+					                               &dist);
 				}

 				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					float4 cnodes;
+					/* TODO(sergey): Investigate whether moving cnodes upwards
+					 * gives a speedup (will be different cache pattern but will
+					 * avoid extra check here),
+					 */
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}

 					/* One child is hit, continue with that child. */
 					int r = __bscf(traverseChild);
@ -361,13 +394,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 					tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 					P_idir = P*idir;
 					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 					triangle_intersect_precalc(dir, &isect_precalc);

 					++stackPtr;
@ -398,13 +436,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);

 			object = OBJECT_NONE;
@ -417,3 +460,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 	return (isect->prim != PRIM_NONE);
 }
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
--- a/intern/cycles/kernel/geom/geom_qbvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@ -26,6 +26,12 @@
 *
 */

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                             const Ray *ray,
                                             Intersection *isect,
@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	isect->object = OBJECT_NONE;

 	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));

 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif

 	/* Offsets to select the side that becomes the lower or upper bound. */
@ -104,22 +114,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                   P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);

 				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}

 					/* One child is hit, continue with that child. */
 					int r = __bscf(traverseChild);
@ -278,13 +301,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 						tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 						P_idir = P*idir;
 						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 						triangle_intersect_precalc(dir, &isect_precalc);

 						++stackPtr;
@ -319,13 +347,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);

 			object = OBJECT_NONE;
@ -337,3 +370,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 	return (isect->prim != PRIM_NONE);
 }
+
+#undef NODE_INTERSECT
--- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
@ -26,6 +26,12 @@
 *
 */

+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                             const Ray *ray,
                                             Intersection *isect_array,
@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));

 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif

 	/* Offsets to select the side that becomes the lower or upper bound. */
@ -108,22 +118,35 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif

 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                   P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);

 				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}

 					/* One child is hit, continue with that child. */
 					int r = __bscf(traverseChild);
@ -330,12 +353,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 						tfar = ssef(isect_t);
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 #  ifdef __KERNEL_AVX2__
 						P_idir = P*idir;
 						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
@ -389,13 +417,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);
 			isect_t = tmax;
 			isect_array->t = isect_t;
@ -409,3 +442,5 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 	return num_hits;
 }
+
+#undef NODE_INTERSECT