Cycles: Implement unaligned nodes BVH traversal

This commit implements traversal of unaligned BVH nodes.

QBVH traversal is fully SIMD optimized and calculates orientation
for all 4 children at a time, regular BVH might probably be optimized
a bit more.
This commit is contained in:
Sergey Sharybin 2016-07-07 12:23:13 +02:00
parent b03e66e75f
commit a08e2179f1
14 changed files with 1574 additions and 533 deletions

@ -141,6 +141,7 @@ set(SRC_GEOM_HEADERS
geom/geom.h
geom/geom_attribute.h
geom/geom_bvh.h
geom/geom_bvh_nodes.h
geom/geom_bvh_shadow.h
geom/geom_bvh_subsurface.h
geom/geom_bvh_traversal.h

@ -77,6 +77,8 @@ CCL_NAMESPACE_BEGIN
/* Regular BVH traversal */
#include "geom_bvh_nodes.h"
#define BVH_FUNCTION_NAME bvh_intersect
#define BVH_FUNCTION_FEATURES 0
#include "geom_bvh_traversal.h"
@ -109,13 +111,13 @@ CCL_NAMESPACE_BEGIN
#if defined(__SUBSURFACE__)
# define BVH_FUNCTION_NAME bvh_intersect_subsurface
# define BVH_FUNCTION_FEATURES 0
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_subsurface.h"
#endif
#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
# define BVH_FUNCTION_FEATURES BVH_MOTION
# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
# include "geom_bvh_subsurface.h"
#endif
@ -123,19 +125,19 @@ CCL_NAMESPACE_BEGIN
#if defined(__VOLUME__)
# define BVH_FUNCTION_NAME bvh_intersect_volume
# define BVH_FUNCTION_FEATURES 0
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_volume.h"
#endif
#if defined(__VOLUME__) && defined(__INSTANCING__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
# define BVH_FUNCTION_FEATURES BVH_INSTANCING
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
# include "geom_bvh_volume.h"
#endif
#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_motion
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
# include "geom_bvh_volume.h"
#endif
@ -175,19 +177,19 @@ CCL_NAMESPACE_BEGIN
#if defined(__VOLUME_RECORD_ALL__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all
# define BVH_FUNCTION_FEATURES 0
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif
#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
# define BVH_FUNCTION_FEATURES BVH_INSTANCING
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif
#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif

@ -0,0 +1,659 @@
/*
* Copyright 2011-2016, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
// 3-vector which might be faster.
ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
int nodeAddr,
int child)
{
Transform space;
const int child_addr = nodeAddr + child * 3;
space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
return space;
}
#if !defined(__KERNEL_SSE2__)
ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
const int nodeAddr,
const uint visibility,
float *dist)
{
/* fetch node data */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
dist[0] = c0min;
dist[1] = c1min;
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
#else
return ((c0max >= c0min)? 1: 0) |
((c1max >= c1min)? 2: 0);
#endif
}
ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
const float difl,
const float extmax,
const int nodeAddr,
const uint visibility,
float *dist)
{
/* fetch node data */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
}
dist[0] = c0min;
dist[1] = c1min;
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
#else
return ((c0max >= c0min)? 1: 0) |
((c1max >= c1min)? 2: 0);
#endif
}
ccl_device_inline bool bvh_unaligned_node_intersect_child(
KernelGlobals *kg,
const float3 P,
const float3 dir,
const float t,
int nodeAddr,
int child,
float *dist)
{
Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
float3 aligned_dir = transform_direction(&space, dir);
float3 aligned_P = transform_point(&space, P);
float3 nrdir = -bvh_inverse_direction(aligned_dir);
float3 tLowerXYZ = aligned_P * nrdir;
float3 tUpperXYZ = tLowerXYZ - nrdir;
const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
const float tFar = min4(t, tFarX, tFarY, tFarZ);
*dist = tNear;
return tNear <= tFar;
}
ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
KernelGlobals *kg,
const float3 P,
const float3 dir,
const float t,
const float difl,
const float /*extmax*/,
int nodeAddr,
int child,
float *dist)
{
Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
float3 aligned_dir = transform_direction(&space, dir);
float3 aligned_P = transform_point(&space, P);
float3 nrdir = -bvh_inverse_direction(aligned_dir);
float3 tLowerXYZ = aligned_P * nrdir;
float3 tUpperXYZ = tLowerXYZ - nrdir;
const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
const float tFar = min4(t, tFarX, tFarY, tFarZ);
*dist = tNear;
if(difl != 0.0f) {
/* TODO(sergey): Same as for QBVH, needs a proper use. */
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
return round_down*tNear <= round_up*tFar;
}
else {
return tNear <= tFar;
}
}
ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
const float t,
const int nodeAddr,
const uint visibility,
float *dist)
{
int mask = 0;
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(cnodes.x) & visibility))
#endif
{
mask |= 1;
}
}
if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(cnodes.y) & visibility))
#endif
{
mask |= 2;
}
}
return mask;
}
ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
const float t,
const float difl,
const float extmax,
const int nodeAddr,
const uint visibility,
float *dist)
{
int mask = 0;
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(cnodes.x) & visibility))
#endif
{
mask |= 1;
}
}
if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 1, &dist[1])) {
#ifdef __VISIBILITY_FLAG__
if((__float_as_uint(cnodes.y) & visibility))
#endif
{
mask |= 2;
}
}
return mask;
}
ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
const float t,
const int nodeAddr,
const uint visibility,
float dist[2])
{
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return bvh_unaligned_node_intersect(kg,
P,
dir,
idir,
t,
nodeAddr,
visibility,
dist);
}
else {
return bvh_aligned_node_intersect(kg,
P,
idir,
t,
nodeAddr,
visibility,
dist);
}
}
ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
const float t,
const float difl,
const float extmax,
const int nodeAddr,
const uint visibility,
float dist[2])
{
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return bvh_unaligned_node_intersect_robust(kg,
P,
dir,
idir,
t,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
else {
return bvh_aligned_node_intersect_robust(kg,
P,
idir,
t,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
}
#else /* !defined(__KERNEL_SSE2__) */
int ccl_device_inline bvh_aligned_node_intersect(
KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& tsplat,
const ssef Psplat[3],
const ssef idirsplat[3],
const shuffle_swap_t shufflexyz[3],
const int nodeAddr,
const uint visibility,
float dist[2])
{
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
dist[0] = tminmax[0];
dist[1] = tminmax[1];
int mask = movemask(lrhit);
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
return cmask;
# else
return mask & 3;
# endif
}
int ccl_device_inline bvh_aligned_node_intersect_robust(
KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& tsplat,
const ssef Psplat[3],
const ssef idirsplat[3],
const shuffle_swap_t shufflexyz[3],
const float difl,
const float extmax,
const int nodeAddr,
const uint visibility,
float dist[2])
{
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
if(difl != 0.0f) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 *tminmaxview = (float4*)&tminmax;
float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
}
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
dist[0] = tminmax[0];
dist[1] = tminmax[1];
int mask = movemask(lrhit);
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
return cmask;
# else
return mask & 3;
# endif
}
int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& tnear,
const ssef& tfar,
const int nodeAddr,
const uint visibility,
float dist[2])
{
Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
float3 aligned_dir0 = transform_direction(&space0, dir),
aligned_dir1 = transform_direction(&space1, dir);;
float3 aligned_P0 = transform_point(&space0, P),
aligned_P1 = transform_point(&space1, P);
float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
nrdir1 = -bvh_inverse_direction(aligned_dir1);
ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
aligned_P1.x * nrdir1.x,
0.0f, 0.0f),
tLowerY = ssef(aligned_P0.y * nrdir0.y,
aligned_P1.y * nrdir1.y,
0.0f,
0.0f),
tLowerZ = ssef(aligned_P0.z * nrdir0.z,
aligned_P1.z * nrdir1.z,
0.0f,
0.0f);
ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
ssef tnear_x = min(tLowerX, tUpperX);
ssef tnear_y = min(tLowerY, tUpperY);
ssef tnear_z = min(tLowerZ, tUpperZ);
ssef tfar_x = max(tLowerX, tUpperX);
ssef tfar_y = max(tLowerY, tUpperY);
ssef tfar_z = max(tLowerZ, tUpperZ);
const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
sseb vmask = tNear <= tFar;
dist[0] = tNear.f[0];
dist[1] = tNear.f[1];
int mask = (int)movemask(vmask);
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
return cmask;
# else
return mask & 3;
# endif
}
int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& tnear,
const ssef& tfar,
const float difl,
const float /*extmax*/,
const int nodeAddr,
const uint visibility,
float dist[2])
{
Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
float3 aligned_dir0 = transform_direction(&space0, dir),
aligned_dir1 = transform_direction(&space1, dir);;
float3 aligned_P0 = transform_point(&space0, P),
aligned_P1 = transform_point(&space1, P);
float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
nrdir1 = -bvh_inverse_direction(aligned_dir1);
ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
aligned_P1.x * nrdir1.x,
0.0f, 0.0f),
tLowerY = ssef(aligned_P0.y * nrdir0.y,
aligned_P1.y * nrdir1.y,
0.0f,
0.0f),
tLowerZ = ssef(aligned_P0.z * nrdir0.z,
aligned_P1.z * nrdir1.z,
0.0f,
0.0f);
ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
ssef tnear_x = min(tLowerX, tUpperX);
ssef tnear_y = min(tLowerY, tUpperY);
ssef tnear_z = min(tLowerZ, tUpperZ);
ssef tfar_x = max(tLowerX, tUpperX);
ssef tfar_y = max(tLowerY, tUpperY);
ssef tfar_z = max(tLowerZ, tUpperZ);
const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
sseb vmask;
if(difl != 0.0f) {
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
vmask = round_down*tNear <= round_up*tFar;
}
else {
vmask = tNear <= tFar;
}
dist[0] = tNear.f[0];
dist[1] = tNear.f[1];
int mask = (int)movemask(vmask);
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
return cmask;
# else
return mask & 3;
# endif
}
ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& tnear,
const ssef& tfar,
const ssef& tsplat,
const ssef Psplat[3],
const ssef idirsplat[3],
const shuffle_swap_t shufflexyz[3],
const int nodeAddr,
const uint visibility,
float dist[2])
{
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return bvh_unaligned_node_intersect(kg,
P,
dir,
tnear,
tfar,
nodeAddr,
visibility,
dist);
}
else {
return bvh_aligned_node_intersect(kg,
P,
dir,
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
visibility,
dist);
}
}
ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& tnear,
const ssef& tfar,
const ssef& tsplat,
const ssef Psplat[3],
const ssef idirsplat[3],
const shuffle_swap_t shufflexyz[3],
const float difl,
const float extmax,
const int nodeAddr,
const uint visibility,
float dist[2])
{
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return bvh_unaligned_node_intersect_robust(kg,
P,
dir,
tnear,
tfar,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
else {
return bvh_aligned_node_intersect_robust(kg,
P,
dir,
tsplat,
Psplat,
idirsplat,
shufflexyz,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
}
#endif /* !defined(__KERNEL_SSE2__) */

@ -21,6 +21,12 @@
# include "geom_qbvh_shadow.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
#else
# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
@ -41,7 +47,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
* - likely and unlikely for if() statements
* - test restrict attribute for pointers
*/
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
# if BVH_FEATURE(BVH_HAIR)
ssef tnear(0.0f), tfar(isect_t);
# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@ -94,86 +103,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
int nodeAddrChild1, traverse_mask;
float dist[2];
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect_t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
/* decide which nodes to traverse next */
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
# else
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
traverse_mask = NODE_INTERSECT(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect_t,
nodeAddr,
PATH_RAY_SHADOW,
dist);
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
# else
traverseChild0 = (movemask(lrhit) & 1);
traverseChild1 = (movemask(lrhit) & 2);
traverse_mask = NODE_INTERSECT(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
PATH_RAY_SHADOW,
dist);
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
nodeAddr = __float_as_int(cnodes.z);
nodeAddrChild1 = __float_as_int(cnodes.w);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(traverse_mask == 3) {
/* Both children were intersected, push the farther one. */
bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@ -186,12 +153,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
/* One child was intersected. */
if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
else if(traverse_mask == 0) {
/* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@ -238,7 +205,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if BVH_FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
else
hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
@ -317,6 +284,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect_t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -369,6 +339,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect_t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -410,3 +383,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
#undef NODE_INTERSECT

@ -21,6 +21,12 @@
# include "geom_qbvh_subsurface.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
#else
# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function for subsurface scattering, where
* various features can be enabled/disabled. This way we can compile optimized
* versions for each case without new features slowing things down.
@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
# if BVH_FEATURE(BVH_HAIR)
ssef tnear(0.0f), tfar(isect_t);
# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@ -100,79 +109,47 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
/* traversal loop */
do {
do
{
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
{
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
int nodeAddrChild1, traverse_mask;
float dist[2];
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect_t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
/* decide which nodes to traverse next */
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
traverse_mask = NODE_INTERSECT(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect_t,
nodeAddr,
PATH_RAY_ALL_VISIBILITY,
dist);
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
traverseChild0 = (movemask(lrhit) & 1);
traverseChild1 = (movemask(lrhit) & 2);
traverse_mask = NODE_INTERSECT(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
PATH_RAY_ALL_VISIBILITY,
dist);
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
nodeAddr = __float_as_int(cnodes.z);
nodeAddrChild1 = __float_as_int(cnodes.w);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(traverse_mask == 3) {
/* Both children were intersected, push the farther one. */
bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@ -185,12 +162,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
/* One child was intersected. */
if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
else if(traverse_mask == 0) {
/* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@ -286,3 +263,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
#undef NODE_INTERSECT

@ -21,6 +21,14 @@
# include "geom_qbvh_traversal.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
#else
# define NODE_INTERSECT bvh_aligned_node_intersect
# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
#endif
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
@ -49,7 +57,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
* - likely and unlikely for if() statements
* - test restrict attribute for pointers
*/
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
# if BVH_FEATURE(BVH_HAIR)
ssef tnear(0.0f), tfar(isect->t);
# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@ -101,121 +112,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
int nodeAddrChild1, traverse_mask;
float dist[2];
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect->t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
traverse_mask = NODE_INTERSECT_ROBUST(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect->t,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
else
# endif
/* decide which nodes to traverse next */
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
# else
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
# endif
{
traverse_mask = NODE_INTERSECT(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect->t,
nodeAddr,
visibility,
dist);
}
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
float4 *tminmaxview = (float4*)&tminmax;
float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
float hdiff = 1.0f + difl;
float ldiff = 1.0f - difl;
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
c0min = max(ldiff * c0min, c0min - extmax);
c0max = min(hdiff * c0max, c0max + extmax);
}
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
c1min = max(ldiff * c1min, c1min - extmax);
c1max = min(hdiff * c1max, c1max + extmax);
}
traverse_mask = NODE_INTERSECT_ROBUST(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
difl,
extmax,
nodeAddr,
visibility,
dist);
}
else
# endif
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
# ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
# else
traverseChild0 = (movemask(lrhit) & 1);
traverseChild1 = (movemask(lrhit) & 2);
# endif
{
traverse_mask = NODE_INTERSECT(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
visibility,
dist);
}
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
nodeAddr = __float_as_int(cnodes.z);
nodeAddrChild1 = __float_as_int(cnodes.w);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(traverse_mask == 3) {
/* Both children were intersected, push the farther one. */
bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@ -228,12 +204,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
/* One child was intersected. */
if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
else if(traverse_mask == 0) {
/* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@ -268,6 +244,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
#else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@ -287,6 +266,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
# else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@ -313,6 +295,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
# else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@ -342,6 +327,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -376,6 +364,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -433,3 +424,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
#undef NODE_INTERSECT
#undef NODE_INTERSECT_ROBUST

@ -18,7 +18,13 @@
*/
#ifdef __QBVH__
#include "geom_qbvh_volume.h"
# include "geom_qbvh_volume.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
#else
# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function for volumes, where
@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
# if BVH_FEATURE(BVH_HAIR)
ssef tnear(0.0f), tfar(isect->t);
# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@ -91,75 +100,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
int nodeAddrChild1, traverse_mask;
float dist[2];
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect->t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
/* decide which nodes to traverse next */
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
traverse_mask = NODE_INTERSECT(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect->t,
nodeAddr,
visibility,
dist);
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
traverseChild0 = (movemask(lrhit) & 1);
traverseChild1 = (movemask(lrhit) & 2);
traverse_mask = NODE_INTERSECT(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
visibility,
dist);
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
nodeAddr = __float_as_int(cnodes.z);
nodeAddrChild1 = __float_as_int(cnodes.w);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(traverse_mask == 3) {
/* Both children were intersected, push the farther one. */
bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@ -172,12 +150,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
/* One child was intersected. */
if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
else if(traverse_mask == 0) {
/* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@ -258,6 +236,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -298,6 +279,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -337,3 +321,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
#undef NODE_INTERSECT

@ -18,7 +18,13 @@
*/
#ifdef __QBVH__
#include "geom_qbvh_volume_all.h"
# include "geom_qbvh_volume_all.h"
#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
#else
# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function for volumes, where
@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
# if BVH_FEATURE(BVH_HAIR)
ssef tnear(0.0f), tfar(isect_t);
# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@ -95,75 +104,44 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
bool traverseChild0, traverseChild1;
int nodeAddrChild1;
int nodeAddrChild1, traverse_mask;
float dist[2];
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
/* Intersect two child bounding boxes, non-SSE version */
float t = isect_array->t;
/* fetch node data */
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
/* intersect ray against child nodes */
float c0lox = (node0.x - P.x) * idir.x;
float c0hix = (node0.z - P.x) * idir.x;
float c0loy = (node1.x - P.y) * idir.y;
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
float c1loy = (node1.y - P.y) * idir.y;
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
/* decide which nodes to traverse next */
traverseChild0 = (c0max >= c0min);
traverseChild1 = (c1max >= c1min);
traverse_mask = NODE_INTERSECT(kg,
P,
# if BVH_FEATURE(BVH_HAIR)
dir,
# endif
idir,
isect_t,
nodeAddr,
visibility,
dist);
#else // __KERNEL_SSE2__
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
const ssef tminmax = minmax ^ pn;
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
traverseChild0 = (movemask(lrhit) & 1);
traverseChild1 = (movemask(lrhit) & 2);
traverse_mask = NODE_INTERSECT(kg,
P,
dir,
# if BVH_FEATURE(BVH_HAIR)
tnear,
tfar,
# endif
tsplat,
Psplat,
idirsplat,
shufflexyz,
nodeAddr,
visibility,
dist);
#endif // __KERNEL_SSE2__
nodeAddr = __float_as_int(cnodes.x);
nodeAddrChild1 = __float_as_int(cnodes.y);
nodeAddr = __float_as_int(cnodes.z);
nodeAddrChild1 = __float_as_int(cnodes.w);
if(traverseChild0 && traverseChild1) {
/* both children were intersected, push the farther one */
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(traverse_mask == 3) {
/* Both children were intersected, push the farther one. */
bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@ -176,12 +154,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
/* one child was intersected */
if(traverseChild1) {
/* One child was intersected. */
if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
else if(!traverseChild0) {
/* neither child was intersected */
else if(traverse_mask == 0) {
/* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@ -311,6 +289,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect_t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -368,6 +349,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect_t);
# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@ -410,3 +394,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
#undef NODE_INTERSECT

@ -51,23 +51,25 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
}
ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
/* Axis-aligned nodes intersection */
ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& org_idir,
const sse3f& org_idir,
#else
const sse3f& org,
const sse3f& org,
#endif
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
ssef *__restrict dist)
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
ssef *__restrict dist)
{
const int offset = nodeAddr + 1;
#ifdef __KERNEL_AVX2__
@ -101,24 +103,25 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
return mask;
}
ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
ccl_device_inline int qbvh_aligned_node_intersect_robust(
KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& P_idir,
const sse3f& P_idir,
#else
const sse3f& P,
const sse3f& P,
#endif
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
const float difl,
ssef *__restrict dist)
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
const float difl,
ssef *__restrict dist)
{
const int offset = nodeAddr + 1;
#ifdef __KERNEL_AVX2__
@ -145,3 +148,286 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
*dist = tNear;
return (int)movemask(vmask);
}
/* Unaligned nodes intersection */
ccl_device_inline int qbvh_unaligned_node_intersect(
KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& org_idir,
#endif
const sse3f& org,
const sse3f& dir,
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
ssef *__restrict dist)
{
const int offset = nodeAddr;
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
const ssef nrdir_x = neg_one / aligned_dir_x,
nrdir_y = neg_one / aligned_dir_y,
nrdir_z = neg_one / aligned_dir_z;
const ssef tlower_x = aligned_P_x * nrdir_x,
tlower_y = aligned_P_y * nrdir_y,
tlower_z = aligned_P_z * nrdir_z;
const ssef tupper_x = tlower_x - nrdir_x,
tupper_y = tlower_y - nrdir_y,
tupper_z = tlower_z - nrdir_z;
#ifdef __KERNEL_SSE41__
const ssef tnear_x = mini(tlower_x, tupper_x);
const ssef tnear_y = mini(tlower_y, tupper_y);
const ssef tnear_z = mini(tlower_z, tupper_z);
const ssef tfar_x = maxi(tlower_x, tupper_x);
const ssef tfar_y = maxi(tlower_y, tupper_y);
const ssef tfar_z = maxi(tlower_z, tupper_z);
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
const sseb vmask = tNear <= tFar;
*dist = tNear;
return movemask(vmask);
#else
const ssef tnear_x = min(tlower_x, tupper_x);
const ssef tnear_y = min(tlower_y, tupper_y);
const ssef tnear_z = min(tlower_z, tupper_z);
const ssef tfar_x = max(tlower_x, tupper_x);
const ssef tfar_y = max(tlower_y, tupper_y);
const ssef tfar_z = max(tlower_z, tupper_z);
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
const sseb vmask = tNear <= tFar;
*dist = tNear;
return movemask(vmask);
#endif
}
ccl_device_inline int qbvh_unaligned_node_intersect_robust(
KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& P_idir,
#endif
const sse3f& P,
const sse3f& dir,
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
const float difl,
ssef *__restrict dist)
{
const int offset = nodeAddr;
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
const ssef nrdir_x = neg_one / aligned_dir_x,
nrdir_y = neg_one / aligned_dir_y,
nrdir_z = neg_one / aligned_dir_z;
const ssef tlower_x = aligned_P_x * nrdir_x,
tlower_y = aligned_P_y * nrdir_y,
tlower_z = aligned_P_z * nrdir_z;
const ssef tupper_x = tlower_x - nrdir_x,
tupper_y = tlower_y - nrdir_y,
tupper_z = tlower_z - nrdir_z;
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
#ifdef __KERNEL_SSE41__
const ssef tnear_x = mini(tlower_x, tupper_x);
const ssef tnear_y = mini(tlower_y, tupper_y);
const ssef tnear_z = mini(tlower_z, tupper_z);
const ssef tfar_x = maxi(tlower_x, tupper_x);
const ssef tfar_y = maxi(tlower_y, tupper_y);
const ssef tfar_z = maxi(tlower_z, tupper_z);
#else
const ssef tnear_x = min(tlower_x, tupper_x);
const ssef tnear_y = min(tlower_y, tupper_y);
const ssef tnear_z = min(tlower_z, tupper_z);
const ssef tfar_x = max(tlower_x, tupper_x);
const ssef tfar_y = max(tlower_y, tupper_y);
const ssef tfar_z = max(tlower_z, tupper_z);
#endif
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
const sseb vmask = round_down*tNear <= round_up*tFar;
*dist = tNear;
return movemask(vmask);
}
/* Intersectors wrappers.
*
* They'll check node type and call appropriate intersection code.
*/
ccl_device_inline int qbvh_node_intersect(
KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& org_idir,
#endif
const sse3f& org,
const sse3f& dir,
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
ssef *__restrict dist)
{
const int offset = nodeAddr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return qbvh_unaligned_node_intersect(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
org_idir,
#endif
org,
dir,
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
dist);
}
else {
return qbvh_aligned_node_intersect(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
org_idir,
#else
org,
#endif
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
dist);
}
}
ccl_device_inline int qbvh_node_intersect_robust(
KernelGlobals *__restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
const sse3f& P_idir,
#endif
const sse3f& P,
const sse3f& dir,
const sse3f& idir,
const int near_x,
const int near_y,
const int near_z,
const int far_x,
const int far_y,
const int far_z,
const int nodeAddr,
const float difl,
ssef *__restrict dist)
{
const int offset = nodeAddr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
return qbvh_unaligned_node_intersect_robust(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir,
#endif
P,
dir,
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
difl,
dist);
}
else {
return qbvh_aligned_node_intersect_robust(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir,
#else
P,
#endif
idir,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
difl,
dist);
}
}

@ -27,6 +27,12 @@
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT qbvh_node_intersect
#else
# define NODE_INTERSECT qbvh_aligned_node_intersect
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(tmax);
#if BVH_FEATURE(BVH_HAIR)
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
#else
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@ -109,22 +119,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
int traverseChild = qbvh_node_intersect(kg,
tnear,
tfar,
int traverseChild = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#else
org,
P_idir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
# endif
# if BVH_FEATURE(BVH_HAIR)
dir4,
# endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
if(traverseChild != 0) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
float4 cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
}
else
#endif
{
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
}
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@ -340,13 +363,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@ -394,13 +422,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(tmax);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@ -412,3 +445,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return false;
}
#undef NODE_INTERSECT

@ -25,6 +25,12 @@
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT qbvh_node_intersect
#else
# define NODE_INTERSECT qbvh_aligned_node_intersect
#endif
ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
SubsurfaceIntersection *ss_isect,
@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(isect_t);
#if BVH_FEATURE(BVH_HAIR)
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
#else
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@ -108,22 +118,37 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
/* Traverse internal nodes. */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
ssef dist;
int traverseChild = qbvh_node_intersect(kg,
tnear,
tfar,
int traverseChild = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#else
org,
P_idir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
if(traverseChild != 0) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
float4 cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
}
else
#endif
{
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
}
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@ -270,3 +295,5 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
} while(nodeAddr != ENTRYPOINT_SENTINEL);
} while(nodeAddr != ENTRYPOINT_SENTINEL);
}
#undef NODE_INTERSECT

@ -28,6 +28,14 @@
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT qbvh_node_intersect
# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
#else
# define NODE_INTERSECT qbvh_aligned_node_intersect
# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
@ -81,13 +89,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
BVH_DEBUG_INIT();
ssef tnear(0.0f), tfar(ray->t);
#if BVH_FEATURE(BVH_HAIR)
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
#else
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@ -132,41 +144,62 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
*
* Need to test if doing opposite would be any faster.
*/
traverseChild = qbvh_node_intersect_robust(kg,
tnear,
tfar,
traverseChild = NODE_INTERSECT_ROBUST(kg,
tnear,
tfar,
# ifdef __KERNEL_AVX2__
P_idir4,
# else
org,
P_idir4,
# endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
difl,
&dist);
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
# endif
# if BVH_FEATURE(BVH_HAIR)
dir4,
# endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
difl,
&dist);
}
else
#endif /* BVH_HAIR_MINIMUM_WIDTH */
{
traverseChild = qbvh_node_intersect(kg,
tnear,
tfar,
traverseChild = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#else
org,
P_idir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
}
if(traverseChild != 0) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
float4 cnodes;
/* TODO(sergey): Investigate whether moving cnodes upwards
* gives a speedup (will be different cache pattern but will
* avoid extra check here),
*/
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
}
else
#endif
{
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
}
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@ -361,13 +394,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@ -398,13 +436,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@ -417,3 +460,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return (isect->prim != PRIM_NONE);
}
#undef NODE_INTERSECT
#undef NODE_INTERSECT_ROBUST

@ -26,6 +26,12 @@
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT qbvh_node_intersect
#else
# define NODE_INTERSECT qbvh_aligned_node_intersect
#endif
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
isect->object = OBJECT_NONE;
ssef tnear(0.0f), tfar(ray->t);
#if BVH_FEATURE(BVH_HAIR)
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
#else
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@ -104,22 +114,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
int traverseChild = qbvh_node_intersect(kg,
tnear,
tfar,
int traverseChild = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#else
org,
P_idir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
if(traverseChild != 0) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
float4 cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
}
else
#endif
{
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
}
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@ -278,13 +301,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@ -319,13 +347,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@ -337,3 +370,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return (isect->prim != PRIM_NONE);
}
#undef NODE_INTERSECT

@ -26,6 +26,12 @@
*
*/
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT qbvh_node_intersect
#else
# define NODE_INTERSECT qbvh_aligned_node_intersect
#endif
ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(isect_t);
#if BVH_FEATURE(BVH_HAIR)
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
#else
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
#endif
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@ -108,22 +118,35 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
int traverseChild = qbvh_node_intersect(kg,
tnear,
tfar,
int traverseChild = NODE_INTERSECT(kg,
tnear,
tfar,
#ifdef __KERNEL_AVX2__
P_idir4,
#else
org,
P_idir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4,
#endif
#if BVH_FEATURE(BVH_HAIR)
dir4,
#endif
idir4,
near_x, near_y, near_z,
far_x, far_y, far_z,
nodeAddr,
&dist);
if(traverseChild != 0) {
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
float4 cnodes;
#if BVH_FEATURE(BVH_HAIR)
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
}
else
#endif
{
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
}
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@ -330,12 +353,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
num_hits_in_instance = 0;
isect_array->t = isect_t;
@ -389,13 +417,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
# if BVH_FEATURE(BVH_HAIR)
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
# else
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
triangle_intersect_precalc(dir, &isect_precalc);
isect_t = tmax;
isect_array->t = isect_t;
@ -409,3 +442,5 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return num_hits;
}
#undef NODE_INTERSECT