Cycles: Implement unaligned nodes BVH traversal
This commit implements traversal of unaligned BVH nodes. QBVH traversal is fully SIMD optimized and calculates orientation for all 4 children at a time, regular BVH might probably be optimized a bit more.
This commit is contained in:
parent
b03e66e75f
commit
a08e2179f1
@ -141,6 +141,7 @@ set(SRC_GEOM_HEADERS
|
||||
geom/geom.h
|
||||
geom/geom_attribute.h
|
||||
geom/geom_bvh.h
|
||||
geom/geom_bvh_nodes.h
|
||||
geom/geom_bvh_shadow.h
|
||||
geom/geom_bvh_subsurface.h
|
||||
geom/geom_bvh_traversal.h
|
||||
|
@ -77,6 +77,8 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Regular BVH traversal */
|
||||
|
||||
#include "geom_bvh_nodes.h"
|
||||
|
||||
#define BVH_FUNCTION_NAME bvh_intersect
|
||||
#define BVH_FUNCTION_FEATURES 0
|
||||
#include "geom_bvh_traversal.h"
|
||||
@ -109,13 +111,13 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#if defined(__SUBSURFACE__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_subsurface
|
||||
# define BVH_FUNCTION_FEATURES 0
|
||||
# define BVH_FUNCTION_FEATURES BVH_HAIR
|
||||
# include "geom_bvh_subsurface.h"
|
||||
#endif
|
||||
|
||||
#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
|
||||
# define BVH_FUNCTION_FEATURES BVH_MOTION
|
||||
# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
|
||||
# include "geom_bvh_subsurface.h"
|
||||
#endif
|
||||
|
||||
@ -123,19 +125,19 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#if defined(__VOLUME__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume
|
||||
# define BVH_FUNCTION_FEATURES 0
|
||||
# define BVH_FUNCTION_FEATURES BVH_HAIR
|
||||
# include "geom_bvh_volume.h"
|
||||
#endif
|
||||
|
||||
#if defined(__VOLUME__) && defined(__INSTANCING__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
|
||||
# include "geom_bvh_volume.h"
|
||||
#endif
|
||||
|
||||
#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume_motion
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
|
||||
# include "geom_bvh_volume.h"
|
||||
#endif
|
||||
|
||||
@ -175,19 +177,19 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#if defined(__VOLUME_RECORD_ALL__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume_all
|
||||
# define BVH_FUNCTION_FEATURES 0
|
||||
# define BVH_FUNCTION_FEATURES BVH_HAIR
|
||||
# include "geom_bvh_volume_all.h"
|
||||
#endif
|
||||
|
||||
#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
|
||||
# include "geom_bvh_volume_all.h"
|
||||
#endif
|
||||
|
||||
#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
|
||||
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
|
||||
# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
|
||||
# include "geom_bvh_volume_all.h"
|
||||
#endif
|
||||
|
||||
|
659
intern/cycles/kernel/geom/geom_bvh_nodes.h
Normal file
659
intern/cycles/kernel/geom/geom_bvh_nodes.h
Normal file
@ -0,0 +1,659 @@
|
||||
/*
|
||||
* Copyright 2011-2016, Blender Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
|
||||
// 3-vector which might be faster.
|
||||
ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
|
||||
int nodeAddr,
|
||||
int child)
|
||||
{
|
||||
Transform space;
|
||||
const int child_addr = nodeAddr + child * 3;
|
||||
space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
|
||||
space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
|
||||
space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
|
||||
space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
|
||||
return space;
|
||||
}
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float *dist)
|
||||
{
|
||||
|
||||
/* fetch node data */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
dist[0] = c0min;
|
||||
dist[1] = c1min;
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
#else
|
||||
return ((c0max >= c0min)? 1: 0) |
|
||||
((c1max >= c1min)? 2: 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const float difl,
|
||||
const float extmax,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float *dist)
|
||||
{
|
||||
|
||||
/* fetch node data */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
if(difl != 0.0f) {
|
||||
float hdiff = 1.0f + difl;
|
||||
float ldiff = 1.0f - difl;
|
||||
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
|
||||
c0min = max(ldiff * c0min, c0min - extmax);
|
||||
c0max = min(hdiff * c0max, c0max + extmax);
|
||||
}
|
||||
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
|
||||
c1min = max(ldiff * c1min, c1min - extmax);
|
||||
c1max = min(hdiff * c1max, c1max + extmax);
|
||||
}
|
||||
}
|
||||
|
||||
dist[0] = c0min;
|
||||
dist[1] = c1min;
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
#else
|
||||
return ((c0max >= c0min)? 1: 0) |
|
||||
((c1max >= c1min)? 2: 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool bvh_unaligned_node_intersect_child(
|
||||
KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float t,
|
||||
int nodeAddr,
|
||||
int child,
|
||||
float *dist)
|
||||
{
|
||||
Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
|
||||
float3 aligned_dir = transform_direction(&space, dir);
|
||||
float3 aligned_P = transform_point(&space, P);
|
||||
float3 nrdir = -bvh_inverse_direction(aligned_dir);
|
||||
float3 tLowerXYZ = aligned_P * nrdir;
|
||||
float3 tUpperXYZ = tLowerXYZ - nrdir;
|
||||
const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
|
||||
const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
|
||||
const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
|
||||
const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
|
||||
const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
|
||||
const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
|
||||
const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
|
||||
const float tFar = min4(t, tFarX, tFarY, tFarZ);
|
||||
*dist = tNear;
|
||||
return tNear <= tFar;
|
||||
}
|
||||
|
||||
ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
|
||||
KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float t,
|
||||
const float difl,
|
||||
const float /*extmax*/,
|
||||
int nodeAddr,
|
||||
int child,
|
||||
float *dist)
|
||||
{
|
||||
Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
|
||||
float3 aligned_dir = transform_direction(&space, dir);
|
||||
float3 aligned_P = transform_point(&space, P);
|
||||
float3 nrdir = -bvh_inverse_direction(aligned_dir);
|
||||
float3 tLowerXYZ = aligned_P * nrdir;
|
||||
float3 tUpperXYZ = tLowerXYZ - nrdir;
|
||||
const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
|
||||
const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
|
||||
const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
|
||||
const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
|
||||
const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
|
||||
const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
|
||||
const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
|
||||
const float tFar = min4(t, tFarX, tFarY, tFarZ);
|
||||
*dist = tNear;
|
||||
if(difl != 0.0f) {
|
||||
/* TODO(sergey): Same as for QBVH, needs a proper use. */
|
||||
const float round_down = 1.0f - difl;
|
||||
const float round_up = 1.0f + difl;
|
||||
return round_down*tNear <= round_up*tFar;
|
||||
}
|
||||
else {
|
||||
return tNear <= tFar;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float *dist)
|
||||
{
|
||||
int mask = 0;
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(cnodes.x) & visibility))
|
||||
#endif
|
||||
{
|
||||
mask |= 1;
|
||||
}
|
||||
}
|
||||
if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(cnodes.y) & visibility))
|
||||
#endif
|
||||
{
|
||||
mask |= 2;
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const float difl,
|
||||
const float extmax,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float *dist)
|
||||
{
|
||||
int mask = 0;
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(cnodes.x) & visibility))
|
||||
#endif
|
||||
{
|
||||
mask |= 1;
|
||||
}
|
||||
}
|
||||
if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 1, &dist[1])) {
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
if((__float_as_uint(cnodes.y) & visibility))
|
||||
#endif
|
||||
{
|
||||
mask |= 2;
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return bvh_unaligned_node_intersect(kg,
|
||||
P,
|
||||
dir,
|
||||
idir,
|
||||
t,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return bvh_aligned_node_intersect(kg,
|
||||
P,
|
||||
idir,
|
||||
t,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const float3 idir,
|
||||
const float t,
|
||||
const float difl,
|
||||
const float extmax,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return bvh_unaligned_node_intersect_robust(kg,
|
||||
P,
|
||||
dir,
|
||||
idir,
|
||||
t,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return bvh_aligned_node_intersect_robust(kg,
|
||||
P,
|
||||
idir,
|
||||
t,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
#else /* !defined(__KERNEL_SSE2__) */
|
||||
|
||||
int ccl_device_inline bvh_aligned_node_intersect(
|
||||
KernelGlobals *kg,
|
||||
const float3& P,
|
||||
const float3& dir,
|
||||
const ssef& tsplat,
|
||||
const ssef Psplat[3],
|
||||
const ssef idirsplat[3],
|
||||
const shuffle_swap_t shufflexyz[3],
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
dist[0] = tminmax[0];
|
||||
dist[1] = tminmax[1];
|
||||
|
||||
int mask = movemask(lrhit);
|
||||
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
return cmask;
|
||||
# else
|
||||
return mask & 3;
|
||||
# endif
|
||||
}
|
||||
|
||||
int ccl_device_inline bvh_aligned_node_intersect_robust(
|
||||
KernelGlobals *kg,
|
||||
const float3& P,
|
||||
const float3& dir,
|
||||
const ssef& tsplat,
|
||||
const ssef Psplat[3],
|
||||
const ssef idirsplat[3],
|
||||
const shuffle_swap_t shufflexyz[3],
|
||||
const float difl,
|
||||
const float extmax,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
|
||||
if(difl != 0.0f) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 *tminmaxview = (float4*)&tminmax;
|
||||
float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
|
||||
float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
|
||||
float hdiff = 1.0f + difl;
|
||||
float ldiff = 1.0f - difl;
|
||||
if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
|
||||
c0min = max(ldiff * c0min, c0min - extmax);
|
||||
c0max = min(hdiff * c0max, c0max + extmax);
|
||||
}
|
||||
if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
|
||||
c1min = max(ldiff * c1min, c1min - extmax);
|
||||
c1max = min(hdiff * c1max, c1max + extmax);
|
||||
}
|
||||
}
|
||||
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
dist[0] = tminmax[0];
|
||||
dist[1] = tminmax[1];
|
||||
|
||||
int mask = movemask(lrhit);
|
||||
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
return cmask;
|
||||
# else
|
||||
return mask & 3;
|
||||
# endif
|
||||
}
|
||||
|
||||
int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
|
||||
Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
|
||||
|
||||
float3 aligned_dir0 = transform_direction(&space0, dir),
|
||||
aligned_dir1 = transform_direction(&space1, dir);;
|
||||
float3 aligned_P0 = transform_point(&space0, P),
|
||||
aligned_P1 = transform_point(&space1, P);
|
||||
float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
|
||||
nrdir1 = -bvh_inverse_direction(aligned_dir1);
|
||||
|
||||
ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
|
||||
aligned_P1.x * nrdir1.x,
|
||||
0.0f, 0.0f),
|
||||
tLowerY = ssef(aligned_P0.y * nrdir0.y,
|
||||
aligned_P1.y * nrdir1.y,
|
||||
0.0f,
|
||||
0.0f),
|
||||
tLowerZ = ssef(aligned_P0.z * nrdir0.z,
|
||||
aligned_P1.z * nrdir1.z,
|
||||
0.0f,
|
||||
0.0f);
|
||||
|
||||
ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
|
||||
tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
|
||||
tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
|
||||
|
||||
ssef tnear_x = min(tLowerX, tUpperX);
|
||||
ssef tnear_y = min(tLowerY, tUpperY);
|
||||
ssef tnear_z = min(tLowerZ, tUpperZ);
|
||||
ssef tfar_x = max(tLowerX, tUpperX);
|
||||
ssef tfar_y = max(tLowerY, tUpperY);
|
||||
ssef tfar_z = max(tLowerZ, tUpperZ);
|
||||
|
||||
const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
|
||||
const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
|
||||
sseb vmask = tNear <= tFar;
|
||||
dist[0] = tNear.f[0];
|
||||
dist[1] = tNear.f[1];
|
||||
|
||||
int mask = (int)movemask(vmask);
|
||||
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
return cmask;
|
||||
# else
|
||||
return mask & 3;
|
||||
# endif
|
||||
}
|
||||
|
||||
int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
|
||||
const float3 P,
|
||||
const float3 dir,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
const float difl,
|
||||
const float /*extmax*/,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
|
||||
Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
|
||||
|
||||
float3 aligned_dir0 = transform_direction(&space0, dir),
|
||||
aligned_dir1 = transform_direction(&space1, dir);;
|
||||
float3 aligned_P0 = transform_point(&space0, P),
|
||||
aligned_P1 = transform_point(&space1, P);
|
||||
float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
|
||||
nrdir1 = -bvh_inverse_direction(aligned_dir1);
|
||||
|
||||
ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
|
||||
aligned_P1.x * nrdir1.x,
|
||||
0.0f, 0.0f),
|
||||
tLowerY = ssef(aligned_P0.y * nrdir0.y,
|
||||
aligned_P1.y * nrdir1.y,
|
||||
0.0f,
|
||||
0.0f),
|
||||
tLowerZ = ssef(aligned_P0.z * nrdir0.z,
|
||||
aligned_P1.z * nrdir1.z,
|
||||
0.0f,
|
||||
0.0f);
|
||||
|
||||
ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
|
||||
tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
|
||||
tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
|
||||
|
||||
ssef tnear_x = min(tLowerX, tUpperX);
|
||||
ssef tnear_y = min(tLowerY, tUpperY);
|
||||
ssef tnear_z = min(tLowerZ, tUpperZ);
|
||||
ssef tfar_x = max(tLowerX, tUpperX);
|
||||
ssef tfar_y = max(tLowerY, tUpperY);
|
||||
ssef tfar_z = max(tLowerZ, tUpperZ);
|
||||
|
||||
const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
|
||||
const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
|
||||
sseb vmask;
|
||||
if(difl != 0.0f) {
|
||||
const float round_down = 1.0f - difl;
|
||||
const float round_up = 1.0f + difl;
|
||||
vmask = round_down*tNear <= round_up*tFar;
|
||||
}
|
||||
else {
|
||||
vmask = tNear <= tFar;
|
||||
}
|
||||
|
||||
dist[0] = tNear.f[0];
|
||||
dist[1] = tNear.f[1];
|
||||
|
||||
int mask = (int)movemask(vmask);
|
||||
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
|
||||
(((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
|
||||
return cmask;
|
||||
# else
|
||||
return mask & 3;
|
||||
# endif
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
|
||||
const float3& P,
|
||||
const float3& dir,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
const ssef& tsplat,
|
||||
const ssef Psplat[3],
|
||||
const ssef idirsplat[3],
|
||||
const shuffle_swap_t shufflexyz[3],
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return bvh_unaligned_node_intersect(kg,
|
||||
P,
|
||||
dir,
|
||||
tnear,
|
||||
tfar,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return bvh_aligned_node_intersect(kg,
|
||||
P,
|
||||
dir,
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
|
||||
const float3& P,
|
||||
const float3& dir,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
const ssef& tsplat,
|
||||
const ssef Psplat[3],
|
||||
const ssef idirsplat[3],
|
||||
const shuffle_swap_t shufflexyz[3],
|
||||
const float difl,
|
||||
const float extmax,
|
||||
const int nodeAddr,
|
||||
const uint visibility,
|
||||
float dist[2])
|
||||
{
|
||||
float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return bvh_unaligned_node_intersect_robust(kg,
|
||||
P,
|
||||
dir,
|
||||
tnear,
|
||||
tfar,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return bvh_aligned_node_intersect_robust(kg,
|
||||
P,
|
||||
dir,
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
#endif /* !defined(__KERNEL_SSE2__) */
|
@ -21,6 +21,12 @@
|
||||
# include "geom_qbvh_shadow.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT bvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
/* This is a template BVH traversal function, where various features can be
|
||||
* enabled/disabled. This way we can compile optimized versions for each case
|
||||
* without new features slowing things down.
|
||||
@ -41,7 +47,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
* - likely and unlikely for if() statements
|
||||
* - test restrict attribute for pointers
|
||||
*/
|
||||
|
||||
|
||||
/* traversal stack in CUDA thread-local memory */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
|
||||
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
|
||||
|
||||
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
ssef Psplat[3], idirsplat[3];
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
ssef tnear(0.0f), tfar(isect_t);
|
||||
# endif
|
||||
shuffle_swap_t shufflexyz[3];
|
||||
|
||||
Psplat[0] = ssef(P.x);
|
||||
@ -94,86 +103,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
do {
|
||||
/* traverse internal nodes */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
int nodeAddrChild1, traverse_mask;
|
||||
float dist[2];
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect_t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
|
||||
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
|
||||
# else
|
||||
traverseChild0 = (c0max >= c0min);
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
|
||||
idir,
|
||||
isect_t,
|
||||
nodeAddr,
|
||||
PATH_RAY_SHADOW,
|
||||
dist);
|
||||
#else // __KERNEL_SSE2__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
const float4 cnodes = ((float4*)bvh_nodes)[3];
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
|
||||
traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
|
||||
# else
|
||||
traverseChild0 = (movemask(lrhit) & 1);
|
||||
traverseChild1 = (movemask(lrhit) & 2);
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
PATH_RAY_SHADOW,
|
||||
dist);
|
||||
#endif // __KERNEL_SSE2__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
nodeAddr = __float_as_int(cnodes.z);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.w);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
bool closestChild1 = tminmax[1] < tminmax[0];
|
||||
#endif
|
||||
if(traverse_mask == 3) {
|
||||
/* Both children were intersected, push the farther one. */
|
||||
bool closestChild1 = (dist[1] < dist[0]);
|
||||
|
||||
if(closestChild1) {
|
||||
int tmp = nodeAddr;
|
||||
@ -186,12 +153,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
traversalStack[stackPtr] = nodeAddrChild1;
|
||||
}
|
||||
else {
|
||||
/* one child was intersected */
|
||||
if(traverseChild1) {
|
||||
/* One child was intersected. */
|
||||
if(traverse_mask == 2) {
|
||||
nodeAddr = nodeAddrChild1;
|
||||
}
|
||||
else if(!traverseChild0) {
|
||||
/* neither child was intersected */
|
||||
else if(traverse_mask == 0) {
|
||||
/* Neither child was intersected. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
--stackPtr;
|
||||
}
|
||||
@ -238,7 +205,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
case PRIMITIVE_CURVE:
|
||||
case PRIMITIVE_MOTION_CURVE: {
|
||||
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
|
||||
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
|
||||
hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
|
||||
else
|
||||
hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
|
||||
@ -317,6 +284,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect_t);
|
||||
# endif
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
|
||||
@ -369,6 +339,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect_t);
|
||||
# endif
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
|
||||
@ -410,3 +383,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
#undef BVH_FUNCTION_FEATURES
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -21,6 +21,12 @@
|
||||
# include "geom_qbvh_subsurface.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT bvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
/* This is a template BVH traversal function for subsurface scattering, where
|
||||
* various features can be enabled/disabled. This way we can compile optimized
|
||||
* versions for each case without new features slowing things down.
|
||||
@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
ssef Psplat[3], idirsplat[3];
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
ssef tnear(0.0f), tfar(isect_t);
|
||||
# endif
|
||||
shuffle_swap_t shufflexyz[3];
|
||||
|
||||
Psplat[0] = ssef(P.x);
|
||||
@ -100,79 +109,47 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
|
||||
/* traversal loop */
|
||||
do {
|
||||
do
|
||||
{
|
||||
do {
|
||||
/* traverse internal nodes */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
|
||||
{
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
int nodeAddrChild1, traverse_mask;
|
||||
float dist[2];
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect_t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (c0max >= c0min);
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
idir,
|
||||
isect_t,
|
||||
nodeAddr,
|
||||
PATH_RAY_ALL_VISIBILITY,
|
||||
dist);
|
||||
#else // __KERNEL_SSE2__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
const float4 cnodes = ((float4*)bvh_nodes)[3];
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (movemask(lrhit) & 1);
|
||||
traverseChild1 = (movemask(lrhit) & 2);
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
PATH_RAY_ALL_VISIBILITY,
|
||||
dist);
|
||||
#endif // __KERNEL_SSE2__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
nodeAddr = __float_as_int(cnodes.z);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.w);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
bool closestChild1 = tminmax[1] < tminmax[0];
|
||||
#endif
|
||||
if(traverse_mask == 3) {
|
||||
/* Both children were intersected, push the farther one. */
|
||||
bool closestChild1 = (dist[1] < dist[0]);
|
||||
|
||||
if(closestChild1) {
|
||||
int tmp = nodeAddr;
|
||||
@ -185,12 +162,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
traversalStack[stackPtr] = nodeAddrChild1;
|
||||
}
|
||||
else {
|
||||
/* one child was intersected */
|
||||
if(traverseChild1) {
|
||||
/* One child was intersected. */
|
||||
if(traverse_mask == 2) {
|
||||
nodeAddr = nodeAddrChild1;
|
||||
}
|
||||
else if(!traverseChild0) {
|
||||
/* neither child was intersected */
|
||||
else if(traverse_mask == 0) {
|
||||
/* Neither child was intersected. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
--stackPtr;
|
||||
}
|
||||
@ -286,3 +263,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
#undef BVH_FUNCTION_FEATURES
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -21,6 +21,14 @@
|
||||
# include "geom_qbvh_traversal.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
|
||||
#else
|
||||
# define NODE_INTERSECT bvh_aligned_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
|
||||
#endif
|
||||
|
||||
/* This is a template BVH traversal function, where various features can be
|
||||
* enabled/disabled. This way we can compile optimized versions for each case
|
||||
* without new features slowing things down.
|
||||
@ -49,7 +57,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
* - likely and unlikely for if() statements
|
||||
* - test restrict attribute for pointers
|
||||
*/
|
||||
|
||||
|
||||
/* traversal stack in CUDA thread-local memory */
|
||||
int traversalStack[BVH_STACK_SIZE];
|
||||
traversalStack[0] = ENTRYPOINT_SENTINEL;
|
||||
@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
|
||||
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
|
||||
|
||||
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
ssef Psplat[3], idirsplat[3];
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
ssef tnear(0.0f), tfar(isect->t);
|
||||
# endif
|
||||
shuffle_swap_t shufflexyz[3];
|
||||
|
||||
Psplat[0] = ssef(P.x);
|
||||
@ -101,121 +112,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
do {
|
||||
/* traverse internal nodes */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
int nodeAddrChild1, traverse_mask;
|
||||
float dist[2];
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect->t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
if(difl != 0.0f) {
|
||||
float hdiff = 1.0f + difl;
|
||||
float ldiff = 1.0f - difl;
|
||||
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
|
||||
c0min = max(ldiff * c0min, c0min - extmax);
|
||||
c0max = min(hdiff * c0max, c0max + extmax);
|
||||
}
|
||||
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
|
||||
c1min = max(ldiff * c1min, c1min - extmax);
|
||||
c1max = min(hdiff * c1max, c1max + extmax);
|
||||
}
|
||||
traverse_mask = NODE_INTERSECT_ROBUST(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
idir,
|
||||
isect->t,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
|
||||
traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
|
||||
# else
|
||||
traverseChild0 = (c0max >= c0min);
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
# endif
|
||||
|
||||
{
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
idir,
|
||||
isect->t,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
#else // __KERNEL_SSE2__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
const float4 cnodes = ((float4*)bvh_nodes)[3];
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
|
||||
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
|
||||
if(difl != 0.0f) {
|
||||
float4 *tminmaxview = (float4*)&tminmax;
|
||||
float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
|
||||
float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
|
||||
|
||||
float hdiff = 1.0f + difl;
|
||||
float ldiff = 1.0f - difl;
|
||||
if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
|
||||
c0min = max(ldiff * c0min, c0min - extmax);
|
||||
c0max = min(hdiff * c0max, c0max + extmax);
|
||||
}
|
||||
if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
|
||||
c1min = max(ldiff * c1min, c1min - extmax);
|
||||
c1max = min(hdiff * c1max, c1max + extmax);
|
||||
}
|
||||
traverse_mask = NODE_INTERSECT_ROBUST(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
difl,
|
||||
extmax,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
# ifdef __VISIBILITY_FLAG__
|
||||
/* this visibility test gives a 5% performance hit, how to solve? */
|
||||
traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
|
||||
traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
|
||||
# else
|
||||
traverseChild0 = (movemask(lrhit) & 1);
|
||||
traverseChild1 = (movemask(lrhit) & 2);
|
||||
# endif
|
||||
{
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
}
|
||||
#endif // __KERNEL_SSE2__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
nodeAddr = __float_as_int(cnodes.z);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.w);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
bool closestChild1 = tminmax[1] < tminmax[0];
|
||||
#endif
|
||||
if(traverse_mask == 3) {
|
||||
/* Both children were intersected, push the farther one. */
|
||||
bool closestChild1 = (dist[1] < dist[0]);
|
||||
|
||||
if(closestChild1) {
|
||||
int tmp = nodeAddr;
|
||||
@ -228,12 +204,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
traversalStack[stackPtr] = nodeAddrChild1;
|
||||
}
|
||||
else {
|
||||
/* one child was intersected */
|
||||
if(traverseChild1) {
|
||||
/* One child was intersected. */
|
||||
if(traverse_mask == 2) {
|
||||
nodeAddr = nodeAddrChild1;
|
||||
}
|
||||
else if(!traverseChild0) {
|
||||
/* neither child was intersected */
|
||||
else if(traverse_mask == 0) {
|
||||
/* Neither child was intersected. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
--stackPtr;
|
||||
}
|
||||
@ -268,6 +244,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
#else
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
@ -287,6 +266,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
# else
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
@ -313,6 +295,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
# else
|
||||
if(visibility == PATH_RAY_SHADOW_OPAQUE)
|
||||
return true;
|
||||
@ -342,6 +327,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -376,6 +364,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -433,3 +424,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
#undef BVH_FUNCTION_FEATURES
|
||||
#undef NODE_INTERSECT
|
||||
#undef NODE_INTERSECT_ROBUST
|
||||
|
@ -18,7 +18,13 @@
|
||||
*/
|
||||
|
||||
#ifdef __QBVH__
|
||||
#include "geom_qbvh_volume.h"
|
||||
# include "geom_qbvh_volume.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT bvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
/* This is a template BVH traversal function for volumes, where
|
||||
@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
|
||||
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
|
||||
|
||||
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
ssef Psplat[3], idirsplat[3];
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
ssef tnear(0.0f), tfar(isect->t);
|
||||
# endif
|
||||
shuffle_swap_t shufflexyz[3];
|
||||
|
||||
Psplat[0] = ssef(P.x);
|
||||
@ -91,75 +100,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
do {
|
||||
/* traverse internal nodes */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
int nodeAddrChild1, traverse_mask;
|
||||
float dist[2];
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect->t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (c0max >= c0min);
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
idir,
|
||||
isect->t,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
#else // __KERNEL_SSE2__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
const float4 cnodes = ((float4*)bvh_nodes)[3];
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (movemask(lrhit) & 1);
|
||||
traverseChild1 = (movemask(lrhit) & 2);
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
#endif // __KERNEL_SSE2__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
nodeAddr = __float_as_int(cnodes.z);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.w);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
bool closestChild1 = tminmax[1] < tminmax[0];
|
||||
#endif
|
||||
if(traverse_mask == 3) {
|
||||
/* Both children were intersected, push the farther one. */
|
||||
bool closestChild1 = (dist[1] < dist[0]);
|
||||
|
||||
if(closestChild1) {
|
||||
int tmp = nodeAddr;
|
||||
@ -172,12 +150,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
traversalStack[stackPtr] = nodeAddrChild1;
|
||||
}
|
||||
else {
|
||||
/* one child was intersected */
|
||||
if(traverseChild1) {
|
||||
/* One child was intersected. */
|
||||
if(traverse_mask == 2) {
|
||||
nodeAddr = nodeAddrChild1;
|
||||
}
|
||||
else if(!traverseChild0) {
|
||||
/* neither child was intersected */
|
||||
else if(traverse_mask == 0) {
|
||||
/* Neither child was intersected. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
--stackPtr;
|
||||
}
|
||||
@ -258,6 +236,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -298,6 +279,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect->t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -337,3 +321,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
#undef BVH_FUNCTION_FEATURES
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -18,7 +18,13 @@
|
||||
*/
|
||||
|
||||
#ifdef __QBVH__
|
||||
#include "geom_qbvh_volume_all.h"
|
||||
# include "geom_qbvh_volume_all.h"
|
||||
#endif
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT bvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT bvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
/* This is a template BVH traversal function for volumes, where
|
||||
@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
#if defined(__KERNEL_SSE2__)
|
||||
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
|
||||
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
|
||||
|
||||
|
||||
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
|
||||
ssef Psplat[3], idirsplat[3];
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
ssef tnear(0.0f), tfar(isect_t);
|
||||
# endif
|
||||
shuffle_swap_t shufflexyz[3];
|
||||
|
||||
Psplat[0] = ssef(P.x);
|
||||
@ -95,75 +104,44 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
do {
|
||||
/* traverse internal nodes */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
bool traverseChild0, traverseChild1;
|
||||
int nodeAddrChild1;
|
||||
int nodeAddrChild1, traverse_mask;
|
||||
float dist[2];
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
/* Intersect two child bounding boxes, non-SSE version */
|
||||
float t = isect_array->t;
|
||||
|
||||
/* fetch node data */
|
||||
float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
|
||||
float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
float c0lox = (node0.x - P.x) * idir.x;
|
||||
float c0hix = (node0.z - P.x) * idir.x;
|
||||
float c0loy = (node1.x - P.y) * idir.y;
|
||||
float c0hiy = (node1.z - P.y) * idir.y;
|
||||
float c0loz = (node2.x - P.z) * idir.z;
|
||||
float c0hiz = (node2.z - P.z) * idir.z;
|
||||
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
|
||||
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
|
||||
|
||||
float c1lox = (node0.y - P.x) * idir.x;
|
||||
float c1hix = (node0.w - P.x) * idir.x;
|
||||
float c1loy = (node1.y - P.y) * idir.y;
|
||||
float c1hiy = (node1.w - P.y) * idir.y;
|
||||
float c1loz = (node2.y - P.z) * idir.z;
|
||||
float c1hiz = (node2.w - P.z) * idir.z;
|
||||
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
|
||||
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (c0max >= c0min);
|
||||
traverseChild1 = (c1max >= c1min);
|
||||
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir,
|
||||
# endif
|
||||
idir,
|
||||
isect_t,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
#else // __KERNEL_SSE2__
|
||||
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
|
||||
|
||||
/* fetch node data */
|
||||
const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
|
||||
const float4 cnodes = ((float4*)bvh_nodes)[3];
|
||||
|
||||
/* intersect ray against child nodes */
|
||||
const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
|
||||
const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
|
||||
const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
|
||||
|
||||
/* calculate { c0min, c1min, -c0max, -c1max} */
|
||||
ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
|
||||
const ssef tminmax = minmax ^ pn;
|
||||
|
||||
const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
|
||||
|
||||
/* decide which nodes to traverse next */
|
||||
traverseChild0 = (movemask(lrhit) & 1);
|
||||
traverseChild1 = (movemask(lrhit) & 2);
|
||||
traverse_mask = NODE_INTERSECT(kg,
|
||||
P,
|
||||
dir,
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tnear,
|
||||
tfar,
|
||||
# endif
|
||||
tsplat,
|
||||
Psplat,
|
||||
idirsplat,
|
||||
shufflexyz,
|
||||
nodeAddr,
|
||||
visibility,
|
||||
dist);
|
||||
#endif // __KERNEL_SSE2__
|
||||
|
||||
nodeAddr = __float_as_int(cnodes.x);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.y);
|
||||
nodeAddr = __float_as_int(cnodes.z);
|
||||
nodeAddrChild1 = __float_as_int(cnodes.w);
|
||||
|
||||
if(traverseChild0 && traverseChild1) {
|
||||
/* both children were intersected, push the farther one */
|
||||
#if !defined(__KERNEL_SSE2__)
|
||||
bool closestChild1 = (c1min < c0min);
|
||||
#else
|
||||
bool closestChild1 = tminmax[1] < tminmax[0];
|
||||
#endif
|
||||
if(traverse_mask == 3) {
|
||||
/* Both children were intersected, push the farther one. */
|
||||
bool closestChild1 = (dist[1] < dist[0]);
|
||||
|
||||
if(closestChild1) {
|
||||
int tmp = nodeAddr;
|
||||
@ -176,12 +154,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
traversalStack[stackPtr] = nodeAddrChild1;
|
||||
}
|
||||
else {
|
||||
/* one child was intersected */
|
||||
if(traverseChild1) {
|
||||
/* One child was intersected. */
|
||||
if(traverse_mask == 2) {
|
||||
nodeAddr = nodeAddrChild1;
|
||||
}
|
||||
else if(!traverseChild0) {
|
||||
/* neither child was intersected */
|
||||
else if(traverse_mask == 0) {
|
||||
/* Neither child was intersected. */
|
||||
nodeAddr = traversalStack[stackPtr];
|
||||
--stackPtr;
|
||||
}
|
||||
@ -311,6 +289,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect_t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -368,6 +349,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
|
||||
Psplat[2] = ssef(P.z);
|
||||
|
||||
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
tfar = ssef(isect_t);
|
||||
# endif
|
||||
|
||||
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
|
||||
# endif
|
||||
@ -410,3 +394,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
|
||||
|
||||
#undef BVH_FUNCTION_NAME
|
||||
#undef BVH_FUNCTION_FEATURES
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -51,23 +51,25 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
|
||||
if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
|
||||
}
|
||||
|
||||
ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
/* Axis-aligned nodes intersection */
|
||||
|
||||
ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& org_idir,
|
||||
const sse3f& org_idir,
|
||||
#else
|
||||
const sse3f& org,
|
||||
const sse3f& org,
|
||||
#endif
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
ssef *__restrict dist)
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr + 1;
|
||||
#ifdef __KERNEL_AVX2__
|
||||
@ -101,24 +103,25 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
|
||||
return mask;
|
||||
}
|
||||
|
||||
ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
ccl_device_inline int qbvh_aligned_node_intersect_robust(
|
||||
KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& P_idir,
|
||||
const sse3f& P_idir,
|
||||
#else
|
||||
const sse3f& P,
|
||||
const sse3f& P,
|
||||
#endif
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
const float difl,
|
||||
ssef *__restrict dist)
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
const float difl,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr + 1;
|
||||
#ifdef __KERNEL_AVX2__
|
||||
@ -145,3 +148,286 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
|
||||
*dist = tNear;
|
||||
return (int)movemask(vmask);
|
||||
}
|
||||
|
||||
/* Unaligned nodes intersection */
|
||||
|
||||
ccl_device_inline int qbvh_unaligned_node_intersect(
|
||||
KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& org_idir,
|
||||
#endif
|
||||
const sse3f& org,
|
||||
const sse3f& dir,
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr;
|
||||
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
|
||||
const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
|
||||
const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
|
||||
|
||||
const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
|
||||
const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
|
||||
const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
|
||||
|
||||
const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
|
||||
const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
|
||||
const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
|
||||
|
||||
const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
|
||||
const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
|
||||
const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
|
||||
|
||||
const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
|
||||
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
|
||||
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
|
||||
|
||||
const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
|
||||
aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
|
||||
aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
|
||||
|
||||
const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
|
||||
const ssef nrdir_x = neg_one / aligned_dir_x,
|
||||
nrdir_y = neg_one / aligned_dir_y,
|
||||
nrdir_z = neg_one / aligned_dir_z;
|
||||
|
||||
const ssef tlower_x = aligned_P_x * nrdir_x,
|
||||
tlower_y = aligned_P_y * nrdir_y,
|
||||
tlower_z = aligned_P_z * nrdir_z;
|
||||
|
||||
const ssef tupper_x = tlower_x - nrdir_x,
|
||||
tupper_y = tlower_y - nrdir_y,
|
||||
tupper_z = tlower_z - nrdir_z;
|
||||
|
||||
#ifdef __KERNEL_SSE41__
|
||||
const ssef tnear_x = mini(tlower_x, tupper_x);
|
||||
const ssef tnear_y = mini(tlower_y, tupper_y);
|
||||
const ssef tnear_z = mini(tlower_z, tupper_z);
|
||||
const ssef tfar_x = maxi(tlower_x, tupper_x);
|
||||
const ssef tfar_y = maxi(tlower_y, tupper_y);
|
||||
const ssef tfar_z = maxi(tlower_z, tupper_z);
|
||||
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
|
||||
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
|
||||
const sseb vmask = tNear <= tFar;
|
||||
*dist = tNear;
|
||||
return movemask(vmask);
|
||||
#else
|
||||
const ssef tnear_x = min(tlower_x, tupper_x);
|
||||
const ssef tnear_y = min(tlower_y, tupper_y);
|
||||
const ssef tnear_z = min(tlower_z, tupper_z);
|
||||
const ssef tfar_x = max(tlower_x, tupper_x);
|
||||
const ssef tfar_y = max(tlower_y, tupper_y);
|
||||
const ssef tfar_z = max(tlower_z, tupper_z);
|
||||
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
|
||||
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
|
||||
const sseb vmask = tNear <= tFar;
|
||||
*dist = tNear;
|
||||
return movemask(vmask);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int qbvh_unaligned_node_intersect_robust(
|
||||
KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& P_idir,
|
||||
#endif
|
||||
const sse3f& P,
|
||||
const sse3f& dir,
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
const float difl,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr;
|
||||
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
|
||||
const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
|
||||
const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
|
||||
|
||||
const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
|
||||
const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
|
||||
const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
|
||||
|
||||
const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
|
||||
const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
|
||||
const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
|
||||
|
||||
const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
|
||||
const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
|
||||
const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
|
||||
|
||||
const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
|
||||
aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
|
||||
aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
|
||||
|
||||
const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
|
||||
aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
|
||||
aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
|
||||
|
||||
const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
|
||||
const ssef nrdir_x = neg_one / aligned_dir_x,
|
||||
nrdir_y = neg_one / aligned_dir_y,
|
||||
nrdir_z = neg_one / aligned_dir_z;
|
||||
|
||||
const ssef tlower_x = aligned_P_x * nrdir_x,
|
||||
tlower_y = aligned_P_y * nrdir_y,
|
||||
tlower_z = aligned_P_z * nrdir_z;
|
||||
|
||||
const ssef tupper_x = tlower_x - nrdir_x,
|
||||
tupper_y = tlower_y - nrdir_y,
|
||||
tupper_z = tlower_z - nrdir_z;
|
||||
|
||||
const float round_down = 1.0f - difl;
|
||||
const float round_up = 1.0f + difl;
|
||||
|
||||
#ifdef __KERNEL_SSE41__
|
||||
const ssef tnear_x = mini(tlower_x, tupper_x);
|
||||
const ssef tnear_y = mini(tlower_y, tupper_y);
|
||||
const ssef tnear_z = mini(tlower_z, tupper_z);
|
||||
const ssef tfar_x = maxi(tlower_x, tupper_x);
|
||||
const ssef tfar_y = maxi(tlower_y, tupper_y);
|
||||
const ssef tfar_z = maxi(tlower_z, tupper_z);
|
||||
#else
|
||||
const ssef tnear_x = min(tlower_x, tupper_x);
|
||||
const ssef tnear_y = min(tlower_y, tupper_y);
|
||||
const ssef tnear_z = min(tlower_z, tupper_z);
|
||||
const ssef tfar_x = max(tlower_x, tupper_x);
|
||||
const ssef tfar_y = max(tlower_y, tupper_y);
|
||||
const ssef tfar_z = max(tlower_z, tupper_z);
|
||||
#endif
|
||||
const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
|
||||
const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
|
||||
const sseb vmask = round_down*tNear <= round_up*tFar;
|
||||
*dist = tNear;
|
||||
return movemask(vmask);
|
||||
}
|
||||
|
||||
/* Intersectors wrappers.
|
||||
*
|
||||
* They'll check node type and call appropriate intersection code.
|
||||
*/
|
||||
|
||||
ccl_device_inline int qbvh_node_intersect(
|
||||
KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& org_idir,
|
||||
#endif
|
||||
const sse3f& org,
|
||||
const sse3f& dir,
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr;
|
||||
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return qbvh_unaligned_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
org_idir,
|
||||
#endif
|
||||
org,
|
||||
dir,
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return qbvh_aligned_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
org_idir,
|
||||
#else
|
||||
org,
|
||||
#endif
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline int qbvh_node_intersect_robust(
|
||||
KernelGlobals *__restrict kg,
|
||||
const ssef& tnear,
|
||||
const ssef& tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
const sse3f& P_idir,
|
||||
#endif
|
||||
const sse3f& P,
|
||||
const sse3f& dir,
|
||||
const sse3f& idir,
|
||||
const int near_x,
|
||||
const int near_y,
|
||||
const int near_z,
|
||||
const int far_x,
|
||||
const int far_y,
|
||||
const int far_z,
|
||||
const int nodeAddr,
|
||||
const float difl,
|
||||
ssef *__restrict dist)
|
||||
{
|
||||
const int offset = nodeAddr;
|
||||
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
|
||||
if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
return qbvh_unaligned_node_intersect_robust(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir,
|
||||
#endif
|
||||
P,
|
||||
dir,
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
difl,
|
||||
dist);
|
||||
}
|
||||
else {
|
||||
return qbvh_aligned_node_intersect_robust(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir,
|
||||
#else
|
||||
P,
|
||||
#endif
|
||||
idir,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
difl,
|
||||
dist);
|
||||
}
|
||||
}
|
||||
|
@ -27,6 +27,12 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT qbvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT qbvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect_array,
|
||||
@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef tnear(0.0f), tfar(tmax);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
#endif
|
||||
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#else
|
||||
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
@ -109,22 +119,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef dist;
|
||||
int traverseChild = qbvh_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
int traverseChild = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#else
|
||||
org,
|
||||
P_idir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
# endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
|
||||
if(traverseChild != 0) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
float4 cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
@ -340,13 +363,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
@ -394,13 +422,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(tmax);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
@ -412,3 +445,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -25,6 +25,12 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT qbvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT qbvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
SubsurfaceIntersection *ss_isect,
|
||||
@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef tnear(0.0f), tfar(isect_t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
#endif
|
||||
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#else
|
||||
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
@ -108,22 +118,37 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
/* Traverse internal nodes. */
|
||||
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
|
||||
ssef dist;
|
||||
int traverseChild = qbvh_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
|
||||
int traverseChild = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#else
|
||||
org,
|
||||
P_idir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
|
||||
if(traverseChild != 0) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
|
||||
float4 cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
@ -270,3 +295,5 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
} while(nodeAddr != ENTRYPOINT_SENTINEL);
|
||||
} while(nodeAddr != ENTRYPOINT_SENTINEL);
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -28,6 +28,14 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT qbvh_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
|
||||
#else
|
||||
# define NODE_INTERSECT qbvh_aligned_node_intersect
|
||||
# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect,
|
||||
@ -81,13 +89,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
BVH_DEBUG_INIT();
|
||||
|
||||
ssef tnear(0.0f), tfar(ray->t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
#endif
|
||||
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#else
|
||||
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
@ -132,41 +144,62 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
*
|
||||
* Need to test if doing opposite would be any faster.
|
||||
*/
|
||||
traverseChild = qbvh_node_intersect_robust(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
traverseChild = NODE_INTERSECT_ROBUST(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
# else
|
||||
org,
|
||||
P_idir4,
|
||||
# endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
difl,
|
||||
&dist);
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
# endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
difl,
|
||||
&dist);
|
||||
}
|
||||
else
|
||||
#endif /* BVH_HAIR_MINIMUM_WIDTH */
|
||||
{
|
||||
traverseChild = qbvh_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
traverseChild = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#else
|
||||
org,
|
||||
P_idir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
}
|
||||
|
||||
if(traverseChild != 0) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
float4 cnodes;
|
||||
/* TODO(sergey): Investigate whether moving cnodes upwards
|
||||
* gives a speedup (will be different cache pattern but will
|
||||
* avoid extra check here),
|
||||
*/
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
@ -361,13 +394,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
@ -398,13 +436,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
@ -417,3 +460,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
|
||||
return (isect->prim != PRIM_NONE);
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
#undef NODE_INTERSECT_ROBUST
|
||||
|
@ -26,6 +26,12 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT qbvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT qbvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect,
|
||||
@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
isect->object = OBJECT_NONE;
|
||||
|
||||
ssef tnear(0.0f), tfar(ray->t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
#endif
|
||||
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#else
|
||||
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
@ -104,22 +114,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef dist;
|
||||
int traverseChild = qbvh_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
int traverseChild = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#else
|
||||
org,
|
||||
P_idir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
|
||||
if(traverseChild != 0) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
float4 cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
@ -278,13 +301,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
++stackPtr;
|
||||
@ -319,13 +347,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect->t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
|
||||
object = OBJECT_NONE;
|
||||
@ -337,3 +370,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
|
||||
return (isect->prim != PRIM_NONE);
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
|
@ -26,6 +26,12 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
# define NODE_INTERSECT qbvh_node_intersect
|
||||
#else
|
||||
# define NODE_INTERSECT qbvh_aligned_node_intersect
|
||||
#endif
|
||||
|
||||
ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
const Ray *ray,
|
||||
Intersection *isect_array,
|
||||
@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef tnear(0.0f), tfar(isect_t);
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
#endif
|
||||
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
|
||||
#ifdef __KERNEL_AVX2__
|
||||
float3 P_idir = P*idir;
|
||||
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
#else
|
||||
sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
#endif
|
||||
|
||||
/* Offsets to select the side that becomes the lower or upper bound. */
|
||||
@ -108,22 +118,35 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
#endif
|
||||
|
||||
ssef dist;
|
||||
int traverseChild = qbvh_node_intersect(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
int traverseChild = NODE_INTERSECT(kg,
|
||||
tnear,
|
||||
tfar,
|
||||
#ifdef __KERNEL_AVX2__
|
||||
P_idir4,
|
||||
#else
|
||||
org,
|
||||
P_idir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4,
|
||||
#endif
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
dir4,
|
||||
#endif
|
||||
idir4,
|
||||
near_x, near_y, near_z,
|
||||
far_x, far_y, far_z,
|
||||
nodeAddr,
|
||||
&dist);
|
||||
|
||||
if(traverseChild != 0) {
|
||||
float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
float4 cnodes;
|
||||
#if BVH_FEATURE(BVH_HAIR)
|
||||
if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
|
||||
}
|
||||
|
||||
/* One child is hit, continue with that child. */
|
||||
int r = __bscf(traverseChild);
|
||||
@ -330,12 +353,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect_t);
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
num_hits_in_instance = 0;
|
||||
isect_array->t = isect_t;
|
||||
@ -389,13 +417,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
|
||||
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
|
||||
tfar = ssef(isect_t);
|
||||
# if BVH_FEATURE(BVH_HAIR)
|
||||
dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
|
||||
# endif
|
||||
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
|
||||
# ifdef __KERNEL_AVX2__
|
||||
P_idir = P*idir;
|
||||
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
|
||||
# else
|
||||
org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
|
||||
org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
|
||||
# endif
|
||||
|
||||
triangle_intersect_precalc(dir, &isect_precalc);
|
||||
isect_t = tmax;
|
||||
isect_array->t = isect_t;
|
||||
@ -409,3 +442,5 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
|
||||
|
||||
return num_hits;
|
||||
}
|
||||
|
||||
#undef NODE_INTERSECT
|
||||
|
Loading…
Reference in New Issue
Block a user