Cycles: Fix speed regression on GPU

Avoid construction of temporary array and make utility function force-inlined.
Additionally avoid calling float4_to_float3 twice.

This brings render times to the same values as before current patch series.
This commit is contained in:
Sergey Sharybin 2017-03-23 17:15:54 +01:00
parent 2a5d7b5b1e
commit a1348dde2e
2 changed files with 28 additions and 26 deletions

@ -51,19 +51,22 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
const ssef *verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
#else #else
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
const float3 verts[3] = {float4_to_float3(tri_a),
float4_to_float3(tri_b),
float4_to_float3(tri_c)};
#endif #endif
float t, u, v; float t, u, v;
if(ray_triangle_intersect(isect_precalc, if(ray_triangle_intersect(isect_precalc,
P, isect->t, P, isect->t,
verts, #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
ssef_verts,
#else
float4_to_float3(tri_a),
float4_to_float3(tri_b),
float4_to_float3(tri_c),
#endif
&u, &v, &t)) &u, &v, &t))
{ {
#ifdef __VISIBILITY_FLAG__ #ifdef __VISIBILITY_FLAG__
@ -105,19 +108,22 @@ ccl_device_inline void triangle_intersect_subsurface(
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
const ssef *verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
#else #else
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
const float3 verts[3] = {float4_to_float3(tri_a),
float4_to_float3(tri_b),
float4_to_float3(tri_c)};
#endif #endif
float t, u, v; float t, u, v;
if(!ray_triangle_intersect(isect_precalc, if(!ray_triangle_intersect(isect_precalc,
P, tmax, P, tmax,
verts, #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
ssef_verts,
#else
tri_a,
tri_b,
tri_c,
#endif
&u, &v, &t)) &u, &v, &t))
{ {
return; return;
@ -156,15 +162,11 @@ ccl_device_inline void triangle_intersect_subsurface(
/* Record geometric normal. */ /* Record geometric normal. */
/* TODO(sergey): Check whether it's faster to re-use ssef verts. */ /* TODO(sergey): Check whether it's faster to re-use ssef verts. */
#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
#endif #endif
/* TODO(sergey): Use float4_to_float3() on just an edges. */ ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
const float3 v0 = float4_to_float3(tri_a);
const float3 v1 = float4_to_float3(tri_b);
const float3 v2 = float4_to_float3(tri_c);
ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
} }
#endif #endif

@ -153,13 +153,13 @@ void ray_triangle_intersect_precalc(float3 dir,
isect_precalc->kz = kz; isect_precalc->kz = kz;
} }
ccl_device_inline bool ray_triangle_intersect( ccl_device_forceinline bool ray_triangle_intersect(
const TriangleIsectPrecalc *isect_precalc, const TriangleIsectPrecalc *isect_precalc,
float3 ray_P, float ray_t, float3 ray_P, float ray_t,
#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
const ssef *ssef_verts, const ssef *ssef_verts,
#else #else
const float3 *verts, const float3 tri_a, const float3 tri_b, const float3 tri_c,
#endif #endif
float *isect_u, float *isect_v, float *isect_t) float *isect_u, float *isect_v, float *isect_t)
{ {
@ -230,9 +230,9 @@ ccl_device_inline bool ray_triangle_intersect(
} }
#else #else
/* Calculate vertices relative to ray origin. */ /* Calculate vertices relative to ray origin. */
const float3 A = verts[0] - ray_P; const float3 A = make_float3(tri_a.x - ray_P.x, tri_a.y - ray_P.y, tri_a.z - ray_P.z);
const float3 B = verts[1] - ray_P; const float3 B = make_float3(tri_b.x - ray_P.x, tri_b.y - ray_P.y, tri_b.z - ray_P.z);
const float3 C = verts[2] - ray_P; const float3 C = make_float3(tri_c.x - ray_P.x, tri_c.y - ray_P.y, tri_c.z - ray_P.z);
const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);