From bf11e362c5418ec40dc0437d329efceb225eb5ef Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Wed, 8 Apr 2015 18:44:31 +0500 Subject: [PATCH] Fix T44046: Cycles speed regression in 2.74 (CPU only) Issue was caused by MSVC not being able to optimize some code out in the same way as GCC/Clang does, so now that parts of code are explicitly unfolded in order to help compilers out. This makes speed loss much less drastic on my laptop. That's probably as good as we can do with MSVC without investing infinite amount of time looking trying to workaround the optimizer. --- .../kernel/geom/geom_triangle_intersect.h | 56 ++++++++----------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index a0e36058bf8..3990bae5478 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -110,14 +110,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2)); - - const float3 A = tri[0] - P; - const float3 B = tri[1] - P; - const float3 C = tri[2] - P; + const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); + const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); + const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); @@ -205,14 +203,12 @@ ccl_device_inline void triangle_intersect_subsurface( const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2)); - - const float3 A = tri[0] - P; - const float3 B = tri[1] - P; - const float3 C = tri[2] - P; + const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); + const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); + const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); @@ -322,14 +318,12 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D*t; - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2)); - - float3 edge1 = tri[0] - tri[2]; - float3 edge2 = tri[1] - tri[2]; - float3 tvec = P - tri[2]; + const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2); + float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); + float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); + float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); float3 qvec = cross(tvec, edge1); float3 pvec = cross(D, edge2); float rt = dot(edge2, qvec) / dot(edge1, pvec); @@ -381,14 +375,12 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, P = P + D*t; - float3 tri[3]; - tri[0] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0)); - tri[1] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1)); - tri[2] = float4_to_float3(kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2)); - - float3 edge1 = tri[0] - tri[2]; - float3 edge2 = tri[1] - tri[2]; - float3 tvec = P - tri[2]; + const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0), + tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1), + tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2); + float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); + float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); + float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); float3 qvec = cross(tvec, edge1); float3 pvec = cross(D, edge2); float rt = dot(edge2, qvec) / dot(edge1, pvec);