Cycles: more closely match some math and intersection operations in Embree

This helps with debugging, and gives a slightly closer match between CPU and CUDA/HIP/Metal renders when it comes to ray tracing precision.
2022-07-21 15:49:00 +02:00 · 2022-07-21 15:49:00 +02:00 · 023eb2ea7c
commit 023eb2ea7c
parent d567785658
8 changed files with 77 additions and 76 deletions
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@ -55,7 +55,7 @@ static bool ObtainCacheParticleData(
    return false;

  Transform tfm = get_transform(b_ob->matrix_world());
-  Transform itfm = transform_quick_inverse(tfm);
+  Transform itfm = transform_inverse(tfm);

  for (BL::Modifier &b_mod : b_ob->modifiers) {
    if ((b_mod.type() == b_mod.type_PARTICLE_SYSTEM) &&
--- a/intern/cycles/kernel/geom/object.h
+++ b/intern/cycles/kernel/geom/object.h
@ -86,7 +86,7 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals kg,
    Transform tfm = object_fetch_transform_motion(kg, object, time);

    if (itfm)
-      *itfm = transform_quick_inverse(tfm);
+      *itfm = transform_inverse(tfm);

    return tfm;
  }
--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@ -18,7 +18,7 @@ ccl_device void shader_setup_object_transforms(KernelGlobals kg,
 {
  if (sd->object_flag & SD_OBJECT_MOTION) {
    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+    sd->ob_itfm_motion = transform_inverse(sd->ob_tfm_motion);
  }
 }
 #endif
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
  return make_float4(a.x, a.y, a.z, 1.0f);
 }

+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+  return make_float4(a.x, a.y, a.z, w);
+}
+
 ccl_device_inline float inverse_lerp(float a, float b, float x)
 {
  return (x - a) / (b - a);
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)

 ccl_device_inline float3 operator/(const float3 &a, const float f)
 {
-  float invf = 1.0f / f;
-  return a * invf;
+#  if defined(__KERNEL_SSE__)
+  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+#  else
+  return make_float3(a.x / f, a.y / f, a.z / f);
+#  endif
 }

 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)

 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 {
-  float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-  return r;
+#  ifdef __KERNEL_SSE__
+  return float3(shuffle<1, 2, 0, 3>(
+      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+#  else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#  endif
 }

 ccl_device_inline float3 normalize(const float3 &a)
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@ -105,10 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
  return false;
 }

-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
-                                                   float3 ray_dir,
-                                                   float ray_tmin,
-                                                   float ray_tmax,
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+                                                   const float3 ray_D,
+                                                   const float ray_tmin,
+                                                   const float ray_tmax,
                                                   const float3 tri_a,
                                                   const float3 tri_b,
                                                   const float3 tri_c,
@ -116,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                   ccl_private float *isect_v,
                                                   ccl_private float *isect_t)
 {
-#define dot3(a, b) dot(a, b)
-  const float3 P = ray_P;
-  const float3 dir = ray_dir;
+  /* This implementation matches the Plücker coordinates triangle intersection
+   * in Embree. */

  /* Calculate vertices relative to ray origin. */
-  const float3 v0 = tri_c - P;
-  const float3 v1 = tri_a - P;
-  const float3 v2 = tri_b - P;
+  const float3 v0 = tri_c - ray_P;
+  const float3 v1 = tri_a - ray_P;
+  const float3 v2 = tri_b - ray_P;

  /* Calculate triangle edges. */
  const float3 e0 = v2 - v0;
@ -131,29 +130,29 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
  const float3 e2 = v1 - v2;

  /* Perform edge tests. */
-  const float U = dot(cross(v2 + v0, e0), ray_dir);
-  const float V = dot(cross(v0 + v1, e1), ray_dir);
-  const float W = dot(cross(v1 + v2, e2), ray_dir);
+  const float U = dot(cross(v2 + v0, e0), ray_D);
+  const float V = dot(cross(v0 + v1, e1), ray_D);
+  const float W = dot(cross(v1 + v2, e2), ray_D);

+  const float eps = FLT_EPSILON * fabsf(U + V + W);
  const float minUVW = min(U, min(V, W));
  const float maxUVW = max(U, max(V, W));

-  if (minUVW < 0.0f && maxUVW > 0.0f) {
+  if (!(minUVW >= -eps || maxUVW <= eps)) {
    return false;
  }

  /* Calculate geometry normal and denominator. */
  const float3 Ng1 = cross(e1, e0);
-  // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
  const float3 Ng = Ng1 + Ng1;
-  const float den = dot3(Ng, dir);
+  const float den = dot(Ng, ray_D);
  /* Avoid division by 0. */
  if (UNLIKELY(den == 0.0f)) {
    return false;
  }

  /* Perform depth test. */
-  const float T = dot3(v0, Ng);
+  const float T = dot(v0, Ng);
  const float t = T / den;
  if (!(t >= ray_tmin && t <= ray_tmax)) {
    return false;
@ -163,8 +162,6 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
  *isect_v = V / den;
  *isect_t = t;
  return true;
-
-#undef dot3
 }

 /* Tests for an intersection between a ray and a quad defined by
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
  memcpy(M, &tfm, sizeof(M));

  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-    /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak */
-    M[0][0] += 1e-8f;
-    M[1][1] += 1e-8f;
-    M[2][2] += 1e-8f;
-
-    if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-      return projection_identity();
-    }
+    return projection_identity();
  }

  memcpy(&tfmR, R, sizeof(R));
@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
  return tfmR;
 }

-Transform transform_inverse(const Transform &tfm)
-{
-  ProjectionTransform projection(tfm);
-  return projection_to_transform(projection_inverse(projection));
-}
-
 Transform transform_transposed_inverse(const Transform &tfm)
 {
-  ProjectionTransform projection(tfm);
-  ProjectionTransform iprojection = projection_inverse(projection);
+  ProjectionTransform iprojection(transform_inverse(tfm));
  return projection_to_transform(projection_transpose(iprojection));
 }

--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f

  _MM_TRANSPOSE4_PS(x, y, z, w);

-  ssef tmp = shuffle<0>(aa) * x;
-  tmp = madd(shuffle<1>(aa), y, tmp);
+  ssef tmp = w;
  tmp = madd(shuffle<2>(aa), z, tmp);
-  tmp += w;
+  tmp = madd(shuffle<1>(aa), y, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);

  return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con

  _MM_TRANSPOSE4_PS(x, y, z, w);

-  ssef tmp = shuffle<0>(aa) * x;
+  ssef tmp = shuffle<2>(aa) * z;
  tmp = madd(shuffle<1>(aa), y, tmp);
-  tmp = madd(shuffle<2>(aa), z, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);

  return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
  t->z[column] = value.z;
 }

-Transform transform_inverse(const Transform &a);
 Transform transform_transposed_inverse(const Transform &a);

 ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }

-ccl_device_inline Transform transform_quick_inverse(Transform M)
+ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* possible optimization: can we avoid doing this altogether and construct
-   * the inverse matrix directly from negated translation, transposed rotation,
-   * scale can be inverted but what about shearing? */
-  Transform R;
-  float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-              M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = dot(x, cross(y, z));
+
  if (det == 0.0f) {
-    M.x.x += 1e-8f;
-    M.y.y += 1e-8f;
-    M.z.z += 1e-8f;
-    det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-          M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
  }
-  det = (det != 0.0f) ? 1.0f / det : 0.0f;

-  float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
-                                M.z.y * M.x.z - M.z.z * M.x.y,
-                                M.y.z * M.x.y - M.y.y * M.x.z);
-  float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
-                                M.z.z * M.x.x - M.z.x * M.x.z,
-                                M.y.x * M.x.z - M.y.z * M.x.x);
-  float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
-                                M.z.x * M.x.y - M.z.y * M.x.x,
-                                M.y.y * M.x.x - M.y.x * M.x.y);
-  float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = cross(y, z) / det;
+  const float3 inverse_y = cross(z, x) / det;
+  const float3 inverse_z = cross(x, y) / det;

-  R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
-  R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
-  R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));

-  return R;
+  return itfm;
 }

 ccl_device_inline void transform_compose(ccl_private Transform *tfm,