diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 3c5a10540d5..4094e173da9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -71,6 +71,7 @@ __device__ half __float2half(const float f) #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +#define ccl_loop_no_unroll /* TODO(sergey): In theory we might use references with CUDA, however * performance impact yet to be investigated. */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 4963f1cd196..35dc95ca10d 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -43,6 +43,7 @@ #define ccl_local __local #define ccl_local_param __local #define ccl_private __private +#define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1))) #define ccl_restrict restrict #define ccl_ref #define ccl_align(n) __attribute__((aligned(n))) diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/kernel_compat_optix.h index 7068acc3a32..970f5cf864c 100644 --- a/intern/cycles/kernel/kernel_compat_optix.h +++ b/intern/cycles/kernel/kernel_compat_optix.h @@ -70,6 +70,7 @@ __device__ half __float2half(const float f) #define ccl_private #define ccl_may_alias #define ccl_addr_space +#define ccl_loop_no_unroll #define ccl_restrict __restrict__ #define ccl_ref #define ccl_align(n) __align__(n) diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h index 2ad22592eef..f0fc0068fa2 100644 --- a/intern/cycles/kernel/svm/svm_voronoi.h +++ b/intern/cycles/kernel/svm/svm_voronoi.h @@ -684,7 +684,8 @@ ccl_device void voronoi_f1_4d(float4 coord, float4 targetPosition = make_float4(0.0f, 0.0f, 0.0f, 0.0f); for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 pointPosition = cellOffset + @@ -722,7 +723,8 @@ ccl_device void voronoi_smooth_f1_4d(float4 coord, float4 smoothPosition = make_float4(0.0f, 0.0f, 0.0f, 0.0f); for (int u = -2; u <= 2; u++) { for (int k = -2; k <= 2; k++) { - for (int j = -2; j <= 2; j++) { + ccl_loop_no_unroll for (int j = -2; j <= 2; j++) + { for (int i = -2; i <= 2; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 pointPosition = cellOffset + @@ -765,7 +767,8 @@ ccl_device void voronoi_f2_4d(float4 coord, float4 positionF2 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 pointPosition = cellOffset + @@ -803,7 +806,8 @@ ccl_device void voronoi_distance_to_edge_4d(float4 coord, float randomness, floa float minDistance = 8.0f; for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 vectorToPoint = cellOffset + @@ -822,7 +826,8 @@ ccl_device void voronoi_distance_to_edge_4d(float4 coord, float randomness, floa minDistance = 8.0f; for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 vectorToPoint = cellOffset + @@ -851,7 +856,8 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float float minDistance = 8.0f; for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { float4 cellOffset = make_float4(i, j, k, u); float4 pointPosition = cellOffset + @@ -871,7 +877,8 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float float4 closestPointToClosestPoint = make_float4(0.0f, 0.0f, 0.0f, 0.0f); for (int u = -1; u <= 1; u++) { for (int k = -1; k <= 1; k++) { - for (int j = -1; j <= 1; j++) { + ccl_loop_no_unroll for (int j = -1; j <= 1; j++) + { for (int i = -1; i <= 1; i++) { if (i == 0 && j == 0 && k == 0 && u == 0) { continue; diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h index 24a20a969ab..e8e414587fb 100644 --- a/intern/cycles/util/util_defines.h +++ b/intern/cycles/util/util_defines.h @@ -45,6 +45,7 @@ # define ccl_restrict __restrict # define ccl_ref & # define ccl_optional_struct_init +# define ccl_loop_no_unroll # define __KERNEL_WITH_SSE_ALIGN__ # if defined(_WIN32) && !defined(FREE_WINDOWS)