Cycles: Exploit non-uniform threadgroup sizes on Metal

This patch replaces `dispatchThreadgroups` with `dispatchThreads` which takes care of non-uniform threadgroup bounds. This allows us to remove the bounds guards in the integrator kernel entry points. Pull Request: https://projects.blender.org/blender/blender/pulls/106217
2023-03-29 21:46:11 +02:00 · 2023-03-29 21:46:11 +02:00 · 5f61eca7af
commit 5f61eca7af
parent 8eb9d5342f
6 changed files with 20 additions and 17 deletions
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -590,11 +590,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
  }

-  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
-      divide_up(work_size, num_threads_per_block), 1, 1);
+  MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
-  [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
-                           threadsPerThreadgroup:size_threads_per_threadgroup];
+  [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
+                      threadsPerThreadgroup:size_threads_per_threadgroup];

  [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
    NSString *kernel_name = metal_kernel_pso->function.label;
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@ -91,6 +91,7 @@
 #define ccl_gpu_kernel_postfix

 #define ccl_gpu_kernel_call(x) x
+#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))

 /* Define a function object where "func" is the lambda body, and additional parameters are used to
 * specify captured state  */
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -136,7 +136,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
  }
@ -150,7 +150,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
  }
@ -164,7 +164,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
  }
@ -178,7 +178,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
  }
@ -193,7 +193,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
  }
@ -208,7 +208,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
  }
@ -223,7 +223,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
  }
@ -238,7 +238,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
  }
@ -257,7 +257,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;

 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
@ -281,7 +281,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer));
  }
@ -296,7 +296,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
  }
@ -492,7 +492,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int from_state = active_terminated_states[active_states_offset + global_index];
    const int to_state = active_terminated_states[terminated_states_offset + global_index];

@ -526,7 +526,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 {
  const int global_index = ccl_gpu_global_id_x();

-  if (global_index < work_size) {
+  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int from_state = active_terminated_states[active_states_offset + global_index];
    const int to_state = active_terminated_states[terminated_states_offset + global_index];

--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@ -34,6 +34,7 @@
 #define ccl_gpu_kernel_postfix

 #define ccl_gpu_kernel_call(x) x
+#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))

 /* Define a function object where "func" is the lambda body, and additional parameters are used to
 * specify captured state  */
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -143,6 +143,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \

 #define ccl_gpu_kernel_postfix
 #define ccl_gpu_kernel_call(x) context.x
+#define ccl_gpu_kernel_within_bounds(i,n) true

 /* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
 #define ccl_gpu_kernel_lambda(func, ...) \
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@ -101,6 +101,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
 #endif

 #define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
+#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))

 #define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda \