Cycles: Exploit non-uniform threadgroup sizes on Metal

This patch replaces `dispatchThreadgroups` with `dispatchThreads` which takes care of non-uniform threadgroup bounds. This allows us to remove the bounds guards in the integrator kernel entry points.

Pull Request: https://projects.blender.org/blender/blender/pulls/106217
This commit is contained in:
Michael Jones 2023-03-29 21:46:11 +02:00 committed by Michael Jones (Apple)
parent 8eb9d5342f
commit 5f61eca7af
6 changed files with 20 additions and 17 deletions

@ -590,11 +590,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
}
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
divide_up(work_size, num_threads_per_block), 1, 1);
MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
[mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
threadsPerThreadgroup:size_threads_per_threadgroup];
[mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
threadsPerThreadgroup:size_threads_per_threadgroup];
[mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
NSString *kernel_name = metal_kernel_pso->function.label;

@ -91,6 +91,7 @@
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_call(x) x
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
/* Define a function object where "func" is the lambda body, and additional parameters are used to
* specify captured state */

@ -136,7 +136,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
}
@ -150,7 +150,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
}
@ -164,7 +164,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
}
@ -178,7 +178,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
}
@ -193,7 +193,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
}
@ -208,7 +208,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
}
@ -223,7 +223,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
}
@ -238,7 +238,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
}
@ -257,7 +257,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
@ -281,7 +281,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer));
}
@ -296,7 +296,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
}
@ -492,7 +492,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
@ -526,7 +526,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];

@ -34,6 +34,7 @@
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_call(x) x
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
/* Define a function object where "func" is the lambda body, and additional parameters are used to
* specify captured state */

@ -143,6 +143,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_call(x) context.x
#define ccl_gpu_kernel_within_bounds(i,n) true
/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */
#define ccl_gpu_kernel_lambda(func, ...) \

@ -101,6 +101,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
#endif
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
#define ccl_gpu_kernel_lambda(func, ...) \
struct KernelLambda \