diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 0792e20296f..800639691dc 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -590,11 +590,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0]; } - MTLSize size_threadgroups_per_dispatch = MTLSizeMake( - divide_up(work_size, num_threads_per_block), 1, 1); + MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1); MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1); - [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch - threadsPerThreadgroup:size_threads_per_threadgroup]; + [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch + threadsPerThreadgroup:size_threads_per_threadgroup]; [mtlCommandBuffer_ addCompletedHandler:^(id command_buffer) { NSString *kernel_name = metal_kernel_pso->function.label; diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h index 88149e92ec9..2dbe69cbf01 100644 --- a/intern/cycles/kernel/device/cuda/config.h +++ b/intern/cycles/kernel/device/cuda/config.h @@ -91,6 +91,7 @@ #define ccl_gpu_kernel_postfix #define ccl_gpu_kernel_call(x) x +#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n)) /* Define a function object where "func" is the lambda body, and additional parameters are used to * specify captured state */ diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 0c2fd76fcbd..d31d8c46d4a 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -136,7 +136,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer)); } @@ -150,7 +150,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state)); } @@ -164,7 +164,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state)); } @@ -178,7 +178,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state)); } @@ -193,7 +193,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer)); } @@ -208,7 +208,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer)); } @@ -223,7 +223,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer)); } @@ -238,7 +238,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer)); } @@ -257,7 +257,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) @@ -281,7 +281,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer)); } @@ -296,7 +296,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer)); } @@ -492,7 +492,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; @@ -526,7 +526,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) { const int global_index = ccl_gpu_global_id_x(); - if (global_index < work_size) { + if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h index c7e7306d628..6b1a9464b34 100644 --- a/intern/cycles/kernel/device/hip/config.h +++ b/intern/cycles/kernel/device/hip/config.h @@ -34,6 +34,7 @@ #define ccl_gpu_kernel_postfix #define ccl_gpu_kernel_call(x) x +#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n)) /* Define a function object where "func" is the lambda body, and additional parameters are used to * specify captured state */ diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 27fca24b92c..ea6cb9be10d 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -143,6 +143,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ #define ccl_gpu_kernel_postfix #define ccl_gpu_kernel_call(x) context.x +#define ccl_gpu_kernel_within_bounds(i,n) true /* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */ #define ccl_gpu_kernel_lambda(func, ...) \ diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index e7b73306962..b113faf9761 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -101,6 +101,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ #endif #define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x +#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n)) #define ccl_gpu_kernel_lambda(func, ...) \ struct KernelLambda \