forked from bartvdbraak/blender
Cycles: Exploit non-uniform threadgroup sizes on Metal
This patch replaces `dispatchThreadgroups` with `dispatchThreads` which takes care of non-uniform threadgroup bounds. This allows us to remove the bounds guards in the integrator kernel entry points. Pull Request: https://projects.blender.org/blender/blender/pulls/106217
This commit is contained in:
parent
8eb9d5342f
commit
5f61eca7af
@ -590,11 +590,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
|
||||
}
|
||||
|
||||
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
|
||||
divide_up(work_size, num_threads_per_block), 1, 1);
|
||||
MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
|
||||
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
|
||||
[mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
|
||||
threadsPerThreadgroup:size_threads_per_threadgroup];
|
||||
[mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
|
||||
threadsPerThreadgroup:size_threads_per_threadgroup];
|
||||
|
||||
[mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
|
||||
NSString *kernel_name = metal_kernel_pso->function.label;
|
||||
|
@ -91,6 +91,7 @@
|
||||
#define ccl_gpu_kernel_postfix
|
||||
|
||||
#define ccl_gpu_kernel_call(x) x
|
||||
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
|
||||
|
||||
/* Define a function object where "func" is the lambda body, and additional parameters are used to
|
||||
* specify captured state */
|
||||
|
@ -136,7 +136,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
|
||||
}
|
||||
@ -150,7 +150,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
|
||||
}
|
||||
@ -164,7 +164,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
|
||||
}
|
||||
@ -178,7 +178,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
|
||||
}
|
||||
@ -193,7 +193,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
|
||||
}
|
||||
@ -208,7 +208,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
|
||||
}
|
||||
@ -223,7 +223,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
|
||||
}
|
||||
@ -238,7 +238,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
|
||||
}
|
||||
@ -257,7 +257,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
|
||||
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
|
||||
@ -281,7 +281,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer));
|
||||
}
|
||||
@ -296,7 +296,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
|
||||
ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
|
||||
}
|
||||
@ -492,7 +492,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int from_state = active_terminated_states[active_states_offset + global_index];
|
||||
const int to_state = active_terminated_states[terminated_states_offset + global_index];
|
||||
|
||||
@ -526,7 +526,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
{
|
||||
const int global_index = ccl_gpu_global_id_x();
|
||||
|
||||
if (global_index < work_size) {
|
||||
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
|
||||
const int from_state = active_terminated_states[active_states_offset + global_index];
|
||||
const int to_state = active_terminated_states[terminated_states_offset + global_index];
|
||||
|
||||
|
@ -34,6 +34,7 @@
|
||||
#define ccl_gpu_kernel_postfix
|
||||
|
||||
#define ccl_gpu_kernel_call(x) x
|
||||
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
|
||||
|
||||
/* Define a function object where "func" is the lambda body, and additional parameters are used to
|
||||
* specify captured state */
|
||||
|
@ -143,6 +143,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
|
||||
|
||||
#define ccl_gpu_kernel_postfix
|
||||
#define ccl_gpu_kernel_call(x) context.x
|
||||
#define ccl_gpu_kernel_within_bounds(i,n) true
|
||||
|
||||
/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */
|
||||
#define ccl_gpu_kernel_lambda(func, ...) \
|
||||
|
@ -101,6 +101,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
||||
#endif
|
||||
|
||||
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
|
||||
#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
|
||||
|
||||
#define ccl_gpu_kernel_lambda(func, ...) \
|
||||
struct KernelLambda \
|
||||
|
Loading…
Reference in New Issue
Block a user