blender/intern/cycles/kernel/kernel_work_stealing.h

194 lines
7.5 KiB
C
Raw Normal View History

2015-05-09 14:34:30 +00:00
/*
* Copyright 2011-2015 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __KERNEL_WORK_STEALING_H__
#define __KERNEL_WORK_STEALING_H__
/*
* Utility functions for work stealing
*/
#ifdef __WORK_STEALING__
#ifdef __KERNEL_OPENCL__
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2015-05-09 14:34:30 +00:00
#endif
uint get_group_id_with_ray_index(uint ray_index,
uint tile_dim_x,
uint tile_dim_y,
uint parallel_samples,
int dim)
{
if(dim == 0) {
uint x_span = ray_index % (tile_dim_x * parallel_samples);
return x_span / get_local_size(0);
}
else /*if(dim == 1)*/ {
kernel_assert(dim == 1);
uint y_span = ray_index / (tile_dim_x * parallel_samples);
return y_span / get_local_size(1);
}
}
uint get_total_work(uint tile_dim_x,
uint tile_dim_y,
uint grp_idx,
uint grp_idy,
uint num_samples)
{
uint threads_within_tile_border_x =
(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
: get_local_size(0);
uint threads_within_tile_border_y =
(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
: get_local_size(1);
threads_within_tile_border_x =
(threads_within_tile_border_x == 0) ? get_local_size(0)
: threads_within_tile_border_x;
threads_within_tile_border_y =
(threads_within_tile_border_y == 0) ? get_local_size(1)
: threads_within_tile_border_y;
return threads_within_tile_border_x *
threads_within_tile_border_y *
num_samples;
}
/* Returns 0 in case there is no next work available */
/* Returns 1 in case work assigned is valid */
int get_next_work(ccl_global uint *work_pool,
ccl_private uint *my_work,
uint tile_dim_x,
uint tile_dim_y,
uint num_samples,
uint parallel_samples,
uint ray_index)
{
uint grp_idx = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
0);
uint grp_idy = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
1);
uint total_work = get_total_work(tile_dim_x,
tile_dim_y,
grp_idx,
grp_idy,
num_samples);
uint group_index = grp_idy * get_num_groups(0) + grp_idx;
*my_work = atomic_inc(&work_pool[group_index]);
return (*my_work < total_work) ? 1 : 0;
}
/* This function assumes that the passed my_work is valid. */
/* Decode sample number w.r.t. assigned my_work. */
uint get_my_sample(uint my_work,
uint tile_dim_x,
uint tile_dim_y,
uint parallel_samples,
uint ray_index)
{
uint grp_idx = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
0);
uint grp_idy = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
1);
uint threads_within_tile_border_x =
(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
: get_local_size(0);
uint threads_within_tile_border_y =
(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
: get_local_size(1);
threads_within_tile_border_x =
(threads_within_tile_border_x == 0) ? get_local_size(0)
: threads_within_tile_border_x;
threads_within_tile_border_y =
(threads_within_tile_border_y == 0) ? get_local_size(1)
: threads_within_tile_border_y;
return my_work /
(threads_within_tile_border_x * threads_within_tile_border_y);
}
/* Decode pixel and tile position w.r.t. assigned my_work. */
void get_pixel_tile_position(ccl_private uint *pixel_x,
ccl_private uint *pixel_y,
ccl_private uint *tile_x,
ccl_private uint *tile_y,
uint my_work,
uint tile_dim_x,
uint tile_dim_y,
uint tile_offset_x,
uint tile_offset_y,
uint parallel_samples,
uint ray_index)
{
uint grp_idx = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
0);
uint grp_idy = get_group_id_with_ray_index(ray_index,
tile_dim_x,
tile_dim_y,
parallel_samples,
1);
uint threads_within_tile_border_x =
(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
: get_local_size(0);
uint threads_within_tile_border_y =
(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
: get_local_size(1);
threads_within_tile_border_x =
(threads_within_tile_border_x == 0) ? get_local_size(0)
: threads_within_tile_border_x;
threads_within_tile_border_y =
(threads_within_tile_border_y == 0) ? get_local_size(1)
: threads_within_tile_border_y;
uint total_associated_pixels =
threads_within_tile_border_x * threads_within_tile_border_y;
uint work_group_pixel_index = my_work % total_associated_pixels;
uint work_group_pixel_x =
work_group_pixel_index % threads_within_tile_border_x;
uint work_group_pixel_y =
work_group_pixel_index / threads_within_tile_border_x;
*pixel_x =
tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
*pixel_y =
tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
*tile_x = *pixel_x - tile_offset_x;
*tile_y = *pixel_y - tile_offset_y;
}
#endif /* __WORK_STEALING__ */
#endif /* __KERNEL_WORK_STEALING_H__ */