forked from bartvdbraak/blender
Code refactor: add WorkTile struct for passing work to kernel.
This makes sharing some code between mega/split in following commits a bit easier, and also paves the way for rendering multiple tiles later.
This commit is contained in:
parent
660e8e59e7
commit
5b7d6ea54b
@ -1293,8 +1293,6 @@ public:
|
||||
CUDAContextScope scope(this);
|
||||
|
||||
CUfunction cuPathTrace;
|
||||
CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
|
||||
CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
|
||||
|
||||
/* get kernel function */
|
||||
if(branched) {
|
||||
@ -1308,40 +1306,48 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
/* pass in parameters */
|
||||
void *args[] = {&d_buffer,
|
||||
&d_rng_state,
|
||||
&sample,
|
||||
&rtile.x,
|
||||
&rtile.y,
|
||||
&rtile.w,
|
||||
&rtile.h,
|
||||
&rtile.offset,
|
||||
&rtile.stride};
|
||||
|
||||
/* launch kernel */
|
||||
int threads_per_block;
|
||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
|
||||
|
||||
/*int num_registers;
|
||||
cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
|
||||
|
||||
printf("threads_per_block %d\n", threads_per_block);
|
||||
printf("num_registers %d\n", num_registers);*/
|
||||
|
||||
int xthreads = (int)sqrt(threads_per_block);
|
||||
int ythreads = (int)sqrt(threads_per_block);
|
||||
int xblocks = (rtile.w + xthreads - 1)/xthreads;
|
||||
int yblocks = (rtile.h + ythreads - 1)/ythreads;
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
/* allocate work tile */
|
||||
device_vector<WorkTile> work_tiles;
|
||||
work_tiles.resize(1);
|
||||
|
||||
WorkTile *wtile = work_tiles.get_data();
|
||||
wtile->x = rtile.x;
|
||||
wtile->y = rtile.y;
|
||||
wtile->w = rtile.w;
|
||||
wtile->h = rtile.h;
|
||||
wtile->offset = rtile.offset;
|
||||
wtile->stride = rtile.stride;
|
||||
wtile->start_sample = sample;
|
||||
wtile->num_samples = 1;
|
||||
wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
|
||||
wtile->rng_state = (uint*)cuda_device_ptr(rtile.rng_state);
|
||||
|
||||
mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
|
||||
mem_copy_to(work_tiles);
|
||||
|
||||
CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
|
||||
|
||||
uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
|
||||
|
||||
/* pass in parameters */
|
||||
void *args[] = {&d_work_tiles,
|
||||
&total_work_size};
|
||||
|
||||
/* launch kernel */
|
||||
int num_threads_per_block;
|
||||
cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
|
||||
int num_blocks = divide_up(total_work_size, num_threads_per_block);
|
||||
|
||||
cuda_assert(cuLaunchKernel(cuPathTrace,
|
||||
xblocks , yblocks, 1, /* blocks */
|
||||
xthreads, ythreads, 1, /* threads */
|
||||
num_blocks, 1, 1,
|
||||
num_threads_per_block, 1, 1,
|
||||
0, 0, args, 0));
|
||||
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
mem_free(work_tiles);
|
||||
}
|
||||
|
||||
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
|
||||
|
@ -46,6 +46,7 @@ enum MemoryType {
|
||||
/* Supported Data Types */
|
||||
|
||||
enum DataType {
|
||||
TYPE_UNKNOWN,
|
||||
TYPE_UCHAR,
|
||||
TYPE_UINT,
|
||||
TYPE_INT,
|
||||
@ -57,6 +58,7 @@ enum DataType {
|
||||
static inline size_t datatype_size(DataType datatype)
|
||||
{
|
||||
switch(datatype) {
|
||||
case TYPE_UNKNOWN: return 1;
|
||||
case TYPE_UCHAR: return sizeof(uchar);
|
||||
case TYPE_FLOAT: return sizeof(float);
|
||||
case TYPE_UINT: return sizeof(uint);
|
||||
@ -70,8 +72,8 @@ static inline size_t datatype_size(DataType datatype)
|
||||
/* Traits for data types */
|
||||
|
||||
template<typename T> struct device_type_traits {
|
||||
static const DataType data_type = TYPE_UCHAR;
|
||||
static const int num_elements = 0;
|
||||
static const DataType data_type = TYPE_UNKNOWN;
|
||||
static const int num_elements = sizeof(T);
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uchar> {
|
||||
|
@ -1448,6 +1448,21 @@ enum RayState {
|
||||
#define PATCH_MAP_NODE_IS_LEAF (1u << 31)
|
||||
#define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF))
|
||||
|
||||
/* Work Tiles */
|
||||
|
||||
typedef struct WorkTile {
|
||||
uint x, y, w, h;
|
||||
|
||||
uint start_sample;
|
||||
uint num_samples;
|
||||
|
||||
uint offset;
|
||||
uint stride;
|
||||
|
||||
ccl_global float *buffer;
|
||||
ccl_global uint *rng_state;
|
||||
} WorkTile;
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __KERNEL_TYPES_H__ */
|
||||
|
@ -27,29 +27,28 @@ CCL_NAMESPACE_BEGIN
|
||||
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef __SPLIT_KERNEL__
|
||||
/* Returns true if there is work */
|
||||
ccl_device bool get_next_work(KernelGlobals *kg,
|
||||
uint thread_index,
|
||||
ccl_global uint *work_pools,
|
||||
uint total_work_size,
|
||||
uint ray_index,
|
||||
ccl_private uint *global_work_index)
|
||||
{
|
||||
uint total_work_size = kernel_split_params.w
|
||||
* kernel_split_params.h
|
||||
* kernel_split_params.num_samples;
|
||||
|
||||
/* With a small amount of work there may be more threads than work due to
|
||||
* rounding up of global size, stop such threads immediately. */
|
||||
if(thread_index >= total_work_size) {
|
||||
if(ray_index >= total_work_size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Increase atomic work index counter in pool. */
|
||||
uint pool = thread_index / WORK_POOL_SIZE;
|
||||
uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
|
||||
uint pool = ray_index / WORK_POOL_SIZE;
|
||||
uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
|
||||
|
||||
/* Map per-pool work index to a global work index. */
|
||||
uint global_size = ccl_global_size(0) * ccl_global_size(1);
|
||||
kernel_assert(global_size % WORK_POOL_SIZE == 0);
|
||||
kernel_assert(thread_index < global_size);
|
||||
kernel_assert(ray_index < global_size);
|
||||
|
||||
*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
|
||||
+ (pool * WORK_POOL_SIZE)
|
||||
@ -58,23 +57,24 @@ ccl_device bool get_next_work(KernelGlobals *kg,
|
||||
/* Test if all work for this pool is done. */
|
||||
return (*global_work_index < total_work_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Map global work index to pixel X/Y and sample. */
|
||||
ccl_device_inline void get_work_pixel(KernelGlobals *kg,
|
||||
/* Map global work index to tile, pixel X/Y and sample. */
|
||||
ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
|
||||
uint global_work_index,
|
||||
ccl_private uint *x,
|
||||
ccl_private uint *y,
|
||||
ccl_private uint *sample)
|
||||
{
|
||||
uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
|
||||
uint tile_pixels = tile->w * tile->h;
|
||||
uint sample_offset = global_work_index / tile_pixels;
|
||||
uint pixel_offset = global_work_index - sample_offset * tile_pixels;
|
||||
uint y_offset = pixel_offset / kernel_split_params.w;
|
||||
uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
|
||||
uint y_offset = pixel_offset / tile->w;
|
||||
uint x_offset = pixel_offset - y_offset * tile->w;
|
||||
|
||||
*x = kernel_split_params.x + x_offset;
|
||||
*y = kernel_split_params.y + y_offset;
|
||||
*sample = kernel_split_params.start_sample + sample_offset;
|
||||
*x = tile->x + x_offset;
|
||||
*y = tile->y + y_offset;
|
||||
*sample = tile->start_sample + sample_offset;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include "kernel/kernel_compat_cuda.h"
|
||||
#include "kernel_config.h"
|
||||
|
||||
#include "kernel/kernel_math.h"
|
||||
#include "kernel/kernel_types.h"
|
||||
#include "kernel/kernel_globals.h"
|
||||
@ -27,32 +28,37 @@
|
||||
#include "kernel/kernel_path.h"
|
||||
#include "kernel/kernel_path_branched.h"
|
||||
#include "kernel/kernel_bake.h"
|
||||
#include "kernel/kernel_work_stealing.h"
|
||||
|
||||
/* kernels */
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
|
||||
kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
|
||||
{
|
||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
int work_index = ccl_global_id(0);
|
||||
|
||||
if(work_index < total_work_size) {
|
||||
uint x, y, sample;
|
||||
get_work_pixel(tile, work_index, &x, &y, &sample);
|
||||
|
||||
if(x < sx + sw && y < sy + sh) {
|
||||
KernelGlobals kg;
|
||||
kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||
kernel_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __BRANCHED_PATH__
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
|
||||
kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
|
||||
kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
|
||||
{
|
||||
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
|
||||
int work_index = ccl_global_id(0);
|
||||
|
||||
if(work_index < total_work_size) {
|
||||
uint x, y, sample;
|
||||
get_work_pixel(tile, work_index, &x, &y, &sample);
|
||||
|
||||
if(x < sx + sw && y < sy + sh) {
|
||||
KernelGlobals kg;
|
||||
kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||
kernel_branched_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -42,11 +42,11 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
|
||||
if(ccl_local_id(0) + ccl_local_id(1) == 0) {
|
||||
kg->data = data;
|
||||
|
||||
kernel_split_params.rng_state = rng_state;
|
||||
kernel_split_params.tile.rng_state = rng_state;
|
||||
kernel_split_params.queue_index = queue_index;
|
||||
kernel_split_params.use_queues_flag = use_queues_flag;
|
||||
kernel_split_params.work_pools = work_pools;
|
||||
kernel_split_params.buffer = buffer;
|
||||
kernel_split_params.tile.buffer = buffer;
|
||||
|
||||
split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
|
||||
|
||||
|
@ -75,8 +75,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
||||
if(ray_index != QUEUE_EMPTY_SLOT) {
|
||||
#endif
|
||||
|
||||
int stride = kernel_split_params.stride;
|
||||
|
||||
ccl_global char *ray_state = kernel_split_state.ray_state;
|
||||
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
|
||||
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
||||
@ -86,7 +84,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
||||
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
|
||||
uint sample = state->sample;
|
||||
uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
|
||||
ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
|
||||
ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
|
||||
|
||||
/* accumulate result in output buffer */
|
||||
kernel_write_result(kg, buffer, sample, L);
|
||||
@ -96,22 +94,27 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
||||
|
||||
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
|
||||
/* We have completed current work; So get next work */
|
||||
ccl_global uint *work_pools = kernel_split_params.work_pools;
|
||||
uint total_work_size = kernel_split_params.total_work_size;
|
||||
uint work_index;
|
||||
if(!get_next_work(kg, ray_index, &work_index)) {
|
||||
|
||||
if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
|
||||
/* If work is invalid, this means no more work is available and the thread may exit */
|
||||
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
|
||||
}
|
||||
|
||||
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
|
||||
ccl_global WorkTile *tile = &kernel_split_params.tile;
|
||||
uint x, y, sample;
|
||||
get_work_pixel(kg, work_index, &x, &y, &sample);
|
||||
get_work_pixel(tile, work_index, &x, &y, &sample);
|
||||
|
||||
/* Remap rng_state to current pixel. */
|
||||
ccl_global uint *rng_state = kernel_split_params.rng_state;
|
||||
rng_state += kernel_split_params.offset + x + y*stride;
|
||||
ccl_global uint *rng_state = kernel_split_params.tile.rng_state;
|
||||
rng_state += tile->offset + x + y*tile->stride;
|
||||
|
||||
/* Store buffer offset for writing to passes. */
|
||||
uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
|
||||
uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
|
||||
ccl_global float *buffer = tile->buffer + buffer_offset;
|
||||
kernel_split_state.buffer_offset[ray_index] = buffer_offset;
|
||||
|
||||
/* Initialize random numbers and ray. */
|
||||
@ -135,7 +138,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
||||
/* These rays do not participate in path-iteration. */
|
||||
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
/* Accumulate result in output buffer. */
|
||||
ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
|
||||
kernel_write_pass_float4(buffer, sample, L_rad);
|
||||
|
||||
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
|
||||
|
@ -73,28 +73,28 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
|
||||
kg->data = data;
|
||||
#endif
|
||||
|
||||
kernel_split_params.x = sx;
|
||||
kernel_split_params.y = sy;
|
||||
kernel_split_params.w = sw;
|
||||
kernel_split_params.h = sh;
|
||||
kernel_split_params.tile.x = sx;
|
||||
kernel_split_params.tile.y = sy;
|
||||
kernel_split_params.tile.w = sw;
|
||||
kernel_split_params.tile.h = sh;
|
||||
|
||||
kernel_split_params.offset = offset;
|
||||
kernel_split_params.stride = stride;
|
||||
kernel_split_params.tile.start_sample = start_sample;
|
||||
kernel_split_params.tile.num_samples = num_samples;
|
||||
|
||||
kernel_split_params.rng_state = rng_state;
|
||||
kernel_split_params.tile.offset = offset;
|
||||
kernel_split_params.tile.stride = stride;
|
||||
|
||||
kernel_split_params.start_sample = start_sample;
|
||||
kernel_split_params.end_sample = end_sample;
|
||||
kernel_split_params.tile.rng_state = rng_state;
|
||||
kernel_split_params.tile.buffer = buffer;
|
||||
|
||||
kernel_split_params.total_work_size = sw * sh * num_samples;
|
||||
|
||||
kernel_split_params.work_pools = work_pools;
|
||||
kernel_split_params.num_samples = num_samples;
|
||||
|
||||
kernel_split_params.queue_index = Queue_index;
|
||||
kernel_split_params.queue_size = queuesize;
|
||||
kernel_split_params.use_queues_flag = use_queues_flag;
|
||||
|
||||
kernel_split_params.buffer = buffer;
|
||||
|
||||
split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
|
||||
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
|
@ -98,7 +98,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
|
||||
|
||||
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
|
||||
uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
|
||||
ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
|
||||
ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
|
||||
|
||||
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
|
||||
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
|
||||
|
@ -30,23 +30,28 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
|
||||
kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
|
||||
|
||||
/* Get work. */
|
||||
ccl_global uint *work_pools = kernel_split_params.work_pools;
|
||||
uint total_work_size = kernel_split_params.total_work_size;
|
||||
uint work_index;
|
||||
if(!get_next_work(kg, ray_index, &work_index)) {
|
||||
|
||||
if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
|
||||
/* No more work, mark ray as inactive */
|
||||
kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
ccl_global WorkTile *tile = &kernel_split_params.tile;
|
||||
uint x, y, sample;
|
||||
get_work_pixel(kg, work_index, &x, &y, &sample);
|
||||
get_work_pixel(tile, work_index, &x, &y, &sample);
|
||||
|
||||
/* Remap rng_state and buffer to current pixel. */
|
||||
ccl_global uint *rng_state = kernel_split_params.rng_state;
|
||||
rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride;
|
||||
ccl_global uint *rng_state = kernel_split_params.tile.rng_state;
|
||||
rng_state += tile->offset + x + y*tile->stride;
|
||||
|
||||
/* Store buffer offset for writing to passes. */
|
||||
uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride;
|
||||
uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
|
||||
ccl_global float *buffer = tile->buffer + buffer_offset;
|
||||
kernel_split_state.buffer_offset[ray_index] = buffer_offset;
|
||||
|
||||
/* Initialize random numbers and ray. */
|
||||
@ -78,7 +83,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
|
||||
/* These rays do not participate in path-iteration. */
|
||||
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
/* Accumulate result in output buffer. */
|
||||
ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
|
||||
kernel_write_pass_float4(buffer, sample, L_rad);
|
||||
ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
|
||||
}
|
||||
|
@ -22,28 +22,15 @@ CCL_NAMESPACE_BEGIN
|
||||
/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
|
||||
|
||||
typedef struct SplitParams {
|
||||
int x;
|
||||
int y;
|
||||
int w;
|
||||
int h;
|
||||
|
||||
int offset;
|
||||
int stride;
|
||||
|
||||
ccl_global uint *rng_state;
|
||||
|
||||
int start_sample;
|
||||
int end_sample;
|
||||
WorkTile tile;
|
||||
uint total_work_size;
|
||||
|
||||
ccl_global unsigned int *work_pools;
|
||||
unsigned int num_samples;
|
||||
|
||||
ccl_global int *queue_index;
|
||||
int queue_size;
|
||||
ccl_global char *use_queues_flag;
|
||||
|
||||
ccl_global float *buffer;
|
||||
|
||||
/* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
|
||||
int dummy_sd_flag;
|
||||
} SplitParams;
|
||||
|
Loading…
Reference in New Issue
Block a user