forked from bartvdbraak/blender
Cycles: Metal readiness: Specify DeviceQueue::enqueue arg types
This patch adds new arg-type parameters to `DeviceQueue::enqueue` and its overrides. This is in preparation for the Metal backend which needs this information for correct argument encoding. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13357
This commit is contained in:
parent
f9add2d63e
commit
98a5c924fc
@ -477,10 +477,10 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
|
||||
* still to make it faster. */
|
||||
CUDADeviceQueue queue(this);
|
||||
|
||||
void *d_path_index = nullptr;
|
||||
void *d_render_buffer = nullptr;
|
||||
device_ptr d_path_index = 0;
|
||||
device_ptr d_render_buffer = 0;
|
||||
int d_work_size = 0;
|
||||
void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
|
||||
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
|
||||
|
||||
queue.init_execution();
|
||||
queue.enqueue(test_kernel, 1, args);
|
||||
|
@ -89,7 +89,9 @@ bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
|
||||
return cuda_device_->kernels.available(kernel);
|
||||
}
|
||||
|
||||
bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
|
||||
bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args)
|
||||
{
|
||||
if (cuda_device_->have_error()) {
|
||||
return false;
|
||||
@ -133,7 +135,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
|
||||
1,
|
||||
shared_mem_bytes,
|
||||
cuda_stream_,
|
||||
args,
|
||||
const_cast<void**>(args.values),
|
||||
0),
|
||||
"enqueue");
|
||||
|
||||
|
@ -42,7 +42,9 @@ class CUDADeviceQueue : public DeviceQueue {
|
||||
|
||||
virtual bool kernel_available(DeviceKernel kernel) const override;
|
||||
|
||||
virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
|
||||
virtual bool enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args) override;
|
||||
|
||||
virtual bool synchronize() override;
|
||||
|
||||
|
@ -440,10 +440,10 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
|
||||
* still to make it faster. */
|
||||
HIPDeviceQueue queue(this);
|
||||
|
||||
void *d_path_index = nullptr;
|
||||
void *d_render_buffer = nullptr;
|
||||
device_ptr d_path_index = 0;
|
||||
device_ptr d_render_buffer = 0;
|
||||
int d_work_size = 0;
|
||||
void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
|
||||
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
|
||||
|
||||
queue.init_execution();
|
||||
queue.enqueue(test_kernel, 1, args);
|
||||
|
@ -89,7 +89,9 @@ bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
|
||||
return hip_device_->kernels.available(kernel);
|
||||
}
|
||||
|
||||
bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
|
||||
bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args)
|
||||
{
|
||||
if (hip_device_->have_error()) {
|
||||
return false;
|
||||
@ -132,7 +134,7 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
|
||||
1,
|
||||
shared_mem_bytes,
|
||||
hip_stream_,
|
||||
args,
|
||||
const_cast<void**>(args.values),
|
||||
0),
|
||||
"enqueue");
|
||||
|
||||
|
@ -42,7 +42,9 @@ class HIPDeviceQueue : public DeviceQueue {
|
||||
|
||||
virtual bool kernel_available(DeviceKernel kernel) const override;
|
||||
|
||||
virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
|
||||
virtual bool enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args) override;
|
||||
|
||||
virtual bool synchronize() override;
|
||||
|
||||
|
@ -667,22 +667,22 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
|
||||
|
||||
const int work_size = buffer_params.width * buffer_params.height;
|
||||
|
||||
void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
|
||||
const_cast<int *>(&context.guiding_params.pass_stride),
|
||||
const_cast<int *>(&context.guiding_params.pass_albedo),
|
||||
const_cast<int *>(&context.guiding_params.pass_normal),
|
||||
DeviceKernelArguments args(&context.guiding_params.device_pointer,
|
||||
&context.guiding_params.pass_stride,
|
||||
&context.guiding_params.pass_albedo,
|
||||
&context.guiding_params.pass_normal,
|
||||
&context.render_buffers->buffer.device_pointer,
|
||||
const_cast<int *>(&buffer_params.offset),
|
||||
const_cast<int *>(&buffer_params.stride),
|
||||
const_cast<int *>(&buffer_params.pass_stride),
|
||||
const_cast<int *>(&context.pass_sample_count),
|
||||
const_cast<int *>(&context.pass_denoising_albedo),
|
||||
const_cast<int *>(&context.pass_denoising_normal),
|
||||
const_cast<int *>(&buffer_params.full_x),
|
||||
const_cast<int *>(&buffer_params.full_y),
|
||||
const_cast<int *>(&buffer_params.width),
|
||||
const_cast<int *>(&buffer_params.height),
|
||||
const_cast<int *>(&context.num_samples)};
|
||||
&buffer_params.offset,
|
||||
&buffer_params.stride,
|
||||
&buffer_params.pass_stride,
|
||||
&context.pass_sample_count,
|
||||
&context.pass_denoising_albedo,
|
||||
&context.pass_denoising_normal,
|
||||
&buffer_params.full_x,
|
||||
&buffer_params.full_y,
|
||||
&buffer_params.width,
|
||||
&buffer_params.height,
|
||||
&context.num_samples);
|
||||
|
||||
return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
|
||||
}
|
||||
@ -693,11 +693,11 @@ bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context
|
||||
|
||||
const int work_size = buffer_params.width * buffer_params.height;
|
||||
|
||||
void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
|
||||
const_cast<int *>(&context.guiding_params.pass_stride),
|
||||
const_cast<int *>(&context.guiding_params.pass_albedo),
|
||||
const_cast<int *>(&buffer_params.width),
|
||||
const_cast<int *>(&buffer_params.height)};
|
||||
DeviceKernelArguments args(&context.guiding_params.device_pointer,
|
||||
&context.guiding_params.pass_stride,
|
||||
&context.guiding_params.pass_albedo,
|
||||
&buffer_params.width,
|
||||
&buffer_params.height);
|
||||
|
||||
return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
|
||||
}
|
||||
@ -793,15 +793,15 @@ bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const
|
||||
|
||||
const int work_size = buffer_params.width * buffer_params.height;
|
||||
|
||||
void *args[] = {&context.render_buffers->buffer.device_pointer,
|
||||
const_cast<int *>(&buffer_params.full_x),
|
||||
const_cast<int *>(&buffer_params.full_y),
|
||||
const_cast<int *>(&buffer_params.width),
|
||||
const_cast<int *>(&buffer_params.height),
|
||||
const_cast<int *>(&buffer_params.offset),
|
||||
const_cast<int *>(&buffer_params.stride),
|
||||
const_cast<int *>(&buffer_params.pass_stride),
|
||||
const_cast<int *>(&pass.denoised_offset)};
|
||||
DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
|
||||
&buffer_params.full_x,
|
||||
&buffer_params.full_y,
|
||||
&buffer_params.width,
|
||||
&buffer_params.height,
|
||||
&buffer_params.offset,
|
||||
&buffer_params.stride,
|
||||
&buffer_params.pass_stride,
|
||||
&pass.denoised_offset);
|
||||
|
||||
return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
|
||||
}
|
||||
@ -813,20 +813,20 @@ bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
|
||||
|
||||
const int work_size = buffer_params.width * buffer_params.height;
|
||||
|
||||
void *args[] = {&context.render_buffers->buffer.device_pointer,
|
||||
const_cast<int *>(&buffer_params.full_x),
|
||||
const_cast<int *>(&buffer_params.full_y),
|
||||
const_cast<int *>(&buffer_params.width),
|
||||
const_cast<int *>(&buffer_params.height),
|
||||
const_cast<int *>(&buffer_params.offset),
|
||||
const_cast<int *>(&buffer_params.stride),
|
||||
const_cast<int *>(&buffer_params.pass_stride),
|
||||
const_cast<int *>(&context.num_samples),
|
||||
const_cast<int *>(&pass.noisy_offset),
|
||||
const_cast<int *>(&pass.denoised_offset),
|
||||
const_cast<int *>(&context.pass_sample_count),
|
||||
const_cast<int *>(&pass.num_components),
|
||||
const_cast<bool *>(&pass.use_compositing)};
|
||||
DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
|
||||
&buffer_params.full_x,
|
||||
&buffer_params.full_y,
|
||||
&buffer_params.width,
|
||||
&buffer_params.height,
|
||||
&buffer_params.offset,
|
||||
&buffer_params.stride,
|
||||
&buffer_params.pass_stride,
|
||||
&context.num_samples,
|
||||
&pass.noisy_offset,
|
||||
&pass.denoised_offset,
|
||||
&context.pass_sample_count,
|
||||
&pass.num_components,
|
||||
&pass.use_compositing);
|
||||
|
||||
return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
|
||||
}
|
||||
|
@ -47,7 +47,9 @@ static bool is_optix_specific_kernel(DeviceKernel kernel)
|
||||
kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
|
||||
}
|
||||
|
||||
bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
|
||||
bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args)
|
||||
{
|
||||
if (!is_optix_specific_kernel(kernel)) {
|
||||
return CUDADeviceQueue::enqueue(kernel, work_size, args);
|
||||
@ -69,7 +71,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
|
||||
args[0], // &d_path_index
|
||||
args.values[0], // &d_path_index
|
||||
sizeof(device_ptr),
|
||||
cuda_stream_));
|
||||
|
||||
@ -78,7 +80,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
|
||||
args[1], // &d_render_buffer
|
||||
args.values[1], // &d_render_buffer
|
||||
sizeof(device_ptr),
|
||||
cuda_stream_));
|
||||
}
|
||||
|
@ -31,7 +31,9 @@ class OptiXDeviceQueue : public CUDADeviceQueue {
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
|
||||
virtual bool enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args) override;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -31,6 +31,72 @@ class device_memory;
|
||||
|
||||
struct KernelWorkTile;
|
||||
|
||||
/* Container for device kernel arguments with type correctness ensured by API. */
|
||||
struct DeviceKernelArguments {
|
||||
|
||||
enum Type {
|
||||
POINTER,
|
||||
INT32,
|
||||
FLOAT32,
|
||||
BOOLEAN,
|
||||
KERNEL_FILM_CONVERT,
|
||||
};
|
||||
|
||||
static const int MAX_ARGS = 16;
|
||||
Type types[MAX_ARGS];
|
||||
void *values[MAX_ARGS];
|
||||
size_t sizes[MAX_ARGS];
|
||||
size_t count = 0;
|
||||
|
||||
DeviceKernelArguments()
|
||||
{
|
||||
}
|
||||
|
||||
template<class T> DeviceKernelArguments(const T *arg)
|
||||
{
|
||||
add(arg);
|
||||
}
|
||||
|
||||
template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
|
||||
{
|
||||
add(first);
|
||||
add(args...);
|
||||
}
|
||||
|
||||
void add(const KernelFilmConvert *value)
|
||||
{
|
||||
add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
|
||||
}
|
||||
void add(const device_ptr *value)
|
||||
{
|
||||
add(POINTER, value, sizeof(device_ptr));
|
||||
}
|
||||
void add(const int32_t *value)
|
||||
{
|
||||
add(INT32, value, sizeof(int32_t));
|
||||
}
|
||||
void add(const float *value)
|
||||
{
|
||||
add(FLOAT32, value, sizeof(float));
|
||||
}
|
||||
void add(const bool *value)
|
||||
{
|
||||
add(BOOLEAN, value, 4);
|
||||
}
|
||||
void add(const Type type, const void *value, size_t size)
|
||||
{
|
||||
types[count] = type;
|
||||
values[count] = (void *)value;
|
||||
sizes[count] = size;
|
||||
count++;
|
||||
}
|
||||
template<typename T, typename... Args> void add(const T *first, Args... args)
|
||||
{
|
||||
add(first);
|
||||
add(args...);
|
||||
}
|
||||
};
|
||||
|
||||
/* Abstraction of a command queue for a device.
|
||||
* Provides API to schedule kernel execution in a specific queue with minimal possible overhead
|
||||
* from driver side.
|
||||
@ -66,7 +132,9 @@ class DeviceQueue {
|
||||
* - int: pass pointer to the int
|
||||
* - device memory: pass pointer to device_memory.device_pointer
|
||||
* Return false if there was an error executing this or a previous kernel. */
|
||||
virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
|
||||
virtual bool enqueue(DeviceKernel kernel,
|
||||
const int work_size,
|
||||
DeviceKernelArguments const &args) = 0;
|
||||
|
||||
/* Wait unit all enqueued kernels have finished execution.
|
||||
* Return false if there was an error executing any of the enqueued kernels. */
|
||||
|
@ -54,30 +54,30 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
|
||||
if (destination.d_pixels) {
|
||||
DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
|
||||
|
||||
void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
|
||||
const_cast<device_ptr *>(&destination.d_pixels),
|
||||
const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
|
||||
const_cast<int *>(&work_size),
|
||||
const_cast<int *>(&buffer_params.window_width),
|
||||
const_cast<int *>(&offset),
|
||||
const_cast<int *>(&buffer_params.stride),
|
||||
const_cast<int *>(&destination.offset),
|
||||
const_cast<int *>(&destination_stride)};
|
||||
DeviceKernelArguments args(&kfilm_convert,
|
||||
&destination.d_pixels,
|
||||
&render_buffers->buffer.device_pointer,
|
||||
&work_size,
|
||||
&buffer_params.window_width,
|
||||
&offset,
|
||||
&buffer_params.stride,
|
||||
&destination.offset,
|
||||
&destination_stride);
|
||||
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
}
|
||||
if (destination.d_pixels_half_rgba) {
|
||||
const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
|
||||
|
||||
void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
|
||||
const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
|
||||
const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
|
||||
const_cast<int *>(&work_size),
|
||||
const_cast<int *>(&buffer_params.window_width),
|
||||
const_cast<int *>(&offset),
|
||||
const_cast<int *>(&buffer_params.stride),
|
||||
const_cast<int *>(&destination.offset),
|
||||
const_cast<int *>(&destination_stride)};
|
||||
DeviceKernelArguments args(&kfilm_convert,
|
||||
&destination.d_pixels_half_rgba,
|
||||
&render_buffers->buffer.device_pointer,
|
||||
&work_size,
|
||||
&buffer_params.window_width,
|
||||
&offset,
|
||||
&buffer_params.stride,
|
||||
&destination.offset,
|
||||
&destination_stride);
|
||||
|
||||
queue_->enqueue(kernel_half_float, work_size, args);
|
||||
}
|
||||
|
@ -334,7 +334,8 @@ DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
|
||||
|
||||
void PathTraceWorkGPU::enqueue_reset()
|
||||
{
|
||||
void *args[] = {&max_num_paths_};
|
||||
DeviceKernelArguments args(&max_num_paths_);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
|
||||
queue_->zero_to_device(integrator_queue_counter_);
|
||||
queue_->zero_to_device(integrator_shader_sort_counter_);
|
||||
@ -405,7 +406,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
|
||||
|
||||
void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
|
||||
{
|
||||
void *d_path_index = (void *)NULL;
|
||||
device_ptr d_path_index = 0;
|
||||
|
||||
/* Create array of path indices for which this kernel is queued to be executed. */
|
||||
int work_size = kernel_max_active_main_path_index(kernel);
|
||||
@ -416,14 +417,14 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
|
||||
if (kernel_uses_sorting(kernel)) {
|
||||
/* Compute array of active paths, sorted by shader. */
|
||||
work_size = num_queued;
|
||||
d_path_index = (void *)queued_paths_.device_pointer;
|
||||
d_path_index = queued_paths_.device_pointer;
|
||||
|
||||
compute_sorted_queued_paths(
|
||||
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
|
||||
}
|
||||
else if (num_queued < work_size) {
|
||||
work_size = num_queued;
|
||||
d_path_index = (void *)queued_paths_.device_pointer;
|
||||
d_path_index = queued_paths_.device_pointer;
|
||||
|
||||
if (kernel_is_shadow_path(kernel)) {
|
||||
/* Compute array of active shadow paths for specific kernel. */
|
||||
@ -442,8 +443,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
|
||||
/* Closest ray intersection kernels with integrator state and render buffer. */
|
||||
void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
|
||||
void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
|
||||
DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
|
||||
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
break;
|
||||
@ -453,7 +453,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
|
||||
/* Ray intersection kernels with integrator state. */
|
||||
void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
|
||||
DeviceKernelArguments args(&d_path_index, &work_size);
|
||||
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
break;
|
||||
@ -465,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
|
||||
/* Shading kernels with integrator state and render buffer. */
|
||||
void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
|
||||
void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
|
||||
DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
|
||||
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
break;
|
||||
@ -484,15 +483,17 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
|
||||
const int num_paths_limit)
|
||||
{
|
||||
int d_queued_kernel = queued_kernel;
|
||||
void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
|
||||
void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
|
||||
device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
|
||||
device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
|
||||
assert(d_counter != nullptr && d_prefix_sum != nullptr);
|
||||
|
||||
/* Compute prefix sum of number of active paths with each shader. */
|
||||
{
|
||||
const int work_size = 1;
|
||||
int max_shaders = device_scene_->data.max_shaders;
|
||||
void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
|
||||
|
||||
DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
|
||||
}
|
||||
|
||||
@ -507,15 +508,16 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
|
||||
* end of the array since compaction would need to do less work. */
|
||||
const int work_size = kernel_max_active_main_path_index(queued_kernel);
|
||||
|
||||
void *d_queued_paths = (void *)queued_paths_.device_pointer;
|
||||
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
|
||||
void *args[] = {const_cast<int *>(&work_size),
|
||||
const_cast<int *>(&num_paths_limit),
|
||||
device_ptr d_queued_paths = queued_paths_.device_pointer;
|
||||
device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
|
||||
|
||||
DeviceKernelArguments args(&work_size,
|
||||
&num_paths_limit,
|
||||
&d_queued_paths,
|
||||
&d_num_queued_paths,
|
||||
&d_counter,
|
||||
&d_prefix_sum,
|
||||
&d_queued_kernel};
|
||||
&d_queued_kernel);
|
||||
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
}
|
||||
@ -527,10 +529,10 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
|
||||
|
||||
/* Launch kernel to fill the active paths arrays. */
|
||||
const int work_size = kernel_max_active_main_path_index(queued_kernel);
|
||||
void *d_queued_paths = (void *)queued_paths_.device_pointer;
|
||||
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
|
||||
void *args[] = {
|
||||
const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
|
||||
device_ptr d_queued_paths = queued_paths_.device_pointer;
|
||||
device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
|
||||
|
||||
DeviceKernelArguments args(&work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel);
|
||||
|
||||
queue_->zero_to_device(num_queued_paths_);
|
||||
queue_->enqueue(kernel, work_size, args);
|
||||
@ -606,15 +608,17 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths,
|
||||
{
|
||||
/* Compact fragmented path states into the start of the array, moving any paths
|
||||
* with index higher than the number of active paths into the gaps. */
|
||||
void *d_compact_paths = (void *)queued_paths_.device_pointer;
|
||||
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
|
||||
device_ptr d_compact_paths = queued_paths_.device_pointer;
|
||||
device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
|
||||
|
||||
/* Create array with terminated paths that we can write to. */
|
||||
{
|
||||
/* TODO: can the work size be reduced here? */
|
||||
int offset = num_active_paths;
|
||||
int work_size = num_active_paths;
|
||||
void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
|
||||
|
||||
DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset);
|
||||
|
||||
queue_->zero_to_device(num_queued_paths_);
|
||||
queue_->enqueue(terminated_paths_kernel, work_size, args);
|
||||
}
|
||||
@ -623,8 +627,10 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths,
|
||||
* than the number of active paths. */
|
||||
{
|
||||
int work_size = max_active_path_index;
|
||||
void *args[] = {
|
||||
&work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
|
||||
|
||||
DeviceKernelArguments args(
|
||||
&work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths);
|
||||
|
||||
queue_->zero_to_device(num_queued_paths_);
|
||||
queue_->enqueue(compact_paths_kernel, work_size, args);
|
||||
}
|
||||
@ -639,8 +645,10 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths,
|
||||
int work_size = num_compact_paths;
|
||||
int active_states_offset = 0;
|
||||
int terminated_states_offset = num_active_paths;
|
||||
void *args[] = {
|
||||
&d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
|
||||
|
||||
DeviceKernelArguments args(
|
||||
&d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size);
|
||||
|
||||
queue_->enqueue(compact_kernel, work_size, args);
|
||||
}
|
||||
}
|
||||
@ -769,14 +777,12 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
|
||||
|
||||
queue_->copy_to_device(work_tiles_);
|
||||
|
||||
void *d_work_tiles = (void *)work_tiles_.device_pointer;
|
||||
void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
|
||||
device_ptr d_work_tiles = work_tiles_.device_pointer;
|
||||
device_ptr d_render_buffer = buffers_->buffer.device_pointer;
|
||||
|
||||
/* Launch kernel. */
|
||||
void *args[] = {&d_work_tiles,
|
||||
const_cast<int *>(&num_work_tiles),
|
||||
&d_render_buffer,
|
||||
const_cast<int *>(&max_tile_work_size)};
|
||||
DeviceKernelArguments args(
|
||||
&d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size);
|
||||
|
||||
queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
|
||||
|
||||
@ -966,16 +972,16 @@ int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float thr
|
||||
|
||||
const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
|
||||
|
||||
void *args[] = {&buffers_->buffer.device_pointer,
|
||||
const_cast<int *>(&effective_buffer_params_.full_x),
|
||||
const_cast<int *>(&effective_buffer_params_.full_y),
|
||||
const_cast<int *>(&effective_buffer_params_.width),
|
||||
const_cast<int *>(&effective_buffer_params_.height),
|
||||
DeviceKernelArguments args(&buffers_->buffer.device_pointer,
|
||||
&effective_buffer_params_.full_x,
|
||||
&effective_buffer_params_.full_y,
|
||||
&effective_buffer_params_.width,
|
||||
&effective_buffer_params_.height,
|
||||
&threshold,
|
||||
&reset,
|
||||
&effective_buffer_params_.offset,
|
||||
&effective_buffer_params_.stride,
|
||||
&num_active_pixels.device_pointer};
|
||||
&num_active_pixels.device_pointer);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
|
||||
|
||||
@ -989,13 +995,13 @@ void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
|
||||
{
|
||||
const int work_size = effective_buffer_params_.height;
|
||||
|
||||
void *args[] = {&buffers_->buffer.device_pointer,
|
||||
DeviceKernelArguments args(&buffers_->buffer.device_pointer,
|
||||
&effective_buffer_params_.full_x,
|
||||
&effective_buffer_params_.full_y,
|
||||
&effective_buffer_params_.width,
|
||||
&effective_buffer_params_.height,
|
||||
&effective_buffer_params_.offset,
|
||||
&effective_buffer_params_.stride};
|
||||
&effective_buffer_params_.stride);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
|
||||
}
|
||||
@ -1004,13 +1010,13 @@ void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
|
||||
{
|
||||
const int work_size = effective_buffer_params_.width;
|
||||
|
||||
void *args[] = {&buffers_->buffer.device_pointer,
|
||||
DeviceKernelArguments args(&buffers_->buffer.device_pointer,
|
||||
&effective_buffer_params_.full_x,
|
||||
&effective_buffer_params_.full_y,
|
||||
&effective_buffer_params_.width,
|
||||
&effective_buffer_params_.height,
|
||||
&effective_buffer_params_.offset,
|
||||
&effective_buffer_params_.stride};
|
||||
&effective_buffer_params_.stride);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
|
||||
}
|
||||
@ -1019,10 +1025,10 @@ void PathTraceWorkGPU::cryptomatte_postproces()
|
||||
{
|
||||
const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
|
||||
|
||||
void *args[] = {&buffers_->buffer.device_pointer,
|
||||
const_cast<int *>(&work_size),
|
||||
DeviceKernelArguments args(&buffers_->buffer.device_pointer,
|
||||
&work_size,
|
||||
&effective_buffer_params_.offset,
|
||||
&effective_buffer_params_.stride};
|
||||
&effective_buffer_params_.stride);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
|
||||
}
|
||||
@ -1071,8 +1077,9 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
|
||||
queue_->zero_to_device(num_queued_paths_);
|
||||
|
||||
const int work_size = max_active_main_path_index_;
|
||||
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
|
||||
void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
|
||||
device_ptr d_num_queued_paths = num_queued_paths_.device_pointer;
|
||||
|
||||
DeviceKernelArguments args(&work_size, &d_num_queued_paths);
|
||||
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
|
||||
queue_->copy_from_device(num_queued_paths_);
|
||||
|
@ -158,14 +158,16 @@ bool ShaderEval::eval_gpu(Device *device,
|
||||
|
||||
/* Execute work on GPU in chunk, so we can cancel.
|
||||
* TODO : query appropriate size from device.*/
|
||||
const int64_t chunk_size = 65536;
|
||||
const int32_t chunk_size = 65536;
|
||||
|
||||
void *d_input = (void *)input.device_pointer;
|
||||
void *d_output = (void *)output.device_pointer;
|
||||
device_ptr d_input = input.device_pointer;
|
||||
device_ptr d_output = output.device_pointer;
|
||||
|
||||
for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
|
||||
int64_t d_work_size = std::min(chunk_size, work_size - d_offset);
|
||||
void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
|
||||
assert(work_size <= 0x7fffffff);
|
||||
for (int32_t d_offset = 0; d_offset < int32_t(work_size); d_offset += chunk_size) {
|
||||
int32_t d_work_size = std::min(chunk_size, int32_t(work_size) - d_offset);
|
||||
|
||||
DeviceKernelArguments args(&d_input, &d_output, &d_offset, &d_work_size);
|
||||
|
||||
queue->enqueue(kernel, d_work_size, args);
|
||||
queue->synchronize();
|
||||
|
Loading…
Reference in New Issue
Block a user