diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index e05fef3897c..ee55e6dc632 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -477,10 +477,10 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) * still to make it faster. */ CUDADeviceQueue queue(this); - void *d_path_index = nullptr; - void *d_render_buffer = nullptr; + device_ptr d_path_index = 0; + device_ptr d_render_buffer = 0; int d_work_size = 0; - void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); queue.init_execution(); queue.enqueue(test_kernel, 1, args); diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 09352a84181..880d7ca4cf2 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -89,7 +89,9 @@ bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const return cuda_device_->kernels.available(kernel); } -bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +bool CUDADeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) { if (cuda_device_->have_error()) { return false; @@ -133,7 +135,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar 1, shared_mem_bytes, cuda_stream_, - args, + const_cast(args.values), 0), "enqueue"); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index 28613cda071..0836af12098 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -42,7 +42,9 @@ class CUDADeviceQueue : public DeviceQueue { virtual bool kernel_available(DeviceKernel kernel) const override; - virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; virtual bool synchronize() override; diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 53c4f3f0b3f..4f1cbabc89b 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -440,10 +440,10 @@ void HIPDevice::reserve_local_memory(const uint kernel_features) * still to make it faster. */ HIPDeviceQueue queue(this); - void *d_path_index = nullptr; - void *d_render_buffer = nullptr; + device_ptr d_path_index = 0; + device_ptr d_render_buffer = 0; int d_work_size = 0; - void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); queue.init_execution(); queue.enqueue(test_kernel, 1, args); diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 0f053ccbeb5..42841324ed6 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -89,7 +89,9 @@ bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const return hip_device_->kernels.available(kernel); } -bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +bool HIPDeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) { if (hip_device_->have_error()) { return false; @@ -132,7 +134,7 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg 1, shared_mem_bytes, hip_stream_, - args, + const_cast(args.values), 0), "enqueue"); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 95d1afaff0f..8040d367798 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -42,7 +42,9 @@ class HIPDeviceQueue : public DeviceQueue { virtual bool kernel_available(DeviceKernel kernel) const override; - virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; virtual bool synchronize() override; diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index b82b1281eb8..1d893d9c65b 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -667,22 +667,22 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context) const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {const_cast(&context.guiding_params.device_pointer), - const_cast(&context.guiding_params.pass_stride), - const_cast(&context.guiding_params.pass_albedo), - const_cast(&context.guiding_params.pass_normal), - &context.render_buffers->buffer.device_pointer, - const_cast(&buffer_params.offset), - const_cast(&buffer_params.stride), - const_cast(&buffer_params.pass_stride), - const_cast(&context.pass_sample_count), - const_cast(&context.pass_denoising_albedo), - const_cast(&context.pass_denoising_normal), - const_cast(&buffer_params.full_x), - const_cast(&buffer_params.full_y), - const_cast(&buffer_params.width), - const_cast(&buffer_params.height), - const_cast(&context.num_samples)}; + DeviceKernelArguments args(&context.guiding_params.device_pointer, + &context.guiding_params.pass_stride, + &context.guiding_params.pass_albedo, + &context.guiding_params.pass_normal, + &context.render_buffers->buffer.device_pointer, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &context.pass_sample_count, + &context.pass_denoising_albedo, + &context.pass_denoising_normal, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &context.num_samples); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args); } @@ -693,11 +693,11 @@ bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {const_cast(&context.guiding_params.device_pointer), - const_cast(&context.guiding_params.pass_stride), - const_cast(&context.guiding_params.pass_albedo), - const_cast(&buffer_params.width), - const_cast(&buffer_params.height)}; + DeviceKernelArguments args(&context.guiding_params.device_pointer, + &context.guiding_params.pass_stride, + &context.guiding_params.pass_albedo, + &buffer_params.width, + &buffer_params.height); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args); } @@ -793,15 +793,15 @@ bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {&context.render_buffers->buffer.device_pointer, - const_cast(&buffer_params.full_x), - const_cast(&buffer_params.full_y), - const_cast(&buffer_params.width), - const_cast(&buffer_params.height), - const_cast(&buffer_params.offset), - const_cast(&buffer_params.stride), - const_cast(&buffer_params.pass_stride), - const_cast(&pass.denoised_offset)}; + DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &pass.denoised_offset); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args); } @@ -813,20 +813,20 @@ bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context, const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {&context.render_buffers->buffer.device_pointer, - const_cast(&buffer_params.full_x), - const_cast(&buffer_params.full_y), - const_cast(&buffer_params.width), - const_cast(&buffer_params.height), - const_cast(&buffer_params.offset), - const_cast(&buffer_params.stride), - const_cast(&buffer_params.pass_stride), - const_cast(&context.num_samples), - const_cast(&pass.noisy_offset), - const_cast(&pass.denoised_offset), - const_cast(&context.pass_sample_count), - const_cast(&pass.num_components), - const_cast(&pass.use_compositing)}; + DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &context.num_samples, + &pass.noisy_offset, + &pass.denoised_offset, + &context.pass_sample_count, + &pass.num_components, + &pass.use_compositing); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args); } diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index e3946d94f5d..1a437878b5f 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -47,7 +47,9 @@ static bool is_optix_specific_kernel(DeviceKernel kernel) kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); } -bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) { if (!is_optix_specific_kernel(kernel)) { return CUDADeviceQueue::enqueue(kernel, work_size, args); @@ -69,7 +71,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array), - args[0], // &d_path_index + args.values[0], // &d_path_index sizeof(device_ptr), cuda_stream_)); @@ -78,7 +80,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), - args[1], // &d_render_buffer + args.values[1], // &d_render_buffer sizeof(device_ptr), cuda_stream_)); } diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h index 0de422ccc71..5f0e09dff2c 100644 --- a/intern/cycles/device/optix/queue.h +++ b/intern/cycles/device/optix/queue.h @@ -31,7 +31,9 @@ class OptiXDeviceQueue : public CUDADeviceQueue { virtual void init_execution() override; - virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 188162f4b74..4e9f41f7875 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -31,6 +31,72 @@ class device_memory; struct KernelWorkTile; +/* Container for device kernel arguments with type correctness ensured by API. */ +struct DeviceKernelArguments { + + enum Type { + POINTER, + INT32, + FLOAT32, + BOOLEAN, + KERNEL_FILM_CONVERT, + }; + + static const int MAX_ARGS = 16; + Type types[MAX_ARGS]; + void *values[MAX_ARGS]; + size_t sizes[MAX_ARGS]; + size_t count = 0; + + DeviceKernelArguments() + { + } + + template DeviceKernelArguments(const T *arg) + { + add(arg); + } + + template DeviceKernelArguments(const T *first, Args... args) + { + add(first); + add(args...); + } + + void add(const KernelFilmConvert *value) + { + add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert)); + } + void add(const device_ptr *value) + { + add(POINTER, value, sizeof(device_ptr)); + } + void add(const int32_t *value) + { + add(INT32, value, sizeof(int32_t)); + } + void add(const float *value) + { + add(FLOAT32, value, sizeof(float)); + } + void add(const bool *value) + { + add(BOOLEAN, value, 4); + } + void add(const Type type, const void *value, size_t size) + { + types[count] = type; + values[count] = (void *)value; + sizes[count] = size; + count++; + } + template void add(const T *first, Args... args) + { + add(first); + add(args...); + } +}; + /* Abstraction of a command queue for a device. * Provides API to schedule kernel execution in a specific queue with minimal possible overhead * from driver side. @@ -66,7 +132,9 @@ class DeviceQueue { * - int: pass pointer to the int * - device memory: pass pointer to device_memory.device_pointer * Return false if there was an error executing this or a previous kernel. */ - virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0; + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) = 0; /* Wait unit all enqueued kernels have finished execution. * Return false if there was an error executing any of the enqueued kernels. */ diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp index c03ef64a2b2..3fd973749b8 100644 --- a/intern/cycles/integrator/pass_accessor_gpu.cpp +++ b/intern/cycles/integrator/pass_accessor_gpu.cpp @@ -54,30 +54,30 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel, if (destination.d_pixels) { DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; - void *args[] = {const_cast(&kfilm_convert), - const_cast(&destination.d_pixels), - const_cast(&render_buffers->buffer.device_pointer), - const_cast(&work_size), - const_cast(&buffer_params.window_width), - const_cast(&offset), - const_cast(&buffer_params.stride), - const_cast(&destination.offset), - const_cast(&destination_stride)}; + DeviceKernelArguments args(&kfilm_convert, + &destination.d_pixels, + &render_buffers->buffer.device_pointer, + &work_size, + &buffer_params.window_width, + &offset, + &buffer_params.stride, + &destination.offset, + &destination_stride); queue_->enqueue(kernel, work_size, args); } if (destination.d_pixels_half_rgba) { const DeviceKernel kernel_half_float = static_cast(kernel + 1); - void *args[] = {const_cast(&kfilm_convert), - const_cast(&destination.d_pixels_half_rgba), - const_cast(&render_buffers->buffer.device_pointer), - const_cast(&work_size), - const_cast(&buffer_params.window_width), - const_cast(&offset), - const_cast(&buffer_params.stride), - const_cast(&destination.offset), - const_cast(&destination_stride)}; + DeviceKernelArguments args(&kfilm_convert, + &destination.d_pixels_half_rgba, + &render_buffers->buffer.device_pointer, + &work_size, + &buffer_params.window_width, + &offset, + &buffer_params.stride, + &destination.offset, + &destination_stride); queue_->enqueue(kernel_half_float, work_size, args); } diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 05e53f816a0..5f3d30f09ad 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -334,7 +334,8 @@ DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const void PathTraceWorkGPU::enqueue_reset() { - void *args[] = {&max_num_paths_}; + DeviceKernelArguments args(&max_num_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); @@ -405,7 +406,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration() void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit) { - void *d_path_index = (void *)NULL; + device_ptr d_path_index = 0; /* Create array of path indices for which this kernel is queued to be executed. */ int work_size = kernel_max_active_main_path_index(kernel); @@ -416,14 +417,14 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num if (kernel_uses_sorting(kernel)) { /* Compute array of active paths, sorted by shader. */ work_size = num_queued; - d_path_index = (void *)queued_paths_.device_pointer; + d_path_index = queued_paths_.device_pointer; compute_sorted_queued_paths( DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit); } else if (num_queued < work_size) { work_size = num_queued; - d_path_index = (void *)queued_paths_.device_pointer; + d_path_index = queued_paths_.device_pointer; if (kernel_is_shadow_path(kernel)) { /* Compute array of active shadow paths for specific kernel. */ @@ -442,8 +443,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num switch (kernel) { case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: { /* Closest ray intersection kernels with integrator state and render buffer. */ - void *d_render_buffer = (void *)buffers_->buffer.device_pointer; - void *args[] = {&d_path_index, &d_render_buffer, const_cast(&work_size)}; + DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size); queue_->enqueue(kernel, work_size, args); break; @@ -453,7 +453,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { /* Ray intersection kernels with integrator state. */ - void *args[] = {&d_path_index, const_cast(&work_size)}; + DeviceKernelArguments args(&d_path_index, &work_size); queue_->enqueue(kernel, work_size, args); break; @@ -465,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { /* Shading kernels with integrator state and render buffer. */ - void *d_render_buffer = (void *)buffers_->buffer.device_pointer; - void *args[] = {&d_path_index, &d_render_buffer, const_cast(&work_size)}; + DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size); queue_->enqueue(kernel, work_size, args); break; @@ -484,15 +483,17 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, const int num_paths_limit) { int d_queued_kernel = queued_kernel; - void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel]; - void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer; + device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel]; + device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer; assert(d_counter != nullptr && d_prefix_sum != nullptr); /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; int max_shaders = device_scene_->data.max_shaders; - void *args[] = {&d_counter, &d_prefix_sum, &max_shaders}; + + DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders); + queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } @@ -507,15 +508,16 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, * end of the array since compaction would need to do less work. */ const int work_size = kernel_max_active_main_path_index(queued_kernel); - void *d_queued_paths = (void *)queued_paths_.device_pointer; - void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; - void *args[] = {const_cast(&work_size), - const_cast(&num_paths_limit), - &d_queued_paths, - &d_num_queued_paths, - &d_counter, - &d_prefix_sum, - &d_queued_kernel}; + device_ptr d_queued_paths = queued_paths_.device_pointer; + device_ptr d_num_queued_paths = num_queued_paths_.device_pointer; + + DeviceKernelArguments args(&work_size, + &num_paths_limit, + &d_queued_paths, + &d_num_queued_paths, + &d_counter, + &d_prefix_sum, + &d_queued_kernel); queue_->enqueue(kernel, work_size, args); } @@ -527,10 +529,10 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu /* Launch kernel to fill the active paths arrays. */ const int work_size = kernel_max_active_main_path_index(queued_kernel); - void *d_queued_paths = (void *)queued_paths_.device_pointer; - void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; - void *args[] = { - const_cast(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel}; + device_ptr d_queued_paths = queued_paths_.device_pointer; + device_ptr d_num_queued_paths = num_queued_paths_.device_pointer; + + DeviceKernelArguments args(&work_size, &d_queued_paths, &d_num_queued_paths, &d_queued_kernel); queue_->zero_to_device(num_queued_paths_); queue_->enqueue(kernel, work_size, args); @@ -606,15 +608,17 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths, { /* Compact fragmented path states into the start of the array, moving any paths * with index higher than the number of active paths into the gaps. */ - void *d_compact_paths = (void *)queued_paths_.device_pointer; - void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + device_ptr d_compact_paths = queued_paths_.device_pointer; + device_ptr d_num_queued_paths = num_queued_paths_.device_pointer; /* Create array with terminated paths that we can write to. */ { /* TODO: can the work size be reduced here? */ int offset = num_active_paths; int work_size = num_active_paths; - void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset}; + + DeviceKernelArguments args(&work_size, &d_compact_paths, &d_num_queued_paths, &offset); + queue_->zero_to_device(num_queued_paths_); queue_->enqueue(terminated_paths_kernel, work_size, args); } @@ -623,8 +627,10 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths, * than the number of active paths. */ { int work_size = max_active_path_index; - void *args[] = { - &work_size, &d_compact_paths, &d_num_queued_paths, const_cast(&num_active_paths)}; + + DeviceKernelArguments args( + &work_size, &d_compact_paths, &d_num_queued_paths, &num_active_paths); + queue_->zero_to_device(num_queued_paths_); queue_->enqueue(compact_paths_kernel, work_size, args); } @@ -639,8 +645,10 @@ void PathTraceWorkGPU::compact_paths(const int num_active_paths, int work_size = num_compact_paths; int active_states_offset = 0; int terminated_states_offset = num_active_paths; - void *args[] = { - &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size}; + + DeviceKernelArguments args( + &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size); + queue_->enqueue(compact_kernel, work_size, args); } } @@ -769,14 +777,12 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel, queue_->copy_to_device(work_tiles_); - void *d_work_tiles = (void *)work_tiles_.device_pointer; - void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + device_ptr d_work_tiles = work_tiles_.device_pointer; + device_ptr d_render_buffer = buffers_->buffer.device_pointer; /* Launch kernel. */ - void *args[] = {&d_work_tiles, - const_cast(&num_work_tiles), - &d_render_buffer, - const_cast(&max_tile_work_size)}; + DeviceKernelArguments args( + &d_work_tiles, &num_work_tiles, &d_render_buffer, &max_tile_work_size); queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args); @@ -966,16 +972,16 @@ int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float thr const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; - void *args[] = {&buffers_->buffer.device_pointer, - const_cast(&effective_buffer_params_.full_x), - const_cast(&effective_buffer_params_.full_y), - const_cast(&effective_buffer_params_.width), - const_cast(&effective_buffer_params_.height), - &threshold, - &reset, - &effective_buffer_params_.offset, - &effective_buffer_params_.stride, - &num_active_pixels.device_pointer}; + DeviceKernelArguments args(&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &threshold, + &reset, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride, + &num_active_pixels.device_pointer); queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args); @@ -989,13 +995,13 @@ void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x() { const int work_size = effective_buffer_params_.height; - void *args[] = {&buffers_->buffer.device_pointer, - &effective_buffer_params_.full_x, - &effective_buffer_params_.full_y, - &effective_buffer_params_.width, - &effective_buffer_params_.height, - &effective_buffer_params_.offset, - &effective_buffer_params_.stride}; + DeviceKernelArguments args(&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride); queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args); } @@ -1004,13 +1010,13 @@ void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y() { const int work_size = effective_buffer_params_.width; - void *args[] = {&buffers_->buffer.device_pointer, - &effective_buffer_params_.full_x, - &effective_buffer_params_.full_y, - &effective_buffer_params_.width, - &effective_buffer_params_.height, - &effective_buffer_params_.offset, - &effective_buffer_params_.stride}; + DeviceKernelArguments args(&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride); queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args); } @@ -1019,10 +1025,10 @@ void PathTraceWorkGPU::cryptomatte_postproces() { const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; - void *args[] = {&buffers_->buffer.device_pointer, - const_cast(&work_size), - &effective_buffer_params_.offset, - &effective_buffer_params_.stride}; + DeviceKernelArguments args(&buffers_->buffer.device_pointer, + &work_size, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride); queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args); } @@ -1071,8 +1077,9 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits() queue_->zero_to_device(num_queued_paths_); const int work_size = max_active_main_path_index_; - void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; - void *args[] = {const_cast(&work_size), &d_num_queued_paths}; + device_ptr d_num_queued_paths = num_queued_paths_.device_pointer; + + DeviceKernelArguments args(&work_size, &d_num_queued_paths); queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args); queue_->copy_from_device(num_queued_paths_); diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp index 9ec530c81df..95a1adeb016 100644 --- a/intern/cycles/integrator/shader_eval.cpp +++ b/intern/cycles/integrator/shader_eval.cpp @@ -158,14 +158,16 @@ bool ShaderEval::eval_gpu(Device *device, /* Execute work on GPU in chunk, so we can cancel. * TODO : query appropriate size from device.*/ - const int64_t chunk_size = 65536; + const int32_t chunk_size = 65536; - void *d_input = (void *)input.device_pointer; - void *d_output = (void *)output.device_pointer; + device_ptr d_input = input.device_pointer; + device_ptr d_output = output.device_pointer; - for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) { - int64_t d_work_size = std::min(chunk_size, work_size - d_offset); - void *args[] = {&d_input, &d_output, &d_offset, &d_work_size}; + assert(work_size <= 0x7fffffff); + for (int32_t d_offset = 0; d_offset < int32_t(work_size); d_offset += chunk_size) { + int32_t d_work_size = std::min(chunk_size, int32_t(work_size) - d_offset); + + DeviceKernelArguments args(&d_input, &d_output, &d_offset, &d_work_size); queue->enqueue(kernel, d_work_size, args); queue->synchronize();