2017-05-07 12:40:58 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2011-2017 Blender Foundation
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __DEVICE_DENOISING_H__
|
|
|
|
#define __DEVICE_DENOISING_H__
|
|
|
|
|
|
|
|
#include "device/device.h"
|
|
|
|
|
|
|
|
#include "render/buffers.h"
|
|
|
|
|
|
|
|
#include "kernel/filter/filter_defines.h"
|
|
|
|
|
2018-11-29 01:06:30 +00:00
|
|
|
#include "util/util_profiling.h"
|
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class DenoisingTask {
|
|
|
|
public:
|
|
|
|
/* Parameters of the denoising algorithm. */
|
|
|
|
int radius;
|
|
|
|
float nlm_k_2;
|
|
|
|
float pca_threshold;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-07-04 12:26:15 +00:00
|
|
|
/* Parameters of the RenderBuffers. */
|
2017-05-07 12:40:58 +00:00
|
|
|
struct RenderBuffers {
|
2018-07-04 12:26:15 +00:00
|
|
|
int offset;
|
2017-05-07 12:40:58 +00:00
|
|
|
int pass_stride;
|
2019-02-06 13:19:20 +00:00
|
|
|
int frame_stride;
|
2018-07-04 12:26:15 +00:00
|
|
|
int samples;
|
|
|
|
} render_buffer;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-07-04 12:26:15 +00:00
|
|
|
/* Pointer and parameters of the target buffer. */
|
|
|
|
struct TargetBuffer {
|
2017-05-07 12:40:58 +00:00
|
|
|
int offset;
|
|
|
|
int stride;
|
2018-07-04 12:26:15 +00:00
|
|
|
int pass_stride;
|
|
|
|
int denoising_clean_offset;
|
2019-02-06 11:42:10 +00:00
|
|
|
int denoising_output_offset;
|
2017-05-07 12:40:58 +00:00
|
|
|
device_ptr ptr;
|
2018-07-04 12:26:15 +00:00
|
|
|
} target_buffer;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-07-04 12:26:42 +00:00
|
|
|
TileInfo *tile_info;
|
|
|
|
device_vector<int> tile_info_mem;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-11-29 01:06:30 +00:00
|
|
|
ProfilingState *profiler;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
int4 rect;
|
|
|
|
int4 filter_area;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2020-05-31 21:49:10 +00:00
|
|
|
bool do_prefilter;
|
2019-02-06 11:42:10 +00:00
|
|
|
bool do_filter;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
struct DeviceFunctions {
|
|
|
|
function<bool(
|
|
|
|
device_ptr image_ptr, /* Contains the values that are smoothed. */
|
|
|
|
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
|
|
|
|
device_ptr variance_ptr, /* Contains the variance of the guide image. */
|
|
|
|
device_ptr out_ptr /* The filtered output is written into this image. */
|
|
|
|
)>
|
|
|
|
non_local_means;
|
|
|
|
function<bool(
|
|
|
|
device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
|
2019-02-06 11:42:10 +00:00
|
|
|
accumulate;
|
|
|
|
function<bool(device_ptr output_ptr)> solve;
|
2017-05-07 12:40:58 +00:00
|
|
|
function<bool()> construct_transform;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
function<bool(device_ptr a_ptr,
|
|
|
|
device_ptr b_ptr,
|
|
|
|
device_ptr mean_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
int r,
|
|
|
|
int4 rect)>
|
|
|
|
combine_halves;
|
|
|
|
function<bool(device_ptr a_ptr,
|
|
|
|
device_ptr b_ptr,
|
|
|
|
device_ptr sample_variance_ptr,
|
|
|
|
device_ptr sv_variance_ptr,
|
|
|
|
device_ptr buffer_variance_ptr)>
|
|
|
|
divide_shadow;
|
|
|
|
function<bool(int mean_offset,
|
|
|
|
int variance_offset,
|
|
|
|
device_ptr mean_ptr,
|
2019-02-06 11:42:10 +00:00
|
|
|
device_ptr variance_ptr,
|
|
|
|
float scale)>
|
2017-05-07 12:40:58 +00:00
|
|
|
get_feature;
|
2017-05-18 01:03:18 +00:00
|
|
|
function<bool(device_ptr image_ptr,
|
|
|
|
device_ptr variance_ptr,
|
|
|
|
device_ptr depth_ptr,
|
|
|
|
device_ptr output_ptr)>
|
|
|
|
detect_outliers;
|
2019-02-06 11:42:10 +00:00
|
|
|
function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
|
2020-07-09 18:01:22 +00:00
|
|
|
function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
|
|
|
|
function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
|
2017-05-07 12:40:58 +00:00
|
|
|
} functions;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
/* Stores state of the current Reconstruction operation,
|
|
|
|
* which is accessed by the device in order to perform the operation. */
|
|
|
|
struct ReconstructionState {
|
Cycles: Improve denoising speed on GPUs with small tile sizes
Previously, the NLM kernels would be launched once per offset with one thread per pixel.
However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown.
Therefore, the kernels are now launched in a single call that handles all offsets at once.
This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory.
On the other hand, of course, the smaller tiles significantly reduce the size of the memory.
The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum.
I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere.
To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now.
Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
2017-11-10 03:34:14 +00:00
|
|
|
int4 filter_window;
|
2017-05-07 12:40:58 +00:00
|
|
|
int4 buffer_params;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
int source_w;
|
|
|
|
int source_h;
|
|
|
|
} reconstruction_state;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
/* Stores state of the current NLM operation,
|
|
|
|
* which is accessed by the device in order to perform the operation. */
|
|
|
|
struct NLMState {
|
|
|
|
int r; /* Search radius of the filter. */
|
|
|
|
int f; /* Patch size of the filter. */
|
|
|
|
float a; /* Variance compensation factor in the MSE estimation. */
|
|
|
|
float k_2; /* Squared value of the k parameter of the filter. */
|
2019-02-06 11:42:10 +00:00
|
|
|
bool is_color;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2019-02-06 11:42:10 +00:00
|
|
|
void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
|
|
|
|
{
|
|
|
|
r = r_;
|
|
|
|
f = f_;
|
|
|
|
a = a_, k_2 = k_2_;
|
|
|
|
is_color = is_color_;
|
|
|
|
}
|
2017-05-07 12:40:58 +00:00
|
|
|
} nlm_state;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
struct Storage {
|
|
|
|
device_only_memory<float> transform;
|
|
|
|
device_only_memory<int> rank;
|
|
|
|
device_only_memory<float> XtWX;
|
|
|
|
device_only_memory<float3> XtWY;
|
|
|
|
int w;
|
|
|
|
int h;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-10-20 21:31:13 +00:00
|
|
|
Storage(Device *device)
|
|
|
|
: transform(device, "denoising transform"),
|
|
|
|
rank(device, "denoising rank"),
|
|
|
|
XtWX(device, "denoising XtWX"),
|
2018-08-25 19:19:44 +00:00
|
|
|
XtWY(device, "denoising XtWY")
|
2017-10-20 21:31:13 +00:00
|
|
|
{
|
|
|
|
}
|
2017-05-07 12:40:58 +00:00
|
|
|
} storage;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-07-04 12:22:38 +00:00
|
|
|
DenoisingTask(Device *device, const DeviceTask &task);
|
2017-11-08 19:15:38 +00:00
|
|
|
~DenoisingTask();
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2020-07-09 18:01:22 +00:00
|
|
|
void run_denoising(RenderTile &tile);
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
struct DenoiseBuffers {
|
|
|
|
int pass_stride;
|
|
|
|
int passes;
|
Cycles: Improve denoising speed on GPUs with small tile sizes
Previously, the NLM kernels would be launched once per offset with one thread per pixel.
However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown.
Therefore, the kernels are now launched in a single call that handles all offsets at once.
This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory.
On the other hand, of course, the smaller tiles significantly reduce the size of the memory.
The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum.
I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere.
To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now.
Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
2017-11-10 03:34:14 +00:00
|
|
|
int stride;
|
2017-05-07 12:40:58 +00:00
|
|
|
int h;
|
Cycles: Improve denoising speed on GPUs with small tile sizes
Previously, the NLM kernels would be launched once per offset with one thread per pixel.
However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown.
Therefore, the kernels are now launched in a single call that handles all offsets at once.
This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory.
On the other hand, of course, the smaller tiles significantly reduce the size of the memory.
The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum.
I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere.
To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now.
Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
2017-11-10 03:34:14 +00:00
|
|
|
int width;
|
2019-02-06 13:19:20 +00:00
|
|
|
int frame_stride;
|
2017-05-07 12:40:58 +00:00
|
|
|
device_only_memory<float> mem;
|
2018-08-25 19:19:44 +00:00
|
|
|
device_only_memory<float> temporary_mem;
|
2019-02-06 13:19:20 +00:00
|
|
|
bool use_time;
|
2019-02-06 11:42:10 +00:00
|
|
|
bool use_intensity;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2018-08-25 19:19:44 +00:00
|
|
|
bool gpu_temporary_mem;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-10-20 21:31:13 +00:00
|
|
|
DenoiseBuffers(Device *device)
|
2021-03-03 13:35:50 +00:00
|
|
|
: mem(device, "denoising pixel buffer"),
|
|
|
|
temporary_mem(device, "denoising temporary mem", true)
|
2017-10-20 21:31:13 +00:00
|
|
|
{
|
|
|
|
}
|
2017-05-07 12:40:58 +00:00
|
|
|
} buffer;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2017-05-07 12:40:58 +00:00
|
|
|
protected:
|
|
|
|
Device *device;
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2020-07-09 18:01:22 +00:00
|
|
|
void set_render_buffer(RenderTileNeighbors &neighbors);
|
2018-07-04 12:22:38 +00:00
|
|
|
void setup_denoising_buffer();
|
|
|
|
void prefilter_shadowing();
|
|
|
|
void prefilter_features();
|
|
|
|
void prefilter_color();
|
|
|
|
void construct_transform();
|
|
|
|
void reconstruct();
|
2019-04-17 04:17:24 +00:00
|
|
|
|
2019-02-06 13:19:20 +00:00
|
|
|
void load_buffer();
|
2019-02-06 11:42:10 +00:00
|
|
|
void write_buffer();
|
2017-05-07 12:40:58 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif /* __DEVICE_DENOISING_H__ */
|