forked from bartvdbraak/blender
\0;115;0cCycles: Cleanup, use ccl_restrict instead of ccl_restrict_ptr
There were following issues with ccl_restrict_ptr: - We already had ccl_restrict for all platforms. - It was secretly adding `const` qualifier to the declaration, which is quite weird since non-const pointer can also be declared as restricted. - We never in Blender are using foo_ptr or FooPtr type definitions, so not sure why we should introduce such a thing here. - It is absolutely wrong from semantic point of view to put pointer into the restrict macro -- const is a part of type, not part of hint for compiler that some pointer is never aliased.
This commit is contained in:
parent
8e655446d1
commit
803337f3f6
@ -28,7 +28,11 @@
|
||||
pixel_buffer += buffer_w - (high.x - low.x); \
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
|
||||
ccl_device_inline void filter_get_features(int2 pixel,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
float *features,
|
||||
const float *ccl_restrict mean,
|
||||
int pass_stride)
|
||||
{
|
||||
features[0] = pixel.x;
|
||||
features[1] = pixel.y;
|
||||
@ -46,7 +50,11 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
|
||||
ccl_device_inline void filter_get_feature_scales(int2 pixel,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
float *scales,
|
||||
const float *ccl_restrict mean,
|
||||
int pass_stride)
|
||||
{
|
||||
scales[0] = fabsf(pixel.x - mean[0]);
|
||||
scales[1] = fabsf(pixel.y - mean[1]);
|
||||
@ -70,19 +78,21 @@ ccl_device_inline void filter_calculate_scale(float *scale)
|
||||
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
|
||||
}
|
||||
|
||||
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||
ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer,
|
||||
int pass_stride)
|
||||
{
|
||||
return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
|
||||
}
|
||||
|
||||
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||
ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer,
|
||||
int pass_stride)
|
||||
{
|
||||
return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
|
||||
}
|
||||
|
||||
ccl_device_inline void design_row_add(float *design_row,
|
||||
int rank,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
int stride,
|
||||
int row,
|
||||
float feature)
|
||||
@ -94,13 +104,13 @@ ccl_device_inline void design_row_add(float *design_row,
|
||||
|
||||
/* Fill the design row. */
|
||||
ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
|
||||
ccl_global float ccl_restrict_ptr p_buffer,
|
||||
const ccl_global float *ccl_restrict p_buffer,
|
||||
int2 q_pixel,
|
||||
ccl_global float ccl_restrict_ptr q_buffer,
|
||||
const ccl_global float *ccl_restrict q_buffer,
|
||||
int pass_stride,
|
||||
int rank,
|
||||
float *design_row,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
int stride)
|
||||
{
|
||||
design_row[0] = 1.0f;
|
||||
|
@ -33,7 +33,12 @@ CCL_NAMESPACE_BEGIN
|
||||
pixel_buffer += buffer_w - (pixel.x - low.x); \
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
|
||||
__m128 active_pixels,
|
||||
const float *ccl_restrict buffer,
|
||||
__m128 *features,
|
||||
const __m128 ccl_restrict *mean,
|
||||
int pass_stride)
|
||||
{
|
||||
features[0] = x;
|
||||
features[1] = y;
|
||||
@ -53,7 +58,12 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active
|
||||
features[i] = _mm_mask_ps(features[i], active_pixels);
|
||||
}
|
||||
|
||||
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
|
||||
__m128 active_pixels,
|
||||
const float *ccl_restrict buffer,
|
||||
__m128 *scales,
|
||||
const __m128 *ccl_restrict mean,
|
||||
int pass_stride)
|
||||
{
|
||||
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
|
||||
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
|
||||
|
@ -16,7 +16,15 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
const float *ccl_restrict weightImage,
|
||||
const float *ccl_restrict varianceImage,
|
||||
float *differenceImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
@ -36,7 +44,11 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float c
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||
ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differenceImage,
|
||||
float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE3__
|
||||
int aligned_lowx = (rect.x & ~(3));
|
||||
@ -65,7 +77,11 @@ ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceI
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict differenceImage,
|
||||
float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
@ -90,7 +106,14 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
|
||||
const float *ccl_restrict differenceImage,
|
||||
const float *ccl_restrict image,
|
||||
float *outImage,
|
||||
float *accumImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
@ -108,8 +131,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr buffer,
|
||||
const float *ccl_restrict differenceImage,
|
||||
const float *ccl_restrict buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float *transform,
|
||||
@ -151,7 +174,10 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage,
|
||||
const float *ccl_restrict accumImage,
|
||||
int4 rect,
|
||||
int w)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
|
@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr weightImage,
|
||||
ccl_global float ccl_restrict_ptr varianceImage,
|
||||
const ccl_global float *ccl_restrict weightImage,
|
||||
const ccl_global float *ccl_restrict varianceImage,
|
||||
ccl_global float *differenceImage,
|
||||
int4 rect, int w,
|
||||
int channel_offset,
|
||||
@ -40,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect, int w, int f)
|
||||
{
|
||||
@ -55,7 +55,7 @@ ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect, int w, int f)
|
||||
{
|
||||
@ -71,8 +71,8 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr image,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
const ccl_global float *ccl_restrict image,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float *accumImage,
|
||||
int4 rect, int w, int f)
|
||||
@ -95,11 +95,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
int dx, int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
@ -138,7 +138,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float ccl_restrict_ptr accumImage,
|
||||
const ccl_global float *ccl_restrict accumImage,
|
||||
int4 rect, int w)
|
||||
{
|
||||
outImage[y*w+x] /= accumImage[y*w+x];
|
||||
|
@ -44,7 +44,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
|
||||
|
||||
int offset = tiles->offsets[tile];
|
||||
int stride = tiles->strides[tile];
|
||||
ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
|
||||
const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
|
||||
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
|
||||
center_buffer += buffer_denoising_offset + 14;
|
||||
|
||||
|
@ -21,10 +21,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||
int dx, int dy,
|
||||
int w, int h,
|
||||
int pass_stride,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
ccl_global int *rank,
|
||||
float weight,
|
||||
ccl_global float *XtWX,
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
float *transform, int *rank,
|
||||
@ -29,7 +29,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
/* Temporary storage, used in different steps of the algorithm. */
|
||||
float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||
float tempvector[2*DENOISE_FEATURES];
|
||||
float ccl_restrict_ptr pixel_buffer;
|
||||
const float *ccl_restrict pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||
ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
ccl_global float *transform,
|
||||
@ -38,7 +38,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_
|
||||
max(rect.y, y - radius));
|
||||
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||
min(rect.w, y + radius + 1));
|
||||
ccl_global float ccl_restrict_ptr pixel_buffer;
|
||||
const ccl_global float *ccl_restrict pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
|
||||
int x, int y, int4 rect,
|
||||
int pass_stride,
|
||||
float *transform, int *rank,
|
||||
@ -25,7 +25,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||
|
||||
__m128 features[DENOISE_FEATURES];
|
||||
float ccl_restrict_ptr pixel_buffer;
|
||||
const float *ccl_restrict pixel_buffer;
|
||||
int2 pixel;
|
||||
|
||||
int2 low = make_int2(max(rect.x, x - radius),
|
||||
|
@ -42,8 +42,6 @@
|
||||
#include "util/util_types.h"
|
||||
#include "util/util_texture.h"
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict
|
||||
|
||||
#define ccl_addr_space
|
||||
|
||||
#define ccl_local_id(d) 0
|
||||
|
@ -55,7 +55,6 @@
|
||||
#define ccl_restrict __restrict__
|
||||
#define ccl_align(n) __align__(n)
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict__
|
||||
#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
|
||||
|
||||
|
||||
|
@ -50,8 +50,6 @@
|
||||
# define ccl_addr_space
|
||||
#endif
|
||||
|
||||
#define ccl_restrict_ptr const * __restrict__
|
||||
|
||||
#define ccl_local_id(d) get_local_id(d)
|
||||
#define ccl_global_id(d) get_global_id(d)
|
||||
|
||||
|
@ -139,8 +139,8 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
|
||||
float ccl_restrict_ptr weightImage,
|
||||
float ccl_restrict_ptr varianceImage,
|
||||
const float *ccl_restrict weightImage,
|
||||
const float *ccl_restrict varianceImage,
|
||||
float *differenceImage,
|
||||
int4 rect, int w,
|
||||
int channel_offset,
|
||||
@ -154,7 +154,7 @@ kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
kernel_cuda_filter_nlm_blur(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
@ -164,7 +164,7 @@ kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outIm
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
@ -175,8 +175,8 @@ kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_update_output(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr image,
|
||||
const float *ccl_restrict differenceImage,
|
||||
const float *ccl_restrict image,
|
||||
float *outImage, float *accumImage,
|
||||
int4 rect, int w,
|
||||
int f) {
|
||||
@ -189,7 +189,7 @@ kernel_cuda_filter_nlm_update_output(int dx, int dy,
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
|
||||
kernel_cuda_filter_nlm_normalize(float *outImage, const float *ccl_restrict accumImage, int4 rect, int w) {
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
@ -200,8 +200,8 @@ kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumIm
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
|
||||
float ccl_restrict_ptr differenceImage,
|
||||
float ccl_restrict_ptr buffer,
|
||||
const float *ccl_restrict differenceImage,
|
||||
const float *ccl_restrict buffer,
|
||||
float *color_pass,
|
||||
float *variance_pass,
|
||||
float const* __restrict__ transform,
|
||||
|
@ -106,7 +106,7 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||
__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
|
||||
ccl_global float *transform,
|
||||
ccl_global int *rank,
|
||||
int4 filter_area,
|
||||
@ -132,8 +132,8 @@ __kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restric
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr weightImage,
|
||||
ccl_global float ccl_restrict_ptr varianceImage,
|
||||
const ccl_global float *ccl_restrict weightImage,
|
||||
const ccl_global float *ccl_restrict varianceImage,
|
||||
ccl_global float *differenceImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
@ -147,7 +147,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
|
||||
__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
@ -159,7 +159,7 @@ __kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr diffe
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
|
||||
__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict differenceImage,
|
||||
ccl_global float *outImage,
|
||||
int4 rect,
|
||||
int w,
|
||||
@ -173,8 +173,8 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_pt
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr image,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
const ccl_global float *ccl_restrict image,
|
||||
ccl_global float *outImage,
|
||||
ccl_global float *accumImage,
|
||||
int4 rect,
|
||||
@ -188,7 +188,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(int dx,
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
|
||||
ccl_global float ccl_restrict_ptr accumImage,
|
||||
const ccl_global float *ccl_restrict accumImage,
|
||||
int4 rect,
|
||||
int w) {
|
||||
int x = get_global_id(0) + rect.x;
|
||||
@ -200,11 +200,11 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
|
||||
int dy,
|
||||
ccl_global float ccl_restrict_ptr differenceImage,
|
||||
ccl_global float ccl_restrict_ptr buffer,
|
||||
const ccl_global float *ccl_restrict differenceImage,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
ccl_global float *color_pass,
|
||||
ccl_global float *variance_pass,
|
||||
ccl_global float ccl_restrict_ptr transform,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
|
@ -50,19 +50,19 @@ ccl_device_inline void math_matrix_zero(float *A, int n)
|
||||
|
||||
/* Elementary vector operations. */
|
||||
|
||||
ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n)
|
||||
ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i] += b[i];
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n)
|
||||
ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i] *= b[i];
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n)
|
||||
ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i*astride] *= b[i];
|
||||
@ -74,7 +74,7 @@ ccl_device_inline void math_vector_scale(float *a, float b, int n)
|
||||
a[i] *= b;
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n)
|
||||
ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i] = max(a[i], b[i]);
|
||||
@ -105,7 +105,7 @@ ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, f
|
||||
* The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
|
||||
ccl_device_inline void math_matrix_add_gramian(float *A,
|
||||
int n,
|
||||
float ccl_restrict_ptr v,
|
||||
const float *ccl_restrict v,
|
||||
float weight)
|
||||
{
|
||||
for(int row = 0; row < n; row++)
|
||||
@ -117,7 +117,7 @@ ccl_device_inline void math_matrix_add_gramian(float *A,
|
||||
* The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
|
||||
ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
|
||||
int n,
|
||||
float ccl_restrict_ptr v,
|
||||
const float *ccl_restrict v,
|
||||
float weight,
|
||||
int stride)
|
||||
{
|
||||
@ -342,32 +342,32 @@ ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
|
||||
|
||||
/* Add Gramian matrix of v to A.
|
||||
* The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
|
||||
ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight)
|
||||
ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
|
||||
{
|
||||
for(int row = 0; row < n; row++)
|
||||
for(int col = 0; col <= row; col++)
|
||||
MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
|
||||
ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
V[i] = _mm_add_ps(V[i], a[i]);
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
|
||||
ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
V[i] = _mm_mul_ps(V[i], a[i]);
|
||||
}
|
||||
|
||||
ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n)
|
||||
ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
|
||||
{
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i] = _mm_max_ps(a[i], b[i]);
|
||||
}
|
||||
|
||||
ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B)
|
||||
ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
|
||||
{
|
||||
for(int row = 0; row < n; row++)
|
||||
for(int col = 0; col <= row; col++)
|
||||
|
Loading…
Reference in New Issue
Block a user