forked from bartvdbraak/blender
Cycles: Implement vectorized NLM kernels for faster CPU denoising
This commit is contained in:
parent
6e08b01bd9
commit
a0cc7bd961
@ -184,11 +184,11 @@ public:
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
|
||||
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
|
||||
|
||||
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
|
||||
@ -475,6 +475,7 @@ public:
|
||||
float *blurDifference = temporary_mem;
|
||||
float *difference = temporary_mem + task->buffer.pass_stride;
|
||||
float *weightAccum = temporary_mem + 2*task->buffer.pass_stride;
|
||||
float *temp_image = temporary_mem + 3*task->buffer.pass_stride;
|
||||
|
||||
memset(weightAccum, 0, sizeof(float)*w*h);
|
||||
memset((float*) out_ptr, 0, sizeof(float)*w*h);
|
||||
@ -499,6 +500,7 @@ public:
|
||||
filter_nlm_update_output_kernel()(dx, dy,
|
||||
blurDifference,
|
||||
(float*) image_ptr,
|
||||
temp_image,
|
||||
(float*) out_ptr,
|
||||
weightAccum,
|
||||
local_rect,
|
||||
|
@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer()
|
||||
buffer.mem.alloc_to_device(mem_size, false);
|
||||
|
||||
/* CPUs process shifts sequentially while GPUs process them in parallel. */
|
||||
int num_shifts = 1;
|
||||
int num_layers;
|
||||
if(buffer.gpu_temporary_mem) {
|
||||
/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
|
||||
int max_radius = max(radius, 6);
|
||||
num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
|
||||
int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
|
||||
num_layers = 2*num_shifts + 1;
|
||||
}
|
||||
else {
|
||||
num_layers = 4;
|
||||
}
|
||||
/* Allocate two layers per shift as well as one for the weight accumulation. */
|
||||
buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride);
|
||||
buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
|
||||
}
|
||||
|
||||
void DenoisingTask::prefilter_shadowing()
|
||||
|
@ -16,6 +16,9 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
|
||||
#define load4_u(buf, ofs) load_float4((buf)+(ofs))
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
const float *ccl_restrict weight_image,
|
||||
const float *ccl_restrict variance_image,
|
||||
@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
float a,
|
||||
float k_2)
|
||||
{
|
||||
/* Strides need to be aligned to 16 bytes. */
|
||||
kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
|
||||
|
||||
int aligned_lowx = rect.x & (~3);
|
||||
const int numChannels = (channel_offset > 0)? 3 : 1;
|
||||
const float4 channel_fac = make_float4(1.0f / numChannels);
|
||||
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
float diff = 0.0f;
|
||||
int numChannels = channel_offset? 3 : 1;
|
||||
for(int c = 0; c < numChannels; c++) {
|
||||
float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
float pvar = variance_image[c*channel_offset + y*stride + x];
|
||||
float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||
int idx_p = y*stride + aligned_lowx;
|
||||
int idx_q = (y+dy)*stride + aligned_lowx + dx;
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
|
||||
float4 diff = make_float4(0.0f);
|
||||
for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
|
||||
/* idx_p is guaranteed to be aligned, but idx_q isn't. */
|
||||
float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
|
||||
float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
|
||||
float4 cdiff = color_p - color_q;
|
||||
float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
|
||||
float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
|
||||
diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
|
||||
}
|
||||
if(numChannels > 1) {
|
||||
diff *= 1.0f/numChannels;
|
||||
}
|
||||
difference_image[y*stride + x] = diff;
|
||||
load4_a(difference_image, idx_p) = diff*channel_fac;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -50,23 +61,61 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
int aligned_lowx = rect.x / 4;
|
||||
int aligned_highx = (rect.z + 3) / 4;
|
||||
int aligned_lowx = round_down(rect.x, 4);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
const int low = max(rect.y, y-f);
|
||||
const int high = min(rect.w, y+f+1);
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*stride + x] = 0.0f;
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
load4_a(out_image, y*stride + x) = make_float4(0.0f);
|
||||
}
|
||||
for(int y1 = low; y1 < high; y1++) {
|
||||
float4* out_image4 = (float4*)(out_image + y*stride);
|
||||
float4* difference_image4 = (float4*)(difference_image + y1*stride);
|
||||
for(int x = aligned_lowx; x < aligned_highx; x++) {
|
||||
out_image4[x] += difference_image4[x];
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
|
||||
}
|
||||
}
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*stride + x] *= 1.0f/(high - low);
|
||||
float fac = 1.0f/(high - low);
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
load4_a(out_image, y*stride + x) *= fac;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
|
||||
float *out_image,
|
||||
int4 rect,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
int aligned_lowx = round_down(rect.x, 4);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
load4_a(out_image, y*stride + x) = make_float4(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
for(int dx = -f; dx <= f; dx++) {
|
||||
aligned_lowx = round_down(rect.x - min(0, dx), 4);
|
||||
int highx = rect.z - max(0, dx);
|
||||
int4 lowx4 = make_int4(rect.x - min(0, dx));
|
||||
int4 highx4 = make_int4(rect.z - max(0, dx));
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = aligned_lowx; x < highx; x += 4) {
|
||||
int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
|
||||
int4 active = (x4 >= lowx4) & (x4 < highx4);
|
||||
|
||||
float4 diff = load4_u(difference_image, y*stride + x + dx);
|
||||
load4_a(out_image, y*stride + x) += mask(active, diff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
aligned_lowx = round_down(rect.x, 4);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
|
||||
float4 low = max(make_float4(rect.x), x4 - make_float4(f));
|
||||
float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
|
||||
load4_a(out_image, y*stride + x) *= rcp(high - low);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -77,25 +126,12 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
|
||||
|
||||
int aligned_lowx = round_down(rect.x, 4);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*stride + x] = 0.0f;
|
||||
}
|
||||
}
|
||||
for(int dx = -f; dx <= f; dx++) {
|
||||
int pos_dx = max(0, dx);
|
||||
int neg_dx = min(0, dx);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
|
||||
out_image[y*stride + x] += difference_image[y*stride + x+dx];
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
|
||||
const float *ccl_restrict difference_image,
|
||||
const float *ccl_restrict image,
|
||||
float *temp_image,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int4 rect,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
|
||||
|
||||
int aligned_lowx = round_down(rect.x, 4);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
accum_image[y*stride + x] += weight;
|
||||
out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
|
||||
for(int x = aligned_lowx; x < rect.z; x += 4) {
|
||||
int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
|
||||
int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
|
||||
|
||||
int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
|
||||
|
||||
float4 weight = load4_a(temp_image, idx_p);
|
||||
load4_a(accum_image, idx_p) += mask(active, weight);
|
||||
|
||||
float4 val = load4_u(image, idx_q);
|
||||
|
||||
load4_a(out_image, idx_p) += mask(active, weight*val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
|
||||
}
|
||||
}
|
||||
|
||||
#undef load4_a
|
||||
#undef load4_u
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
int dy,
|
||||
float *difference_image,
|
||||
float *image,
|
||||
float *temp_image,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int* rect,
|
||||
|
@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
int dy,
|
||||
float *difference_image,
|
||||
float *image,
|
||||
float *temp_image,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int *rect,
|
||||
@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
|
||||
#else
|
||||
kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
|
||||
kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i)
|
||||
u.i = i;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
ccl_device_inline int4 __float4_as_int4(float4 f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return int4(_mm_castps_si128(f.m128));
|
||||
#else
|
||||
return make_int4(__float_as_int(f.x),
|
||||
__float_as_int(f.y),
|
||||
__float_as_int(f.z),
|
||||
__float_as_int(f.w));
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 __int4_as_float4(int4 i)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float4(_mm_castsi128_ps(i.m128));
|
||||
#else
|
||||
return make_float4(__int_as_float(i.x),
|
||||
__int_as_float(i.y),
|
||||
__int_as_float(i.z),
|
||||
__int_as_float(i.w));
|
||||
#endif
|
||||
}
|
||||
#endif /* __KERNEL_OPENCL__ */
|
||||
|
||||
/* Versions of functions which are safe for fast math. */
|
||||
|
@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c)
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
|
||||
{
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
/*
|
||||
* FAST & APPROXIMATE MATH
|
||||
*
|
||||
@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x)
|
||||
return fast_exp2f(x / M_LN2_F);
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
ccl_device float4 fast_exp2f4(float4 x)
|
||||
{
|
||||
const float4 one = make_float4(1.0f);
|
||||
const float4 limit = make_float4(126.0f);
|
||||
x = clamp(x, -limit, limit);
|
||||
int4 m = make_int4(x);
|
||||
x = one - (one - (x - make_float4(m)));
|
||||
float4 r = make_float4(1.33336498402e-3f);
|
||||
r = madd4(x, r, make_float4(9.810352697968e-3f));
|
||||
r = madd4(x, r, make_float4(5.551834031939e-2f));
|
||||
r = madd4(x, r, make_float4(0.2401793301105f));
|
||||
r = madd4(x, r, make_float4(0.693144857883f));
|
||||
r = madd4(x, r, make_float4(1.0f));
|
||||
return __int4_as_float4(__float4_as_int4(r) + (m << 23));
|
||||
}
|
||||
|
||||
ccl_device_inline float4 fast_expf4(float4 x)
|
||||
{
|
||||
return fast_exp2f4(x / M_LN2_F);
|
||||
}
|
||||
#endif
|
||||
|
||||
ccl_device_inline float fast_exp10(float x)
|
||||
{
|
||||
/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
|
||||
|
@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b);
|
||||
ccl_device_inline float4 operator-(const float4& a, const float4& b);
|
||||
ccl_device_inline float4 operator+=(float4& a, const float4& b);
|
||||
ccl_device_inline float4 operator*=(float4& a, const float4& b);
|
||||
ccl_device_inline float4 operator*=(float4& a, float f);
|
||||
ccl_device_inline float4 operator/=(float4& a, float f);
|
||||
|
||||
ccl_device_inline int4 operator<(const float4& a, const float4& b);
|
||||
@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a);
|
||||
ccl_device_inline float4 safe_normalize(const float4& a);
|
||||
ccl_device_inline float4 min(const float4& a, const float4& b);
|
||||
ccl_device_inline float4 max(const float4& a, const float4& b);
|
||||
ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx);
|
||||
ccl_device_inline float4 fabs(const float4& a);
|
||||
#endif /* !__KERNEL_OPENCL__*/
|
||||
|
||||
@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b)
|
||||
return a = a * b;
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator*=(float4& a, float f)
|
||||
{
|
||||
return a = a * f;
|
||||
}
|
||||
|
||||
ccl_device_inline float4 operator/=(float4& a, float f)
|
||||
{
|
||||
return a = a / f;
|
||||
@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx)
|
||||
{
|
||||
return min(max(a, mn), mx);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 fabs(const float4& a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
|
@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN
|
||||
ccl_device_inline int4 operator+(const int4& a, const int4& b);
|
||||
ccl_device_inline int4 operator+=(int4& a, const int4& b);
|
||||
ccl_device_inline int4 operator>>(const int4& a, int i);
|
||||
ccl_device_inline int4 operator<<(const int4& a, int i);
|
||||
ccl_device_inline int4 operator<(const int4& a, const int4& b);
|
||||
ccl_device_inline int4 operator>=(const int4& a, const int4& b);
|
||||
ccl_device_inline int4 operator&(const int4& a, const int4& b);
|
||||
ccl_device_inline int4 min(int4 a, int4 b);
|
||||
ccl_device_inline int4 max(int4 a, int4 b);
|
||||
ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
|
||||
@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator<<(const int4& a, int i)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return int4(_mm_slli_epi32(a.m128, i));
|
||||
#else
|
||||
return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator<(const int4& a, const int4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return int4(_mm_cmplt_epi32(a.m128, b.m128));
|
||||
#else
|
||||
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator>=(const int4& a, const int4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
|
||||
#else
|
||||
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator&(const int4& a, const int4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return int4(_mm_and_si128(a.m128, b.m128));
|
||||
#else
|
||||
return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 min(int4 a, int4 b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
|
||||
|
@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
|
||||
#ifndef __KERNEL_GPU__
|
||||
|
||||
struct float3;
|
||||
struct float4;
|
||||
|
||||
struct ccl_try_align(16) int4 {
|
||||
#ifdef __KERNEL_SSE__
|
||||
@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 {
|
||||
ccl_device_inline int4 make_int4(int i);
|
||||
ccl_device_inline int4 make_int4(int x, int y, int z, int w);
|
||||
ccl_device_inline int4 make_int4(const float3& f);
|
||||
ccl_device_inline int4 make_int4(const float4& f);
|
||||
ccl_device_inline void print_int4(const char *label, const int4& a);
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
||||
|
@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f)
|
||||
return a;
|
||||
}
|
||||
|
||||
ccl_device_inline int4 make_int4(const float4& f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
int4 a(_mm_cvtps_epi32(f.m128));
|
||||
#else
|
||||
int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
ccl_device_inline void print_int4(const char *label, const int4& a)
|
||||
{
|
||||
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
|
||||
|
Loading…
Reference in New Issue
Block a user