forked from bartvdbraak/blender
Cycles: Enable half float support (4 channels and 1 channel) on CUDA.
Atm OpenEXR half files benefit from this and will use only 1/2 of the memory now. More space for HDRs! Part of my GSoC 2016.
This commit is contained in:
parent
5ac7ef873b
commit
9d236ac06c
@ -576,6 +576,7 @@ public:
|
||||
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
||||
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
||||
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
||||
case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
|
||||
default: assert(0); return;
|
||||
}
|
||||
|
||||
@ -747,8 +748,12 @@ public:
|
||||
}
|
||||
|
||||
/* Resize once */
|
||||
if(flat_slot >= bindless_mapping.size())
|
||||
bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
|
||||
if(flat_slot >= bindless_mapping.size()) {
|
||||
/* Allocate some slots in advance, to reduce amount
|
||||
* of re-allocations.
|
||||
*/
|
||||
bindless_mapping.resize(flat_slot + 128);
|
||||
}
|
||||
|
||||
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||
bindless_mapping.get_data()[flat_slot] = (uint)tex;
|
||||
|
@ -31,6 +31,7 @@
|
||||
#endif
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <float.h>
|
||||
|
||||
/* Qualifier wrappers for different names on different devices */
|
||||
|
@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Float4 textures on various devices. */
|
||||
#if defined(__KERNEL_CPU__)
|
||||
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU
|
||||
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU
|
||||
#elif defined(__KERNEL_CUDA__)
|
||||
# if __CUDA_ARCH__ < 300
|
||||
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA
|
||||
@ -277,8 +277,10 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
|
||||
}
|
||||
# else
|
||||
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||
/* float4, byte4 and half4 */
|
||||
if(id < TEX_START_FLOAT_CUDA_KEPLER)
|
||||
r = kernel_tex_image_interp_float4(tex, x, y);
|
||||
/* float, byte and half */
|
||||
else {
|
||||
float f = kernel_tex_image_interp_float(tex, x, y);
|
||||
r = make_float4(f, f, f, 1.0);
|
||||
|
@ -33,17 +33,21 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#else
|
||||
|
||||
/* CUDA has its own half data type, no need to define then */
|
||||
#ifndef __KERNEL_CUDA__
|
||||
typedef unsigned short half;
|
||||
#endif
|
||||
|
||||
struct half4 { half x, y, z, w; };
|
||||
|
||||
#ifdef __KERNEL_CUDA__
|
||||
|
||||
ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
|
||||
{
|
||||
h[0] = __float2half_rn(f.x * scale);
|
||||
h[1] = __float2half_rn(f.y * scale);
|
||||
h[2] = __float2half_rn(f.z * scale);
|
||||
h[3] = __float2half_rn(f.w * scale);
|
||||
h[0] = __float2half(f.x * scale);
|
||||
h[1] = __float2half(f.y * scale);
|
||||
h[2] = __float2half(f.z * scale);
|
||||
h[3] = __float2half(f.w * scale);
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -52,10 +52,10 @@ CCL_NAMESPACE_BEGIN
|
||||
/* CUDA (Kepler, Geforce 6xx and above) */
|
||||
#define TEX_NUM_FLOAT4_CUDA_KEPLER 1024
|
||||
#define TEX_NUM_BYTE4_CUDA_KEPLER 1024
|
||||
#define TEX_NUM_HALF4_CUDA_KEPLER 0
|
||||
#define TEX_NUM_HALF4_CUDA_KEPLER 1024
|
||||
#define TEX_NUM_FLOAT_CUDA_KEPLER 1024
|
||||
#define TEX_NUM_BYTE_CUDA_KEPLER 1024
|
||||
#define TEX_NUM_HALF_CUDA_KEPLER 0
|
||||
#define TEX_NUM_HALF_CUDA_KEPLER 1024
|
||||
#define TEX_START_FLOAT4_CUDA_KEPLER 0
|
||||
#define TEX_START_BYTE4_CUDA_KEPLER TEX_NUM_FLOAT4_CUDA_KEPLER
|
||||
#define TEX_START_HALF4_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER)
|
||||
|
Loading…
Reference in New Issue
Block a user