forked from bartvdbraak/blender
Code refactor: make texture code more consistent between devices.
* Use common TextureInfo struct for all devices, except CUDA fermi. * Move image sampling code to kernels/*/kernel_*_image.h files. * Use arrays for data textures on Fermi too, so device_vector<Struct> works.
This commit is contained in:
parent
d013b56dde
commit
23098cda99
@ -26,6 +26,7 @@
|
||||
#include "util/util_stats.h"
|
||||
#include "util/util_string.h"
|
||||
#include "util/util_thread.h"
|
||||
#include "util/util_texture.h"
|
||||
#include "util/util_types.h"
|
||||
#include "util/util_vector.h"
|
||||
|
||||
|
@ -163,6 +163,9 @@ public:
|
||||
TaskPool task_pool;
|
||||
KernelGlobals kernel_globals;
|
||||
|
||||
device_vector<TextureInfo> texture_info;
|
||||
bool need_texture_info;
|
||||
|
||||
#ifdef WITH_OSL
|
||||
OSLGlobals osl_globals;
|
||||
#endif
|
||||
@ -235,6 +238,8 @@ public:
|
||||
VLOG(1) << "Will be using split kernel.";
|
||||
}
|
||||
|
||||
need_texture_info = false;
|
||||
|
||||
#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
|
||||
REGISTER_SPLIT_KERNEL(path_init);
|
||||
REGISTER_SPLIT_KERNEL(scene_intersect);
|
||||
@ -261,6 +266,7 @@ public:
|
||||
~CPUDevice()
|
||||
{
|
||||
task_pool.stop();
|
||||
tex_free(texture_info);
|
||||
}
|
||||
|
||||
virtual bool show_samples() const
|
||||
@ -268,6 +274,15 @@ public:
|
||||
return (TaskScheduler::num_threads() == 1);
|
||||
}
|
||||
|
||||
void load_texture_info()
|
||||
{
|
||||
if(need_texture_info) {
|
||||
tex_free(texture_info);
|
||||
tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT);
|
||||
need_texture_info = false;
|
||||
}
|
||||
}
|
||||
|
||||
void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
|
||||
{
|
||||
if(name) {
|
||||
@ -333,14 +348,47 @@ public:
|
||||
VLOG(1) << "Texture allocate: " << name << ", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
kernel_tex_copy(&kernel_globals,
|
||||
name,
|
||||
mem.data_pointer,
|
||||
mem.data_width,
|
||||
mem.data_height,
|
||||
mem.data_depth,
|
||||
interpolation,
|
||||
extension);
|
||||
|
||||
if(interpolation == INTERPOLATION_NONE) {
|
||||
/* Data texture. */
|
||||
kernel_tex_copy(&kernel_globals,
|
||||
name,
|
||||
mem.data_pointer,
|
||||
mem.data_width,
|
||||
mem.data_height,
|
||||
mem.data_depth,
|
||||
interpolation,
|
||||
extension);
|
||||
}
|
||||
else {
|
||||
/* Image Texture. */
|
||||
int flat_slot = 0;
|
||||
if(string_startswith(name, "__tex_image")) {
|
||||
int pos = string(name).rfind("_");
|
||||
flat_slot = atoi(name + pos + 1);
|
||||
}
|
||||
else {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
if(flat_slot >= texture_info.size()) {
|
||||
/* Allocate some slots in advance, to reduce amount
|
||||
* of re-allocations. */
|
||||
texture_info.resize(flat_slot + 128);
|
||||
}
|
||||
|
||||
TextureInfo& info = texture_info.get_data()[flat_slot];
|
||||
info.data = (uint64_t)mem.data_pointer;
|
||||
info.cl_buffer = 0;
|
||||
info.interpolation = interpolation;
|
||||
info.extension = extension;
|
||||
info.width = mem.data_width;
|
||||
info.height = mem.data_height;
|
||||
info.depth = mem.data_depth;
|
||||
|
||||
need_texture_info = true;
|
||||
}
|
||||
|
||||
mem.device_pointer = mem.data_pointer;
|
||||
mem.device_size = mem.memory_size();
|
||||
stats.mem_alloc(mem.device_size);
|
||||
@ -352,6 +400,7 @@ public:
|
||||
mem.device_pointer = 0;
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_size = 0;
|
||||
need_texture_info = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -784,6 +833,9 @@ public:
|
||||
|
||||
void task_add(DeviceTask& task)
|
||||
{
|
||||
/* Load texture info. */
|
||||
load_texture_info();
|
||||
|
||||
/* split task into smaller ones */
|
||||
list<DeviceTask> tasks;
|
||||
|
||||
|
@ -129,7 +129,7 @@ public:
|
||||
CUcontext cuContext;
|
||||
CUmodule cuModule, cuFilterModule;
|
||||
map<device_ptr, bool> tex_interp_map;
|
||||
map<device_ptr, uint> tex_bindless_map;
|
||||
map<device_ptr, CUtexObject> tex_bindless_map;
|
||||
int cuDevId;
|
||||
int cuDevArchitecture;
|
||||
bool first_error;
|
||||
@ -145,8 +145,8 @@ public:
|
||||
map<device_ptr, PixelMem> pixel_mem_map;
|
||||
|
||||
/* Bindless Textures */
|
||||
device_vector<uint> bindless_mapping;
|
||||
bool need_bindless_mapping;
|
||||
device_vector<TextureInfo> texture_info;
|
||||
bool need_texture_info;
|
||||
|
||||
CUdeviceptr cuda_device_ptr(device_ptr mem)
|
||||
{
|
||||
@ -231,7 +231,7 @@ public:
|
||||
|
||||
split_kernel = NULL;
|
||||
|
||||
need_bindless_mapping = false;
|
||||
need_texture_info = false;
|
||||
|
||||
/* intialize */
|
||||
if(cuda_error(cuInit(0)))
|
||||
@ -274,7 +274,7 @@ public:
|
||||
delete split_kernel;
|
||||
|
||||
if(info.has_bindless_textures) {
|
||||
tex_free(bindless_mapping);
|
||||
tex_free(texture_info);
|
||||
}
|
||||
|
||||
cuda_assert(cuCtxDestroy(cuContext));
|
||||
@ -544,12 +544,12 @@ public:
|
||||
return (result == CUDA_SUCCESS);
|
||||
}
|
||||
|
||||
void load_bindless_mapping()
|
||||
void load_texture_info()
|
||||
{
|
||||
if(info.has_bindless_textures && need_bindless_mapping) {
|
||||
tex_free(bindless_mapping);
|
||||
tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
|
||||
need_bindless_mapping = false;
|
||||
if(info.has_bindless_textures && need_texture_info) {
|
||||
tex_free(texture_info);
|
||||
tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT);
|
||||
need_texture_info = false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -646,8 +646,7 @@ public:
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
|
||||
/* Check if we are on sm_30 or above.
|
||||
* We use arrays and bindles textures for storage there */
|
||||
/* Check if we are on sm_30 or above, for bindless textures. */
|
||||
bool has_bindless_textures = info.has_bindless_textures;
|
||||
|
||||
/* General variables for both architectures */
|
||||
@ -679,20 +678,10 @@ public:
|
||||
filter_mode = CU_TR_FILTER_MODE_LINEAR;
|
||||
}
|
||||
|
||||
CUarray_format_enum format;
|
||||
switch(mem.data_type) {
|
||||
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
|
||||
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
||||
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
||||
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
||||
case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
|
||||
default: assert(0); return;
|
||||
}
|
||||
|
||||
/* General variables for Fermi */
|
||||
CUtexref texref = NULL;
|
||||
|
||||
if(!has_bindless_textures) {
|
||||
if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) {
|
||||
if(mem.data_depth > 1) {
|
||||
/* Kernel uses different bind names for 2d and 3d float textures,
|
||||
* so we have to adjust couple of things here.
|
||||
@ -711,41 +700,41 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
/* Data Storage */
|
||||
if(interpolation == INTERPOLATION_NONE) {
|
||||
if(has_bindless_textures) {
|
||||
mem_alloc(NULL, mem, MEM_READ_ONLY);
|
||||
mem_copy_to(mem);
|
||||
/* Data Storage */
|
||||
mem_alloc(NULL, mem, MEM_READ_ONLY);
|
||||
mem_copy_to(mem);
|
||||
|
||||
CUdeviceptr cumem;
|
||||
size_t cubytes;
|
||||
CUdeviceptr cumem;
|
||||
size_t cubytes;
|
||||
|
||||
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
|
||||
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
|
||||
|
||||
if(cubytes == 8) {
|
||||
/* 64 bit device pointer */
|
||||
uint64_t ptr = mem.device_pointer;
|
||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||
}
|
||||
else {
|
||||
/* 32 bit device pointer */
|
||||
uint32_t ptr = (uint32_t)mem.device_pointer;
|
||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||
}
|
||||
if(cubytes == 8) {
|
||||
/* 64 bit device pointer */
|
||||
uint64_t ptr = mem.device_pointer;
|
||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||
}
|
||||
else {
|
||||
mem_alloc(NULL, mem, MEM_READ_ONLY);
|
||||
mem_copy_to(mem);
|
||||
|
||||
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
|
||||
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
|
||||
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
|
||||
/* 32 bit device pointer */
|
||||
uint32_t ptr = (uint32_t)mem.device_pointer;
|
||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||
}
|
||||
}
|
||||
/* Texture Storage */
|
||||
else {
|
||||
/* Texture Storage */
|
||||
CUarray handle = NULL;
|
||||
|
||||
CUarray_format_enum format;
|
||||
switch(mem.data_type) {
|
||||
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
|
||||
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
||||
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
||||
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
||||
case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
|
||||
default: assert(0); return;
|
||||
}
|
||||
|
||||
if(mem.data_depth > 1) {
|
||||
CUDA_ARRAY3D_DESCRIPTOR desc;
|
||||
|
||||
@ -810,8 +799,8 @@ public:
|
||||
|
||||
stats.mem_alloc(size);
|
||||
|
||||
/* Bindless Textures - Kepler */
|
||||
if(has_bindless_textures) {
|
||||
/* Bindless Textures - Kepler */
|
||||
int flat_slot = 0;
|
||||
if(string_startswith(name, "__tex_image")) {
|
||||
int pos = string(name).rfind("_");
|
||||
@ -844,35 +833,39 @@ public:
|
||||
}
|
||||
|
||||
/* Resize once */
|
||||
if(flat_slot >= bindless_mapping.size()) {
|
||||
if(flat_slot >= texture_info.size()) {
|
||||
/* Allocate some slots in advance, to reduce amount
|
||||
* of re-allocations.
|
||||
*/
|
||||
bindless_mapping.resize(flat_slot + 128);
|
||||
* of re-allocations. */
|
||||
texture_info.resize(flat_slot + 128);
|
||||
}
|
||||
|
||||
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||
bindless_mapping.get_data()[flat_slot] = (uint)tex;
|
||||
tex_bindless_map[mem.device_pointer] = (uint)tex;
|
||||
need_bindless_mapping = true;
|
||||
TextureInfo& info = texture_info.get_data()[flat_slot];
|
||||
info.data = (uint64_t)tex;
|
||||
info.cl_buffer = 0;
|
||||
info.interpolation = interpolation;
|
||||
info.extension = extension;
|
||||
info.width = mem.data_width;
|
||||
info.height = mem.data_height;
|
||||
info.depth = mem.data_depth;
|
||||
|
||||
tex_bindless_map[mem.device_pointer] = tex;
|
||||
need_texture_info = true;
|
||||
}
|
||||
/* Regular Textures - Fermi */
|
||||
else {
|
||||
/* Regular Textures - Fermi */
|
||||
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
|
||||
cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
|
||||
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
|
||||
}
|
||||
}
|
||||
|
||||
/* Fermi, Data and Image Textures */
|
||||
if(!has_bindless_textures) {
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
|
||||
if(mem.data_depth > 1) {
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
|
||||
}
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
|
||||
if(mem.data_depth > 1) {
|
||||
cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
|
||||
}
|
||||
|
||||
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
|
||||
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
|
||||
}
|
||||
}
|
||||
|
||||
/* Fermi and Kepler */
|
||||
@ -888,8 +881,8 @@ public:
|
||||
|
||||
/* Free CUtexObject (Bindless Textures) */
|
||||
if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
|
||||
uint flat_slot = tex_bindless_map[mem.device_pointer];
|
||||
cuTexObjectDestroy(flat_slot);
|
||||
CUtexObject tex = tex_bindless_map[mem.device_pointer];
|
||||
cuTexObjectDestroy(tex);
|
||||
}
|
||||
|
||||
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
|
||||
@ -1716,9 +1709,6 @@ public:
|
||||
if(task->type == DeviceTask::RENDER) {
|
||||
RenderTile tile;
|
||||
|
||||
/* Upload Bindless Mapping */
|
||||
load_bindless_mapping();
|
||||
|
||||
DeviceRequestedFeatures requested_features;
|
||||
if(use_split_kernel()) {
|
||||
if(!use_adaptive_compilation()) {
|
||||
@ -1759,9 +1749,6 @@ public:
|
||||
}
|
||||
}
|
||||
else if(task->type == DeviceTask::SHADER) {
|
||||
/* Upload Bindless Mapping */
|
||||
load_bindless_mapping();
|
||||
|
||||
shader(*task);
|
||||
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
@ -1784,9 +1771,12 @@ public:
|
||||
|
||||
void task_add(DeviceTask& task)
|
||||
{
|
||||
if(task.type == DeviceTask::FILM_CONVERT) {
|
||||
CUDAContextScope scope(this);
|
||||
CUDAContextScope scope(this);
|
||||
|
||||
/* Load texture info. */
|
||||
load_texture_info();
|
||||
|
||||
if(task.type == DeviceTask::FILM_CONVERT) {
|
||||
/* must be done in main thread due to opengl access */
|
||||
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
@ -545,15 +545,10 @@ private:
|
||||
MemoryManager memory_manager;
|
||||
friend class MemoryManager;
|
||||
|
||||
struct tex_info_t {
|
||||
uint buffer, padding;
|
||||
cl_ulong offset;
|
||||
uint width, height, depth, options;
|
||||
};
|
||||
static_assert_align(tex_info_t, 16);
|
||||
static_assert_align(TextureInfo, 16);
|
||||
|
||||
vector<tex_info_t> texture_descriptors;
|
||||
device_memory texture_descriptors_buffer;
|
||||
vector<TextureInfo> texture_info;
|
||||
device_memory texture_info_buffer;
|
||||
|
||||
struct Texture {
|
||||
Texture() {}
|
||||
|
@ -136,11 +136,11 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
|
||||
return;
|
||||
}
|
||||
|
||||
/* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */
|
||||
texture_descriptors.resize(1);
|
||||
texture_descriptors_buffer.resize(1);
|
||||
texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
|
||||
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
|
||||
/* Allocate this right away so that texture_info_buffer is placed at offset 0 in the device memory buffers */
|
||||
texture_info.resize(1);
|
||||
texture_info_buffer.resize(1);
|
||||
texture_info_buffer.data_pointer = (device_ptr)&texture_info[0];
|
||||
memory_manager.alloc("texture_info", texture_info_buffer);
|
||||
|
||||
fprintf(stderr, "Device init success\n");
|
||||
device_initialized = true;
|
||||
@ -625,7 +625,7 @@ void OpenCLDeviceBase::flush_texture_buffers()
|
||||
|
||||
vector<texture_slot_t> texture_slots;
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
#define KERNEL_TEX(type, name) \
|
||||
if(textures.find(#name) != textures.end()) { \
|
||||
texture_slots.push_back(texture_slot_t(#name, num_slots)); \
|
||||
} \
|
||||
@ -647,55 +647,38 @@ void OpenCLDeviceBase::flush_texture_buffers()
|
||||
}
|
||||
|
||||
/* Realloc texture descriptors buffer. */
|
||||
memory_manager.free(texture_descriptors_buffer);
|
||||
memory_manager.free(texture_info_buffer);
|
||||
|
||||
texture_descriptors.resize(num_slots);
|
||||
texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t));
|
||||
texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
|
||||
texture_info.resize(num_slots);
|
||||
texture_info_buffer.resize(num_slots * sizeof(TextureInfo));
|
||||
texture_info_buffer.data_pointer = (device_ptr)&texture_info[0];
|
||||
|
||||
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
|
||||
memory_manager.alloc("texture_info", texture_info_buffer);
|
||||
|
||||
/* Fill in descriptors */
|
||||
foreach(texture_slot_t& slot, texture_slots) {
|
||||
Texture& tex = textures[slot.name];
|
||||
|
||||
tex_info_t& info = texture_descriptors[slot.slot];
|
||||
TextureInfo& info = texture_info[slot.slot];
|
||||
|
||||
MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
|
||||
|
||||
info.offset = desc.offset;
|
||||
info.buffer = desc.device_buffer;
|
||||
info.data = desc.offset;
|
||||
info.cl_buffer = desc.device_buffer;
|
||||
|
||||
if(string_startswith(slot.name, "__tex_image")) {
|
||||
info.width = tex.mem->data_width;
|
||||
info.height = tex.mem->data_height;
|
||||
info.depth = tex.mem->data_depth;
|
||||
|
||||
info.options = 0;
|
||||
|
||||
if(tex.interpolation == INTERPOLATION_CLOSEST) {
|
||||
info.options |= (1 << 0);
|
||||
}
|
||||
|
||||
switch(tex.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
info.options |= (1 << 1);
|
||||
break;
|
||||
case EXTENSION_EXTEND:
|
||||
info.options |= (1 << 2);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
info.options |= (1 << 3);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
info.interpolation = tex.interpolation;
|
||||
info.extension = tex.extension;
|
||||
}
|
||||
}
|
||||
|
||||
/* Force write of descriptors. */
|
||||
memory_manager.free(texture_descriptors_buffer);
|
||||
memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
|
||||
memory_manager.free(texture_info_buffer);
|
||||
memory_manager.alloc("texture_info", texture_info_buffer);
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
|
||||
|
@ -117,14 +117,8 @@ public:
|
||||
ccl_constant KernelData *data;
|
||||
ccl_global char *buffers[8];
|
||||
|
||||
typedef struct _tex_info_t {
|
||||
uint buffer, padding;
|
||||
uint64_t offset;
|
||||
uint width, height, depth, options;
|
||||
} _tex_info_t;
|
||||
|
||||
#define KERNEL_TEX(type, ttype, name) \
|
||||
_tex_info_t name;
|
||||
#define KERNEL_TEX(type, name) \
|
||||
TextureInfo name;
|
||||
#include "kernel/kernel_textures.h"
|
||||
#undef KERNEL_TEX
|
||||
|
||||
|
@ -83,7 +83,6 @@ set(SRC_HEADERS
|
||||
kernel_emission.h
|
||||
kernel_film.h
|
||||
kernel_globals.h
|
||||
kernel_image_opencl.h
|
||||
kernel_jitter.h
|
||||
kernel_light.h
|
||||
kernel_math.h
|
||||
@ -119,10 +118,12 @@ set(SRC_KERNELS_CPU_HEADERS
|
||||
|
||||
set(SRC_KERNELS_CUDA_HEADERS
|
||||
kernels/cuda/kernel_config.h
|
||||
kernels/cuda/kernel_cuda_image.h
|
||||
)
|
||||
|
||||
set(SRC_KERNELS_OPENCL_HEADERS
|
||||
kernels/opencl/kernel_split_function.h
|
||||
kernels/opencl/kernel_opencl_image.h
|
||||
)
|
||||
|
||||
set(SRC_CLOSURE_HEADERS
|
||||
@ -507,6 +508,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_opencl_image.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||
|
@ -29,21 +29,6 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Return position normalized to 0..1 in mesh bounds */
|
||||
|
||||
#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
|
||||
ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
|
||||
{
|
||||
float4 r;
|
||||
switch(id) {
|
||||
case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
|
||||
case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break;
|
||||
case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break;
|
||||
case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break;
|
||||
case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
#endif /* __KERNEL_CUDA__ */
|
||||
|
||||
ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
|
||||
const ShaderData *sd,
|
||||
float3 P)
|
||||
@ -65,22 +50,14 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
|
||||
ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
|
||||
{
|
||||
float3 P = volume_normalized_position(kg, sd, sd->P);
|
||||
#ifdef __KERNEL_CUDA__
|
||||
# if __CUDA_ARCH__ >= 300
|
||||
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
|
||||
float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
|
||||
float4 r = make_float4(f, f, f, 1.0f);
|
||||
# else
|
||||
float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
|
||||
# endif
|
||||
#elif defined(__KERNEL_OPENCL__)
|
||||
#ifdef __KERNEL_GPU__
|
||||
float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
|
||||
#else
|
||||
float4 r;
|
||||
if(sd->flag & SD_VOLUME_CUBIC)
|
||||
r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
|
||||
r = kernel_tex_image_interp_3d_ex(kg, desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
|
||||
else
|
||||
r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
|
||||
r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
|
||||
#endif
|
||||
|
||||
if(dx) *dx = 0.0f;
|
||||
@ -92,21 +69,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
|
||||
ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
|
||||
{
|
||||
float3 P = volume_normalized_position(kg, sd, sd->P);
|
||||
#ifdef __KERNEL_CUDA__
|
||||
# if __CUDA_ARCH__ >= 300
|
||||
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
|
||||
float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
|
||||
# else
|
||||
float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
|
||||
# endif
|
||||
#elif defined(__KERNEL_OPENCL__)
|
||||
#ifdef __KERNEL_GPU__
|
||||
float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
|
||||
#else
|
||||
float4 r;
|
||||
if(sd->flag & SD_VOLUME_CUBIC)
|
||||
r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
|
||||
r = kernel_tex_image_interp_3d_ex(kg, desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC);
|
||||
else
|
||||
r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z);
|
||||
r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
|
||||
#endif
|
||||
|
||||
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
|
||||
|
@ -74,7 +74,7 @@ CCL_NAMESPACE_BEGIN
|
||||
* pointer lookup. */
|
||||
|
||||
template<typename T> struct texture {
|
||||
ccl_always_inline T fetch(int index)
|
||||
ccl_always_inline const T& fetch(int index)
|
||||
{
|
||||
kernel_assert(index >= 0 && index < width);
|
||||
return data[index];
|
||||
@ -112,449 +112,6 @@ template<typename T> struct texture {
|
||||
int width;
|
||||
};
|
||||
|
||||
template<typename T> struct texture_image {
|
||||
#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
|
||||
{ \
|
||||
u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
|
||||
u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \
|
||||
u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
|
||||
u[3] = (1.0f / 6.0f) * t * t * t; \
|
||||
} (void)0
|
||||
|
||||
ccl_always_inline float4 read(float4 r)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
ccl_always_inline float4 read(uchar4 r)
|
||||
{
|
||||
float f = 1.0f/255.0f;
|
||||
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 read(uchar r)
|
||||
{
|
||||
float f = r*(1.0f/255.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 read(float r)
|
||||
{
|
||||
/* TODO(dingto): Optimize this, so interpolation
|
||||
* happens on float instead of float4 */
|
||||
return make_float4(r, r, r, 1.0f);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 read(half4 r)
|
||||
{
|
||||
return half4_to_float4(r);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 read(half r)
|
||||
{
|
||||
float f = half_to_float(r);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
|
||||
ccl_always_inline int wrap_periodic(int x, int width)
|
||||
{
|
||||
x %= width;
|
||||
if(x < 0)
|
||||
x += width;
|
||||
return x;
|
||||
}
|
||||
|
||||
ccl_always_inline int wrap_clamp(int x, int width)
|
||||
{
|
||||
return clamp(x, 0, width-1);
|
||||
}
|
||||
|
||||
ccl_always_inline float frac(float x, int *ix)
|
||||
{
|
||||
int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
|
||||
*ix = i;
|
||||
return x - (float)i;
|
||||
}
|
||||
|
||||
ccl_always_inline float4 interp(float x, float y)
|
||||
{
|
||||
if(UNLIKELY(!data))
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
int ix, iy, nix, niy;
|
||||
|
||||
if(interpolation == INTERPOLATION_CLOSEST) {
|
||||
frac(x*(float)width, &ix);
|
||||
frac(y*(float)height, &iy);
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
return read(data[ix + iy*width]);
|
||||
}
|
||||
else if(interpolation == INTERPOLATION_LINEAR) {
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
|
||||
r += (1.0f - ty)*tx*read(data[nix + iy*width]);
|
||||
r += ty*(1.0f - tx)*read(data[ix + niy*width]);
|
||||
r += ty*tx*read(data[nix + niy*width]);
|
||||
|
||||
return r;
|
||||
}
|
||||
else {
|
||||
/* Bicubic b-spline interpolation. */
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
int pix, piy, nnix, nniy;
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
|
||||
pix = wrap_periodic(ix-1, width);
|
||||
piy = wrap_periodic(iy-1, height);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
|
||||
nnix = wrap_periodic(ix+2, width);
|
||||
nniy = wrap_periodic(iy+2, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
pix = wrap_clamp(ix-1, width);
|
||||
piy = wrap_clamp(iy-1, height);
|
||||
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
|
||||
nnix = wrap_clamp(ix+2, width);
|
||||
nniy = wrap_clamp(iy+2, height);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {width * piy,
|
||||
width * iy,
|
||||
width * niy,
|
||||
width * nniy};
|
||||
float u[4], v[4];
|
||||
/* Some helper macro to keep code reasonable size,
|
||||
* let compiler to inline all the matrix multiplications.
|
||||
*/
|
||||
#define DATA(x, y) (read(data[xc[x] + yc[y]]))
|
||||
#define TERM(col) \
|
||||
(v[col] * (u[0] * DATA(0, col) + \
|
||||
u[1] * DATA(1, col) + \
|
||||
u[2] * DATA(2, col) + \
|
||||
u[3] * DATA(3, col)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
|
||||
/* Actual interpolation. */
|
||||
return TERM(0) + TERM(1) + TERM(2) + TERM(3);
|
||||
|
||||
#undef TERM
|
||||
#undef DATA
|
||||
}
|
||||
}
|
||||
|
||||
ccl_always_inline float4 interp_3d(float x, float y, float z)
|
||||
{
|
||||
return interp_3d_ex(x, y, z, interpolation);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 interp_3d_ex_closest(float x, float y, float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
frac(x*(float)width, &ix);
|
||||
frac(y*(float)height, &iy);
|
||||
frac(z*(float)depth, &iz);
|
||||
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
return read(data[ix + iy*width + iz*width*height]);
|
||||
}
|
||||
|
||||
ccl_always_inline float4 interp_3d_ex_linear(float x, float y, float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
float tz = frac(z*(float)depth - 0.5f, &iz);
|
||||
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
niz = wrap_periodic(iz+1, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
niz = wrap_clamp(iz+1, depth);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
float4 r;
|
||||
|
||||
r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
|
||||
|
||||
r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
|
||||
r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
|
||||
r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
|
||||
r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
|
||||
* causing stack overflow issue in this function unless it is inlined.
|
||||
*
|
||||
* Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
|
||||
* enabled.
|
||||
*/
|
||||
#ifdef __GNUC__
|
||||
ccl_always_inline
|
||||
#else
|
||||
ccl_never_inline
|
||||
#endif
|
||||
float4 interp_3d_ex_tricubic(float x, float y, float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
/* Tricubic b-spline interpolation. */
|
||||
const float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
const float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
const float tz = frac(z*(float)depth - 0.5f, &iz);
|
||||
int pix, piy, piz, nnix, nniy, nniz;
|
||||
|
||||
switch(extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
|
||||
pix = wrap_periodic(ix-1, width);
|
||||
piy = wrap_periodic(iy-1, height);
|
||||
piz = wrap_periodic(iz-1, depth);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
niz = wrap_periodic(iz+1, depth);
|
||||
|
||||
nnix = wrap_periodic(ix+2, width);
|
||||
nniy = wrap_periodic(iy+2, height);
|
||||
nniz = wrap_periodic(iz+2, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
pix = wrap_clamp(ix-1, width);
|
||||
piy = wrap_clamp(iy-1, height);
|
||||
piz = wrap_clamp(iz-1, depth);
|
||||
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
niz = wrap_clamp(iz+1, depth);
|
||||
|
||||
nnix = wrap_clamp(ix+2, width);
|
||||
nniy = wrap_clamp(iy+2, height);
|
||||
nniz = wrap_clamp(iz+2, depth);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {width * piy,
|
||||
width * iy,
|
||||
width * niy,
|
||||
width * nniy};
|
||||
const int zc[4] = {width * height * piz,
|
||||
width * height * iz,
|
||||
width * height * niz,
|
||||
width * height * nniz};
|
||||
float u[4], v[4], w[4];
|
||||
|
||||
/* Some helper macro to keep code reasonable size,
|
||||
* let compiler to inline all the matrix multiplications.
|
||||
*/
|
||||
#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
|
||||
#define COL_TERM(col, row) \
|
||||
(v[col] * (u[0] * DATA(0, col, row) + \
|
||||
u[1] * DATA(1, col, row) + \
|
||||
u[2] * DATA(2, col, row) + \
|
||||
u[3] * DATA(3, col, row)))
|
||||
#define ROW_TERM(row) \
|
||||
(w[row] * (COL_TERM(0, row) + \
|
||||
COL_TERM(1, row) + \
|
||||
COL_TERM(2, row) + \
|
||||
COL_TERM(3, row)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
/* Actual interpolation. */
|
||||
return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
|
||||
|
||||
#undef COL_TERM
|
||||
#undef ROW_TERM
|
||||
#undef DATA
|
||||
}
|
||||
|
||||
ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
|
||||
int interpolation = INTERPOLATION_LINEAR)
|
||||
{
|
||||
if(UNLIKELY(!data))
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
switch(interpolation) {
|
||||
case INTERPOLATION_CLOSEST:
|
||||
return interp_3d_ex_closest(x, y, z);
|
||||
case INTERPOLATION_LINEAR:
|
||||
return interp_3d_ex_linear(x, y, z);
|
||||
default:
|
||||
return interp_3d_ex_tricubic(x, y, z);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
|
||||
{
|
||||
width = width_;
|
||||
height = height_;
|
||||
depth = depth_;
|
||||
}
|
||||
|
||||
T *data;
|
||||
int interpolation;
|
||||
ExtensionType extension;
|
||||
int width, height, depth;
|
||||
#undef SET_CUBIC_SPLINE_WEIGHTS
|
||||
};
|
||||
|
||||
typedef texture<float4> texture_float4;
|
||||
typedef texture<float2> texture_float2;
|
||||
typedef texture<float> texture_float;
|
||||
typedef texture<uint> texture_uint;
|
||||
typedef texture<int> texture_int;
|
||||
typedef texture<uint4> texture_uint4;
|
||||
typedef texture<uchar4> texture_uchar4;
|
||||
typedef texture<uchar> texture_uchar;
|
||||
typedef texture_image<float> texture_image_float;
|
||||
typedef texture_image<uchar> texture_image_uchar;
|
||||
typedef texture_image<half> texture_image_half;
|
||||
typedef texture_image<float4> texture_image_float4;
|
||||
typedef texture_image<uchar4> texture_image_uchar4;
|
||||
typedef texture_image<half4> texture_image_half4;
|
||||
|
||||
/* Macros to handle different memory storage on different devices */
|
||||
|
||||
#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
|
||||
@ -563,10 +120,6 @@ typedef texture_image<half4> texture_image_half4;
|
||||
#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
|
||||
#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
|
||||
|
||||
#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
|
||||
#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
|
||||
#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
|
||||
|
||||
#define kernel_data (kg->__data)
|
||||
|
||||
#ifdef __KERNEL_SSE2__
|
||||
|
@ -126,42 +126,16 @@ ccl_device_inline uint ccl_num_groups(uint d)
|
||||
|
||||
/* Textures */
|
||||
|
||||
typedef texture<float4, 1> texture_float4;
|
||||
typedef texture<float2, 1> texture_float2;
|
||||
typedef texture<float, 1> texture_float;
|
||||
typedef texture<uint, 1> texture_uint;
|
||||
typedef texture<int, 1> texture_int;
|
||||
typedef texture<uint4, 1> texture_uint4;
|
||||
typedef texture<uchar, 1> texture_uchar;
|
||||
typedef texture<uchar4, 1> texture_uchar4;
|
||||
/* Use arrays for regular data. This is a little slower than textures on Fermi,
|
||||
* but allows for cleaner code and we will stop supporting Fermi soon. */
|
||||
#define kernel_tex_fetch(t, index) t[(index)]
|
||||
|
||||
/* On Kepler (6xx) and above, we use Bindless Textures for images.
|
||||
* On Fermi cards (4xx and 5xx), we have to use regular textures. */
|
||||
#if __CUDA_ARCH__ < 300
|
||||
typedef texture<float4, 2> texture_image_float4;
|
||||
typedef texture<float4, 3> texture_image3d_float4;
|
||||
typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
|
||||
|
||||
/* Macros to handle different memory storage on different devices */
|
||||
|
||||
/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
|
||||
* On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
|
||||
*
|
||||
* Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
|
||||
* Using Arrays on Fermi turned out to be slower.*/
|
||||
|
||||
/* Fermi */
|
||||
#if __CUDA_ARCH__ < 300
|
||||
# define __KERNEL_CUDA_TEX_STORAGE__
|
||||
# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
|
||||
|
||||
# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
|
||||
# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
|
||||
|
||||
/* Kepler */
|
||||
#else
|
||||
# define kernel_tex_fetch(t, index) t[(index)]
|
||||
|
||||
# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
|
||||
# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
|
||||
# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
|
||||
# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
|
||||
#endif
|
||||
|
||||
#define kernel_data __data
|
||||
|
@ -144,7 +144,7 @@
|
||||
|
||||
/* data lookup defines */
|
||||
#define kernel_data (*kg->data)
|
||||
#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)]
|
||||
#define kernel_tex_fetch(tex, index) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))[(index)]
|
||||
|
||||
/* define NULL */
|
||||
#define NULL 0
|
||||
|
@ -46,14 +46,7 @@ struct Intersection;
|
||||
struct VolumeStep;
|
||||
|
||||
typedef struct KernelGlobals {
|
||||
vector<texture_image_float4> texture_float4_images;
|
||||
vector<texture_image_uchar4> texture_byte4_images;
|
||||
vector<texture_image_half4> texture_half4_images;
|
||||
vector<texture_image_float> texture_float_images;
|
||||
vector<texture_image_uchar> texture_byte_images;
|
||||
vector<texture_image_half> texture_half_images;
|
||||
|
||||
# define KERNEL_TEX(type, ttype, name) ttype name;
|
||||
# define KERNEL_TEX(type, name) texture<type> name;
|
||||
# define KERNEL_IMAGE_TEX(type, ttype, name)
|
||||
# include "kernel/kernel_textures.h"
|
||||
|
||||
@ -99,11 +92,7 @@ typedef struct KernelGlobals {
|
||||
Intersection hits_stack[64];
|
||||
} KernelGlobals;
|
||||
|
||||
# ifdef __KERNEL_CUDA_TEX_STORAGE__
|
||||
# define KERNEL_TEX(type, ttype, name) ttype name;
|
||||
# else
|
||||
# define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
|
||||
# endif
|
||||
# define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
|
||||
# define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
|
||||
# include "kernel/kernel_textures.h"
|
||||
|
||||
@ -113,22 +102,16 @@ typedef struct KernelGlobals {
|
||||
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
|
||||
# define KERNEL_TEX(type, ttype, name) \
|
||||
# define KERNEL_TEX(type, name) \
|
||||
typedef type name##_t;
|
||||
# include "kernel/kernel_textures.h"
|
||||
|
||||
typedef struct tex_info_t {
|
||||
uint buffer, padding;
|
||||
uint64_t offset;
|
||||
uint width, height, depth, options;
|
||||
} tex_info_t;
|
||||
|
||||
typedef ccl_addr_space struct KernelGlobals {
|
||||
ccl_constant KernelData *data;
|
||||
ccl_global char *buffers[8];
|
||||
|
||||
# define KERNEL_TEX(type, ttype, name) \
|
||||
tex_info_t name;
|
||||
# define KERNEL_TEX(type, name) \
|
||||
TextureInfo name;
|
||||
# include "kernel/kernel_textures.h"
|
||||
|
||||
# ifdef __SPLIT_KERNEL__
|
||||
@ -176,9 +159,9 @@ ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
|
||||
if(ccl_local_id(0) + ccl_local_id(1) == 0)
|
||||
# endif
|
||||
{
|
||||
ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0];
|
||||
ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0];
|
||||
|
||||
# define KERNEL_TEX(type, ttype, name) \
|
||||
# define KERNEL_TEX(type, name) \
|
||||
kg->name = *(info++);
|
||||
# include "kernel/kernel_textures.h"
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
*/
|
||||
|
||||
#ifndef KERNEL_TEX
|
||||
# define KERNEL_TEX(type, ttype, name)
|
||||
# define KERNEL_TEX(type, name)
|
||||
#endif
|
||||
|
||||
#ifndef KERNEL_IMAGE_TEX
|
||||
@ -23,63 +23,65 @@
|
||||
#endif
|
||||
|
||||
/* bvh */
|
||||
KERNEL_TEX(float4, texture_float4, __bvh_nodes)
|
||||
KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
|
||||
KERNEL_TEX(float4, texture_float4, __prim_tri_verts)
|
||||
KERNEL_TEX(uint, texture_uint, __prim_tri_index)
|
||||
KERNEL_TEX(uint, texture_uint, __prim_type)
|
||||
KERNEL_TEX(uint, texture_uint, __prim_visibility)
|
||||
KERNEL_TEX(uint, texture_uint, __prim_index)
|
||||
KERNEL_TEX(uint, texture_uint, __prim_object)
|
||||
KERNEL_TEX(uint, texture_uint, __object_node)
|
||||
KERNEL_TEX(float2, texture_float2, __prim_time)
|
||||
KERNEL_TEX(float4, __bvh_nodes)
|
||||
KERNEL_TEX(float4, __bvh_leaf_nodes)
|
||||
KERNEL_TEX(float4, __prim_tri_verts)
|
||||
KERNEL_TEX(uint, __prim_tri_index)
|
||||
KERNEL_TEX(uint, __prim_type)
|
||||
KERNEL_TEX(uint, __prim_visibility)
|
||||
KERNEL_TEX(uint, __prim_index)
|
||||
KERNEL_TEX(uint, __prim_object)
|
||||
KERNEL_TEX(uint, __object_node)
|
||||
KERNEL_TEX(float2, __prim_time)
|
||||
|
||||
/* objects */
|
||||
KERNEL_TEX(float4, texture_float4, __objects)
|
||||
KERNEL_TEX(float4, texture_float4, __objects_vector)
|
||||
KERNEL_TEX(float4, __objects)
|
||||
KERNEL_TEX(float4, __objects_vector)
|
||||
|
||||
/* triangles */
|
||||
KERNEL_TEX(uint, texture_uint, __tri_shader)
|
||||
KERNEL_TEX(float4, texture_float4, __tri_vnormal)
|
||||
KERNEL_TEX(uint4, texture_uint4, __tri_vindex)
|
||||
KERNEL_TEX(uint, texture_uint, __tri_patch)
|
||||
KERNEL_TEX(float2, texture_float2, __tri_patch_uv)
|
||||
KERNEL_TEX(uint, __tri_shader)
|
||||
KERNEL_TEX(float4, __tri_vnormal)
|
||||
KERNEL_TEX(uint4, __tri_vindex)
|
||||
KERNEL_TEX(uint, __tri_patch)
|
||||
KERNEL_TEX(float2, __tri_patch_uv)
|
||||
|
||||
/* curves */
|
||||
KERNEL_TEX(float4, texture_float4, __curves)
|
||||
KERNEL_TEX(float4, texture_float4, __curve_keys)
|
||||
KERNEL_TEX(float4, __curves)
|
||||
KERNEL_TEX(float4, __curve_keys)
|
||||
|
||||
/* patches */
|
||||
KERNEL_TEX(uint, texture_uint, __patches)
|
||||
KERNEL_TEX(uint, __patches)
|
||||
|
||||
/* attributes */
|
||||
KERNEL_TEX(uint4, texture_uint4, __attributes_map)
|
||||
KERNEL_TEX(float, texture_float, __attributes_float)
|
||||
KERNEL_TEX(float4, texture_float4, __attributes_float3)
|
||||
KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4)
|
||||
KERNEL_TEX(uint4, __attributes_map)
|
||||
KERNEL_TEX(float, __attributes_float)
|
||||
KERNEL_TEX(float4, __attributes_float3)
|
||||
KERNEL_TEX(uchar4, __attributes_uchar4)
|
||||
|
||||
/* lights */
|
||||
KERNEL_TEX(float4, texture_float4, __light_distribution)
|
||||
KERNEL_TEX(float4, texture_float4, __light_data)
|
||||
KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf)
|
||||
KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf)
|
||||
KERNEL_TEX(float4, __light_distribution)
|
||||
KERNEL_TEX(float4, __light_data)
|
||||
KERNEL_TEX(float2, __light_background_marginal_cdf)
|
||||
KERNEL_TEX(float2, __light_background_conditional_cdf)
|
||||
|
||||
/* particles */
|
||||
KERNEL_TEX(float4, texture_float4, __particles)
|
||||
KERNEL_TEX(float4, __particles)
|
||||
|
||||
/* shaders */
|
||||
KERNEL_TEX(uint4, texture_uint4, __svm_nodes)
|
||||
KERNEL_TEX(uint, texture_uint, __shader_flag)
|
||||
KERNEL_TEX(uint, texture_uint, __object_flag)
|
||||
KERNEL_TEX(uint4, __svm_nodes)
|
||||
KERNEL_TEX(uint, __shader_flag)
|
||||
KERNEL_TEX(uint, __object_flag)
|
||||
|
||||
/* lookup tables */
|
||||
KERNEL_TEX(float, texture_float, __lookup_table)
|
||||
KERNEL_TEX(float, __lookup_table)
|
||||
|
||||
/* sobol */
|
||||
KERNEL_TEX(uint, texture_uint, __sobol_directions)
|
||||
KERNEL_TEX(uint, __sobol_directions)
|
||||
|
||||
#ifdef __KERNEL_CUDA__
|
||||
# if __CUDA_ARCH__ < 300
|
||||
#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ >= 300
|
||||
/* image textures */
|
||||
KERNEL_TEX(TextureInfo, __texture_info)
|
||||
#else
|
||||
/* full-float image */
|
||||
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
|
||||
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
|
||||
@ -180,12 +182,7 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
|
||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
|
||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
|
||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
|
||||
|
||||
# else
|
||||
/* bindless textures */
|
||||
KERNEL_TEX(uint, texture_uint, __bindless_mapping)
|
||||
# endif /* __CUDA_ARCH__ */
|
||||
#endif /* __KERNEL_CUDA__ */
|
||||
#endif /* defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 */
|
||||
|
||||
#undef KERNEL_TEX
|
||||
#undef KERNEL_IMAGE_TEX
|
||||
|
@ -84,130 +84,16 @@ void kernel_tex_copy(KernelGlobals *kg,
|
||||
if(0) {
|
||||
}
|
||||
|
||||
#define KERNEL_TEX(type, ttype, tname) \
|
||||
#define KERNEL_TEX(type, tname) \
|
||||
else if(strcmp(name, #tname) == 0) { \
|
||||
kg->tname.data = (type*)mem; \
|
||||
kg->tname.width = width; \
|
||||
}
|
||||
#define KERNEL_IMAGE_TEX(type, ttype, tname)
|
||||
#define KERNEL_IMAGE_TEX(type, tname)
|
||||
#include "kernel/kernel_textures.h"
|
||||
|
||||
else if(strstr(name, "__tex_image_float4")) {
|
||||
texture_image_float4 *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_float4_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_float4_images.size()) {
|
||||
kg->texture_float4_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_float4_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (float4*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else if(strstr(name, "__tex_image_float")) {
|
||||
texture_image_float *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_float_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_float_images.size()) {
|
||||
kg->texture_float_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_float_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (float*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else if(strstr(name, "__tex_image_byte4")) {
|
||||
texture_image_uchar4 *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_byte4_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_byte4_images.size()) {
|
||||
kg->texture_byte4_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_byte4_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (uchar4*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else if(strstr(name, "__tex_image_byte")) {
|
||||
texture_image_uchar *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_byte_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_byte_images.size()) {
|
||||
kg->texture_byte_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_byte_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (uchar*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else if(strstr(name, "__tex_image_half4")) {
|
||||
texture_image_half4 *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_half4_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_half4_images.size()) {
|
||||
kg->texture_half4_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_half4_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (half4*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else if(strstr(name, "__tex_image_half")) {
|
||||
texture_image_half *tex = NULL;
|
||||
int id = atoi(name + strlen("__tex_image_half_"));
|
||||
int array_index = kernel_tex_index(id);
|
||||
|
||||
if(array_index >= 0) {
|
||||
if(array_index >= kg->texture_half_images.size()) {
|
||||
kg->texture_half_images.resize(array_index+1);
|
||||
}
|
||||
tex = &kg->texture_half_images[array_index];
|
||||
}
|
||||
|
||||
if(tex) {
|
||||
tex->data = (half*)mem;
|
||||
tex->dimensions_set(width, height, depth);
|
||||
tex->interpolation = interpolation;
|
||||
tex->extension = extension;
|
||||
}
|
||||
}
|
||||
else
|
||||
else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@ -17,70 +17,500 @@
|
||||
#ifndef __KERNEL_CPU_IMAGE_H__
|
||||
#define __KERNEL_CPU_IMAGE_H__
|
||||
|
||||
#ifdef __KERNEL_CPU__
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
|
||||
template<typename T> struct TextureInterpolator {
|
||||
#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
|
||||
{ \
|
||||
u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
|
||||
u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \
|
||||
u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
|
||||
u[3] = (1.0f / 6.0f) * t * t * t; \
|
||||
} (void)0
|
||||
|
||||
static ccl_always_inline float4 read(float4 r)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(uchar4 r)
|
||||
{
|
||||
float f = 1.0f/255.0f;
|
||||
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(uchar r)
|
||||
{
|
||||
float f = r*(1.0f/255.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(float r)
|
||||
{
|
||||
/* TODO(dingto): Optimize this, so interpolation
|
||||
* happens on float instead of float4 */
|
||||
return make_float4(r, r, r, 1.0f);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(half4 r)
|
||||
{
|
||||
return half4_to_float4(r);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(half r)
|
||||
{
|
||||
float f = half_to_float(r);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
|
||||
static ccl_always_inline int wrap_periodic(int x, int width)
|
||||
{
|
||||
x %= width;
|
||||
if(x < 0)
|
||||
x += width;
|
||||
return x;
|
||||
}
|
||||
|
||||
static ccl_always_inline int wrap_clamp(int x, int width)
|
||||
{
|
||||
return clamp(x, 0, width-1);
|
||||
}
|
||||
|
||||
static ccl_always_inline float frac(float x, int *ix)
|
||||
{
|
||||
int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
|
||||
*ix = i;
|
||||
return x - (float)i;
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp(const TextureInfo& info, float x, float y)
|
||||
{
|
||||
if(UNLIKELY(!info.data))
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
const T *data = (const T*)info.data;
|
||||
int width = info.width;
|
||||
int height = info.height;
|
||||
int ix, iy, nix, niy;
|
||||
|
||||
if(info.interpolation == INTERPOLATION_CLOSEST) {
|
||||
frac(x*(float)width, &ix);
|
||||
frac(y*(float)height, &iy);
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
return read(data[ix + iy*width]);
|
||||
}
|
||||
else if(info.interpolation == INTERPOLATION_LINEAR) {
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
|
||||
r += (1.0f - ty)*tx*read(data[nix + iy*width]);
|
||||
r += ty*(1.0f - tx)*read(data[ix + niy*width]);
|
||||
r += ty*tx*read(data[nix + niy*width]);
|
||||
|
||||
return r;
|
||||
}
|
||||
else {
|
||||
/* Bicubic b-spline interpolation. */
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
int pix, piy, nnix, nniy;
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
|
||||
pix = wrap_periodic(ix-1, width);
|
||||
piy = wrap_periodic(iy-1, height);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
|
||||
nnix = wrap_periodic(ix+2, width);
|
||||
nniy = wrap_periodic(iy+2, height);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
pix = wrap_clamp(ix-1, width);
|
||||
piy = wrap_clamp(iy-1, height);
|
||||
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
|
||||
nnix = wrap_clamp(ix+2, width);
|
||||
nniy = wrap_clamp(iy+2, height);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {width * piy,
|
||||
width * iy,
|
||||
width * niy,
|
||||
width * nniy};
|
||||
float u[4], v[4];
|
||||
/* Some helper macro to keep code reasonable size,
|
||||
* let compiler to inline all the matrix multiplications.
|
||||
*/
|
||||
#define DATA(x, y) (read(data[xc[x] + yc[y]]))
|
||||
#define TERM(col) \
|
||||
(v[col] * (u[0] * DATA(0, col) + \
|
||||
u[1] * DATA(1, col) + \
|
||||
u[2] * DATA(2, col) + \
|
||||
u[3] * DATA(3, col)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
|
||||
/* Actual interpolation. */
|
||||
return TERM(0) + TERM(1) + TERM(2) + TERM(3);
|
||||
|
||||
#undef TERM
|
||||
#undef DATA
|
||||
}
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, float x, float y, float z)
|
||||
{
|
||||
int width = info.width;
|
||||
int height = info.height;
|
||||
int depth = info.depth;
|
||||
int ix, iy, iz;
|
||||
|
||||
frac(x*(float)width, &ix);
|
||||
frac(y*(float)height, &iy);
|
||||
frac(z*(float)depth, &iz);
|
||||
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const T *data = (const T*)info.data;
|
||||
return read(data[ix + iy*width + iz*width*height]);
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, float x, float y, float z)
|
||||
{
|
||||
int width = info.width;
|
||||
int height = info.height;
|
||||
int depth = info.depth;
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
|
||||
float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
float tz = frac(z*(float)depth - 0.5f, &iz);
|
||||
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
niz = wrap_periodic(iz+1, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
niz = wrap_clamp(iz+1, depth);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const T *data = (const T*)info.data;
|
||||
float4 r;
|
||||
|
||||
r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
|
||||
r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
|
||||
|
||||
r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
|
||||
r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
|
||||
r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
|
||||
r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
|
||||
* causing stack overflow issue in this function unless it is inlined.
|
||||
*
|
||||
* Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
|
||||
* enabled.
|
||||
*/
|
||||
#ifdef __GNUC__
|
||||
static ccl_always_inline
|
||||
#else
|
||||
static ccl_never_inline
|
||||
#endif
|
||||
float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z)
|
||||
{
|
||||
int width = info.width;
|
||||
int height = info.height;
|
||||
int depth = info.depth;
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
/* Tricubic b-spline interpolation. */
|
||||
const float tx = frac(x*(float)width - 0.5f, &ix);
|
||||
const float ty = frac(y*(float)height - 0.5f, &iy);
|
||||
const float tz = frac(z*(float)depth - 0.5f, &iz);
|
||||
int pix, piy, piz, nnix, nniy, nniz;
|
||||
|
||||
switch(info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
ix = wrap_periodic(ix, width);
|
||||
iy = wrap_periodic(iy, height);
|
||||
iz = wrap_periodic(iz, depth);
|
||||
|
||||
pix = wrap_periodic(ix-1, width);
|
||||
piy = wrap_periodic(iy-1, height);
|
||||
piz = wrap_periodic(iz-1, depth);
|
||||
|
||||
nix = wrap_periodic(ix+1, width);
|
||||
niy = wrap_periodic(iy+1, height);
|
||||
niz = wrap_periodic(iz+1, depth);
|
||||
|
||||
nnix = wrap_periodic(ix+2, width);
|
||||
nniy = wrap_periodic(iy+2, height);
|
||||
nniz = wrap_periodic(iz+2, depth);
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
if(x < 0.0f || y < 0.0f || z < 0.0f ||
|
||||
x > 1.0f || y > 1.0f || z > 1.0f)
|
||||
{
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
ATTR_FALLTHROUGH;
|
||||
case EXTENSION_EXTEND:
|
||||
pix = wrap_clamp(ix-1, width);
|
||||
piy = wrap_clamp(iy-1, height);
|
||||
piz = wrap_clamp(iz-1, depth);
|
||||
|
||||
nix = wrap_clamp(ix+1, width);
|
||||
niy = wrap_clamp(iy+1, height);
|
||||
niz = wrap_clamp(iz+1, depth);
|
||||
|
||||
nnix = wrap_clamp(ix+2, width);
|
||||
nniy = wrap_clamp(iy+2, height);
|
||||
nniz = wrap_clamp(iz+2, depth);
|
||||
|
||||
ix = wrap_clamp(ix, width);
|
||||
iy = wrap_clamp(iy, height);
|
||||
iz = wrap_clamp(iz, depth);
|
||||
break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {width * piy,
|
||||
width * iy,
|
||||
width * niy,
|
||||
width * nniy};
|
||||
const int zc[4] = {width * height * piz,
|
||||
width * height * iz,
|
||||
width * height * niz,
|
||||
width * height * nniz};
|
||||
float u[4], v[4], w[4];
|
||||
|
||||
/* Some helper macro to keep code reasonable size,
|
||||
* let compiler to inline all the matrix multiplications.
|
||||
*/
|
||||
#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
|
||||
#define COL_TERM(col, row) \
|
||||
(v[col] * (u[0] * DATA(0, col, row) + \
|
||||
u[1] * DATA(1, col, row) + \
|
||||
u[2] * DATA(2, col, row) + \
|
||||
u[3] * DATA(3, col, row)))
|
||||
#define ROW_TERM(row) \
|
||||
(w[row] * (COL_TERM(0, row) + \
|
||||
COL_TERM(1, row) + \
|
||||
COL_TERM(2, row) + \
|
||||
COL_TERM(3, row)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
/* Actual interpolation. */
|
||||
const T *data = (const T*)info.data;
|
||||
return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
|
||||
|
||||
#undef COL_TERM
|
||||
#undef ROW_TERM
|
||||
#undef DATA
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 interp_3d(const TextureInfo& info,
|
||||
float x, float y, float z,
|
||||
int interpolation = INTERPOLATION_LINEAR)
|
||||
{
|
||||
if(UNLIKELY(!info.data))
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
switch(interpolation) {
|
||||
case INTERPOLATION_CLOSEST:
|
||||
return interp_3d_closest(info, x, y, z);
|
||||
case INTERPOLATION_LINEAR:
|
||||
return interp_3d_linear(info, x, y, z);
|
||||
default:
|
||||
return interp_3d_tricubic(info, x, y, z);
|
||||
}
|
||||
}
|
||||
#undef SET_CUBIC_SPLINE_WEIGHTS
|
||||
};
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
|
||||
{
|
||||
switch(kernel_tex_type(tex)) {
|
||||
const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
|
||||
|
||||
switch(kernel_tex_type(id)) {
|
||||
case IMAGE_DATA_TYPE_HALF:
|
||||
return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<half>::interp(info, x, y);
|
||||
case IMAGE_DATA_TYPE_BYTE:
|
||||
return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<uchar>::interp(info, x, y);
|
||||
case IMAGE_DATA_TYPE_FLOAT:
|
||||
return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<float>::interp(info, x, y);
|
||||
case IMAGE_DATA_TYPE_HALF4:
|
||||
return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<half4>::interp(info, x, y);
|
||||
case IMAGE_DATA_TYPE_BYTE4:
|
||||
return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<uchar4>::interp(info, x, y);
|
||||
case IMAGE_DATA_TYPE_FLOAT4:
|
||||
default:
|
||||
return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y);
|
||||
return TextureInterpolator<float4>::interp(info, x, y);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
|
||||
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
|
||||
{
|
||||
switch(kernel_tex_type(tex)) {
|
||||
const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
|
||||
InterpolationType interp = (InterpolationType)info.interpolation;
|
||||
|
||||
switch(kernel_tex_type(id)) {
|
||||
case IMAGE_DATA_TYPE_HALF:
|
||||
return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_BYTE:
|
||||
return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_FLOAT:
|
||||
return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_HALF4:
|
||||
return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_BYTE4:
|
||||
return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_FLOAT4:
|
||||
default:
|
||||
return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
|
||||
return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
|
||||
ccl_device float4 kernel_tex_image_interp_3d_ex(KernelGlobals *kg, int id, float x, float y, float z, int interp)
|
||||
{
|
||||
switch(kernel_tex_type(tex)) {
|
||||
const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
|
||||
|
||||
switch(kernel_tex_type(id)) {
|
||||
case IMAGE_DATA_TYPE_HALF:
|
||||
return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_BYTE:
|
||||
return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_FLOAT:
|
||||
return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_HALF4:
|
||||
return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_BYTE4:
|
||||
return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
|
||||
case IMAGE_DATA_TYPE_FLOAT4:
|
||||
default:
|
||||
return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
|
||||
return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif // __KERNEL_CPU__
|
||||
|
||||
|
||||
#endif // __KERNEL_CPU_IMAGE_H__
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "kernel/kernel_math.h"
|
||||
#include "kernel/kernel_types.h"
|
||||
#include "kernel/kernel_globals.h"
|
||||
#include "kernel/kernels/cuda/kernel_cuda_image.h"
|
||||
#include "kernel/kernel_film.h"
|
||||
#include "kernel/kernel_path.h"
|
||||
#include "kernel/kernel_path_branched.h"
|
||||
|
175
intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
Normal file
175
intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
Normal file
@ -0,0 +1,175 @@
|
||||
/*
|
||||
* Copyright 2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
|
||||
/* Kepler */
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(void *kg, int id, float x, float y)
|
||||
{
|
||||
const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
|
||||
CUtexObject tex = (CUtexObject)info.data;
|
||||
|
||||
/* float4, byte4 and half4 */
|
||||
const int texture_type = kernel_tex_type(id);
|
||||
if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_BYTE4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_HALF4)
|
||||
{
|
||||
return tex2D<float4>(tex, x, y);
|
||||
}
|
||||
/* float, byte and half */
|
||||
else {
|
||||
float f = tex2D<float>(tex, x, y);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d(void *kg, int id, float x, float y, float z)
|
||||
{
|
||||
const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
|
||||
CUtexObject tex = (CUtexObject)info.data;
|
||||
|
||||
const int texture_type = kernel_tex_type(id);
|
||||
if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_BYTE4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_HALF4)
|
||||
{
|
||||
return tex3D<float4>(tex, x, y, z);
|
||||
}
|
||||
else {
|
||||
float f = tex3D<float>(tex, x, y, z);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Fermi */
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(void *kg, int id, float x, float y)
|
||||
{
|
||||
float4 r;
|
||||
switch(id) {
|
||||
case 0: r = tex2D(__tex_image_float4_000, x, y); break;
|
||||
case 8: r = tex2D(__tex_image_float4_008, x, y); break;
|
||||
case 16: r = tex2D(__tex_image_float4_016, x, y); break;
|
||||
case 24: r = tex2D(__tex_image_float4_024, x, y); break;
|
||||
case 32: r = tex2D(__tex_image_float4_032, x, y); break;
|
||||
case 1: r = tex2D(__tex_image_byte4_001, x, y); break;
|
||||
case 9: r = tex2D(__tex_image_byte4_009, x, y); break;
|
||||
case 17: r = tex2D(__tex_image_byte4_017, x, y); break;
|
||||
case 25: r = tex2D(__tex_image_byte4_025, x, y); break;
|
||||
case 33: r = tex2D(__tex_image_byte4_033, x, y); break;
|
||||
case 41: r = tex2D(__tex_image_byte4_041, x, y); break;
|
||||
case 49: r = tex2D(__tex_image_byte4_049, x, y); break;
|
||||
case 57: r = tex2D(__tex_image_byte4_057, x, y); break;
|
||||
case 65: r = tex2D(__tex_image_byte4_065, x, y); break;
|
||||
case 73: r = tex2D(__tex_image_byte4_073, x, y); break;
|
||||
case 81: r = tex2D(__tex_image_byte4_081, x, y); break;
|
||||
case 89: r = tex2D(__tex_image_byte4_089, x, y); break;
|
||||
case 97: r = tex2D(__tex_image_byte4_097, x, y); break;
|
||||
case 105: r = tex2D(__tex_image_byte4_105, x, y); break;
|
||||
case 113: r = tex2D(__tex_image_byte4_113, x, y); break;
|
||||
case 121: r = tex2D(__tex_image_byte4_121, x, y); break;
|
||||
case 129: r = tex2D(__tex_image_byte4_129, x, y); break;
|
||||
case 137: r = tex2D(__tex_image_byte4_137, x, y); break;
|
||||
case 145: r = tex2D(__tex_image_byte4_145, x, y); break;
|
||||
case 153: r = tex2D(__tex_image_byte4_153, x, y); break;
|
||||
case 161: r = tex2D(__tex_image_byte4_161, x, y); break;
|
||||
case 169: r = tex2D(__tex_image_byte4_169, x, y); break;
|
||||
case 177: r = tex2D(__tex_image_byte4_177, x, y); break;
|
||||
case 185: r = tex2D(__tex_image_byte4_185, x, y); break;
|
||||
case 193: r = tex2D(__tex_image_byte4_193, x, y); break;
|
||||
case 201: r = tex2D(__tex_image_byte4_201, x, y); break;
|
||||
case 209: r = tex2D(__tex_image_byte4_209, x, y); break;
|
||||
case 217: r = tex2D(__tex_image_byte4_217, x, y); break;
|
||||
case 225: r = tex2D(__tex_image_byte4_225, x, y); break;
|
||||
case 233: r = tex2D(__tex_image_byte4_233, x, y); break;
|
||||
case 241: r = tex2D(__tex_image_byte4_241, x, y); break;
|
||||
case 249: r = tex2D(__tex_image_byte4_249, x, y); break;
|
||||
case 257: r = tex2D(__tex_image_byte4_257, x, y); break;
|
||||
case 265: r = tex2D(__tex_image_byte4_265, x, y); break;
|
||||
case 273: r = tex2D(__tex_image_byte4_273, x, y); break;
|
||||
case 281: r = tex2D(__tex_image_byte4_281, x, y); break;
|
||||
case 289: r = tex2D(__tex_image_byte4_289, x, y); break;
|
||||
case 297: r = tex2D(__tex_image_byte4_297, x, y); break;
|
||||
case 305: r = tex2D(__tex_image_byte4_305, x, y); break;
|
||||
case 313: r = tex2D(__tex_image_byte4_313, x, y); break;
|
||||
case 321: r = tex2D(__tex_image_byte4_321, x, y); break;
|
||||
case 329: r = tex2D(__tex_image_byte4_329, x, y); break;
|
||||
case 337: r = tex2D(__tex_image_byte4_337, x, y); break;
|
||||
case 345: r = tex2D(__tex_image_byte4_345, x, y); break;
|
||||
case 353: r = tex2D(__tex_image_byte4_353, x, y); break;
|
||||
case 361: r = tex2D(__tex_image_byte4_361, x, y); break;
|
||||
case 369: r = tex2D(__tex_image_byte4_369, x, y); break;
|
||||
case 377: r = tex2D(__tex_image_byte4_377, x, y); break;
|
||||
case 385: r = tex2D(__tex_image_byte4_385, x, y); break;
|
||||
case 393: r = tex2D(__tex_image_byte4_393, x, y); break;
|
||||
case 401: r = tex2D(__tex_image_byte4_401, x, y); break;
|
||||
case 409: r = tex2D(__tex_image_byte4_409, x, y); break;
|
||||
case 417: r = tex2D(__tex_image_byte4_417, x, y); break;
|
||||
case 425: r = tex2D(__tex_image_byte4_425, x, y); break;
|
||||
case 433: r = tex2D(__tex_image_byte4_433, x, y); break;
|
||||
case 441: r = tex2D(__tex_image_byte4_441, x, y); break;
|
||||
case 449: r = tex2D(__tex_image_byte4_449, x, y); break;
|
||||
case 457: r = tex2D(__tex_image_byte4_457, x, y); break;
|
||||
case 465: r = tex2D(__tex_image_byte4_465, x, y); break;
|
||||
case 473: r = tex2D(__tex_image_byte4_473, x, y); break;
|
||||
case 481: r = tex2D(__tex_image_byte4_481, x, y); break;
|
||||
case 489: r = tex2D(__tex_image_byte4_489, x, y); break;
|
||||
case 497: r = tex2D(__tex_image_byte4_497, x, y); break;
|
||||
case 505: r = tex2D(__tex_image_byte4_505, x, y); break;
|
||||
case 513: r = tex2D(__tex_image_byte4_513, x, y); break;
|
||||
case 521: r = tex2D(__tex_image_byte4_521, x, y); break;
|
||||
case 529: r = tex2D(__tex_image_byte4_529, x, y); break;
|
||||
case 537: r = tex2D(__tex_image_byte4_537, x, y); break;
|
||||
case 545: r = tex2D(__tex_image_byte4_545, x, y); break;
|
||||
case 553: r = tex2D(__tex_image_byte4_553, x, y); break;
|
||||
case 561: r = tex2D(__tex_image_byte4_561, x, y); break;
|
||||
case 569: r = tex2D(__tex_image_byte4_569, x, y); break;
|
||||
case 577: r = tex2D(__tex_image_byte4_577, x, y); break;
|
||||
case 585: r = tex2D(__tex_image_byte4_585, x, y); break;
|
||||
case 593: r = tex2D(__tex_image_byte4_593, x, y); break;
|
||||
case 601: r = tex2D(__tex_image_byte4_601, x, y); break;
|
||||
case 609: r = tex2D(__tex_image_byte4_609, x, y); break;
|
||||
case 617: r = tex2D(__tex_image_byte4_617, x, y); break;
|
||||
case 625: r = tex2D(__tex_image_byte4_625, x, y); break;
|
||||
case 633: r = tex2D(__tex_image_byte4_633, x, y); break;
|
||||
case 641: r = tex2D(__tex_image_byte4_641, x, y); break;
|
||||
case 649: r = tex2D(__tex_image_byte4_649, x, y); break;
|
||||
case 657: r = tex2D(__tex_image_byte4_657, x, y); break;
|
||||
case 665: r = tex2D(__tex_image_byte4_665, x, y); break;
|
||||
default: r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d(void *kg, int id, float x, float y, float z)
|
||||
{
|
||||
float4 r;
|
||||
switch(id) {
|
||||
case 0: r = tex3D(__tex_image_float4_3d_000, x, y, z); break;
|
||||
case 8: r = tex3D(__tex_image_float4_3d_008, x, y, z); break;
|
||||
case 16: r = tex3D(__tex_image_float4_3d_016, x, y, z); break;
|
||||
case 24: r = tex3D(__tex_image_float4_3d_024, x, y, z); break;
|
||||
case 32: r = tex3D(__tex_image_float4_3d_032, x, y, z); break;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -20,7 +20,7 @@
|
||||
#include "kernel/kernel_math.h"
|
||||
#include "kernel/kernel_types.h"
|
||||
#include "kernel/kernel_globals.h"
|
||||
#include "kernel/kernel_image_opencl.h"
|
||||
#include "kernel/kernels/opencl/kernel_opencl_image.h"
|
||||
|
||||
#include "kernel/kernel_film.h"
|
||||
|
||||
|
@ -14,23 +14,22 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/* For OpenCL we do manual lookup and interpolation. */
|
||||
|
||||
ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) {
|
||||
ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) {
|
||||
const uint tex_offset = id
|
||||
#define KERNEL_TEX(type, ttype, name) + 1
|
||||
#define KERNEL_TEX(type, name) + 1
|
||||
#include "kernel/kernel_textures.h"
|
||||
;
|
||||
|
||||
return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset];
|
||||
return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset];
|
||||
}
|
||||
|
||||
#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)]
|
||||
#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
|
||||
{
|
||||
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
|
||||
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
|
||||
const int texture_type = kernel_tex_type(id);
|
||||
|
||||
/* Float4 */
|
||||
@ -76,35 +75,15 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
|
||||
return x - (float)i;
|
||||
}
|
||||
|
||||
ccl_device_inline uint kernel_decode_image_interpolation(uint info)
|
||||
{
|
||||
return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
|
||||
}
|
||||
|
||||
ccl_device_inline uint kernel_decode_image_extension(uint info)
|
||||
{
|
||||
if(info & (1 << 1)) {
|
||||
return EXTENSION_REPEAT;
|
||||
}
|
||||
else if(info & (1 << 2)) {
|
||||
return EXTENSION_EXTEND;
|
||||
}
|
||||
else {
|
||||
return EXTENSION_CLIP;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
|
||||
{
|
||||
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
|
||||
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
|
||||
|
||||
uint width = info->width;
|
||||
uint height = info->height;
|
||||
uint offset = 0;
|
||||
|
||||
/* Decode image options. */
|
||||
uint interpolation = kernel_decode_image_interpolation(info->options);
|
||||
uint extension = kernel_decode_image_extension(info->options);
|
||||
uint interpolation = info->interpolation;
|
||||
uint extension = info->extension;
|
||||
|
||||
/* Actual sampling. */
|
||||
float4 r;
|
||||
@ -165,16 +144,14 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
|
||||
{
|
||||
const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
|
||||
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
|
||||
|
||||
uint width = info->width;
|
||||
uint height = info->height;
|
||||
uint offset = 0;
|
||||
uint depth = info->depth;
|
||||
|
||||
/* Decode image options. */
|
||||
uint interpolation = kernel_decode_image_interpolation(info->options);
|
||||
uint extension = kernel_decode_image_extension(info->options);
|
||||
uint interpolation = info->interpolation;
|
||||
uint extension = info->extension;
|
||||
|
||||
/* Actual sampling. */
|
||||
float4 r;
|
@ -962,7 +962,7 @@ bool OSLRenderServices::texture(ustring filename,
|
||||
|
||||
if(filename.length() && filename[0] == '@') {
|
||||
int slot = atoi(filename.c_str() + 1);
|
||||
float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
|
||||
float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t);
|
||||
|
||||
result[0] = rgba[0];
|
||||
if(nchannels > 1)
|
||||
@ -1043,7 +1043,7 @@ bool OSLRenderServices::texture3d(ustring filename,
|
||||
bool status;
|
||||
if(filename.length() && filename[0] == '@') {
|
||||
int slot = atoi(filename.c_str() + 1);
|
||||
float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
|
||||
float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z);
|
||||
|
||||
result[0] = rgba[0];
|
||||
if(nchannels > 1)
|
||||
|
@ -29,7 +29,10 @@
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_OPENCL__
|
||||
# include "kernel/kernel_image_opencl.h"
|
||||
# include "kernel/kernels/opencl/kernel_opencl_image.h"
|
||||
#endif
|
||||
#ifdef __KERNEL_CUDA__
|
||||
# include "kernel/kernels/cuda/kernel_cuda_image.h"
|
||||
#endif
|
||||
#ifdef __KERNEL_CPU__
|
||||
# include "kernel/kernels/cpu/kernel_cpu_image.h"
|
||||
|
@ -18,135 +18,7 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
|
||||
{
|
||||
#ifdef __KERNEL_CPU__
|
||||
float4 r = kernel_tex_image_interp(id, x, y);
|
||||
#elif defined(__KERNEL_OPENCL__)
|
||||
float4 r = kernel_tex_image_interp(kg, id, x, y);
|
||||
#else
|
||||
float4 r;
|
||||
|
||||
# if __CUDA_ARCH__ < 300
|
||||
/* not particularly proud of this massive switch, what are the
|
||||
* alternatives?
|
||||
* - use a single big 1D texture, and do our own lookup/filtering
|
||||
* - group by size and use a 3d texture, performance impact
|
||||
* - group into larger texture with some padding for correct lerp
|
||||
*
|
||||
* also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
|
||||
* and we cannot use all since we still need some for other storage */
|
||||
|
||||
switch(id) {
|
||||
case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
|
||||
case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
|
||||
case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
|
||||
case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
|
||||
case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
|
||||
case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
|
||||
case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
|
||||
case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
|
||||
case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
|
||||
case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
|
||||
case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
|
||||
case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
|
||||
case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
|
||||
case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
|
||||
case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
|
||||
case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
|
||||
case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
|
||||
case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
|
||||
case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
|
||||
case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
|
||||
case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
|
||||
case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
|
||||
case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
|
||||
case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
|
||||
case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
|
||||
case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
|
||||
case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
|
||||
case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
|
||||
case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
|
||||
case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
|
||||
case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
|
||||
case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
|
||||
case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
|
||||
case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
|
||||
case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
|
||||
case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
|
||||
case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
|
||||
case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
|
||||
case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
|
||||
case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
|
||||
case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
|
||||
case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
|
||||
case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
|
||||
case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
|
||||
case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
|
||||
case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
|
||||
case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
|
||||
case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
|
||||
case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
|
||||
case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
|
||||
case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
|
||||
case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
|
||||
case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
|
||||
case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
|
||||
case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
|
||||
case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
|
||||
case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
|
||||
case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
|
||||
case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
|
||||
case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
|
||||
case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
|
||||
case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
|
||||
case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
|
||||
case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
|
||||
case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
|
||||
case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
|
||||
case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
|
||||
case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
|
||||
case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
|
||||
case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
|
||||
case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
|
||||
case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
|
||||
case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
|
||||
case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
|
||||
case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
|
||||
case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
|
||||
case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
|
||||
case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
|
||||
case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
|
||||
case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
|
||||
case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
|
||||
case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
|
||||
case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
|
||||
case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
|
||||
case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
|
||||
case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
|
||||
case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
|
||||
case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
|
||||
case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
|
||||
default:
|
||||
kernel_assert(0);
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
# else
|
||||
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||
/* float4, byte4 and half4 */
|
||||
const int texture_type = kernel_tex_type(id);
|
||||
if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_BYTE4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_HALF4)
|
||||
{
|
||||
r = kernel_tex_image_interp_float4(tex, x, y);
|
||||
}
|
||||
/* float, byte and half */
|
||||
else {
|
||||
float f = kernel_tex_image_interp_float(tex, x, y);
|
||||
r = make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
# endif
|
||||
#endif
|
||||
|
||||
const float alpha = r.w;
|
||||
|
||||
if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
|
||||
|
@ -42,29 +42,8 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
|
||||
tfm.w = read_node_float(kg, offset);
|
||||
co = transform_point(&tfm, co);
|
||||
}
|
||||
float4 r;
|
||||
# if defined(__KERNEL_CUDA__)
|
||||
# if __CUDA_ARCH__ >= 300
|
||||
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||
const int texture_type = kernel_tex_type(id);
|
||||
if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_BYTE4 ||
|
||||
texture_type == IMAGE_DATA_TYPE_HALF4)
|
||||
{
|
||||
r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
|
||||
}
|
||||
else {
|
||||
float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
|
||||
r = make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
# else /* __CUDA_ARCH__ >= 300 */
|
||||
r = volume_image_texture_3d(id, co.x, co.y, co.z);
|
||||
# endif
|
||||
# elif defined(__KERNEL_OPENCL__)
|
||||
r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z);
|
||||
# else
|
||||
r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
|
||||
# endif /* __KERNEL_CUDA__ */
|
||||
|
||||
float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z);
|
||||
#else
|
||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
#endif
|
||||
|
@ -46,12 +46,64 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#if defined (__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
|
||||
# define kernel_tex_type(tex) (tex < TEX_START_BYTE4_CUDA ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_BYTE4)
|
||||
# define kernel_tex_index(tex) (tex)
|
||||
#else
|
||||
# define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK)
|
||||
# define kernel_tex_index(tex) (tex >> IMAGE_DATA_TYPE_SHIFT)
|
||||
#endif
|
||||
|
||||
/* Interpolation types for textures
|
||||
* cuda also use texture space to store other objects */
|
||||
enum InterpolationType {
|
||||
INTERPOLATION_NONE = -1,
|
||||
INTERPOLATION_LINEAR = 0,
|
||||
INTERPOLATION_CLOSEST = 1,
|
||||
INTERPOLATION_CUBIC = 2,
|
||||
INTERPOLATION_SMART = 3,
|
||||
|
||||
INTERPOLATION_NUM_TYPES,
|
||||
};
|
||||
|
||||
/* Texture types
|
||||
* Since we store the type in the lower bits of a flat index,
|
||||
* the shift and bit mask constant below need to be kept in sync. */
|
||||
enum ImageDataType {
|
||||
IMAGE_DATA_TYPE_FLOAT4 = 0,
|
||||
IMAGE_DATA_TYPE_BYTE4 = 1,
|
||||
IMAGE_DATA_TYPE_HALF4 = 2,
|
||||
IMAGE_DATA_TYPE_FLOAT = 3,
|
||||
IMAGE_DATA_TYPE_BYTE = 4,
|
||||
IMAGE_DATA_TYPE_HALF = 5,
|
||||
|
||||
IMAGE_DATA_NUM_TYPES
|
||||
};
|
||||
|
||||
#define IMAGE_DATA_TYPE_SHIFT 3
|
||||
#define IMAGE_DATA_TYPE_MASK 0x7
|
||||
|
||||
/* Extension types for textures.
|
||||
*
|
||||
* Defines how the image is extrapolated past its original bounds. */
|
||||
enum ExtensionType {
|
||||
/* Cause the image to repeat horizontally and vertically. */
|
||||
EXTENSION_REPEAT = 0,
|
||||
/* Extend by repeating edge pixels of the image. */
|
||||
EXTENSION_EXTEND = 1,
|
||||
/* Clip to image size and set exterior pixels as transparent. */
|
||||
EXTENSION_CLIP = 2,
|
||||
|
||||
EXTENSION_NUM_TYPES,
|
||||
};
|
||||
|
||||
typedef struct TextureInfo {
|
||||
/* Pointer, offset or texture depending on device. */
|
||||
uint64_t data;
|
||||
/* Buffer number for OpenCL. */
|
||||
uint cl_buffer;
|
||||
/* Interpolation and extension type. */
|
||||
uint interpolation, extension;
|
||||
/* Dimensions. */
|
||||
uint width, height, depth;
|
||||
} TextureInfo;
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __UTIL_TEXTURE_H__ */
|
||||
|
@ -101,52 +101,6 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
|
||||
return (x / multiple) * multiple;
|
||||
}
|
||||
|
||||
/* Interpolation types for textures
|
||||
* cuda also use texture space to store other objects */
|
||||
enum InterpolationType {
|
||||
INTERPOLATION_NONE = -1,
|
||||
INTERPOLATION_LINEAR = 0,
|
||||
INTERPOLATION_CLOSEST = 1,
|
||||
INTERPOLATION_CUBIC = 2,
|
||||
INTERPOLATION_SMART = 3,
|
||||
|
||||
INTERPOLATION_NUM_TYPES,
|
||||
};
|
||||
|
||||
/* Texture types
|
||||
* Since we store the type in the lower bits of a flat index,
|
||||
* the shift and bit mask constant below need to be kept in sync.
|
||||
*/
|
||||
|
||||
enum ImageDataType {
|
||||
IMAGE_DATA_TYPE_FLOAT4 = 0,
|
||||
IMAGE_DATA_TYPE_BYTE4 = 1,
|
||||
IMAGE_DATA_TYPE_HALF4 = 2,
|
||||
IMAGE_DATA_TYPE_FLOAT = 3,
|
||||
IMAGE_DATA_TYPE_BYTE = 4,
|
||||
IMAGE_DATA_TYPE_HALF = 5,
|
||||
|
||||
IMAGE_DATA_NUM_TYPES
|
||||
};
|
||||
|
||||
#define IMAGE_DATA_TYPE_SHIFT 3
|
||||
#define IMAGE_DATA_TYPE_MASK 0x7
|
||||
|
||||
/* Extension types for textures.
|
||||
*
|
||||
* Defines how the image is extrapolated past its original bounds.
|
||||
*/
|
||||
enum ExtensionType {
|
||||
/* Cause the image to repeat horizontally and vertically. */
|
||||
EXTENSION_REPEAT = 0,
|
||||
/* Extend by repeating edge pixels of the image. */
|
||||
EXTENSION_EXTEND = 1,
|
||||
/* Clip to image size and set exterior pixels as transparent. */
|
||||
EXTENSION_CLIP = 2,
|
||||
|
||||
EXTENSION_NUM_TYPES,
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
/* Vectorized types declaration. */
|
||||
|
Loading…
Reference in New Issue
Block a user