blender/intern/cycles/kernel/kernel_primitive.h
Brecht Van Lommel fa352bb749 Fix #35684: cycles unable to use full 6GB of memory on NVidia Titan GPU. We now
use arrays instead of textures for general storage on this card (image textures
are still stored as texture). Textures were found to be faster on older cards,
but the limits on 1D texture size have not increased along with the memory size,
which meant that the full 6 GB could not be used.

The performance actually seems to be slightly better with arrays in some tests
on Titan. For older cards there seems to be a bit of a mix, some are better and
others not. We may change those to use arrays too, but more testing is needed,
only Titan and Tesla K20 (sm_35) is changed for now.

The fact that arrays are faster is a bit surprising, as others found textures
to be faster on Kepler. However even if they were, the memory limitation is
more important to solve anyway.
https://research.nvidia.com/publication/understanding-efficiency-ray-traversal-gpus-kepler-and-fermi-addendum
2013-09-27 19:09:31 +00:00

192 lines
5.8 KiB
C

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
#ifndef __KERNEL_ATTRIBUTE_CL__
#define __KERNEL_ATTRIBUTE_CL__
CCL_NAMESPACE_BEGIN
/* attribute lookup */
__device_inline int find_attribute(KernelGlobals *kg, ShaderData *sd, uint id, AttributeElement *elem)
{
if(sd->object == ~0)
return (int)ATTR_STD_NOT_FOUND;
#ifdef __OSL__
if (kg->osl) {
return OSLShader::find_attribute(kg, sd, id, elem);
}
else
#endif
{
/* for SVM, find attribute by unique id */
uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
#ifdef __HAIR__
attr_offset = (sd->segment == ~0)? attr_offset: attr_offset + ATTR_PRIM_CURVE;
#endif
uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
while(attr_map.x != id) {
attr_offset += ATTR_PRIM_TYPES;
attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
}
*elem = (AttributeElement)attr_map.y;
/* return result */
return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
}
}
__device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
{
#ifdef __HAIR__
if(sd->segment == ~0)
#endif
return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
#ifdef __HAIR__
else
return curve_attribute_float(kg, sd, elem, offset, dx, dy);
#endif
}
__device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
{
#ifdef __HAIR__
if(sd->segment == ~0)
#endif
return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
#ifdef __HAIR__
else
return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
#endif
}
__device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
{
AttributeElement elem_uv;
int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv);
if(offset_uv == ATTR_STD_NOT_FOUND)
return make_float3(0.0f, 0.0f, 0.0f);
float3 uv = primitive_attribute_float3(kg, sd, elem_uv, offset_uv, NULL, NULL);
uv.z = 1.0f;
return uv;
}
__device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
{
#ifdef __HAIR__
if(sd->segment != ~0)
#ifdef __DPDU__
return normalize(sd->dPdu);
#else
return make_float3(0.0f, 0.0f, 0.0f);
#endif
#endif
/* try to create spherical tangent from generated coordinates */
AttributeElement attr_elem;
int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED, &attr_elem);
if(attr_offset != ATTR_STD_NOT_FOUND) {
float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
object_normal_transform(kg, sd, &data);
return cross(sd->N, normalize(cross(data, sd->N)));
}
else {
/* otherwise use surface derivatives */
#ifdef __DPDU__
return normalize(sd->dPdu);
#else
return make_float3(0.0f, 0.0f, 0.0f);
#endif
}
}
/* motion */
__device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
{
float3 motion_pre = sd->P, motion_post = sd->P;
/* deformation motion */
AttributeElement elem_pre, elem_post;
int offset_pre = find_attribute(kg, sd, ATTR_STD_MOTION_PRE, &elem_pre);
int offset_post = find_attribute(kg, sd, ATTR_STD_MOTION_POST, &elem_post);
if(offset_pre != ATTR_STD_NOT_FOUND)
motion_pre = primitive_attribute_float3(kg, sd, elem_pre, offset_pre, NULL, NULL);
if(offset_post != ATTR_STD_NOT_FOUND)
motion_post = primitive_attribute_float3(kg, sd, elem_post, offset_post, NULL, NULL);
/* object motion. note that depending on the mesh having motion vectors, this
* transformation was set match the world/object space of motion_pre/post */
Transform tfm;
tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
motion_pre = transform_point(&tfm, motion_pre);
tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
motion_post = transform_point(&tfm, motion_post);
float3 P;
/* camera motion, for perspective/orthographic motion.pre/post will be a
* world-to-raster matrix, for panorama it's world-to-camera */
if (kernel_data.cam.type != CAMERA_PANORAMA) {
tfm = kernel_data.cam.worldtoraster;
P = transform_perspective(&tfm, sd->P);
tfm = kernel_data.cam.motion.pre;
motion_pre = transform_perspective(&tfm, motion_pre);
tfm = kernel_data.cam.motion.post;
motion_post = transform_perspective(&tfm, motion_post);
}
else {
tfm = kernel_data.cam.worldtocamera;
P = normalize(transform_point(&tfm, sd->P));
P = float2_to_float3(direction_to_panorama(kg, P));
P.x *= kernel_data.cam.width;
P.y *= kernel_data.cam.height;
tfm = kernel_data.cam.motion.pre;
motion_pre = normalize(transform_point(&tfm, motion_pre));
motion_pre = float2_to_float3(direction_to_panorama(kg, motion_pre));
motion_pre.x *= kernel_data.cam.width;
motion_pre.y *= kernel_data.cam.height;
tfm = kernel_data.cam.motion.post;
motion_post = normalize(transform_point(&tfm, motion_post));
motion_post = float2_to_float3(direction_to_panorama(kg, motion_post));
motion_post.x *= kernel_data.cam.width;
motion_post.y *= kernel_data.cam.height;
}
motion_pre = motion_pre - P;
motion_post = P - motion_post;
return make_float4(motion_pre.x, motion_pre.y, motion_post.x, motion_post.y);
}
CCL_NAMESPACE_END
#endif /* __KERNEL_ATTRIBUTE_CL__ */