forked from bartvdbraak/blender
Cycles: fixes to make CUDA 4.2 work, compiling gave errors in shadows and
other places, was mainly due to instancing not working, but also found issues in procedural textures. The problem was with --use_fast_math, this seems to now have way lower precision for some operations. Disabled this flag and selectively use fast math functions. Did not find performance regression on GTX 460 after doing this.
This commit is contained in:
parent
c2be2fd408
commit
131de4352b
@ -304,7 +304,6 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated)
|
||||
void BlenderSync::sync_mesh_motion(BL::Object b_ob, Mesh *mesh, int motion)
|
||||
{
|
||||
/* todo: displacement, subdivision */
|
||||
BL::ID b_ob_data = b_ob.data();
|
||||
size_t size = mesh->verts.size();
|
||||
|
||||
/* skip objects without deforming modifiers. this is not a totally reliable,
|
||||
|
@ -135,7 +135,6 @@ void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, const
|
||||
|
||||
void BlenderSync::sync_integrator()
|
||||
{
|
||||
BL::RenderSettings r = b_scene.render();
|
||||
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
|
||||
|
||||
experimental = (RNA_enum_get(&cscene, "feature_set") != 0);
|
||||
|
@ -259,7 +259,7 @@ public:
|
||||
|
||||
path_create_directories(cubin);
|
||||
|
||||
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
|
||||
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
|
||||
"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
|
||||
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
|
||||
|
||||
|
@ -114,7 +114,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${cuda_cubin}
|
||||
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu --use_fast_math -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
|
||||
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
|
||||
DEPENDS ${cuda_sources})
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
|
@ -74,10 +74,10 @@ __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray
|
||||
|
||||
__device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
|
||||
{
|
||||
if(*t != FLT_MAX) {
|
||||
Transform tfm = object_fetch_transform(kg, object, ray->time, OBJECT_TRANSFORM);
|
||||
|
||||
if(*t != FLT_MAX)
|
||||
*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
|
||||
}
|
||||
|
||||
*P = ray->P;
|
||||
*idir = bvh_inverse_direction(ray->D);
|
||||
|
@ -62,5 +62,15 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
|
||||
|
||||
#define kernel_data __data
|
||||
|
||||
/* Use fast math functions */
|
||||
|
||||
#define cosf(x) __cosf(((float)x))
|
||||
#define sinf(x) __sinf(((float)x))
|
||||
#define powf(x, y) __powf(((float)x), ((float)y))
|
||||
#define cosf(x) __cosf(((float)x))
|
||||
#define tanf(x) __tanf(((float)x))
|
||||
#define logf(x) __logf(((float)x))
|
||||
#define expf(x) __expf(((float)x))
|
||||
|
||||
#endif /* __KERNEL_COMPAT_CUDA_H__ */
|
||||
|
||||
|
@ -69,20 +69,20 @@ __device float3 equirectangular_to_direction(float u, float v)
|
||||
float theta = M_PI_F*(1.0f - v);
|
||||
|
||||
return make_float3(
|
||||
sin(theta)*cos(phi),
|
||||
sin(theta)*sin(phi),
|
||||
cos(theta));
|
||||
sinf(theta)*cosf(phi),
|
||||
sinf(theta)*sinf(phi),
|
||||
cosf(theta));
|
||||
}
|
||||
|
||||
/* Fisheye <-> Cartesian direction */
|
||||
|
||||
__device float2 direction_to_fisheye(float3 dir, float fov)
|
||||
{
|
||||
float r = atan2f(sqrt(dir.y*dir.y + dir.z*dir.z), dir.x) / fov;
|
||||
float phi = atan2(dir.z, dir.y);
|
||||
float r = atan2f(sqrtf(dir.y*dir.y + dir.z*dir.z), dir.x) / fov;
|
||||
float phi = atan2f(dir.z, dir.y);
|
||||
|
||||
float u = r * cos(phi) + 0.5f;
|
||||
float v = r * sin(phi) + 0.5f;
|
||||
float u = r * cosf(phi) + 0.5f;
|
||||
float v = r * sinf(phi) + 0.5f;
|
||||
|
||||
return make_float2(u, v);
|
||||
}
|
||||
@ -92,7 +92,7 @@ __device float3 fisheye_to_direction(float u, float v, float fov)
|
||||
u = (u - 0.5f) * 2.0f;
|
||||
v = (v - 0.5f) * 2.0f;
|
||||
|
||||
float r = sqrt(u*u + v*v);
|
||||
float r = sqrtf(u*u + v*v);
|
||||
|
||||
if(r > 1.0f)
|
||||
return make_float3(0.0f, 0.0f, 0.0f);
|
||||
@ -127,7 +127,7 @@ __device float3 fisheye_equisolid_to_direction(float u, float v, float lens, flo
|
||||
v = (v - 0.5f) * height;
|
||||
|
||||
float rmax = 2.0f * lens * sinf(fov * 0.25f);
|
||||
float r = sqrt(u*u + v*v);
|
||||
float r = sqrtf(u*u + v*v);
|
||||
|
||||
if(r > rmax)
|
||||
return make_float3(0.0f, 0.0f, 0.0f);
|
||||
@ -153,7 +153,7 @@ __device float3 mirrorball_to_direction(float u, float v)
|
||||
|
||||
dir.x = 2.0f*u - 1.0f;
|
||||
dir.z = 2.0f*v - 1.0f;
|
||||
dir.y = -sqrt(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
|
||||
dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
|
||||
|
||||
/* reflection */
|
||||
float3 I = make_float3(0.0f, -1.0f, 0.0f);
|
||||
@ -166,7 +166,7 @@ __device float2 direction_to_mirrorball(float3 dir)
|
||||
/* inverse of mirrorball_to_direction */
|
||||
dir.y -= 1.0f;
|
||||
|
||||
float div = 2.0f*sqrt(max(-0.5f*dir.y, 0.0f));
|
||||
float div = 2.0f*sqrtf(max(-0.5f*dir.y, 0.0f));
|
||||
if(div > 0.0f)
|
||||
dir /= div;
|
||||
|
||||
|
@ -61,16 +61,20 @@ __device_inline float3 transform_perspective(const Transform *t, const float3 a)
|
||||
|
||||
__device_inline float3 transform_point(const Transform *t, const float3 a)
|
||||
{
|
||||
float4 b = make_float4(a.x, a.y, a.z, 1.0f);
|
||||
float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
|
||||
float3 c = make_float3(
|
||||
a.x*t->x.x + a.y*t->x.y + a.z*t->x.z + t->x.w,
|
||||
a.x*t->y.x + a.y*t->y.y + a.z*t->y.z + t->y.w,
|
||||
a.x*t->z.x + a.y*t->z.y + a.z*t->z.z + t->z.w);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
__device_inline float3 transform_direction(const Transform *t, const float3 a)
|
||||
{
|
||||
float4 b = make_float4(a.x, a.y, a.z, 0.0f);
|
||||
float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
|
||||
float3 c = make_float3(
|
||||
a.x*t->x.x + a.y*t->x.y + a.z*t->x.z,
|
||||
a.x*t->y.x + a.y*t->y.y + a.z*t->y.z,
|
||||
a.x*t->z.x + a.y*t->z.y + a.z*t->z.z);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user