Fix #35665: more CUDA issues with recent kernel changes, tested on sm_20, sm_21
and sm_30 cards, so hopefully it should all work now. Also includes some warnings fixes related to nvcc compiler arguments, should make no difference otherwise.
This commit is contained in:
parent
3d21bf9688
commit
37f92119e4
@ -129,9 +129,20 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
|
||||
set(cuda_cubin kernel_${arch}.cubin)
|
||||
|
||||
if(${arch} MATCHES "sm_1[0-9]")
|
||||
# sm_1x
|
||||
set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
|
||||
elseif(${arch} MATCHES "sm_2[0-9]")
|
||||
# sm_2x
|
||||
set(cuda_arch_flags "--maxrregcount=24")
|
||||
else()
|
||||
# sm_3x
|
||||
set(cuda_arch_flags "--maxrregcount=32")
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${cuda_cubin}
|
||||
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
|
||||
COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
|
||||
DEPENDS ${cuda_sources})
|
||||
|
||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
|
||||
|
@ -146,7 +146,7 @@ __device_noinline float cmj_sample_1D(int s, int N, int p)
|
||||
return (x + jx)*invN;
|
||||
}
|
||||
|
||||
__device_noinline float2 cmj_sample_2D(int s, int N, int p)
|
||||
__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
|
||||
{
|
||||
int m = float_to_int(sqrtf(N));
|
||||
int n = (N + m - 1)/m;
|
||||
@ -173,7 +173,8 @@ __device_noinline float2 cmj_sample_2D(int s, int N, int p)
|
||||
float jx = cmj_randfloat(s, p * 0x967a889b);
|
||||
float jy = cmj_randfloat(s, p * 0x368cc8b7);
|
||||
|
||||
return make_float2((sx + (sy + jx)*invn)*invm, (s + jy)*invN);
|
||||
*fx = (sx + (sy + jx)*invn)*invm;
|
||||
*fy = (s + jy)*invN;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -409,9 +409,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||
/* todo: solve correlation */
|
||||
float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
|
||||
float ao_factor = kernel_data.background.ao_factor;
|
||||
float3 ao_N;
|
||||
@ -450,9 +449,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
|
||||
#else
|
||||
float light_o = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_F);
|
||||
#endif
|
||||
float2 light_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U);
|
||||
float light_u = light_uv.x;
|
||||
float light_v = light_uv.y;
|
||||
float light_u, light_v;
|
||||
path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
|
||||
|
||||
Ray light_ray;
|
||||
BsdfEval L_light;
|
||||
@ -484,9 +482,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
|
||||
BsdfEval bsdf_eval;
|
||||
float3 bsdf_omega_in;
|
||||
differential3 bsdf_domega_in;
|
||||
float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
int label;
|
||||
|
||||
label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
|
||||
@ -653,10 +650,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
|
||||
#ifdef __AO__
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||
/* todo: solve correlation */
|
||||
float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
|
||||
float ao_factor = kernel_data.background.ao_factor;
|
||||
float3 ao_N;
|
||||
@ -695,9 +690,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
|
||||
#else
|
||||
float light_o = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_F);
|
||||
#endif
|
||||
float2 light_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U);
|
||||
float light_u = light_uv.x;
|
||||
float light_v = light_uv.y;
|
||||
float light_u, light_v;
|
||||
path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
|
||||
|
||||
Ray light_ray;
|
||||
BsdfEval L_light;
|
||||
@ -730,9 +724,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
|
||||
BsdfEval bsdf_eval;
|
||||
float3 bsdf_omega_in;
|
||||
differential3 bsdf_domega_in;
|
||||
float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
int label;
|
||||
|
||||
label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
|
||||
@ -784,10 +777,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
|
||||
float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
|
||||
|
||||
for(int j = 0; j < num_samples; j++) {
|
||||
/* todo: solve correlation */
|
||||
float2 bsdf_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
|
||||
float3 ao_D;
|
||||
float ao_pdf;
|
||||
@ -836,9 +827,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
|
||||
num_samples_inv *= 0.5f;
|
||||
|
||||
for(int j = 0; j < num_samples; j++) {
|
||||
float2 light_uv = path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
|
||||
float light_u = light_uv.x;
|
||||
float light_v = light_uv.y;
|
||||
float light_u, light_v;
|
||||
path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
|
||||
|
||||
if(direct_emission(kg, sd, i, 0.0f, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp)) {
|
||||
/* trace shadow ray */
|
||||
@ -862,9 +852,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
|
||||
|
||||
for(int j = 0; j < num_samples; j++) {
|
||||
float light_t = path_rng_1D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT);
|
||||
float2 light_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
|
||||
float light_u = light_uv.x;
|
||||
float light_v = light_uv.y;
|
||||
float light_u, light_v;
|
||||
path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v);
|
||||
|
||||
/* only sample triangle lights */
|
||||
if(kernel_data.integrator.num_all_lights)
|
||||
@ -913,9 +902,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
|
||||
BsdfEval bsdf_eval;
|
||||
float3 bsdf_omega_in;
|
||||
differential3 bsdf_domega_in;
|
||||
float2 bsdf_uv = path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
|
||||
float bsdf_u = bsdf_uv.x;
|
||||
float bsdf_v = bsdf_uv.y;
|
||||
float bsdf_u, bsdf_v;
|
||||
path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v);
|
||||
int label;
|
||||
|
||||
label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
|
||||
@ -1162,11 +1150,8 @@ __device void kernel_path_trace(KernelGlobals *kg,
|
||||
|
||||
float lens_u = 0.0f, lens_v = 0.0f;
|
||||
|
||||
if(kernel_data.cam.aperturesize > 0.0f) {
|
||||
float2 lens_uv = path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U);
|
||||
lens_u = lens_uv.x;
|
||||
lens_v = lens_uv.y;
|
||||
}
|
||||
if(kernel_data.cam.aperturesize > 0.0f)
|
||||
path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
|
||||
|
||||
float time = 0.0f;
|
||||
|
||||
|
@ -102,8 +102,16 @@ __device uint sobol_lookup(const uint m, const uint frame, const uint ex, const
|
||||
return index;
|
||||
}
|
||||
|
||||
__device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dimension)
|
||||
__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
|
||||
{
|
||||
#ifdef __CMJ__
|
||||
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
|
||||
/* correlated multi-jittered */
|
||||
int p = *rng + dimension;
|
||||
return cmj_sample_1D(sample, num_samples, p);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __SOBOL_FULL_SCREEN__
|
||||
uint result = sobol_dimension(kg, *rng, dimension);
|
||||
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
|
||||
@ -117,41 +125,27 @@ __device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dime
|
||||
float shift;
|
||||
|
||||
if(dimension & 1)
|
||||
shift = (*rng >> 16)*(1.0f/(float)0xFFFF);
|
||||
shift = (*rng >> 16)/((float)0xFFFF);
|
||||
else
|
||||
shift = (*rng & 0xFFFF)*(1.0f/(float)0xFFFF);
|
||||
shift = (*rng & 0xFFFF)/((float)0xFFFF);
|
||||
|
||||
return r + shift - floorf(r + shift);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
|
||||
__device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
|
||||
{
|
||||
#ifdef __CMJ__
|
||||
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
|
||||
/* correlated multi-jittered */
|
||||
int p = *rng + dimension;
|
||||
return cmj_sample_1D(sample, num_samples, p);
|
||||
cmj_sample_2D(sample, num_samples, p, fx, fy);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* sobol */
|
||||
return path_rng(kg, rng, sample, dimension);
|
||||
}
|
||||
|
||||
__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
|
||||
{
|
||||
#ifdef __CMJ__
|
||||
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
|
||||
/* correlated multi-jittered */
|
||||
int p = *rng + dimension;
|
||||
return cmj_sample_2D(sample, num_samples, p);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* sobol */
|
||||
return make_float2(path_rng(kg, rng, sample, dimension),
|
||||
path_rng(kg, rng, sample, dimension + 1));
|
||||
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
|
||||
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
|
||||
}
|
||||
|
||||
__device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
|
||||
@ -184,10 +178,7 @@ __device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state,
|
||||
*fy = 0.5f;
|
||||
}
|
||||
else {
|
||||
float2 fxy = path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U);
|
||||
|
||||
*fx = fxy.x;
|
||||
*fy = fxy.y;
|
||||
path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -202,21 +193,20 @@ __device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
|
||||
/* Linear Congruential Generator */
|
||||
|
||||
__device float path_rng(KernelGlobals *kg, RNG& rng, int sample, int dimension)
|
||||
{
|
||||
}
|
||||
|
||||
__device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
|
||||
{
|
||||
/* implicit mod 2^32 */
|
||||
rng = (1103515245*(rng) + 12345);
|
||||
return (float)rng * (1.0f/(float)0xFFFFFFFF);
|
||||
}
|
||||
|
||||
__device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
|
||||
__device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
|
||||
{
|
||||
return path_rng(kg, rng, sample, dimension);
|
||||
}
|
||||
|
||||
__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
|
||||
{
|
||||
return make_float2(path_rng(kg, rng, sample, dimension),
|
||||
path_rng(kg, rng, sample, dimension + 1));
|
||||
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
|
||||
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
|
||||
}
|
||||
|
||||
__device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
|
||||
@ -231,10 +221,7 @@ __device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sam
|
||||
*fy = 0.5f;
|
||||
}
|
||||
else {
|
||||
float2 fxy = path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U);
|
||||
|
||||
*fx = fxy.x;
|
||||
*fy = fxy.y;
|
||||
path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user