forked from bartvdbraak/blender
Code cleanup: tweak inlining for 2% better CUDA performance with hair.
This commit is contained in:
parent
76e7c039ec
commit
c4c450045d
@ -424,26 +424,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
|
||||
|
||||
#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
|
||||
|
||||
ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
uint rng_hash,
|
||||
int sample,
|
||||
Ray ray,
|
||||
ccl_global float *buffer,
|
||||
PathRadiance *L,
|
||||
bool *is_shadow_catcher)
|
||||
ccl_device_forceinline void kernel_path_integrate(
|
||||
KernelGlobals *kg,
|
||||
PathState *state,
|
||||
float3 throughput,
|
||||
Ray *ray,
|
||||
PathRadiance *L,
|
||||
float *buffer,
|
||||
ShaderData *emission_sd,
|
||||
bool *is_shadow_catcher)
|
||||
{
|
||||
/* initialize */
|
||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||
|
||||
path_radiance_init(L, kernel_data.film.use_light_pass);
|
||||
|
||||
/* shader data memory used for both volumes and surfaces, saves stack space */
|
||||
/* Shader data memory used for both volumes and surfaces, saves stack space. */
|
||||
ShaderData sd;
|
||||
/* shader data used by emission, shadows, volume stacks */
|
||||
ShaderData emission_sd;
|
||||
|
||||
PathState state;
|
||||
path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
|
||||
|
||||
#ifdef __SUBSURFACE__
|
||||
SubsurfaceIndirectRays ss_indirect;
|
||||
@ -456,40 +448,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
for(;;) {
|
||||
/* intersect scene */
|
||||
Intersection isect;
|
||||
uint visibility = path_state_ray_visibility(kg, &state);
|
||||
uint visibility = path_state_ray_visibility(kg, state);
|
||||
|
||||
#ifdef __HAIR__
|
||||
float difl = 0.0f, extmax = 0.0f;
|
||||
uint lcg_state = 0;
|
||||
|
||||
if(kernel_data.bvh.have_curves) {
|
||||
if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
|
||||
float3 pixdiff = ray.dD.dx + ray.dD.dy;
|
||||
/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
|
||||
if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
|
||||
float3 pixdiff = ray->dD.dx + ray->dD.dy;
|
||||
/*pixdiff = pixdiff - dot(pixdiff, ray->D)*ray->D;*/
|
||||
difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
|
||||
}
|
||||
|
||||
extmax = kernel_data.curve.maximum_width;
|
||||
lcg_state = lcg_state_init(&state, 0x51633e2d);
|
||||
lcg_state = lcg_state_init(state, 0x51633e2d);
|
||||
}
|
||||
|
||||
if(path_state_ao_bounce(kg, &state)) {
|
||||
if(path_state_ao_bounce(kg, state)) {
|
||||
visibility = PATH_RAY_SHADOW;
|
||||
ray.t = kernel_data.background.ao_distance;
|
||||
ray->t = kernel_data.background.ao_distance;
|
||||
}
|
||||
|
||||
bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
|
||||
bool hit = scene_intersect(kg, *ray, visibility, &isect, &lcg_state, difl, extmax);
|
||||
#else
|
||||
if(path_state_ao_bounce(kg, state)) {
|
||||
visibility = PATH_RAY_SHADOW;
|
||||
ray.t = kernel_data.background.ao_distance;
|
||||
ray->t = kernel_data.background.ao_distance;
|
||||
}
|
||||
|
||||
bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
|
||||
#endif /* __HAIR__ */
|
||||
|
||||
#ifdef __KERNEL_DEBUG__
|
||||
if(state.flag & PATH_RAY_CAMERA) {
|
||||
if(state->flag & PATH_RAY_CAMERA) {
|
||||
L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
|
||||
L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
|
||||
L->debug_data.num_bvh_intersections += isect.num_intersections;
|
||||
@ -498,40 +490,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __KERNEL_DEBUG__ */
|
||||
|
||||
#ifdef __LAMP_MIS__
|
||||
if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
|
||||
if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
|
||||
/* ray starting from previous non-transparent bounce */
|
||||
Ray light_ray;
|
||||
|
||||
light_ray.P = ray.P - state.ray_t*ray.D;
|
||||
state.ray_t += isect.t;
|
||||
light_ray.D = ray.D;
|
||||
light_ray.t = state.ray_t;
|
||||
light_ray.time = ray.time;
|
||||
light_ray.dD = ray.dD;
|
||||
light_ray.dP = ray.dP;
|
||||
light_ray.P = ray->P - state->ray_t*ray->D;
|
||||
state->ray_t += isect.t;
|
||||
light_ray.D = ray->D;
|
||||
light_ray.t = state->ray_t;
|
||||
light_ray.time = ray->time;
|
||||
light_ray.dD = ray->dD;
|
||||
light_ray.dP = ray->dP;
|
||||
|
||||
/* intersect with lamp */
|
||||
float3 emission;
|
||||
|
||||
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
|
||||
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||
if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
|
||||
path_radiance_accum_emission(L, throughput, emission, state->bounce);
|
||||
}
|
||||
#endif /* __LAMP_MIS__ */
|
||||
|
||||
#ifdef __VOLUME__
|
||||
/* Sanitize volume stack. */
|
||||
if(!hit) {
|
||||
kernel_volume_clean_stack(kg, state.volume_stack);
|
||||
kernel_volume_clean_stack(kg, state->volume_stack);
|
||||
}
|
||||
/* volume attenuation, emission, scatter */
|
||||
if(state.volume_stack[0].shader != SHADER_NONE) {
|
||||
Ray volume_ray = ray;
|
||||
if(state->volume_stack[0].shader != SHADER_NONE) {
|
||||
Ray volume_ray = *ray;
|
||||
volume_ray.t = (hit)? isect.t: FLT_MAX;
|
||||
|
||||
bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
|
||||
bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
|
||||
|
||||
# ifdef __VOLUME_DECOUPLED__
|
||||
int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
|
||||
int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
|
||||
bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
|
||||
|
||||
if(decoupled) {
|
||||
@ -539,14 +531,14 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
VolumeSegment volume_segment;
|
||||
|
||||
shader_setup_from_volume(kg, &sd, &volume_ray);
|
||||
kernel_volume_decoupled_record(kg, &state,
|
||||
kernel_volume_decoupled_record(kg, state,
|
||||
&volume_ray, &sd, &volume_segment, heterogeneous);
|
||||
|
||||
volume_segment.sampling_method = sampling_method;
|
||||
|
||||
/* emission */
|
||||
if(volume_segment.closure_flag & SD_EMISSION)
|
||||
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
|
||||
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state->bounce);
|
||||
|
||||
/* scattering */
|
||||
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
|
||||
@ -556,17 +548,17 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
|
||||
/* direct light sampling */
|
||||
kernel_branched_path_volume_connect_light(kg, &sd,
|
||||
&emission_sd, throughput, &state, L, all,
|
||||
emission_sd, throughput, state, L, all,
|
||||
&volume_ray, &volume_segment);
|
||||
|
||||
/* indirect sample. if we use distance sampling and take just
|
||||
* one sample for direct and indirect light, we could share
|
||||
* this computation, but makes code a bit complex */
|
||||
float rphase = path_state_rng_1D_for_decision(kg, &state, PRNG_PHASE);
|
||||
float rscatter = path_state_rng_1D_for_decision(kg, &state, PRNG_SCATTER_DISTANCE);
|
||||
float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
|
||||
float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
|
||||
|
||||
result = kernel_volume_decoupled_scatter(kg,
|
||||
&state, &volume_ray, &sd, &throughput,
|
||||
state, &volume_ray, &sd, &throughput,
|
||||
rphase, rscatter, &volume_segment, NULL, true);
|
||||
}
|
||||
|
||||
@ -574,7 +566,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
kernel_volume_decoupled_free(kg, &volume_segment);
|
||||
|
||||
if(result == VOLUME_PATH_SCATTERED) {
|
||||
if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray))
|
||||
if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray))
|
||||
continue;
|
||||
else
|
||||
break;
|
||||
@ -588,15 +580,15 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
{
|
||||
/* integrate along volume segment with distance sampling */
|
||||
VolumeIntegrateResult result = kernel_volume_integrate(
|
||||
kg, &state, &sd, &volume_ray, L, &throughput, heterogeneous);
|
||||
kg, state, &sd, &volume_ray, L, &throughput, heterogeneous);
|
||||
|
||||
# ifdef __VOLUME_SCATTER__
|
||||
if(result == VOLUME_PATH_SCATTERED) {
|
||||
/* direct lighting */
|
||||
kernel_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L);
|
||||
kernel_path_volume_connect_light(kg, &sd, emission_sd, throughput, state, L);
|
||||
|
||||
/* indirect light bounce */
|
||||
if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray))
|
||||
if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray))
|
||||
continue;
|
||||
else
|
||||
break;
|
||||
@ -608,7 +600,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
|
||||
if(!hit) {
|
||||
/* eval background shader if nothing hit */
|
||||
if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
|
||||
if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
|
||||
L->transparent += average(throughput);
|
||||
|
||||
#ifdef __PASSES__
|
||||
@ -619,35 +611,35 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
|
||||
#ifdef __BACKGROUND__
|
||||
/* sample background shader */
|
||||
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
||||
path_radiance_accum_background(L, &state, throughput, L_background);
|
||||
float3 L_background = indirect_background(kg, emission_sd, state, ray);
|
||||
path_radiance_accum_background(L, state, throughput, L_background);
|
||||
#endif /* __BACKGROUND__ */
|
||||
|
||||
break;
|
||||
}
|
||||
else if(path_state_ao_bounce(kg, &state)) {
|
||||
else if(path_state_ao_bounce(kg, state)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* setup shading */
|
||||
shader_setup_from_ray(kg, &sd, &isect, &ray);
|
||||
float rbsdf = path_state_rng_1D_for_decision(kg, &state, PRNG_BSDF);
|
||||
shader_eval_surface(kg, &sd, &state, rbsdf, state.flag);
|
||||
shader_setup_from_ray(kg, &sd, &isect, ray);
|
||||
float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
|
||||
shader_eval_surface(kg, &sd, state, rbsdf, state->flag);
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||
if(state.flag & PATH_RAY_CAMERA) {
|
||||
state.flag |= (PATH_RAY_SHADOW_CATCHER |
|
||||
if(state->flag & PATH_RAY_CAMERA) {
|
||||
state->flag |= (PATH_RAY_SHADOW_CATCHER |
|
||||
PATH_RAY_STORE_SHADOW_INFO);
|
||||
if(!kernel_data.background.transparent) {
|
||||
L->shadow_background_color =
|
||||
indirect_background(kg, &emission_sd, &state, &ray);
|
||||
indirect_background(kg, emission_sd, state, ray);
|
||||
}
|
||||
L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
|
||||
L->shadow_throughput = average(throughput);
|
||||
}
|
||||
}
|
||||
else if(state.flag & PATH_RAY_SHADOW_CATCHER) {
|
||||
else if(state->flag & PATH_RAY_SHADOW_CATCHER) {
|
||||
/* Only update transparency after shadow catcher bounce. */
|
||||
L->shadow_transparency *=
|
||||
average(shader_bsdf_transparency(kg, &sd));
|
||||
@ -658,7 +650,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
#ifdef __HOLDOUT__
|
||||
if(((sd.flag & SD_HOLDOUT) ||
|
||||
(sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
|
||||
(state.flag & PATH_RAY_CAMERA))
|
||||
(state->flag & PATH_RAY_CAMERA))
|
||||
{
|
||||
if(kernel_data.background.transparent) {
|
||||
float3 holdout_weight;
|
||||
@ -679,12 +671,12 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __HOLDOUT__ */
|
||||
|
||||
/* holdout mask objects do not write data passes */
|
||||
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
|
||||
kernel_write_data_passes(kg, buffer, L, &sd, state->sample, state, throughput);
|
||||
|
||||
/* blurring of bsdf after bounces, for rays that have a small likelihood
|
||||
* of following this particular path (diffuse, rough glossy) */
|
||||
if(kernel_data.integrator.filter_glossy != FLT_MAX) {
|
||||
float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
|
||||
float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
|
||||
|
||||
if(blur_pdf < 1.0f) {
|
||||
float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
|
||||
@ -696,33 +688,33 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
/* emission */
|
||||
if(sd.flag & SD_EMISSION) {
|
||||
/* todo: is isect.t wrong here for transparent surfaces? */
|
||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
||||
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state->flag, state->ray_pdf);
|
||||
path_radiance_accum_emission(L, throughput, emission, state->bounce);
|
||||
}
|
||||
#endif /* __EMISSION__ */
|
||||
|
||||
/* path termination. this is a strange place to put the termination, it's
|
||||
* mainly due to the mixed in MIS that we use. gives too many unneeded
|
||||
* shader evaluations, only need emission if we are going to terminate */
|
||||
float probability = path_state_continuation_probability(kg, &state, throughput);
|
||||
float probability = path_state_continuation_probability(kg, state, throughput);
|
||||
|
||||
if(probability == 0.0f) {
|
||||
break;
|
||||
}
|
||||
else if(probability != 1.0f) {
|
||||
float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE);
|
||||
float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE);
|
||||
if(terminate >= probability)
|
||||
break;
|
||||
|
||||
throughput /= probability;
|
||||
}
|
||||
|
||||
kernel_update_denoising_features(kg, &sd, &state, L);
|
||||
kernel_update_denoising_features(kg, &sd, state, L);
|
||||
|
||||
#ifdef __AO__
|
||||
/* ambient occlusion */
|
||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||
kernel_path_ao(kg, &sd, &emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, &sd));
|
||||
kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
|
||||
}
|
||||
#endif /* __AO__ */
|
||||
|
||||
@ -732,10 +724,10 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
if(sd.flag & SD_BSSRDF) {
|
||||
if(kernel_path_subsurface_scatter(kg,
|
||||
&sd,
|
||||
&emission_sd,
|
||||
emission_sd,
|
||||
L,
|
||||
&state,
|
||||
&ray,
|
||||
state,
|
||||
ray,
|
||||
&throughput,
|
||||
&ss_indirect))
|
||||
{
|
||||
@ -745,10 +737,10 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
/* direct lighting */
|
||||
kernel_path_surface_connect_light(kg, &sd, &emission_sd, throughput, &state, L);
|
||||
kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
|
||||
|
||||
/* compute direct lighting and next bounce */
|
||||
if(!kernel_path_surface_bounce(kg, &sd, &throughput, &state, L, &ray))
|
||||
if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, L, ray))
|
||||
break;
|
||||
}
|
||||
|
||||
@ -761,8 +753,8 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
if(ss_indirect.num_rays) {
|
||||
kernel_path_subsurface_setup_indirect(kg,
|
||||
&ss_indirect,
|
||||
&state,
|
||||
&ray,
|
||||
state,
|
||||
ray,
|
||||
L,
|
||||
&throughput);
|
||||
}
|
||||
@ -773,7 +765,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
|
||||
#endif /* __SUBSURFACE__ */
|
||||
|
||||
#ifdef __SHADOW_TRICKS__
|
||||
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER) != 0;
|
||||
*is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER) != 0;
|
||||
#endif /* __SHADOW_TRICKS__ */
|
||||
}
|
||||
|
||||
@ -788,23 +780,40 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
|
||||
rng_state += index;
|
||||
buffer += index*pass_stride;
|
||||
|
||||
/* initialize random numbers and ray */
|
||||
/* Initialize random numbers and sample ray. */
|
||||
uint rng_hash;
|
||||
Ray ray;
|
||||
|
||||
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray);
|
||||
|
||||
/* integrate */
|
||||
PathRadiance L;
|
||||
bool is_shadow_catcher;
|
||||
|
||||
if(ray.t != 0.0f) {
|
||||
kernel_path_integrate(kg, rng_hash, sample, ray, buffer, &L, &is_shadow_catcher);
|
||||
kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher);
|
||||
}
|
||||
else {
|
||||
if(ray.t == 0.0f) {
|
||||
kernel_write_result(kg, buffer, sample, NULL, false);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Initialize state. */
|
||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||
|
||||
PathRadiance L;
|
||||
path_radiance_init(&L, kernel_data.film.use_light_pass);
|
||||
|
||||
ShaderData emission_sd;
|
||||
|
||||
PathState state;
|
||||
path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
|
||||
|
||||
/* Integrate. */
|
||||
bool is_shadow_catcher;
|
||||
kernel_path_integrate(kg,
|
||||
&state,
|
||||
throughput,
|
||||
&ray,
|
||||
&L,
|
||||
buffer,
|
||||
&emission_sd,
|
||||
&is_shadow_catcher);
|
||||
|
||||
kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher);
|
||||
}
|
||||
|
||||
#endif /* __SPLIT_KERNEL__ */
|
||||
|
Loading…
Reference in New Issue
Block a user