Cycles: Delay shooting SSS indirect rays

The idea is to delay shooting indirect rays for the SSS sampling and
trace them after the main integration loop was finished.

This reduces GPU stack usage even further and brings it down to around
652MB (comparing to 722MB before the change and 946MB with previous
stable release).

This also solves the speed regression happened in the previous commit
and now simple SSS scene (SSS suzanne on the floor) renders in 0:50
(comparing to 1:16 with previous commit and 1:03 with official release).
This commit is contained in:
Sergey Sharybin 2015-11-22 15:48:33 +05:00
parent 8bca34fe32
commit 2a5c1fc9cc
4 changed files with 353 additions and 112 deletions

@ -64,9 +64,20 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
/* sample subsurface scattering */ /* sample subsurface scattering */
if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput)) SubsurfaceIndirectRays ss_indirect;
if(kernel_path_subsurface_scatter(kg,
sd,
&L_sample,
&state,
&rng,
&ray,
&throughput,
&ss_indirect))
{
kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect);
is_sss_sample = true; is_sss_sample = true;
} }
}
#endif #endif
/* sample light and BSDF */ /* sample light and BSDF */
@ -84,7 +95,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
state.ray_t = 0.0f; state.ray_t = 0.0f;
#endif #endif
/* compute indirect light */ /* compute indirect light */
kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample); kernel_path_indirect(kg, &rng, &ray, throughput, 1, &state, &L_sample);
/* sum and reset indirect light pass variables for the next samples */ /* sum and reset indirect light pass variables for the next samples */
path_radiance_sum_indirect(&L_sample); path_radiance_sum_indirect(&L_sample);

@ -52,47 +52,64 @@
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_device void kernel_path_indirect(KernelGlobals *kg,
float3 throughput, int num_samples, PathState state, PathRadiance *L) RNG *rng,
Ray *ray,
float3 throughput,
int num_samples,
PathState *state,
PathRadiance *L)
{ {
/* path iteration */ /* path iteration */
for(;;) { for(;;) {
/* intersect scene */ /* intersect scene */
Intersection isect; Intersection isect;
uint visibility = path_state_ray_visibility(kg, &state); uint visibility = path_state_ray_visibility(kg, state);
bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); bool hit = scene_intersect(kg,
ray,
visibility,
&isect,
NULL,
0.0f, 0.0f);
#ifdef __LAMP_MIS__ #ifdef __LAMP_MIS__
if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
/* ray starting from previous non-transparent bounce */ /* ray starting from previous non-transparent bounce */
Ray light_ray; Ray light_ray;
light_ray.P = ray.P - state.ray_t*ray.D; light_ray.P = ray->P - state->ray_t*ray->D;
state.ray_t += isect.t; state->ray_t += isect.t;
light_ray.D = ray.D; light_ray.D = ray->D;
light_ray.t = state.ray_t; light_ray.t = state->ray_t;
light_ray.time = ray.time; light_ray.time = ray->time;
light_ray.dD = ray.dD; light_ray.dD = ray->dD;
light_ray.dP = ray.dP; light_ray.dP = ray->dP;
/* intersect with lamp */ /* intersect with lamp */
float3 emission; float3 emission;
if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
if(indirect_lamp_emission(kg, &state, &light_ray, &emission)) path_radiance_accum_emission(L,
path_radiance_accum_emission(L, throughput, emission, state.bounce); throughput,
emission,
state->bounce);
}
} }
#endif #endif
#ifdef __VOLUME__ #ifdef __VOLUME__
/* volume attenuation, emission, scatter */ /* volume attenuation, emission, scatter */
if(state.volume_stack[0].shader != SHADER_NONE) { if(state->volume_stack[0].shader != SHADER_NONE) {
Ray volume_ray = ray; Ray volume_ray = *ray;
volume_ray.t = (hit)? isect.t: FLT_MAX; volume_ray.t = (hit)? isect.t: FLT_MAX;
bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); bool heterogeneous =
volume_stack_is_heterogeneous(kg,
state->volume_stack);
#ifdef __VOLUME_DECOUPLED__ #ifdef __VOLUME_DECOUPLED__
int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); int sampling_method =
volume_stack_sampling_method(kg,
state->volume_stack);
bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
if(decoupled) { if(decoupled) {
@ -100,15 +117,27 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
VolumeSegment volume_segment; VolumeSegment volume_segment;
ShaderData volume_sd; ShaderData volume_sd;
shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); shader_setup_from_volume(kg,
kernel_volume_decoupled_record(kg, &state, &volume_sd,
&volume_ray, &volume_sd, &volume_segment, heterogeneous); &volume_ray,
state->bounce,
state->transparent_bounce);
kernel_volume_decoupled_record(kg,
state,
&volume_ray,
&volume_sd,
&volume_segment,
heterogeneous);
volume_segment.sampling_method = sampling_method; volume_segment.sampling_method = sampling_method;
/* emission */ /* emission */
if(volume_segment.closure_flag & SD_EMISSION) if(volume_segment.closure_flag & SD_EMISSION) {
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); path_radiance_accum_emission(L,
throughput,
volume_segment.accum_emission,
state->bounce);
}
/* scattering */ /* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@ -117,29 +146,52 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
bool all = kernel_data.integrator.sample_all_lights_indirect; bool all = kernel_data.integrator.sample_all_lights_indirect;
/* direct light sampling */ /* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, kernel_branched_path_volume_connect_light(kg,
throughput, &state, L, all, &volume_ray, &volume_segment); rng,
&volume_sd,
throughput,
state,
L,
all,
&volume_ray,
&volume_segment);
/* indirect sample. if we use distance sampling and take just /* indirect sample. if we use distance sampling and take just
* one sample for direct and indirect light, we could share * one sample for direct and indirect light, we could share
* this computation, but makes code a bit complex */ * this computation, but makes code a bit complex */
float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
result = kernel_volume_decoupled_scatter(kg, result = kernel_volume_decoupled_scatter(kg,
&state, &volume_ray, &volume_sd, &throughput, state,
rphase, rscatter, &volume_segment, NULL, true); &volume_ray,
&volume_sd,
&throughput,
rphase,
rscatter,
&volume_segment,
NULL,
true);
} }
/* free cached steps */ /* free cached steps */
kernel_volume_decoupled_free(kg, &volume_segment); kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) { if(result == VOLUME_PATH_SCATTERED) {
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) if(kernel_path_volume_bounce(kg,
rng,
&volume_sd,
&throughput,
state,
L,
ray))
{
continue; continue;
else }
else {
break; break;
} }
}
else { else {
throughput *= volume_segment.accum_transmittance; throughput *= volume_segment.accum_transmittance;
} }
@ -150,19 +202,33 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* integrate along volume segment with distance sampling */ /* integrate along volume segment with distance sampling */
ShaderData volume_sd; ShaderData volume_sd;
VolumeIntegrateResult result = kernel_volume_integrate( VolumeIntegrateResult result = kernel_volume_integrate(
kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous); kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__ #ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) { if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */ /* direct lighting */
kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L); kernel_path_volume_connect_light(kg,
rng,
&volume_sd,
throughput,
state,
L);
/* indirect light bounce */ /* indirect light bounce */
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) if(kernel_path_volume_bounce(kg,
rng,
&volume_sd,
&throughput,
state,
L,
ray))
{
continue; continue;
else }
else {
break; break;
} }
}
#endif #endif
} }
} }
@ -171,8 +237,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
if(!hit) { if(!hit) {
#ifdef __BACKGROUND__ #ifdef __BACKGROUND__
/* sample background shader */ /* sample background shader */
float3 L_background = indirect_background(kg, &state, &ray); float3 L_background = indirect_background(kg, state, ray);
path_radiance_accum_background(L, throughput, L_background, state.bounce); path_radiance_accum_background(L,
throughput,
L_background,
state->bounce);
#endif #endif
break; break;
@ -180,9 +249,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* setup shading */ /* setup shading */
ShaderData sd; ShaderData sd;
shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); shader_setup_from_ray(kg,
float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); &sd,
shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT); &isect,
ray,
state->bounce,
state->transparent_bounce);
float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
shader_eval_surface(kg, &sd, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
#ifdef __BRANCHED_PATH__ #ifdef __BRANCHED_PATH__
shader_merge_closures(&sd); shader_merge_closures(&sd);
#endif #endif
@ -190,7 +264,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* blurring of bsdf after bounces, for rays that have a small likelihood /* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */ * of following this particular path (diffuse, rough glossy) */
if(kernel_data.integrator.filter_glossy != FLT_MAX) { if(kernel_data.integrator.filter_glossy != FLT_MAX) {
float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf; float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
if(blur_pdf < 1.0f) { if(blur_pdf < 1.0f) {
float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@ -201,21 +275,28 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
#ifdef __EMISSION__ #ifdef __EMISSION__
/* emission */ /* emission */
if(sd.flag & SD_EMISSION) { if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); float3 emission = indirect_primitive_emission(kg,
path_radiance_accum_emission(L, throughput, emission, state.bounce); &sd,
isect.t,
state->flag,
state->ray_pdf);
path_radiance_accum_emission(L, throughput, emission, state->bounce);
} }
#endif #endif
/* path termination. this is a strange place to put the termination, it's /* path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded * mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate */ * shader evaluations, only need emission if we are going to terminate */
float probability = path_state_terminate_probability(kg, &state, throughput*num_samples); float probability =
path_state_terminate_probability(kg,
state,
throughput*num_samples);
if(probability == 0.0f) { if(probability == 0.0f) {
break; break;
} }
else if(probability != 1.0f) { else if(probability != 1.0f) {
float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
if(terminate >= probability) if(terminate >= probability)
break; break;
@ -227,7 +308,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* ambient occlusion */ /* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
float bsdf_u, bsdf_v; float bsdf_u, bsdf_v;
path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
float ao_factor = kernel_data.background.ao_factor; float ao_factor = kernel_data.background.ao_factor;
float3 ao_N; float3 ao_N;
@ -251,8 +332,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
light_ray.dP = sd.dP; light_ray.dP = sd.dP;
light_ray.dD = differential3_zero(); light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow)) if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state.bounce); path_radiance_accum_ao(L,
throughput,
ao_alpha,
ao_bsdf,
ao_shadow,
state->bounce);
}
} }
} }
#endif #endif
@ -269,11 +356,21 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* do bssrdf scatter step if we picked a bssrdf closure */ /* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) { if(sc) {
uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb); uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
float bssrdf_u, bssrdf_v; float bssrdf_u, bssrdf_v;
path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); path_state_rng_2D(kg,
subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false); rng,
state,
PRNG_BSDF_U,
&bssrdf_u, &bssrdf_v);
subsurface_scatter_step(kg,
&sd,
state->flag,
sc,
&lcg_state,
bssrdf_u, bssrdf_v,
false);
} }
} }
#endif #endif
@ -281,11 +378,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
if(kernel_data.integrator.use_direct_light) { if(kernel_data.integrator.use_direct_light) {
bool all = kernel_data.integrator.sample_all_lights_indirect; bool all = kernel_data.integrator.sample_all_lights_indirect;
kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all); kernel_branched_path_surface_connect_light(kg,
rng,
&sd,
state,
throughput,
1.0f,
L,
all);
} }
#endif #endif
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
break; break;
} }
} }
@ -326,7 +430,15 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput) ccl_device bool kernel_path_subsurface_scatter(
KernelGlobals *kg,
ShaderData *sd,
PathRadiance *L,
PathState *state,
RNG *rng,
Ray *ray,
float3 *throughput,
SubsurfaceIndirectRays *ss_indirect)
{ {
float bssrdf_probability; float bssrdf_probability;
ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
@ -349,8 +461,8 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
bssrdf_u, bssrdf_v, bssrdf_u, bssrdf_v,
false); false);
#ifdef __VOLUME__ #ifdef __VOLUME__
Ray volume_ray = *ray; ss_indirect->need_update_volume_stack =
bool need_update_volume_stack = kernel_data.integrator.use_volumes && kernel_data.integrator.use_volumes &&
ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
#endif #endif
@ -367,47 +479,82 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
sc, sc,
false); false);
float3 tp = *throughput; PathState *hit_state = &ss_indirect->state;
PathState hit_state = *state; Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
Ray hit_ray = *ray; float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
hit_state.rng_offset += PRNG_BOUNCE_NUM; *hit_state = *state;
*hit_ray = *ray;
*hit_tp = *throughput;
kernel_path_surface_connect_light(kg, rng, sd, tp, state, L); hit_state->rng_offset += PRNG_BOUNCE_NUM;
if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) { kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, L);
if(kernel_path_surface_bounce(kg,
rng,
sd,
hit_tp,
hit_state,
L,
hit_ray))
{
#ifdef __LAMP_MIS__ #ifdef __LAMP_MIS__
hit_state.ray_t = 0.0f; hit_state->ray_t = 0.0f;
#endif #endif
ss_indirect->num_rays++;
#ifdef __VOLUME__
if(need_update_volume_stack) {
/* Setup ray from previous surface point to the new one. */
volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
&volume_ray.t);
kernel_volume_stack_update_for_subsurface(
kg,
&volume_ray,
hit_state.volume_stack);
/* Move volume ray forward. */
volume_ray.P = hit_ray.P;
}
#endif
kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
} }
} }
return true; return true;
} }
return false; return false;
} }
/* Trace subsurface indirect rays separately after the path loop, to reduce
* GPU stack memory usage. this way ShaderData and other data structures
* used during the loop are not needed during kernel_path_indirect.
*/
ccl_device void kernel_path_subsurface_scatter_indirect(
KernelGlobals *kg,
PathRadiance *L,
PathState *state,
RNG *rng,
Ray *ray,
SubsurfaceIndirectRays *ss_indirect)
{
for (int i = 0; i < ss_indirect->num_rays; i++) {
Ray *indirect_ray = &ss_indirect->rays[i];
float3 indirect_throughput = ss_indirect->throughputs[i];
*state = ss_indirect->state;
#ifdef __VOLUME__
if(ss_indirect->need_update_volume_stack) {
/* TODO(sergey): Single assignment per scatter. */
Ray volume_ray = *ray;
/* Setup ray from previous surface point to the new one. */
volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
&volume_ray.t);
kernel_volume_stack_update_for_subsurface(
kg,
&volume_ray,
state->volume_stack);
}
#endif
/* Note that this modifies state. */
kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
/* For render passes, sum and reset indirect light pass variables
* for the next samples.
*/
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
}
}
#endif #endif
ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
@ -427,6 +574,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
debug_data_init(&debug_data); debug_data_init(&debug_data);
#endif #endif
#ifdef __SUBSURFACE__
SubsurfaceIndirectRays ss_indirect;
ss_indirect.num_rays = 0;
#endif
/* path iteration */ /* path iteration */
for(;;) { for(;;) {
/* intersect scene */ /* intersect scene */
@ -664,9 +816,18 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
/* bssrdf scatter to a different location on the same object, replacing /* bssrdf scatter to a different location on the same object, replacing
* the closures with a diffuse BSDF */ * the closures with a diffuse BSDF */
if(sd.flag & SD_BSSRDF) { if(sd.flag & SD_BSSRDF) {
if(kernel_path_subsurface_scatter(kg, &sd, &L, &state, rng, &ray, &throughput)) if(kernel_path_subsurface_scatter(kg,
&sd,
&L,
&state,
rng,
&ray,
&throughput,
&ss_indirect))
{
break; break;
} }
}
#endif #endif
/* direct lighting */ /* direct lighting */
@ -677,6 +838,20 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
break; break;
} }
#ifdef __SUBSURFACE__
/* Trace indirect subsurface afterwards to reduce GPU stack size.
* note that this modifies state.
*/
if (ss_indirect.num_rays) {
kernel_path_subsurface_scatter_indirect(kg,
&L,
&state,
rng,
&ray,
&ss_indirect);
}
#endif
float3 L_sum = path_radiance_clamp_and_sum(kg, &L); float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
kernel_write_light_passes(kg, buffer, &L, sample); kernel_write_light_passes(kg, buffer, &L, sample);

@ -91,10 +91,27 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
float3 tp = throughput; float3 tp = throughput;
Ray bsdf_ray; Ray bsdf_ray;
if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray)) if(!kernel_branched_path_surface_bounce(kg,
&bsdf_rng,
sd,
sc,
j,
num_samples,
&tp,
&ps,
L,
&bsdf_ray))
{
continue; continue;
}
kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); kernel_path_indirect(kg,
rng,
&bsdf_ray,
tp*num_samples_inv,
num_samples,
&ps,
L);
/* for render passes, sum and reset indirect light pass variables /* for render passes, sum and reset indirect light pass variables
* for the next samples */ * for the next samples */
@ -316,8 +333,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
(void)result; (void)result;
kernel_assert(result == VOLUME_PATH_SCATTERED); kernel_assert(result == VOLUME_PATH_SCATTERED);
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { if(kernel_path_volume_bounce(kg,
kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); rng,
&volume_sd,
&tp,
&ps,
&L,
&pray))
{
kernel_path_indirect(kg,
rng,
&pray,
tp*num_samples_inv,
num_samples,
&ps,
&L);
/* for render passes, sum and reset indirect light pass variables /* for render passes, sum and reset indirect light pass variables
* for the next samples */ * for the next samples */
@ -360,8 +390,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
* alternatively get decoupled ray marching working on the GPU */ * alternatively get decoupled ray marching working on the GPU */
kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L); kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { if(kernel_path_volume_bounce(kg,
kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L); rng,
&volume_sd,
&tp,
&ps,
&L,
&pray))
{
kernel_path_indirect(kg,
rng,
&pray,
tp,
num_samples,
&ps,
&L);
/* for render passes, sum and reset indirect light pass variables /* for render passes, sum and reset indirect light pass variables
* for the next samples */ * for the next samples */

@ -520,18 +520,6 @@ typedef ccl_addr_space struct Intersection {
#endif #endif
} Intersection; } Intersection;
/* Subsurface Intersection result */
struct SubsurfaceIntersection
{
Ray ray;
float3 weight[BSSRDF_MAX_HITS];
int num_hits;
struct Intersection hits[BSSRDF_MAX_HITS];
float3 Ng[BSSRDF_MAX_HITS];
};
/* Primitives */ /* Primitives */
typedef enum PrimitiveType { typedef enum PrimitiveType {
@ -764,6 +752,30 @@ typedef struct PathState {
#endif #endif
} PathState; } PathState;
/* Subsurface */
/* Struct to gather multiple SSS hits. */
struct SubsurfaceIntersection
{
Ray ray;
float3 weight[BSSRDF_MAX_HITS];
int num_hits;
Intersection hits[BSSRDF_MAX_HITS];
float3 Ng[BSSRDF_MAX_HITS];
};
/* Struct to gather SSS indirect rays and delay tracing them. */
struct SubsurfaceIndirectRays
{
bool need_update_volume_stack;
PathState state;
int num_rays;
Ray rays[BSSRDF_MAX_HITS];
float3 throughputs[BSSRDF_MAX_HITS];
};
/* Constant Kernel Data /* Constant Kernel Data
* *
* These structs are passed from CPU to various devices, and the struct layout * These structs are passed from CPU to various devices, and the struct layout