Cycles: Delay shooting SSS indirect rays

The idea is to delay shooting indirect rays for the SSS sampling and trace them after the main integration loop was finished. This reduces GPU stack usage even further and brings it down to around 652MB (comparing to 722MB before the change and 946MB with previous stable release). This also solves the speed regression happened in the previous commit and now simple SSS scene (SSS suzanne on the floor) renders in 0:50 (comparing to 1:16 with previous commit and 1:03 with official release).
2015-11-22 15:48:33 +05:00 · 2015-11-22 15:48:33 +05:00 · 2a5c1fc9cc
commit 2a5c1fc9cc
parent 8bca34fe32
4 changed files with 353 additions and 112 deletions
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@ -64,9 +64,20 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 		/* sample subsurface scattering */
 		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-			if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+			SubsurfaceIndirectRays ss_indirect;
 			if(kernel_path_subsurface_scatter(kg,
 			                                  sd,
 			                                  &L_sample,
 			                                  &state,
 			                                  &rng,
 			                                  &ray,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
 				kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect);
 				is_sss_sample = true;
 			}
 		}
 #endif
 		/* sample light and BSDF */
@ -84,7 +95,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
+				kernel_path_indirect(kg, &rng, &ray, throughput, 1, &state, &L_sample);
 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@ -52,47 +52,64 @@
 CCL_NAMESPACE_BEGIN
-ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
+ccl_device void kernel_path_indirect(KernelGlobals *kg,
-	float3 throughput, int num_samples, PathState state, PathRadiance *L)
+                                     RNG *rng,
                                     Ray *ray,
                                     float3 throughput,
                                     int num_samples,
                                     PathState *state,
                                     PathRadiance *L)
 {
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
+		uint visibility = path_state_ray_visibility(kg, state);
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		bool hit = scene_intersect(kg,
 		                           ray,
 		                           visibility,
 		                           &isect,
 		                           NULL,
 		                           0.0f, 0.0f);
 #ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
 			/* ray starting from previous non-transparent bounce */
 			Ray light_ray;
-			light_ray.P = ray.P - state.ray_t*ray.D;
+			light_ray.P = ray->P - state->ray_t*ray->D;
-			state.ray_t += isect.t;
+			state->ray_t += isect.t;
-			light_ray.D = ray.D;
+			light_ray.D = ray->D;
-			light_ray.t = state.ray_t;
+			light_ray.t = state->ray_t;
-			light_ray.time = ray.time;
+			light_ray.time = ray->time;
-			light_ray.dD = ray.dD;
+			light_ray.dD = ray->dD;
-			light_ray.dP = ray.dP;
+			light_ray.dP = ray->dP;
 			/* intersect with lamp */
 			float3 emission;
-
+			if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
-			if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
+				path_radiance_accum_emission(L,
-				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+				                             throughput,
 				                             emission,
 				                             state->bounce);
 			}
 		}
 #endif
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
+		if(state->volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
+			Ray volume_ray = *ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+			bool heterogeneous =
 			        volume_stack_is_heterogeneous(kg,
 			                                      state->volume_stack);
 #ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			int sampling_method =
 			        volume_stack_sampling_method(kg,
 			                                     state->volume_stack);
 			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
 			if(decoupled) {
@ -100,15 +117,27 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				VolumeSegment volume_segment;
 				ShaderData volume_sd;
-				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				shader_setup_from_volume(kg,
-				kernel_volume_decoupled_record(kg, &state,
+				                         &volume_sd,
-					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+				                         &volume_ray,
 				                         state->bounce,
 				                         state->transparent_bounce);
 				kernel_volume_decoupled_record(kg,
 				                               state,
 				                               &volume_ray,
 				                               &volume_sd,
 				                               &volume_segment,
 				                               heterogeneous);
 				volume_segment.sampling_method = sampling_method;
 				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
+				if(volume_segment.closure_flag & SD_EMISSION) {
-					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+					path_radiance_accum_emission(L,
 					                             throughput,
 					                             volume_segment.accum_emission,
 					                             state->bounce);
 				}
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@ -117,29 +146,52 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 					bool all = kernel_data.integrator.sample_all_lights_indirect;
 					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					kernel_branched_path_volume_connect_light(kg,
-						throughput, &state, L, all, &volume_ray, &volume_segment);
+					                                          rng,
 					                                          &volume_sd,
 					                                          throughput,
 					                                          state,
 					                                          L,
 					                                          all,
 					                                          &volume_ray,
 					                                          &volume_segment);
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
 					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &volume_sd, &throughput,
+					                                         state,
-						rphase, rscatter, &volume_segment, NULL, true);
+					                                         &volume_ray,
 					                                         &volume_sd,
 					                                         &throughput,
 					                                         rphase,
 					                                         rscatter,
 					                                         &volume_segment,
 					                                         NULL,
 					                                         true);
 				}
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+					if(kernel_path_volume_bounce(kg,
 					                             rng,
 					                             &volume_sd,
 					                             &throughput,
 					                             state,
 					                             L,
 					                             ray))
 					{
 						continue;
-					else
+					}
 					else {
 						break;
 					}
 				}
 				else {
 					throughput *= volume_segment.accum_transmittance;
 				}
@ -150,19 +202,33 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+					kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
+					kernel_path_volume_connect_light(kg,
 					                                 rng,
 					                                 &volume_sd,
 					                                 throughput,
 					                                 state,
 					                                 L);
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+					if(kernel_path_volume_bounce(kg,
 					                             rng,
 					                             &volume_sd,
 					                             &throughput,
 					                             state,
 					                             L,
 					                             ray))
 					{
 						continue;
-					else
+					}
 					else {
 						break;
 					}
 				}
 #endif
 			}
 		}
@ -171,8 +237,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		if(!hit) {
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
+			float3 L_background = indirect_background(kg, state, ray);
-			path_radiance_accum_background(L, throughput, L_background, state.bounce);
+			path_radiance_accum_background(L,
 			                               throughput,
 			                               L_background,
 			                               state->bounce);
 #endif
 			break;
@ -180,9 +249,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* setup shading */
 		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
+		shader_setup_from_ray(kg,
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
+		                      &sd,
-		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
+		                      &isect,
 		                      ray,
 		                      state->bounce,
 		                      state->transparent_bounce);
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
 		shader_merge_closures(&sd);
 #endif
@ -190,7 +264,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
 		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@ -201,21 +275,28 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #ifdef __EMISSION__
 		/* emission */
 		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
+			float3 emission = indirect_primitive_emission(kg,
-			path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			                                              &sd,
 			                                              isect.t,
 			                                              state->flag,
 			                                              state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
 		}
 #endif
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput*num_samples);
+		float probability =
 		        path_state_terminate_probability(kg,
 		                                         state,
 		                                         throughput*num_samples);
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
 			if(terminate >= probability)
 				break;
@ -227,7 +308,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@ -251,8 +332,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				light_ray.dP = sd.dP;
 				light_ray.dD = differential3_zero();
-				if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
+				if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
-					path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
+					path_radiance_accum_ao(L,
 					                       throughput,
 					                       ao_alpha,
 					                       ao_bsdf,
 					                       ao_shadow,
 					                       state->bounce);
 				}
 			}
 		}
 #endif
@ -269,11 +356,21 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
+				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+				path_state_rng_2D(kg,
-				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+				                  rng,
 				                  state,
 				                  PRNG_BSDF_U,
 				                  &bssrdf_u, &bssrdf_v);
 				subsurface_scatter_step(kg,
 				                        &sd,
 				                        state->flag,
 				                        sc,
 				                        &lcg_state,
 				                        bssrdf_u, bssrdf_v,
 				                        false);
 			}
 		}
 #endif
@ -281,11 +378,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
 			bool all = kernel_data.integrator.sample_all_lights_indirect;
-			kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all);
+			kernel_branched_path_surface_connect_light(kg,
 			                                           rng,
 			                                           &sd,
 			                                           state,
 			                                           throughput,
 			                                           1.0f,
 			                                           L,
 			                                           all);
 		}
 #endif
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
 			break;
 	}
 }
@ -326,7 +430,15 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 #ifdef __SUBSURFACE__
-ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
+ccl_device bool kernel_path_subsurface_scatter(
        KernelGlobals *kg,
        ShaderData *sd,
        PathRadiance *L,
        PathState *state,
        RNG *rng,
        Ray *ray,
        float3 *throughput,
        SubsurfaceIndirectRays *ss_indirect)
 {
 	float bssrdf_probability;
 	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
@ -349,8 +461,8 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 		                                                  bssrdf_u, bssrdf_v,
 		                                                  false);
 #ifdef __VOLUME__
-		Ray volume_ray = *ray;
+		ss_indirect->need_update_volume_stack =
-		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+		        kernel_data.integrator.use_volumes &&
 		        ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
@ -367,47 +479,82 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 			                               sc,
 			                               false);
-			float3 tp = *throughput;
+			PathState *hit_state = &ss_indirect->state;
-			PathState hit_state = *state;
+			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-			Ray hit_ray = *ray;
+			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			hit_state.rng_offset += PRNG_BOUNCE_NUM;
+			*hit_state = *state;
 			*hit_ray = *ray;
 			*hit_tp = *throughput;
-			kernel_path_surface_connect_light(kg, rng, sd, tp, state, L);
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
-			if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) {
+			kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, L);
 			if(kernel_path_surface_bounce(kg,
 			                              rng,
 			                              sd,
 			                              hit_tp,
 			                              hit_state,
 			                              L,
 			                              hit_ray))
 			{
 #ifdef __LAMP_MIS__
-				hit_state.ray_t = 0.0f;
+				hit_state->ray_t = 0.0f;
 #endif
-
+				ss_indirect->num_rays++;
 #ifdef __VOLUME__
 				if(need_update_volume_stack) {
 					/* Setup ray from previous surface point to the new one. */
 					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
 					                             &volume_ray.t);
 					kernel_volume_stack_update_for_subsurface(
 					    kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
 					/* Move volume ray forward. */
 					volume_ray.P = hit_ray.P;
 				}
 #endif
 				kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
 				/* for render passes, sum and reset indirect light pass variables
 				 * for the next samples */
 				path_radiance_sum_indirect(L);
 				path_radiance_reset_indirect(L);
 			}
 		}
 		return true;
 	}
 	return false;
 }
 /* Trace subsurface indirect rays separately after the path loop, to reduce
 * GPU stack memory usage. this way ShaderData and other data structures
 * used during the loop are not needed during kernel_path_indirect.
 */
 ccl_device void kernel_path_subsurface_scatter_indirect(
        KernelGlobals *kg,
        PathRadiance *L,
        PathState *state,
        RNG *rng,
        Ray *ray,
        SubsurfaceIndirectRays *ss_indirect)
 {
 	for (int i = 0; i < ss_indirect->num_rays; i++) {
 		Ray *indirect_ray = &ss_indirect->rays[i];
 		float3 indirect_throughput = ss_indirect->throughputs[i];
 		*state = ss_indirect->state;
 #ifdef __VOLUME__
 		if(ss_indirect->need_update_volume_stack) {
 			/* TODO(sergey): Single assignment per scatter. */
 			Ray volume_ray = *ray;
 			/* Setup ray from previous surface point to the new one. */
 			volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
 			                             &volume_ray.t);
 			kernel_volume_stack_update_for_subsurface(
 			        kg,
 			        &volume_ray,
 			        state->volume_stack);
 		}
 #endif
 		/* Note that this modifies state. */
 		kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
 		/* For render passes, sum and reset indirect light pass variables
 		 * for the next samples.
 		 */
 		path_radiance_sum_indirect(L);
 		path_radiance_reset_indirect(L);
 	}
 }
 #endif
 ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
@ -427,6 +574,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	debug_data_init(&debug_data);
 #endif
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
 	ss_indirect.num_rays = 0;
 #endif
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
@ -664,9 +816,18 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd.flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg, &sd, &L, &state, rng, &ray, &throughput))
+			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
 			                                  &L,
 			                                  &state,
 			                                  rng,
 			                                  &ray,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
 				break;
 			}
 		}
 #endif
 		/* direct lighting */
@ -677,6 +838,20 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			break;
 	}
 #ifdef __SUBSURFACE__
 	/* Trace indirect subsurface afterwards to reduce GPU stack size.
 	 * note that this modifies state.
 	 */
 	if (ss_indirect.num_rays) {
 		kernel_path_subsurface_scatter_indirect(kg,
 		                                        &L,
 		                                        &state,
 		                                        rng,
 		                                        &ray,
 		                                        &ss_indirect);
 	}
 #endif
 	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
 	kernel_write_light_passes(kg, buffer, &L, sample);
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@ -91,10 +91,27 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			float3 tp = throughput;
 			Ray bsdf_ray;
-			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
+			if(!kernel_branched_path_surface_bounce(kg,
 			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        &tp,
 			                                        &ps,
 			                                        L,
 			                                        &bsdf_ray))
 			{
 				continue;
 			}
-			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+			kernel_path_indirect(kg,
 			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
 			                     num_samples,
 			                     &ps,
 			                     L);
 			/* for render passes, sum and reset indirect light pass variables
 			 * for the next samples */
@ -316,8 +333,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					(void)result;
 					kernel_assert(result == VOLUME_PATH_SCATTERED);
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+					if(kernel_path_volume_bounce(kg,
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+					                             rng,
 					                             &volume_sd,
 					                             &tp,
 					                             &ps,
 					                             &L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
 						                     rng,
 						                     &pray,
 						                     tp*num_samples_inv,
 						                     num_samples,
 						                     &ps,
 						                     &L);
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
@ -360,8 +390,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					 * alternatively get decoupled ray marching working on the GPU */
 					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+					if(kernel_path_volume_bounce(kg,
-						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
+					                             rng,
 					                             &volume_sd,
 					                             &tp,
 					                             &ps,
 					                             &L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
 						                     rng,
 						                     &pray,
 						                     tp,
 						                     num_samples,
 						                     &ps,
 						                     &L);
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -520,18 +520,6 @@ typedef ccl_addr_space struct Intersection {
 #endif
 } Intersection;
 /* Subsurface Intersection result */
 struct SubsurfaceIntersection
 {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
 	int num_hits;
 	struct Intersection hits[BSSRDF_MAX_HITS];
 	float3 Ng[BSSRDF_MAX_HITS];
 };
 /* Primitives */
 typedef enum PrimitiveType {
@ -764,6 +752,30 @@ typedef struct PathState {
 #endif
 } PathState;
 /* Subsurface */
 /* Struct to gather multiple SSS hits. */
 struct SubsurfaceIntersection
 {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
 	int num_hits;
 	Intersection hits[BSSRDF_MAX_HITS];
 	float3 Ng[BSSRDF_MAX_HITS];
 };
 /* Struct to gather SSS indirect rays and delay tracing them. */
 struct SubsurfaceIndirectRays
 {
 	bool need_update_volume_stack;
 	PathState state;
 	int num_rays;
 	Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
 };
 /* Constant Kernel Data
 *
 * These structs are passed from CPU to various devices, and the struct layout