Cycles: Branched path tracing for the split kernel

This implements branched path tracing for the split kernel. General approach is to store the ray state at a branch point, trace the branched ray as normal, then restore the state as necessary before iterating to the next part of the path. A state machine is used to advance the indirect loop state, which avoids the need to add any new kernels. Each iteration the state machine recreates as much state as possible from the stored ray to keep overall storage down. Its kind of hard to keep all the different integration loops in sync, so this needs lots of testing to make sure everything is working correctly. We should probably start trying to deduplicate the integration loops more now. Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes could use more testing still. Reviewers: sergey, nirved Reviewed By: nirved Subscribers: Blendify, bliblubli Differential Revision: https://developer.blender.org/D2611
2017-03-20 22:31:54 -04:00 · 2017-03-20 22:31:54 -04:00 · 915766f42d
commit 915766f42d
parent 89b1805df6
25 changed files with 1049 additions and 395 deletions
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
    cscene = context.scene.cycles

-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')


 def use_sample_all_lights(context):
@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):

        row = layout.row()
        sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
        sub.prop(cscene, "progressive", text="")
        row.prop(cscene, "use_square_samples")

--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@ -240,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -235,6 +235,7 @@ set(SRC_UTIL_HEADERS
 )

 set(SRC_SPLIT_HEADERS
+	split/kernel_branched.h
 	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                        ShaderData *sd,
                                        ShaderData *emission_sd,
                                        PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                        RNG *rng,
                                        float3 throughput,
                                        float3 ao_alpha)
@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 	}
 }

+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                     ShaderData *sd,
                                     ShaderData *emission_sd,
@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }

+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                               ShaderData *sd,
                                               ShaderData *emission_sd,
                                               PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                               RNG *rng,
                                               float3 throughput)
 {
@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 	}
 }

+#ifndef __SPLIT_KERNEL__

 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }

+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
        ccl_addr_space float3 *throughput,
        ccl_addr_space PathState *state,
        PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }

-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* setup diffuse bsdf */
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */

 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -71,22 +71,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
 #  define __PRINCIPLED__
+#  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */

 #ifdef __KERNEL_CUDA__
@ -138,6 +134,7 @@ CCL_NAMESPACE_BEGIN
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
 #    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */

 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@ -1300,7 +1297,6 @@ typedef ccl_addr_space struct DebugData {
 * Queue 3 - Shadow ray cast kernel - AO
 * Queeu 4 - Shadow ray cast kernel - direct lighting
 */
-#define NUM_QUEUES 4

 /* Queue names */
 enum QueueNumber {
@ -1313,22 +1309,37 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,

 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,

 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };

-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
 	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
@ -1343,14 +1354,22 @@ enum RayState {
 	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
 	RAY_REGENERATED,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
 };

 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@ -183,7 +183,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@ -110,7 +110,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter(
        ccl_global char *kg,
        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+	kernel_subsurface_scatter((KernelGlobals*)kg);
 }
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+		branched_state->name = kernel_split_state.name[ray_index];
+
+	BRANCHED_STORE(path_state);
+	BRANCHED_STORE(throughput);
+	BRANCHED_STORE(ray);
+	BRANCHED_STORE(sd);
+	BRANCHED_STORE(isect);
+	BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+	/* set loop counters to intial position */
+	branched_state->next_closure = 0;
+	branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* restore state */
+#define BRANCHED_RESTORE(name) \
+		kernel_split_state.name[ray_index] = branched_state->name;
+
+	BRANCHED_RESTORE(path_state);
+	BRANCHED_RESTORE(throughput);
+	BRANCHED_RESTORE(ray);
+	BRANCHED_RESTORE(sd);
+	BRANCHED_RESTORE(isect);
+	BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+	/* leave indirect loop */
+	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = saved_sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	float3 throughput = branched_state->throughput;
+
+	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(rng, i);
+
+		for(int j = branched_state->next_sample; j < num_samples; j++) {
+			ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+			if(reset_path_state) {
+				*ps = branched_state->path_state;
+			}
+
+			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+			*tp = throughput;
+
+			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+			if(!kernel_branched_path_surface_bounce(kg,
+			                                        &bsdf_rng,
+			                                        sd,
+			                                        sc,
+			                                        j,
+			                                        num_samples,
+			                                        tp,
+			                                        ps,
+			                                        L,
+			                                        bsdf_ray))
+			{
+				continue;
+			}
+
+			/* update state for next iteration */
+			branched_state->next_closure = i;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			/* start the indirect path */
+			*tp *= num_samples_inv;
+
+			return true;
+		}
+
+		branched_state->next_sample = 0;
+	}
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@ -105,21 +105,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(

 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
-		/* Initialize active ray queue. */
-		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue. */
-		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		}
 	}

 	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			Queue_index[i] = 0;
+		}
+
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);

-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 		/* direct lighting */
 #ifdef __EMISSION__
 		RNG rng = kernel_split_state.rng[ray_index];
+
 		bool flag = (kernel_data.integrator.use_direct_light &&
 		             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+		if(flag && kernel_data.integrator.branched) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __BRANCHED_PATH__ */
+
 #  ifdef __SHADOW_TRICKS__
 		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
 			flag = false;
-			ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-			float3 throughput = kernel_split_state.throughput[ray_index];
-			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-			kernel_branched_path_surface_connect_light(kg,
-			                                           &rng,
-			                                           sd,
-			                                           emission_sd,
-			                                           state,
-			                                           throughput,
-			                                           1.0f,
-			                                           L,
-			                                           1);
+			enqueue_flag = 1;
 		}
 #  endif  /* __SHADOW_TRICKS__ */
+
 		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
 			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 					kernel_split_state.bsdf_eval[ray_index] = L_light;
 					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 #endif  /* __EMISSION__ */
 	}

-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
 #ifdef __EMISSION__
 	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
 	enqueue_ray_index_local(ray_index,
@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
 #endif
+
+#ifdef __BRANCHED_PATH__
+	/* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+	 * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+	 */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_LIGHT_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@ -16,6 +16,81 @@

 CCL_NAMESPACE_BEGIN

+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	/* GPU: no decoupled ray marching, scatter probalistically */
+	int num_samples = kernel_data.integrator.volume_samples;
+	float num_samples_inv = 1.0f/num_samples;
+
+	Ray volume_ray = branched_state->ray;
+	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+	for(int j = branched_state->next_sample; j < num_samples; j++) {
+		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+		*ps = branched_state->path_state;
+
+		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+		*pray = branched_state->ray;
+
+		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+		*tp = branched_state->throughput * num_samples_inv;
+
+		/* branch RNG state */
+		path_state_branch(ps, j, num_samples);
+
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+			/* indirect light bounce */
+			if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+				continue;
+			}
+
+			/* start the indirect path */
+			branched_state->next_closure = 0;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			return true;
+		}
+#  endif
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	/* todo: avoid this calculation using decoupled ray marching */
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+	kernel_split_state.throughput[ray_index] = throughput;
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */

 ccl_device void kernel_do_volume(KernelGlobals *kg)
 {
@ -23,37 +98,37 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 	/* We will empty this queue in this kernel. */
 	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
 		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
 	}
-	/* Fetch use_queues_flag. */
-	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);

 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-	if(local_use_queues_flag) {
+
+	if(*kernel_split_params.use_queues_flag) {
 		ray_index = get_ray_index(kg, ray_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          kernel_split_state.queue_data,
 		                          kernel_split_params.queue_size,
 		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
 	}

-	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+	ccl_global char *ray_state = kernel_split_state.ray_state;

-		bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];

-		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];

-		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
-		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-		ShaderData *sd = &kernel_split_state.sd[ray_index];
-		ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);

 		/* Sanitize volume stack. */
 		if(!hit) {
@ -64,31 +139,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 			Ray volume_ray = *ray;
 			volume_ray.t = (hit)? isect->t: FLT_MAX;

-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+#  ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);

-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+				{
+					/* integrate along volume segment with distance sampling */
+					VolumeIntegrateResult result = kernel_volume_integrate(
+						kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);

 #  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
+					if(result == VOLUME_PATH_SCATTERED) {
+						/* direct lighting */
+						kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);

-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
-					else
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+						/* indirect light bounce */
+						if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+						}
+						else {
+							kernel_split_path_end(kg, ray_index);
+						}
+					}
+#  endif  /* __VOLUME_SCATTER__ */
 				}
-#  endif
+
+#  ifdef __BRANCHED_PATH__
 			}
+			else {
+				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#  endif  /* __BRANCHED_PATH__ */
 		}
+
 		kernel_split_state.rng[ray_index] = rng;
 	}

-#endif
+#  ifdef __BRANCHED_PATH__
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_VOLUME_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
 }


--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
 *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
 *     flag RAY_SHADOW_RAY_CAST_AO
 */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
        KernelGlobals *kg,
        ccl_local_param BackgroundAOLocals *locals)
@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	}
 	ccl_barrier(CCL_LOCAL_MEM_FENCE);

+#ifdef __AO__
 	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@ -155,8 +157,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
 			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+				kernel_split_path_end(kg, ray_index);
 			}
 		}
 #endif  /* __HOLDOUT__ */
@ -164,18 +165,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(

 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		/* Holdout mask objects do not write data passes. */
-		kernel_write_data_passes(kg,
-		                         buffer,
-		                         L,
-		                         sd,
-		                         sample,
-		                         state,
-		                         throughput);
+
+#ifdef __BRANCHED_PATH__
+		if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif  /* __BRANCHED_PATH__ */
+		{
+			/* Holdout mask objects do not write data passes. */
+			kernel_write_data_passes(kg,
+				                     buffer,
+				                     L,
+				                     sd,
+				                     sample,
+				                     state,
+				                     throughput);
+		}
+
 		/* Blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy.
 		 */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+		if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+		if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+		   (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif  /* __BRANCHED_PATH__ */
+		{
 			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@ -201,19 +215,32 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
+#ifndef __BRANCHED_PATH__
 		float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+		float probability = 1.0f;
+
+		if(!kernel_data.integrator.branched) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+		else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+			probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+		}
+		else if(state->flag & PATH_RAY_TRANSPARENT) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+#endif

 		if(probability == 0.0f) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+			kernel_split_path_end(kg, ray_index);
 		}

 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			if(probability != 1.0f) {
 				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
+					kernel_split_path_end(kg, ray_index);
 				}
 				else {
 					kernel_split_state.throughput[ray_index] = throughput/probability;
@ -225,61 +252,23 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __AO__
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (sd->flag & SD_AO))
-		{
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
-			float3 ao_D;
-			float ao_pdf;
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray _ray;
-				_ray.P = ray_offset(sd->P, sd->Ng);
-				_ray.D = ao_D;
-				_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				_ray.time = sd->time;
-#endif
-				_ray.dP = sd->dP;
-				_ray.dD = differential3_zero();
-				kernel_split_state.ao_light_ray[ray_index] = _ray;
-
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-			}
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+			enqueue_flag = 1;
 		}
 	}
 #endif  /* __AO__ */
-	kernel_split_state.rng[ray_index] = rng;

+	kernel_split_state.rng[ray_index] = rng;

 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif

-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        &locals->queue_atomics_bg,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
-
 #ifdef __AO__
 	/* Enqueue to-shadow-ray-cast rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        enqueue_flag,
 	                        kernel_split_params.queue_size,
 	                        &locals->queue_atomics_ao,
 	                        kernel_split_state.queue_data,
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 				if(state->bounce > kernel_data.integrator.ao_bounces) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					kernel_split_path_end(kg, ray_index);
 				}
 			}
 		}
@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 #ifdef __PASSES__
 			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				kernel_split_path_end(kg, ray_index);
 		}

 		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
 			path_radiance_accum_background(L, state, (*throughput), L_background);
 #endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			kernel_split_path_end(kg, ray_index);
 		}
 	}

--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];

-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-		kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+			kernel_path_subsurface_accum_indirect(ss_indirect, L);

-		/* Trace indirect subsurface rays by restarting the loop. this uses less
-		 * stack memory than invoking kernel_path_indirect.
-		 */
-		if(ss_indirect->num_rays) {
-			kernel_path_subsurface_setup_indirect(kg,
-			                                      ss_indirect,
-			                                      state,
-			                                      ray,
-			                                      L,
-			                                      throughput);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-		}
-		else {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			/* Trace indirect subsurface rays by restarting the loop. this uses less
+			 * stack memory than invoking kernel_path_indirect.
+			 */
+			if(ss_indirect->num_rays) {
+				kernel_path_subsurface_setup_indirect(kg,
+					                                  ss_indirect,
+					                                  state,
+					                                  ray,
+					                                  L,
+					                                  throughput);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
 		}
+#ifdef __BRANCHED_PATH__
 	}
+#endif

 #endif  /* __SUBSURFACE__ */

--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
 *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
 *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
 */
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+	/* continue in case of transparency */
+	*throughput *= shader_bsdf_transparency(kg, sd);
+
+	if(is_zero(*throughput)) {
+		kernel_split_path_end(kg, ray_index);
+	}
+	else {
+		/* Update Path State */
+		state->flag |= PATH_RAY_TRANSPARENT;
+		state->transparent_bounce++;
+
+		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+		ray->dD.dx = -sd->dI.dx;
+		ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+	}
+}
+#endif  /* __BRANCHED_PATH__ */
+
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                            ccl_local_param unsigned int *local_queue_atomics)
 {
@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}

-	char enqueue_flag = 0;
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);

-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-	/* Load ShaderData structure. */
-	PathRadiance *L = NULL;
-	ccl_global PathState *state = NULL;
 	ccl_global char *ray_state = kernel_split_state.ray_state;

-	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
-		float3 _throughput = kernel_split_state.throughput[ray_index];
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
-			if(update_path_radiance) {
-				path_radiance_accum_ao(L,
-				                       _throughput,
-				                       kernel_split_state.ao_alpha[ray_index],
-				                       kernel_split_state.ao_bsdf[ray_index],
-				                       shadow,
-				                       state->bounce);
-			}
-			else {
-				path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
-			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-		}
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = kernel_split_state.light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
-			BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-			if(update_path_radiance) {
-				path_radiance_accum_light(L,
-				                          _throughput,
-				                          &L_light,
-				                          shadow,
-				                          1.0f,
-				                          state->bounce,
-				                          kernel_split_state.is_lamp[ray_index]);
-			}
-			else {
-				path_radiance_accum_total_light(L, _throughput, &L_light);
-			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(active) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 		RNG rng = kernel_split_state.rng[ray_index];
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];

-		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+#ifdef __BRANCHED_PATH__
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+			/* Compute direct lighting and next bounce. */
+			if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+				kernel_split_path_end(kg, ray_index);
+			}
+#ifdef __BRANCHED_PATH__
 		}
+		else {
+			kernel_split_branched_indirect_light_init(kg, ray_index);
+
+			if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+			                                                          ray_index,
+			                                                          1.0f,
+			                                                          &kernel_split_state.branched_state[ray_index].sd,
+			                                                          true))
+			{
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
+			else {
+				kernel_split_branched_indirect_light_end(kg, ray_index);
+			}
+		}
+#endif  /* __BRANCHED_PATH__ */
+
 		kernel_split_state.rng[ray_index] = rng;
 	}

-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
 	/* Enqueue RAY_UPDATE_BUFFER rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
+	                        IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
 	                        kernel_split_params.queue_size,
 	                        local_queue_atomics,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+	/* iter loop */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+	}
+
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_LIGHT_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
+
+		if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+		                                                          ray_index,
+		                                                          1.0f,
+		                                                          &kernel_split_state.branched_state[ray_index].sd,
+		                                                          true))
+		{
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			kernel_split_branched_indirect_light_end(kg, ray_index);
+		}
+	}
+
+#  ifdef __VOLUME__
+	/* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_VOLUME_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+	/* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@ -38,8 +38,10 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);

+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
 	char enqueue_flag = 0;
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		enqueue_flag = 1;
 	}

@ -52,7 +54,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
 	                        kernel_split_params.queue_index);

 	/* Continue on with shader evaluation. */
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		Intersection isect = kernel_split_state.isect[ray_index];
 		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
@ -62,8 +64,27 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
 		                      &kernel_split_state.sd[ray_index],
 		                      &isect,
 		                      &ray);
+
+#ifndef __BRANCHED_PATH__
 		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
 		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+		ShaderContext ctx = SHADER_CONTEXT_MAIN;
+		float rbsdf = 0.0f;
+
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			ctx = SHADER_CONTEXT_INDIRECT;
+		}
+
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+		shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif  /* __BRANCHED_PATH__ */
+
 		kernel_split_state.rng[ray_index] = rng;
 	}
 }
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
 	}

-	if(ray_index == QUEUE_EMPTY_SLOT)
+	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
-
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
-		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
-
-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
 	}
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+		kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
+	}
+#endif
+
+	kernel_split_state.rng[ray_index] = rng;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@ -32,28 +32,71 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;

-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.light_ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];

-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	bool is_lamp = kernel_split_state.is_lamp[ray_index];

-		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+	bool use_branched = false;
+	int all = 0;

-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		use_branched = true;
+		all = 1;
 	}
+#    if defined(__BRANCHED_PATH__)
+	else if(kernel_data.integrator.branched) {
+		use_branched = true;
+
+		if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			all = (kernel_data.integrator.sample_all_lights_indirect);
+		}
+		else
+		{
+			all = (kernel_data.integrator.sample_all_lights_direct);
+		}
+	}
+#    endif  /* __BRANCHED_PATH__ */
+
+	if(use_branched) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           &rng,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           all);
+	}
+	else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+	{
+		/* trace shadow ray */
+		float3 shadow;
+
+		if(!shadow_blocked(kg,
+			               emission_sd,
+			               state,
+			               &ray,
+			               &shadow))
+		{
+			/* accumulate */
+			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+		}
+		else {
+			path_radiance_accum_total_light(L, throughput, &L_light);
+		}
+	}
+
+	kernel_split_state.rng[ray_index] = rng;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@ -37,41 +37,42 @@

 #include "util/util_atomic.h"

-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
-#endif
-
-#ifdef __VOLUME__
-#  include "kernel/kernel_volume.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-
-#ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif

 #include "kernel/kernel_queues.h"
 #include "kernel/kernel_work_stealing.h"

+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+	}
+	else {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+#else
+	ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+#endif
+}
+
+CCL_NAMESPACE_END
+
 #endif  /* __KERNEL_SPLIT_H__ */
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@ -62,7 +62,46 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
 #else
 #  define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif  /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+	/* various state that must be kept and restored after an indirect loop */
+	PathState path_state;
+	float3 throughput;
+	Ray ray;
+
+	struct ShaderData sd;
+	Intersection isect;
+
+	char ray_state;
+
+	/* indirect loop state */
+	int next_closure;
+	int next_sample;
+	int num_samples;
+
+#ifdef __SUBSURFACE__
+	int ss_next_closure;
+	int ss_next_sample;
+	int next_hit;
+	int num_hits;
+
+	uint lcg_state;
+	SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+	VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */

 #define SPLIT_DATA_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@ -72,9 +111,6 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
-	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@ -82,6 +118,7 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_BRANCHED_ENTRIES \
 	SPLIT_DATA_DEBUG_ENTRIES \

 /* struct that holds pointers to data in the shared state buffer */
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@ -16,42 +16,206 @@

 CCL_NAMESPACE_BEGIN

+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)

-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &branched_state->sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init(&rng,
+			                                           branched_state->path_state.rng_offset,
+			                                           branched_state->path_state.sample,
+			                                           0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     &bssrdf_rng,
+			                     &branched_state->path_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				RNG lcg_state = branched_state->lcg_state;
+				SubsurfaceIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				SubsurfaceIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               &branched_state->path_state,
+				                               branched_state->path_state.flag,
+				                               sc,
+				                               true);
+				*ss_isect = ss_isect_private;
+
+				ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+				*hit_state = branched_state->path_state;
+
+				path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					/* this next part is expensive as it does scene intersection so only do once */
+					if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+						for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+							branched_state->volume_stack[k] = hit_state->volume_stack[k];
+						}
+
+						kernel_volume_stack_update_for_subsurface(kg,
+						                                          emission_sd,
+						                                          &volume_ray,
+						                                          branched_state->volume_stack);
+					}
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->volume_stack[k];
+					}
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           &rng,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);

 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-	char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);

+#ifdef __SUBSURFACE__
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@ -64,34 +228,85 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,

 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		if(sd->flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg,
-			                                  sd,
-			                                  emission_sd,
-			                                  L,
-			                                  state,
-			                                  &rng,
-			                                  ray,
-			                                  throughput,
-			                                  ss_indirect)) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched) {
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+					                              sd,
+					                              emission_sd,
+					                              L,
+					                              state,
+					                              &rng,
+					                              ray,
+					                              throughput,
+					                              ss_indirect)) {
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
 			}
+			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+				float bssrdf_probability;
+				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+				/* modify throughput for picking bssrdf or bsdf */
+				*throughput *= bssrdf_probability;
+
+				/* do bssrdf scatter step if we picked a bssrdf closure */
+				if(sc) {
+					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+					float bssrdf_u, bssrdf_v;
+					path_state_rng_2D(kg,
+						              &rng,
+						              state,
+						              PRNG_BSDF_U,
+						              &bssrdf_u, &bssrdf_v);
+					subsurface_scatter_step(kg,
+						                    sd,
+						                    state,
+						                    state->flag,
+						                    sc,
+						                    &lcg_state,
+						                    bssrdf_u, bssrdf_v,
+						                    false);
+				}
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#endif
 		}
 		kernel_split_state.rng[ray_index] = rng;
 	}

-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
 	}
-#endif

-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */

 #endif  /* __SUBSURFACE__ */