diff --git a/build_files/build_environment/install_deps.sh b/build_files/build_environment/install_deps.sh
index dd1e7622be1..95cb82ee6ec 100755
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -284,7 +284,7 @@ NO_BUILD=false
 NO_CONFIRM=false
 USE_CXX11=true  # Mandatory in blender2.8
 
-PYTHON_VERSION="3.5.2"
+PYTHON_VERSION="3.5.3"
 PYTHON_VERSION_MIN="3.5"
 PYTHON_FORCE_BUILD=false
 PYTHON_FORCE_REBUILD=false
@@ -361,8 +361,7 @@ ALEMBIC_FORCE_BUILD=false
 ALEMBIC_FORCE_REBUILD=false
 ALEMBIC_SKIP=false
 
-# Version??
-OPENCOLLADA_VERSION="1.3"
+OPENCOLLADA_VERSION="1.6.47"
 OPENCOLLADA_FORCE_BUILD=false
 OPENCOLLADA_FORCE_REBUILD=false
 OPENCOLLADA_SKIP=false
@@ -729,7 +728,10 @@ _boost_version_nodots=`echo "$BOOST_VERSION" | sed -r 's/\./_/g'`
 BOOST_SOURCE=( "http://sourceforge.net/projects/boost/files/boost/$BOOST_VERSION/boost_$_boost_version_nodots.tar.bz2/download" )
 BOOST_BUILD_MODULES="--with-system --with-filesystem --with-thread --with-regex --with-locale --with-date_time --with-wave --with-iostreams --with-python --with-program_options"
 
+OCIO_USE_REPO=true
 OCIO_SOURCE=( "https://github.com/imageworks/OpenColorIO/tarball/v$OCIO_VERSION" )
+OCIO_SOURCE_REPO=( "https://github.com/imageworks/OpenColorIO.git" )
+OCIO_SOURCE_REPO_UID="6de971097c7f552300f669ed69ca0b6cf5a70843"
 
 OPENEXR_USE_REPO=false
 OPENEXR_SOURCE=( "http://download.savannah.nongnu.org/releases/openexr/openexr-$OPENEXR_VERSION.tar.gz" )
@@ -778,7 +780,7 @@ ALEMBIC_SOURCE=( "https://github.com/alembic/alembic/archive/${ALEMBIC_VERSION}.
 # ALEMBIC_SOURCE_REPO_BRANCH="master"
 
 OPENCOLLADA_SOURCE=( "https://github.com/KhronosGroup/OpenCOLLADA.git" )
-OPENCOLLADA_REPO_UID="3335ac164e68b2512a40914b14c74db260e6ff7d"
+OPENCOLLADA_REPO_UID="22b1f4ff026881b4d2804d397730286ab7e3d090"
 OPENCOLLADA_REPO_BRANCH="master"
 
 FFMPEG_SOURCE=( "http://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2" )
@@ -1260,7 +1262,7 @@ compile_OCIO() {
   fi
 
   # To be changed each time we make edits that would modify the compiled result!
-  ocio_magic=1
+  ocio_magic=2
   _init_ocio
 
   # Clean install if needed!
@@ -1277,14 +1279,27 @@ compile_OCIO() {
     if [ ! -d $_src ]; then
       INFO "Downloading OpenColorIO-$OCIO_VERSION"
       mkdir -p $SRC
-      download OCIO_SOURCE[@] $_src.tar.gz
 
-      INFO "Unpacking OpenColorIO-$OCIO_VERSION"
-      tar -C $SRC --transform "s,(.*/?)imageworks-OpenColorIO[^/]*(.*),\1OpenColorIO-$OCIO_VERSION\2,x" \
-          -xf $_src.tar.gz
+      if [ "$OCIO_USE_REPO" = true ]; then
+        git clone ${OCIO_SOURCE_REPO[0]} $_src
+      else
+        download OCIO_SOURCE[@] $_src.tar.gz
+        INFO "Unpacking OpenColorIO-$OCIO_VERSION"
+        tar -C $SRC --transform "s,(.*/?)imageworks-OpenColorIO[^/]*(.*),\1OpenColorIO-$OCIO_VERSION\2,x" \
+            -xf $_src.tar.gz
+      fi
+
     fi
 
     cd $_src
+
+    if [ "$OCIO_USE_REPO" = true ]; then
+      # XXX For now, always update from latest repo...
+      git pull origin master
+      git checkout $OCIO_SOURCE_REPO_UID
+      git reset --hard
+    fi
+
     # Always refresh the whole build!
     if [ -d build ]; then
       rm -rf build
@@ -1490,7 +1505,6 @@ compile_OPENEXR() {
     if [ "$OPENEXR_USE_REPO" = true ]; then
       # XXX For now, always update from latest repo...
       git pull origin master
-      # Stick to same rev as windows' libs...
       git checkout $OPENEXR_SOURCE_REPO_UID
       git reset --hard
       oiio_src_path="../OpenEXR"
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index e219ce3cb8e..fea55049049 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -176,6 +176,7 @@ public:
 
 	KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
 	KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_detect_outliers_kernel;
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_combine_halves_kernel;
 
 	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
@@ -210,6 +211,7 @@ public:
 	  REGISTER_KERNEL(shader),
 	  REGISTER_KERNEL(filter_divide_shadow),
 	  REGISTER_KERNEL(filter_get_feature),
+	  REGISTER_KERNEL(filter_detect_outliers),
 	  REGISTER_KERNEL(filter_combine_halves),
 	  REGISTER_KERNEL(filter_nlm_calc_difference),
 	  REGISTER_KERNEL(filter_nlm_blur),
@@ -594,6 +596,26 @@ public:
 		return true;
 	}
 
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_detect_outliers_kernel()(x, y,
+				                                (float*) image_ptr,
+				                                (float*) variance_ptr,
+				                                (float*) depth_ptr,
+				                                (float*) output_ptr,
+				                                &task->rect.x,
+				                                task->buffer.pass_stride);
+			}
+		}
+		return true;
+	}
+
 	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
 	{
 		float *render_buffer = (float*)tile.buffer;
@@ -632,6 +654,7 @@ public:
 		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
 		denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
 
 		denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 9a8537a6722..07fb964b0c6 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -949,7 +949,7 @@ public:
 		cuda_push_context();
 
 		int4 rect = task->rect;
-		int w = rect.z-rect.x;
+		int w = align_up(rect.z-rect.x, 4);
 		int h = rect.w-rect.y;
 		int r = task->nlm_state.r;
 		int f = task->nlm_state.f;
@@ -1248,6 +1248,38 @@ public:
 		return !have_error();
 	}
 
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDetectOutliers;
+		cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&image_ptr,
+		                &variance_ptr,
+		                &depth_ptr,
+		                &output_ptr,
+		                &task->rect,
+		                &task->buffer.pass_stride};
+
+		CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
 	void denoise(RenderTile &rtile, const DeviceTask &task)
 	{
 		DenoisingTask denoising(this);
@@ -1258,6 +1290,7 @@ public:
 		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
 		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
 
 		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 39c8cf30105..613bd9112cf 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -139,9 +139,9 @@ bool DenoisingTask::run_denoising()
 		nlm_state.temporary_2_ptr = *nlm_temporary_2;
 		nlm_state.temporary_3_ptr = *nlm_temporary_3;
 
-		int mean_from[]     = { 0, 1, 2, 6,  7,  8, 12 };
-		int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
-		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7 };
+		int mean_from[]     = { 0, 1, 2, 12, 6,  7, 8 };
+		int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
+		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7};
 		for(int pass = 0; pass < 7; pass++) {
 			device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
 			/* Get the unfiltered pass and its variance from the RenderBuffers. */
@@ -159,11 +159,25 @@ bool DenoisingTask::run_denoising()
 		int mean_to[]       = { 8,  9, 10};
 		int variance_to[]   = {11, 12, 13};
 		int num_color_passes = 3;
+
+		device_only_memory<float> temp_color;
+		temp_color.resize(3*buffer.pass_stride);
+		device->mem_alloc("Denoising temporary color", temp_color, MEM_READ_WRITE);
+
 		for(int pass = 0; pass < num_color_passes; pass++) {
-			device_sub_ptr color_pass    (device, buffer.mem,     mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_pass(device, temp_color, pass*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
 			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
 			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
 		}
+
+		{
+			device_sub_ptr depth_pass    (device, buffer.mem,                                 0,   buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr output_pass   (device, buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+			functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+		}
+
+		device->mem_free(temp_color);
 	}
 
 	storage.w = filter_area.z;
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 86d8eb64386..25b93c2ad74 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -82,6 +82,11 @@ public:
 		              device_ptr mean_ptr,
 		              device_ptr variance_ptr
 		              )> get_feature;
+		function<bool(device_ptr image_ptr,
+		              device_ptr variance_ptr,
+		              device_ptr depth_ptr,
+		              device_ptr output_ptr
+		              )> detect_outliers;
 		function<bool(device_ptr*)> set_tiles;
 	} functions;
 
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index a458ca6bf64..27e196d1e68 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -411,6 +411,11 @@ protected:
 	                           device_ptr mean_ptr,
 	                           device_ptr variance_ptr,
 	                           DenoisingTask *task);
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task);
 	bool denoising_set_tiles(device_ptr *buffers,
 	                         DenoisingTask *task);
 
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index ae1a7b917c3..8f33014a00e 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -216,6 +216,7 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
 	denoising_program.add_kernel(ustring("filter_divide_shadow"));
 	denoising_program.add_kernel(ustring("filter_get_feature"));
+	denoising_program.add_kernel(ustring("filter_detect_outliers"));
 	denoising_program.add_kernel(ustring("filter_combine_halves"));
 	denoising_program.add_kernel(ustring("filter_construct_transform"));
 	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
@@ -910,6 +911,33 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
 	return true;
 }
 
+bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr depth_ptr,
+                                                 device_ptr output_ptr,
+                                                 DenoisingTask *task)
+{
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+	kernel_set_args(ckFilterDetectOutliers, 0,
+	                image_mem,
+	                variance_mem,
+	                depth_mem,
+	                output_mem,
+	                task->rect,
+	                task->buffer.pass_stride);
+	enqueue_kernel(ckFilterDetectOutliers,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
                                            DenoisingTask *task)
 {
@@ -942,6 +970,7 @@ void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
 	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
 	denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 
 	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
 	denoising.render_buffer.samples = rtile.sample;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 5043f336858..bef869f34b4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,8 +10,23 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
 	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
 	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
 	kernels/opencl/kernel.cl
 	kernels/opencl/kernel_state_buffer_size.cl
 	kernels/opencl/kernel_split.cl
@@ -406,48 +421,28 @@ set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${C
 set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
-if(NOT WITH_CYCLES_NATIVE_ONLY)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-		kernels/cpu/kernel_avx.cpp
-		kernels/cpu/kernel_avx2.cpp
-		kernels/cpu/kernel_split_sse2.cpp
-		kernels/cpu/kernel_split_sse3.cpp
-		kernels/cpu/kernel_split_sse41.cpp
-		kernels/cpu/kernel_split_avx.cpp
-		kernels/cpu/kernel_split_avx2.cpp
-		kernels/cpu/filter_sse2.cpp
-		kernels/cpu/filter_sse3.cpp
-		kernels/cpu/filter_sse41.cpp
-		kernels/cpu/filter_avx.cpp
-		kernels/cpu/filter_avx2.cpp
-	)
+if(CXX_HAS_SSE)
+	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
 
-	if(CXX_HAS_SSE)
-		set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-	endif()
+if(CXX_HAS_AVX)
+	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+endif()
 
-	if(CXX_HAS_AVX)
-		set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-	endif()
-
-	if(CXX_HAS_AVX2)
-		set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-		set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-	endif()
+if(CXX_HAS_AVX2)
+	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 30644946840..b07b515c405 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 }
 
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
 
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
 {
-	if(n && k)
-		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
 	return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
 	else
 		phase *= D_ggx_aniso(wh, alpha);
 
-	if(n && k) {
-		/* Apply conductive fresnel term. */
-		return phase * fresnel_conductor(dotW_WH, *n, *k);
-	}
-
 	return make_float3(phase, phase, phase);
 }
 
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
-	float3 tm, bm;
-	make_orthonormals(wm, &tm, &bm);
-
-	float2 disk = concentric_sample_disk(randu, randv);
-	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
-	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
-	return make_float3(v, v, v);
-}
-
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
 ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
@@ -282,11 +258,6 @@ ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo,
 	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
 }
 
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
-	return M_1_PI_F * wo.z;
-}
-
 ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
 	float3 wh;
@@ -315,13 +286,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 #define MF_MULTI_GLASS
 #include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-#endif
-
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
 #include "kernel/closure/bsdf_microfacet_multi_impl.h"
@@ -428,7 +392,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
 		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
-	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -442,6 +406,10 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 		*omega_in = 2*dot(Z, I)*Z - I;
 		*pdf = 1e6f;
 		*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+		*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+		*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 		return LABEL_REFLECT|LABEL_SINGULAR;
 	}
 
@@ -456,7 +424,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	if(is_aniso)
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 16d900088cb..2eb2457c9e5 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,24 +26,16 @@
  * the balance heuristic isn't necessarily optimal anymore.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
-        float3 wi,
-        float3 wo,
-        const bool wo_outside,
-        const float3 color,
-        const float alpha_x,
-        const float alpha_y,
-         ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-        , const float eta
-        , bool use_fresnel
-        , const float3 cspec0
-#elif defined(MF_MULTI_GLOSSY)
-         , float3 *n, float3 *k
-         , const float eta
-         , bool use_fresnel
-         , const float3 cspec0
-#endif
-)
+	float3 wi,
+	float3 wo,
+	const bool wo_outside,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
 	bool swapped = false;
@@ -77,44 +69,29 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 	/* Analytically compute single scattering for lower noise. */
 	float3 eval;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	const float3 wh = normalize(wi+wo);
 #ifdef MF_MULTI_GLASS
 	eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
 	if(wo_outside)
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-
-	float F0 = fresnel_dielectric_cos(1.0f, eta);
-	if(use_fresnel) {
-		throughput = interpolate_fresnel_color(wi, normalize(wi + wo), eta, F0, cspec0);
-
-		eval *= throughput;
-	}
-#elif defined(MF_MULTI_DIFFUSE)
-	/* Diffuse has no special closed form for the single scattering bounce */
-	eval = make_float3(0.0f, 0.0f, 0.0f);
 #else /* MF_MULTI_GLOSSY */
-	const float3 wh = normalize(wi+wo);
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
 		val *= D_ggx(wh, alpha.x);
 	else
 		val *= D_ggx_aniso(wh, alpha);
-	if(n && k) {
-		eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
-	}
-	else {
-		eval = make_float3(val, val, val);
-	}
+	eval = make_float3(val, val, val);
+#endif
 
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	if(use_fresnel) {
 		throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
 
-		eval = throughput * val;
+		eval *= throughput;
 	}
-#endif
 
 	float3 wr = -wi;
 	float hr = 1.0f;
@@ -129,13 +106,6 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
 		                                                   lcg_step_float_addrspace(lcg_state)));
 
-#ifdef MF_MULTI_DIFFUSE
-		if(order == 0) {
-			/* Compute single-scattering for diffuse. */
-			const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
-			eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
-		}
-#endif
 #ifdef MF_MULTI_GLASS
 		if(order == 0 && use_fresnel) {
 			/* Evaluate amount of scattering towards wo on this microfacet. */
@@ -156,10 +126,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
-			phase = mf_eval_phase_diffuse(wo, wm);
 #else /* MF_MULTI_GLOSSY */
-			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
 		}
@@ -181,25 +149,17 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			else if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
 			}
-#elif defined(MF_MULTI_DIFFUSE)
-			wr = mf_sample_phase_diffuse(wm,
-			                             lcg_step_float_addrspace(lcg_state),
-			                             lcg_step_float_addrspace(lcg_state));
 #else /* MF_MULTI_GLOSSY */
 			if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 			}
-			wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+			wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 			lambda_r = mf_lambda(wr, alpha);
 
-#if defined(MF_MULTI_GLOSSY) || defined(MF_MULTI_GLASS)
 			if(!use_fresnel)
 				throughput *= color;
-#else
-			throughput *= color;
-#endif
 
 			C1_r = mf_C1(hr);
 			G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -215,18 +175,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-	, const float eta
-	, bool use_fresnel
-	, const float3 cspec0
-#elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
-	, const float eta
-	, bool use_fresnel
-	, const float3 cspec0
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+	float3 wi,
+	float3 *wo,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	const float2 alpha = make_float2(alpha_x, alpha_y);
 
@@ -237,17 +195,11 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 	float C1_r = 1.0f;
 	float G1_r = 0.0f;
 	bool outside = true;
-#ifdef MF_MULTI_GLASS
+
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	if(use_fresnel) {
 		throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
 	}
-#elif defined(MF_MULTI_GLOSSY)
-	float F0 = fresnel_dielectric_cos(1.0f, eta);
-	if(use_fresnel) {
-		throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
-	}
-#endif
 
 	int order;
 	for(order = 0; order < 10; order++) {
@@ -262,13 +214,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 		                                                   lcg_step_float_addrspace(lcg_state)));
 
 		/* First-bounce color is already accounted for in mix weight. */
-#if defined(MF_MULTI_GLASS) || defined(MF_MULTI_GLOSSY)
 		if(!use_fresnel && order > 0)
 			throughput *= color;
-#else
-		if(order > 0)
-			throughput *= color;
-#endif
 
 		/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
@@ -294,10 +241,6 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 					throughput *= t_color;
 			}
 		}
-#elif defined(MF_MULTI_DIFFUSE)
-		wr = mf_sample_phase_diffuse(wm,
-		                             lcg_step_float_addrspace(lcg_state),
-		                             lcg_step_float_addrspace(lcg_state));
 #else /* MF_MULTI_GLOSSY */
 		if(use_fresnel) {
 			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
@@ -307,7 +250,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 			else
 				throughput *= t_color;
 		}
-		wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+		wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 		/* Update random walk parameters. */
@@ -319,6 +262,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 }
 
 #undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
 #undef MF_MULTI_GLOSSY
 #undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index f5a40d49997..41998c792b6 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -16,7 +16,7 @@
 
  CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
  * pixel_buffer always points to the current pixel in the first pass. */
@@ -32,7 +32,7 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest
 {
 	features[0] = pixel.x;
 	features[1] = pixel.y;
-	features[2] = ccl_get_feature(buffer, 0);
+	features[2] = fabsf(ccl_get_feature(buffer, 0));
 	features[3] = ccl_get_feature(buffer, 1);
 	features[4] = ccl_get_feature(buffer, 2);
 	features[5] = ccl_get_feature(buffer, 3);
@@ -50,7 +50,7 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float cc
 {
 	scales[0] = fabsf(pixel.x - mean[0]);
 	scales[1] = fabsf(pixel.y - mean[1]);
-	scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+	scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
 	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
 	                                    ccl_get_feature(buffer, 2) - mean[4],
 	                                    ccl_get_feature(buffer, 3) - mean[5]));
@@ -107,7 +107,7 @@ ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
 	math_vector_zero(design_row+1, rank);
 	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
 	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
-	design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+	design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
 	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
 	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
 	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 303c8f482e3..a242a8ed0a1 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -37,7 +37,7 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active
 {
 	features[0] = x;
 	features[1] = y;
-	features[2] = ccl_get_feature_sse(0);
+	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
 	features[3] = ccl_get_feature_sse(1);
 	features[4] = ccl_get_feature_sse(2);
 	features[5] = ccl_get_feature_sse(3);
@@ -58,7 +58,7 @@ ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128
 	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
 	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
 
-	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
+	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
 
 	__m128 diff, scale;
 	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 54bcf888052..252bcc5e675 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -104,6 +104,57 @@ ccl_device void kernel_filter_get_feature(int sample,
 	}
 }
 
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+                                              ccl_global float *image,
+                                              ccl_global float *variance,
+                                              ccl_global float *depth,
+                                              ccl_global float *out,
+                                              int4 rect,
+                                              int pass_stride)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	int n = 0;
+	float values[25];
+	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+			float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+			/* Find the position of L. */
+			int i;
+			for(i = 0; i < n; i++) {
+				if(values[i] > L) break;
+			}
+			/* Make space for L by shifting all following values to the right. */
+			for(int j = n; j > i; j--) {
+				values[j] = values[j-1];
+			}
+			/* Insert L. */
+			values[i] = L;
+			n++;
+		}
+	}
+
+	int idx = (y-rect.y)*buffer_w + (x-rect.x);
+	float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+	float ref = 2.0f*values[(int)(n*0.75f)];
+	float fac = 1.0f;
+	if(L > ref) {
+		/* If the pixel is an outlier, negate the depth value to mark it as one.
+		 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+		depth[idx] = -depth[idx];
+		fac = ref/L;
+		variance[idx              ] *= fac*fac;
+		variance[idx + pass_stride] *= fac*fac;
+		variance[idx+2*pass_stride] *= fac*fac;
+	}
+	out[idx              ] = fac*image[idx];
+	out[idx + pass_stride] = fac*image[idx + pass_stride];
+	out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
 /* Combine A/B buffers.
  * Calculates the combined mean and the buffer variance. */
 ccl_device void kernel_filter_combine_halves(int x, int y,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 02f3802fa0c..6a7c86e4012 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -54,7 +54,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
 	float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
 	float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
 
-	if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+	/* If the pixel was flagged as an outlier during prefiltering, skip it.
+	 * Otherwise, perform the regular confidence interval test. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f ||
+	   average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f)) {
 		return;
 	}
 
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index c694e60ce14..c5206940493 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -356,7 +356,16 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *
 #  endif
 			if(kernel_data.film.pass_denoising_clean) {
 				float3 noisy, clean;
-				path_radiance_split_denoising(kg, L, &noisy, &clean);
+#ifdef __SHADOW_TRICKS__
+				if(is_shadow_catcher) {
+					noisy = L_sum;
+					clean = make_float3(0.0f, 0.0f, 0.0f);
+				}
+				else
+#endif  /* __SHADOW_TRICKS__ */
+				{
+					path_radiance_split_denoising(kg, L, &noisy, &clean);
+				}
 				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
 				                                  sample, noisy);
 				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 0d31ae32aa6..fc093ad8319 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -407,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}
 #endif  /* __SUBSURFACE__ */
 
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
 		if(kernel_data.integrator.use_direct_light) {
 			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
 			          (state->flag & PATH_RAY_SHADOW_CATCHER);
@@ -421,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif  /* defined(__EMISSION__) */
 
 		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
 			break;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index e676ea0f3ae..dcb577e176f 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
 ccl_device_noinline void kernel_branched_path_surface_connect_light(
         KernelGlobals *kg,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index 10007ee2635..9708b4b5b58 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -43,6 +43,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    int buffer_denoising_offset,
                                                    bool use_split_variance);
 
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride);
+
 void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       float *mean,
                                                       float *variance,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 3b71e50ca3b..15325abdccd 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -91,6 +91,21 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
 #endif
 }
 
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+	kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
 void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       float *mean,
                                                       float *variance,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index b9d82781840..9b85a864153 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -22,45 +22,49 @@
 
 #include "kernel/kernel_compat_cpu.h"
 
-#ifndef __SPLIT_KERNEL__
-#  include "kernel/kernel_math.h"
-#  include "kernel/kernel_types.h"
+#ifndef KERNEL_STUB
+#  ifndef __SPLIT_KERNEL__
+#    include "kernel/kernel_math.h"
+#    include "kernel/kernel_types.h"
 
-#  include "kernel/split/kernel_split_data.h"
-#  include "kernel/kernel_globals.h"
+#    include "kernel/split/kernel_split_data.h"
+#    include "kernel/kernel_globals.h"
 
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#  include "kernel/kernel_film.h"
-#  include "kernel/kernel_path.h"
-#  include "kernel/kernel_path_branched.h"
-#  include "kernel/kernel_bake.h"
+#    include "kernel/kernels/cpu/kernel_cpu_image.h"
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_path.h"
+#    include "kernel/kernel_path_branched.h"
+#    include "kernel/kernel_bake.h"
+#  else
+#    include "kernel/split/kernel_split_common.h"
+
+#    include "kernel/split/kernel_data_init.h"
+#    include "kernel/split/kernel_path_init.h"
+#    include "kernel/split/kernel_scene_intersect.h"
+#    include "kernel/split/kernel_lamp_emission.h"
+#    include "kernel/split/kernel_do_volume.h"
+#    include "kernel/split/kernel_queue_enqueue.h"
+#    include "kernel/split/kernel_indirect_background.h"
+#    include "kernel/split/kernel_shader_setup.h"
+#    include "kernel/split/kernel_shader_sort.h"
+#    include "kernel/split/kernel_shader_eval.h"
+#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#    include "kernel/split/kernel_subsurface_scatter.h"
+#    include "kernel/split/kernel_direct_lighting.h"
+#    include "kernel/split/kernel_shadow_blocked_ao.h"
+#    include "kernel/split/kernel_shadow_blocked_dl.h"
+#    include "kernel/split/kernel_next_iteration_setup.h"
+#    include "kernel/split/kernel_indirect_subsurface.h"
+#    include "kernel/split/kernel_buffer_update.h"
+#  endif  /* __SPLIT_KERNEL__ */
 #else
-#  include "kernel/split/kernel_split_common.h"
-
-#  include "kernel/split/kernel_data_init.h"
-#  include "kernel/split/kernel_path_init.h"
-#  include "kernel/split/kernel_scene_intersect.h"
-#  include "kernel/split/kernel_lamp_emission.h"
-#  include "kernel/split/kernel_do_volume.h"
-#  include "kernel/split/kernel_queue_enqueue.h"
-#  include "kernel/split/kernel_indirect_background.h"
-#  include "kernel/split/kernel_shader_setup.h"
-#  include "kernel/split/kernel_shader_sort.h"
-#  include "kernel/split/kernel_shader_eval.h"
-#  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#  include "kernel/split/kernel_subsurface_scatter.h"
-#  include "kernel/split/kernel_direct_lighting.h"
-#  include "kernel/split/kernel_shadow_blocked_ao.h"
-#  include "kernel/split/kernel_shadow_blocked_dl.h"
-#  include "kernel/split/kernel_next_iteration_setup.h"
-#  include "kernel/split/kernel_indirect_subsurface.h"
-#  include "kernel/split/kernel_buffer_update.h"
-#endif
-
-#ifdef KERNEL_STUB
 #  include "util/util_debug.h"
 #  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
+
+#  ifdef __SPLIT_KERNEL__
+#    include "kernel/split/kernel_data_init.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* KERNEL_STUB */
 
 CCL_NAMESPACE_BEGIN
 
@@ -191,20 +195,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	{ \
 		STUB_ASSERT(KERNEL_ARCH, name); \
 	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
 #else
 #  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		kernel_##name(kg); \
 	}
-#endif /* KERNEL_STUB */
 
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		ccl_local type locals; \
 		kernel_##name(kg, &locals); \
 	}
+#endif /* KERNEL_STUB */
 
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 50f73f9728d..f812a6601c6 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -86,6 +86,22 @@ kernel_cuda_filter_get_feature(int sample,
 	}
 }
 
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+                                   float *variance,
+                                   float *depth,
+                                   float *output,
+                                   int4 prefilter_rect,
+                                   int pass_stride)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 3d82bff9892..fbc3daa62b9 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -78,6 +78,20 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
 	}
 }
 
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+                                                ccl_global float *variance,
+                                                ccl_global float *depth,
+                                                ccl_global float *output,
+                                                int4 prefilter_rect,
+                                                int pass_stride)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
 __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
                                                ccl_global float *variance,
                                                ccl_global float *a,
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 39c6c7e5fd9..57f40789d49 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -34,17 +34,17 @@ shader node_principled_bsdf(
 	float Clearcoat = 0.0,
 	float ClearcoatGloss = 1.0,
 	float IOR = 1.45,
-	float Transparency = 0.0,
-	float RefractionRoughness = 0.0,
+	float Transmission = 0.0,
+	float TransmissionRoughness = 0.0,
 	normal Normal = N,
 	normal ClearcoatNormal = N,
 	normal Tangent = normalize(dPdu),
 	output closure color BSDF = 0)
 {
 	float f = max(IOR, 1e-5);
-	float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transparency, 0.0, 1.0));
-	float transp = clamp(Transparency, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
-	float specular_weight = (1.0 - transp);
+	float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+	float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+	float specular_weight = (1.0 - final_transmission);
 
 	vector T = Tangent;
 
@@ -90,7 +90,7 @@ shader node_principled_bsdf(
 		}
 	}
 
-	if (transp > 1e-5) {
+	if (final_transmission > 1e-5) {
 		color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
 		float eta = backfacing() ? 1.0 / f : f;
 
@@ -98,18 +98,18 @@ shader node_principled_bsdf(
 			float cosNO = dot(Normal, I);
 			float Fr = fresnel_dielectric_cos(cosNO, eta);
 
-		float refl_roughness = Roughness;
-		if (Roughness <= 1e-2)
-			refl_roughness = 0.0;
+			float refl_roughness = Roughness;
+			if (Roughness <= 1e-2)
+				refl_roughness = 0.0;
 
-			float refraction_roughness = refl_roughness;
+			float transmission_roughness = refl_roughness;
 			if (distribution == "GGX")
-				refraction_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - RefractionRoughness);
+				transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
 
-			BSDF = BSDF + transp * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
-			       (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, refraction_roughness * refraction_roughness, eta));
+			BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+			       (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
 		} else {
-			BSDF = BSDF + transp * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+			BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
 		}
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 642ccac8239..e4545d66eff 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
         unsigned int num_samples,
         ccl_global float *buffer)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
 #ifdef __KERNEL_OPENCL__
 	kg->data = data;
 #endif
@@ -143,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 			*(rng_state + index) = hash_int_2d(x, y);
 		}
 	}
+
+#endif  /* KERENL_STUB */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 7918c640175..0e1aedf2918 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -79,14 +79,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __PRINCIPLED__
 		case CLOSURE_BSDF_PRINCIPLED_ID: {
 			uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
-				sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset, eta_offset, transparency_offset,
-				anisotropic_rotation_offset, refraction_roughness_offset;
+				sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset, eta_offset, transmission_offset,
+				anisotropic_rotation_offset, transmission_roughness_offset;
 			uint4 data_node2 = read_node(kg, offset);
 
 			float3 T = stack_load_float3(stack, data_node.y);
 			decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
 			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_gloss_offset);
-			decode_node_uchar4(data_node2.x, &eta_offset, &transparency_offset, &anisotropic_rotation_offset, &refraction_roughness_offset);
+			decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
 
 			// get Disney principled parameters
 			float metallic = param1;
@@ -99,9 +99,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float sheen_tint = stack_load_float(stack, sheen_tint_offset);
 			float clearcoat = stack_load_float(stack, clearcoat_offset);
 			float clearcoat_gloss = stack_load_float(stack, clearcoat_gloss_offset);
-			float transparency = stack_load_float(stack, transparency_offset);
+			float transmission = stack_load_float(stack, transmission_offset);
 			float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
-			float refraction_roughness = stack_load_float(stack, refraction_roughness_offset);
+			float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
 			float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
 
 			ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
@@ -118,10 +118,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float fresnel = fresnel_dielectric_cos(cosNO, ior);
 
 			// calculate weights of the diffuse and specular part
-			float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transparency));
+			float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
 			
-			float transp = saturate(transparency) * (1.0f - saturate(metallic));
-			float specular_weight = (1.0f - transp);
+			float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+			float specular_weight = (1.0f - final_transmission);
 
 			// get the base color
 			uint4 data_base_color = read_node(kg, offset);
@@ -300,8 +300,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __CAUSTICS_TRICKS__
 			if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
 #endif
-				if(transp > CLOSURE_WEIGHT_CUTOFF) {
-					float3 glass_weight = weight * transp;
+				if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+					float3 glass_weight = weight * final_transmission;
 					float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
 
 					if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
@@ -342,12 +342,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->N = N;
 
 								if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
-									refraction_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - refraction_roughness);
+									transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
 								else
-									refraction_roughness = refl_roughness;
+									transmission_roughness = refl_roughness;
 
-								bsdf->alpha_x = refraction_roughness * refraction_roughness;
-								bsdf->alpha_y = refraction_roughness * refraction_roughness;
+								bsdf->alpha_x = transmission_roughness * transmission_roughness;
+								bsdf->alpha_y = transmission_roughness * transmission_roughness;
 								bsdf->ior = ior;
 
 								/* setup bsdf */
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index b66d694c82a..f4482e0bb25 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -30,6 +30,16 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Some helpers to silence warning in templated function. */
+static bool isfinite(uchar /*value*/)
+{
+	return false;
+}
+static bool isfinite(half /*value*/)
+{
+	return false;
+}
+
 ImageManager::ImageManager(const DeviceInfo& info)
 {
 	need_update = true;
@@ -639,6 +649,37 @@ bool ImageManager::file_load_image(Image *img,
 			}
 		}
 	}
+	/* Make sure we don't have buggy values. */
+	if(FileFormat == TypeDesc::FLOAT) {
+		/* For RGBA buffers we put all channels to 0 if either of them is not
+		 * finite. This way we avoid possible artifacts caused by fully changed
+		 * hue.
+		 */
+		if(is_rgba) {
+			for(size_t i = 0; i < num_pixels; i += 4) {
+				StorageType *pixel = &pixels[i*4];
+				if(!isfinite(pixel[0]) ||
+				   !isfinite(pixel[1]) ||
+				   !isfinite(pixel[2]) ||
+				   !isfinite(pixel[3]))
+				{
+					pixel[0] = 0;
+					pixel[1] = 0;
+					pixel[2] = 0;
+					pixel[3] = 0;
+				}
+			}
+		}
+		else {
+			for(size_t i = 0; i < num_pixels; ++i) {
+				StorageType *pixel = &pixels[i];
+				if(!isfinite(pixel[0])) {
+					pixel[0] = 0;
+				}
+			}
+		}
+	}
+	/* Scale image down if needed. */
 	if(pixels_storage.size() > 0) {
 		float scale_factor = 1.0f;
 		while(max_size * scale_factor > texture_limit) {
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 4886dcd563f..625dd3ded39 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -486,18 +486,10 @@ static void background_cdf(int start,
                            float2 *cond_cdf)
 {
 	/* Conditional CDFs (rows, U direction). */
-	/* NOTE: It is possible to have some NaN pixels on background
-	 * which will ruin CDF causing wrong shading. We replace such
-	 * pixels with black.
-	 */
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
 		float ave_luminance = average(env_color);
-		/* TODO(sergey): Consider adding average_safe(). */
-		if(!isfinite(ave_luminance)) {
-			ave_luminance = 0.0f;
-		}
 
 		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
@@ -505,9 +497,6 @@ static void background_cdf(int start,
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
 			ave_luminance = average(env_color);
-			if(!isfinite(ave_luminance)) {
-				ave_luminance = 0.0f;
-			}
 
 			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 9b565c3ede1..57b475e5cd0 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2316,8 +2316,8 @@ NODE_DEFINE(PrincipledBsdfNode)
 	SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f);
 	SOCKET_IN_FLOAT(clearcoat_gloss, "Clearcoat Gloss", 0.0f);
 	SOCKET_IN_FLOAT(ior, "IOR", 0.0f);
-	SOCKET_IN_FLOAT(transparency, "Transparency", 0.0f);
-	SOCKET_IN_FLOAT(refraction_roughness, "Refraction Roughness", 0.0f);
+	SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
+	SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
 	SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f);
 	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
 	SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
@@ -2352,7 +2352,7 @@ void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attribu
 void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius,
 	ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic,
 	ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_gloss,
-	ShaderInput *p_ior, ShaderInput *p_transparency, ShaderInput *p_anisotropic_rotation, ShaderInput *p_refraction_roughness)
+	ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness)
 {
 	ShaderInput *base_color_in = input("Base Color");
 	ShaderInput *subsurface_color_in = input("Subsurface Color");
@@ -2376,8 +2376,8 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic,
 	int clearcoat_offset = compiler.stack_assign(p_clearcoat);
 	int clearcoat_gloss_offset = compiler.stack_assign(p_clearcoat_gloss);
 	int ior_offset = compiler.stack_assign(p_ior);
-	int transparency_offset = compiler.stack_assign(p_transparency);
-	int refraction_roughness_offset = compiler.stack_assign(p_refraction_roughness);
+	int transmission_offset = compiler.stack_assign(p_transmission);
+	int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
 	int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
 	int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
 
@@ -2393,7 +2393,7 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic,
 		compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset),
 		compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset));
 
-	compiler.add_node(compiler.encode_uchar4(ior_offset, transparency_offset, anisotropic_rotation_offset, refraction_roughness_offset),
+	compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset),
 		distribution, SVM_STACK_INVALID, SVM_STACK_INVALID);
 
 	float3 bc_default = get_float3(base_color_in->socket_type);
@@ -2419,8 +2419,8 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler)
 {
 	compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"),
 		input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"),
-		input("Clearcoat"), input("Clearcoat Gloss"), input("IOR"), input("Transparency"),
-		input("Anisotropic Rotation"), input("Refraction Roughness"));
+		input("Clearcoat"), input("Clearcoat Gloss"), input("IOR"), input("Transmission"),
+		input("Anisotropic Rotation"), input("Transmission Roughness"));
 }
 
 void PrincipledBsdfNode::compile(OSLCompiler& compiler)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index a56313bd5d4..aac6ce2f375 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -378,13 +378,13 @@ public:
 	void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
 		ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic,
 		ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_gloss,
-		ShaderInput *ior, ShaderInput *transparency, ShaderInput *anisotropic_rotation, ShaderInput *refraction_roughness);
+		ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness);
 
 	float3 base_color;
 	float3 subsurface_color, subsurface_radius;
 	float metallic, subsurface, specular, roughness, specular_tint, anisotropic,
-		sheen, sheen_tint, clearcoat, clearcoat_gloss, ior, transparency,
-		anisotropic_rotation, refraction_roughness;
+		sheen, sheen_tint, clearcoat, clearcoat_gloss, ior, transmission,
+		anisotropic_rotation, transmission_roughness;
 	float3 normal, clearcoat_normal, tangent;
 	float surface_mix_weight;
 	ClosureType distribution, distribution_orig;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 3eaf34c847f..3f080407b1f 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -479,7 +479,7 @@ void Session::release_tile(RenderTile& rtile)
 {
 	thread_scoped_lock tile_lock(tile_mutex);
 
-	progress.add_finished_tile();
+	progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
 
 	bool delete_tile;
 
@@ -912,7 +912,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	int progressive_sample = tile_manager.state.sample;
 	int num_samples = tile_manager.get_num_effective_samples();
 
-	int tile = progress.get_finished_tiles();
+	int tile = progress.get_rendered_tiles();
 	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
@@ -920,11 +920,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 	if(!params.progressive) {
 		const bool is_cpu = params.device.type == DEVICE_CPU;
+		const bool rendering_finished = (tile == num_tiles);
 		const bool is_last_tile = (tile + 1) == num_tiles;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if(device->show_samples() || (is_cpu && is_last_tile)) {
+		if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
 			/* Some devices automatically support showing the sample number:
 			 * - CUDADevice
 			 * - OpenCLDevice when using the megakernel (the split kernel renders multiple
@@ -936,6 +937,9 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 */
 			substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
 		}
+		if(params.use_denoising) {
+			substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
+		}
 	}
 	else if(tile_manager.num_samples == INT_MAX)
 		substatus = string_printf("Path Tracing Sample %d", progressive_sample+1);
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index ef89288f167..a64f227a38c 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -241,6 +241,10 @@ void Shader::set_graph(ShaderGraph *graph_)
 	delete graph_bump;
 	graph = graph_;
 	graph_bump = NULL;
+
+	/* Store info here before graph optimization to make sure that
+	 * nodes that get optimized away still count. */
+	has_volume_connected = (graph->output()->input("Volume")->link != NULL);
 }
 
 void Shader::tag_update(Scene *scene)
@@ -432,15 +436,14 @@ void ShaderManager::device_update_common(Device *device,
 			flag |= SD_HAS_VOLUME;
 			has_volumes = true;
 
-			/* in this case we can assume transparent surface */
-			if(!shader->has_surface)
-				flag |= SD_HAS_ONLY_VOLUME;
-
 			/* todo: this could check more fine grained, to skip useless volumes
 			 * enclosed inside an opaque bsdf.
 			 */
 			flag |= SD_HAS_TRANSPARENT_SHADOW;
 		}
+		/* in this case we can assume transparent surface */
+		if(shader->has_volume_connected && !shader->has_surface)
+			flag |= SD_HAS_ONLY_VOLUME;
 		if(shader->heterogeneous_volume && shader->has_volume_spatial_varying)
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index a8018231f1a..b6714b13247 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -105,6 +105,15 @@ public:
 	bool need_update;
 	bool need_update_attributes;
 
+	/* If the shader has only volume components, the surface is assumed to
+	 * be transparent.
+	 * However, graph optimization might remove the volume subgraph, but
+	 * since the user connected something to the volume output the surface
+	 * should still be transparent.
+	 * Therefore, has_volume_connected stores whether some volume subtree
+	 * was connected before optimization. */
+	bool has_volume_connected;
+
 	/* information about shader after compiling */
 	bool has_surface;
 	bool has_surface_emission;
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 08624cfc739..f9c3b4bb139 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -786,12 +786,10 @@ static string line_directive(const string& base, const string& path, int line)
 	return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
 }
 
-
 static string path_source_replace_includes_recursive(
         const string& base,
         const string& source,
-        const string& path,
-        const string& source_filename)
+        const string& source_filepath)
 {
 	/* Our own little c preprocessor that replaces #includes with the file
 	 * contents, to work around issue of OpenCL drivers not supporting
@@ -809,23 +807,22 @@ static string path_source_replace_includes_recursive(
 			if(string_startswith(token, "include")) {
 				token = string_strip(token.substr(7, token.size() - 7));
 				if(token[0] == '"') {
-					size_t n_start = 1;
-					size_t n_end = token.find("\"", n_start);
-					string filename = token.substr(n_start, n_end - n_start);
-					string text, filepath = path_join(path, filename);
+					const size_t n_start = 1;
+					const size_t n_end = token.find("\"", n_start);
+					const string filename = token.substr(n_start, n_end - n_start);
+					string filepath = path_join(base, filename);
+					if(!path_exists(filepath)) {
+						filepath = path_join(path_dirname(source_filepath),
+						                     filename);
+					}
+					string text;
 					if(path_read_text(filepath, text)) {
-						/* Replace include directories with both current path
-						 * and path extracted from the include file.
-						 * Not totally robust, but works fine for Cycles kernel
-						 * and avoids having list of include directories.x
-						 */
-						text = path_source_replace_includes(
-						        text, path_dirname(filepath), filename);
-						text = path_source_replace_includes(text, path, filename);
+						text = path_source_replace_includes_recursive(
+						        base, text, filepath);
 						/* Use line directives for better error messages. */
 						line = line_directive(base, filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(base, path_join(path, source_filename), i + 1);
+						     + line_directive(base, source_filepath, i + 1);
 					}
 				}
 			}
@@ -840,7 +837,10 @@ string path_source_replace_includes(const string& source,
                                     const string& path,
                                     const string& source_filename)
 {
-	return path_source_replace_includes_recursive(path, source, path, source_filename);
+	return path_source_replace_includes_recursive(
+	        path,
+	        source,
+	        path_join(path, source_filename));
 }
 
 FILE *path_fopen(const string& path, const string& mode)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 39c1eed04e7..bc672669e1f 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -37,7 +37,8 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
 		status = "Initializing";
@@ -75,7 +76,8 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
 		status = "Initializing";
@@ -177,7 +179,8 @@ public:
 
 		pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 	}
 
 	void set_total_pixel_samples(uint64_t total_pixel_samples_)
@@ -209,11 +212,16 @@ public:
 		set_update();
 	}
 
-	void add_finished_tile()
+	void add_finished_tile(bool denoised)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		finished_tiles++;
+		if(denoised) {
+			denoised_tiles++;
+		}
+		else {
+			rendered_tiles++;
+		}
 	}
 
 	int get_current_sample()
@@ -223,9 +231,14 @@ public:
 		return current_tile_sample;
 	}
 
-	int get_finished_tiles()
+	int get_rendered_tiles()
 	{
-		return finished_tiles;
+		return rendered_tiles;
+	}
+
+	int get_denoised_tiles()
+	{
+		return denoised_tiles;
 	}
 
 	/* status messages */
@@ -318,7 +331,7 @@ protected:
 	int current_tile_sample;
 	/* Stores the number of tiles that's already finished.
 	 * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
-	int finished_tiles;
+	int rendered_tiles, denoised_tiles;
 
 	double start_time, render_start_time;
 
diff --git a/source/blender/blenloader/intern/versioning_270.c b/source/blender/blenloader/intern/versioning_270.c
index a097e0a9e48..d8f1da3b6b4 100644
--- a/source/blender/blenloader/intern/versioning_270.c
+++ b/source/blender/blenloader/intern/versioning_270.c
@@ -1636,6 +1636,12 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *main)
 	}
 
 	{
+		for (Scene *scene = main->scene.first; scene; scene = scene->id.next) {
+			if (scene->r.im_format.exr_codec == R_IMF_EXR_CODEC_DWAB) {
+				scene->r.im_format.exr_codec = R_IMF_EXR_CODEC_DWAA;
+			}
+		}
+
 		if (!DNA_struct_elem_find(fd->filesdna, "View3DDebug", "char", "background")) {
 			bScreen *screen;
 
diff --git a/source/blender/bmesh/intern/bmesh_opdefines.c b/source/blender/bmesh/intern/bmesh_opdefines.c
index 6b388a75436..d2f0fc1721c 100644
--- a/source/blender/bmesh/intern/bmesh_opdefines.c
+++ b/source/blender/bmesh/intern/bmesh_opdefines.c
@@ -1912,7 +1912,6 @@ static BMOpDefine bmo_wireframe_def = {
 	 {"use_even_offset", BMO_OP_SLOT_BOOL},
 	 {"use_crease", BMO_OP_SLOT_BOOL},
 	 {"crease_weight", BMO_OP_SLOT_FLT},
-	 {"thickness", BMO_OP_SLOT_FLT},
 	 {"use_relative_offset", BMO_OP_SLOT_BOOL},
 	 {"material_offset", BMO_OP_SLOT_INT},
 	 {{'\0'}},
diff --git a/source/blender/bmesh/intern/bmesh_polygon_edgenet.c b/source/blender/bmesh/intern/bmesh_polygon_edgenet.c
index 0cd38f24165..e515f9af63f 100644
--- a/source/blender/bmesh/intern/bmesh_polygon_edgenet.c
+++ b/source/blender/bmesh/intern/bmesh_polygon_edgenet.c
@@ -62,6 +62,7 @@
 #define EDGE_NET   _FLAG_WALK
 /* tag verts we've visit */
 #define VERT_VISIT _FLAG_WALK
+#define VERT_IN_QUEUE _FLAG_WALK_ALT
 
 struct VertOrder {
 	float   angle;
@@ -512,13 +513,21 @@ bool BM_face_split_edgenet(
 	} while ((l_iter = l_iter->next) != l_first);
 #endif
 
+	/* Note: 'VERT_IN_QUEUE' is often not needed at all,
+	 * however in rare cases verts are added multiple times to the queue,
+	 * that on it's own is harmless but in _very_ rare cases,
+	 * the queue will overflow its maximum size,
+	 * so we better be strict about this! see: T51539 */
 
 	for (i = 0; i < edge_net_len; i++) {
 		BM_ELEM_API_FLAG_ENABLE(edge_net[i], EDGE_NET);
+		BM_ELEM_API_FLAG_DISABLE(edge_net[i]->v1, VERT_IN_QUEUE);
+		BM_ELEM_API_FLAG_DISABLE(edge_net[i]->v2, VERT_IN_QUEUE);
 	}
 	l_iter = l_first = BM_FACE_FIRST_LOOP(f);
 	do {
 		BM_ELEM_API_FLAG_ENABLE(l_iter->e, EDGE_NET);
+		BM_ELEM_API_FLAG_DISABLE(l_iter->v, VERT_IN_QUEUE);
 	} while ((l_iter = l_iter->next) != l_first);
 
 	float face_normal_matrix[3][3];
@@ -527,8 +536,10 @@ bool BM_face_split_edgenet(
 
 	/* any vert can be used to begin with */
 	STACK_PUSH(vert_queue, l_first->v);
+	BM_ELEM_API_FLAG_ENABLE(l_first->v, VERT_IN_QUEUE);
 
 	while ((v = STACK_POP(vert_queue))) {
+		BM_ELEM_API_FLAG_DISABLE(v, VERT_IN_QUEUE);
 		if (bm_face_split_edgenet_find_loop(
 		        v, f->no, face_normal_matrix,
 		        edge_order, edge_order_len, face_verts, &face_verts_len))
@@ -558,8 +569,12 @@ bool BM_face_split_edgenet(
 				 * (verts between boundary and manifold edges) */
 				l_iter = l_first = BM_FACE_FIRST_LOOP(f_new);
 				do {
-					if (bm_face_split_edgenet_find_loop_pair_exists(l_iter->v)) {
+					/* Avoid adding to queue multiple times (not common but happens). */
+					if (!BM_ELEM_API_FLAG_TEST(l_iter->v, VERT_IN_QUEUE) &&
+					    bm_face_split_edgenet_find_loop_pair_exists(l_iter->v))
+					{
 						STACK_PUSH(vert_queue, l_iter->v);
+						BM_ELEM_API_FLAG_ENABLE(l_iter->v, VERT_IN_QUEUE);
 					}
 				} while ((l_iter = l_iter->next) != l_first);
 			}
diff --git a/source/blender/bmesh/tools/bmesh_bevel.c b/source/blender/bmesh/tools/bmesh_bevel.c
index 05169a2a976..a05c12a2730 100644
--- a/source/blender/bmesh/tools/bmesh_bevel.c
+++ b/source/blender/bmesh/tools/bmesh_bevel.c
@@ -2295,7 +2295,7 @@ static int interp_range(const float *frac, int n, const float f, float *r_rest)
 /* Interpolate given vmesh to make one with target nseg border vertices on the profiles */
 static VMesh *interp_vmesh(BevelParams *bp, VMesh *vm0, int nseg)
 {
-	int n, ns0, nseg2, odd, i, j, k, j0, k0, k0prev;
+	int n, ns0, nseg2, odd, i, j, k, j0, k0, k0prev, j0inc, k0inc;
 	float *prev_frac, *frac, *new_frac, *prev_new_frac;
 	float f, restj, restk, restkprev;
 	float quad[4][3], co[3], center[3];
@@ -2339,10 +2339,12 @@ static VMesh *interp_vmesh(BevelParams *bp, VMesh *vm0, int nseg)
 					copy_v3_v3(co, mesh_vert_canon(vm0, i, j0, k0)->co);
 				}
 				else {
+					j0inc = (restj < BEVEL_EPSILON || j0 == ns0) ? 0 : 1;
+					k0inc = (restk < BEVEL_EPSILON || k0 == ns0) ? 0 : 1;
 					copy_v3_v3(quad[0], mesh_vert_canon(vm0, i, j0, k0)->co);
-					copy_v3_v3(quad[1], mesh_vert_canon(vm0, i, j0, k0 + 1)->co);
-					copy_v3_v3(quad[2], mesh_vert_canon(vm0, i, j0 + 1, k0 + 1)->co);
-					copy_v3_v3(quad[3], mesh_vert_canon(vm0, i, j0 + 1, k0)->co);
+					copy_v3_v3(quad[1], mesh_vert_canon(vm0, i, j0, k0 + k0inc)->co);
+					copy_v3_v3(quad[2], mesh_vert_canon(vm0, i, j0 + j0inc, k0 + k0inc)->co);
+					copy_v3_v3(quad[3], mesh_vert_canon(vm0, i, j0 + j0inc, k0)->co);
 					interp_bilinear_quad_v3(quad, restk, restj, co);
 				}
 				copy_v3_v3(mesh_vert(vm1, i, j, k)->co, co);
diff --git a/source/blender/editors/mask/mask_draw.c b/source/blender/editors/mask/mask_draw.c
index 228a4db74e1..b3e6a369d90 100644
--- a/source/blender/editors/mask/mask_draw.c
+++ b/source/blender/editors/mask/mask_draw.c
@@ -784,13 +784,12 @@ void ED_mask_draw_region(Mask *mask, ARegion *ar,
 
 	/* apply transformation so mask editing tools will assume drawing from the origin in normalized space */
 	gpuPushMatrix();
-
+	gpuTranslate2f(x + xofs, y + yofs);
+	gpuScale2f(zoomx, zoomy);
 	if (stabmat) {
 		gpuMultMatrix(stabmat);
 	}
-
-	gpuTranslate2f(x + xofs, y + yofs);
-	gpuScale2f(maxdim * zoomx, maxdim * zoomy);
+	gpuScale2f(maxdim, maxdim);
 
 	if (do_draw_cb) {
 		ED_region_draw_cb_draw(C, ar, REGION_DRAW_PRE_VIEW);
diff --git a/source/blender/editors/space_image/image_ops.c b/source/blender/editors/space_image/image_ops.c
index 748293734fe..69597a390d6 100644
--- a/source/blender/editors/space_image/image_ops.c
+++ b/source/blender/editors/space_image/image_ops.c
@@ -1892,7 +1892,6 @@ static bool save_image_doit(bContext *C, SpaceImage *sima, wmOperator *op, SaveI
 			}
 			else {
 				colormanaged_ibuf = IMB_colormanagement_imbuf_for_write(ibuf, save_as_render, true, &imf->view_settings, &imf->display_settings, imf);
-				IMB_metadata_copy(colormanaged_ibuf, ibuf);
 				ok = BKE_imbuf_write_as(colormanaged_ibuf, simopts->filepath, imf, save_copy);
 				save_imbuf_post(ibuf, colormanaged_ibuf);
 			}
diff --git a/source/blender/editors/space_view3d/view3d_iterators.c b/source/blender/editors/space_view3d/view3d_iterators.c
index ce4b7f7deeb..ef7b01f7a21 100644
--- a/source/blender/editors/space_view3d/view3d_iterators.c
+++ b/source/blender/editors/space_view3d/view3d_iterators.c
@@ -119,7 +119,7 @@ void meshobject_foreachScreenVert(
 	data.clip_flag = clip_flag;
 
 	if (clip_flag & V3D_PROJ_TEST_CLIP_BB) {
-		ED_view3d_clipping_local(vc->rv3d, vc->obedit->obmat);  /* for local clipping lookups */
+		ED_view3d_clipping_local(vc->rv3d, vc->obact->obmat);
 	}
 
 	dm->foreachMappedVert(dm, meshobject_foreachScreenVert__mapFunc, &data, DM_FOREACH_NOP);
diff --git a/source/blender/gpu/shaders/gpu_shader_material.glsl b/source/blender/gpu/shaders/gpu_shader_material.glsl
index f179d9da7bf..fc2fc737ae4 100644
--- a/source/blender/gpu/shaders/gpu_shader_material.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_material.glsl
@@ -2655,7 +2655,7 @@ void node_bsdf_toon(vec4 color, float size, float tsmooth, vec3 N, out vec4 resu
 
 void node_bsdf_principled(vec4 base_color, float subsurface, vec3 subsurface_radius, vec4 subsurface_color, float metallic, float specular,
 	float specular_tint, float roughness, float anisotropic, float anisotropic_rotation, float sheen, float sheen_tint, float clearcoat,
-	float clearcoat_gloss, float ior, float transparency, float refraction_roughness, vec3 N, vec3 CN, vec3 T, vec3 I, out vec4 result)
+	float clearcoat_gloss, float ior, float transmission, float transmission_roughness, vec3 N, vec3 CN, vec3 T, vec3 I, out vec4 result)
 {
 	/* ambient light */
 	// TODO: set ambient light to an appropriate value
diff --git a/source/blender/imbuf/intern/colormanagement.c b/source/blender/imbuf/intern/colormanagement.c
index fc382f02b2c..a4d5da9993a 100644
--- a/source/blender/imbuf/intern/colormanagement.c
+++ b/source/blender/imbuf/intern/colormanagement.c
@@ -1759,9 +1759,14 @@ void IMB_colormanagement_transform_from_byte_threaded(float *float_buffer, unsig
 		return;
 	}
 	if (STREQ(from_colorspace, to_colorspace)) {
-		/* If source and destination color spaces are identical, skip
-		 * threading overhead and simply do nothing
+		/* Because this function always takes a byte buffer and returns a float buffer, it must
+		 * always do byte-to-float conversion of some kind. To avoid threading overhead
+		 * IMB_buffer_float_from_byte is used when color spaces are identical. See T51002.
 		 */
+		IMB_buffer_float_from_byte(float_buffer, byte_buffer,
+		                           IB_PROFILE_SRGB, IB_PROFILE_SRGB,
+		                           true,
+		                           width, height, width, width);
 		return;
 	}
 	cm_processor = IMB_colormanagement_colorspace_processor_new(from_colorspace, to_colorspace);
@@ -2060,6 +2065,10 @@ ImBuf *IMB_colormanagement_imbuf_for_write(ImBuf *ibuf, bool save_as_render, boo
 		}
 	}
 
+	if (colormanaged_ibuf != ibuf) {
+		IMB_metadata_copy(colormanaged_ibuf, ibuf);
+	}
+
 	return colormanaged_ibuf;
 }
 
diff --git a/source/blender/imbuf/intern/metadata.c b/source/blender/imbuf/intern/metadata.c
index 134bbe88f15..da39967a4fe 100644
--- a/source/blender/imbuf/intern/metadata.c
+++ b/source/blender/imbuf/intern/metadata.c
@@ -81,7 +81,9 @@ bool IMB_metadata_get_field(struct ImBuf *img, const char *key, char *field, con
 
 void IMB_metadata_copy(struct ImBuf *dimb, struct ImBuf *simb)
 {
+	BLI_assert(dimb != simb);
 	if (simb->metadata) {
+		IMB_metadata_free(dimb);
 		dimb->metadata = IDP_CopyProperty(simb->metadata);
 	}
 }
diff --git a/source/blender/imbuf/intern/oiio/openimageio_api.cpp b/source/blender/imbuf/intern/oiio/openimageio_api.cpp
index 11bf45418d6..224cf6347d2 100644
--- a/source/blender/imbuf/intern/oiio/openimageio_api.cpp
+++ b/source/blender/imbuf/intern/oiio/openimageio_api.cpp
@@ -213,7 +213,7 @@ struct ImBuf *imb_load_photoshop(const char *filename, int flags, char colorspac
 	in = ImageInput::create(filename);
 	if (!in) {
 		std::cerr << __func__ << ": ImageInput::create() failed:" << std::endl
-		          << OpenImageIO::geterror() << std::endl;
+		          << OIIO_NAMESPACE::geterror() << std::endl;
 		return NULL;
 	}
 
diff --git a/source/blender/imbuf/intern/openexr/openexr_api.cpp b/source/blender/imbuf/intern/openexr/openexr_api.cpp
index 89e796fb7ee..29332a5c899 100644
--- a/source/blender/imbuf/intern/openexr/openexr_api.cpp
+++ b/source/blender/imbuf/intern/openexr/openexr_api.cpp
@@ -1026,15 +1026,16 @@ void IMB_exr_set_channel(void *handle, const char *layname, const char *passname
 	ExrChannel *echan;
 	char name[EXR_TOT_MAXNAME + 1];
 
-	if (layname) {
+	if (layname && layname[0] != '\0') {
 		char lay[EXR_LAY_MAXNAME + 1], pass[EXR_PASS_MAXNAME + 1];
 		BLI_strncpy(lay, layname, EXR_LAY_MAXNAME);
 		BLI_strncpy(pass, passname, EXR_PASS_MAXNAME);
 
 		BLI_snprintf(name, sizeof(name), "%s.%s", lay, pass);
 	}
-	else
+	else {
 		BLI_strncpy(name, passname, EXR_TOT_MAXNAME - 1);
+	}
 
 	echan = (ExrChannel *)BLI_findstring(&data->channels, name, offsetof(ExrChannel, name));
 
@@ -1043,8 +1044,9 @@ void IMB_exr_set_channel(void *handle, const char *layname, const char *passname
 		echan->ystride = ystride;
 		echan->rect = rect;
 	}
-	else
+	else {
 		printf("IMB_exr_set_channel error %s\n", name);
+	}
 }
 
 float  *IMB_exr_channel_rect(void *handle, const char *layname, const char *passname, const char *viewname)
diff --git a/source/blender/makesrna/intern/rna_scene.c b/source/blender/makesrna/intern/rna_scene.c
index 2fa2a32f678..d578c17fe2c 100644
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -95,7 +95,8 @@ EnumPropertyItem rna_enum_exr_codec_items[] = {
 	{R_IMF_EXR_CODEC_B44, "B44", 0, "B44 (lossy)", ""},
 	{R_IMF_EXR_CODEC_B44A, "B44A", 0, "B44A (lossy)", ""},
 	{R_IMF_EXR_CODEC_DWAA, "DWAA", 0, "DWAA (lossy)", ""},
-	{R_IMF_EXR_CODEC_DWAB, "DWAB", 0, "DWAB (lossy)", ""},
+	/* NOTE: Commented out for until new OpenEXR is released, see T50673. */
+	/* {R_IMF_EXR_CODEC_DWAB, "DWAB", 0, "DWAB (lossy)", ""}, */
 	{0, NULL, 0, NULL, NULL}
 };
 #endif
diff --git a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
index 8b0bfa36324..44288db182a 100644
--- a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
+++ b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
@@ -45,8 +45,8 @@ static bNodeSocketTemplate sh_node_bsdf_principled_in[] = {
 	{	SOCK_FLOAT, 1, N_("Clearcoat"),				0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
 	{	SOCK_FLOAT, 1, N_("Clearcoat Gloss"),		1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
 	{	SOCK_FLOAT, 1, N_("IOR"),					1.45f, 0.0f, 0.0f, 0.0f, 0.0f, 1000.0f},
-	{	SOCK_FLOAT, 1, N_("Transparency"),			0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
-	{	SOCK_FLOAT, 1, N_("Refraction Roughness"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
+	{	SOCK_FLOAT, 1, N_("Transmission"),			0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
+	{	SOCK_FLOAT, 1, N_("Transmission Roughness"),0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
 	{	SOCK_VECTOR, 1, N_("Normal"),				0.0f, 0.0f, 0.0f, 1.0f, -1.0f, 1.0f, PROP_NONE, SOCK_HIDE_VALUE},
 	{	SOCK_VECTOR, 1, N_("Clearcoat Normal"),		0.0f, 0.0f, 0.0f, 1.0f, -1.0f, 1.0f, PROP_NONE, SOCK_HIDE_VALUE},
 	{	SOCK_VECTOR, 1, N_("Tangent"),				0.0f, 0.0f, 0.0f, 1.0f, -1.0f, 1.0f, PROP_NONE, SOCK_HIDE_VALUE},
@@ -86,7 +86,7 @@ static void node_shader_update_principled(bNodeTree *UNUSED(ntree), bNode *node)
 	int distribution = node->custom1;
 
 	for (sock = node->inputs.first; sock; sock = sock->next) {
-		if (STREQ(sock->name, "Refraction Roughness")) {
+		if (STREQ(sock->name, "Transmission Roughness")) {
 			if (distribution == SHD_GLOSSY_GGX)
 				sock->flag &= ~SOCK_UNAVAIL;
 			else
diff --git a/source/blender/render/intern/source/envmap.c b/source/blender/render/intern/source/envmap.c
index c5dec430c53..7bd8c6decf0 100644
--- a/source/blender/render/intern/source/envmap.c
+++ b/source/blender/render/intern/source/envmap.c
@@ -62,6 +62,7 @@
 #include "renderpipeline.h"
 #include "texture.h"
 #include "zbuf.h"
+#include "render_result.h"
 
 /* ------------------------------------------------------------------------- */
 
@@ -143,8 +144,8 @@ static Render *envmap_render_copy(Render *re, EnvMap *env)
 	/* set up renderdata */
 	render_copy_renderdata(&envre->r, &re->r);
 	envre->r.mode &= ~(R_BORDER | R_PANORAMA | R_ORTHO | R_MBLUR);
-	BLI_listbase_clear(&envre->r.layers);
-	BLI_listbase_clear(&envre->r.views);
+	BLI_freelistN(&envre->r.layers);
+	BLI_freelistN(&envre->r.views);
 	envre->r.filtertype = 0;
 	envre->r.tilex = envre->r.xsch / 2;
 	envre->r.tiley = envre->r.ysch / 2;
@@ -491,11 +492,18 @@ static void render_envmap(Render *re, EnvMap *env)
 		env_rotate_scene(envre, tmat, 0);
 
 		if (re->test_break(re->tbh) == 0) {
-			RenderLayer *rl = envre->result->layers.first;
 			int y;
 			float *alpha;
 			float *rect;
 
+			if (envre->result->do_exr_tile) {
+				BLI_rw_mutex_lock(&envre->resultmutex, THREAD_LOCK_WRITE);
+				render_result_exr_file_end(envre);
+				BLI_rw_mutex_unlock(&envre->resultmutex);
+			}
+
+			RenderLayer *rl = envre->result->layers.first;
+
 			/* envmap is rendered independently of multiview  */
 			rect = RE_RenderLayerGetPass(rl, RE_PASSNAME_COMBINED, "");
 			ibuf = IMB_allocImBuf(envre->rectx, envre->recty, 24, IB_rect | IB_rectfloat);