Cycles: Speedup of Christensen-Burley SSS falloff function

The idea is simply to pre-compute fitting and parameterization in the bssrdf_setup() function and re-use the values in both sample() and eval(). The only trick is where to store the pre-calculated values and the answer is inside of ShaderClosure->custom{1,2,3}. There's no memory bump here because we now simply re-use padding fields for the pre-calculated values. Similar trick we can do for other BSDFs. Seems to give nice speedup up to 7% here on my desktop with Core i7 CPU, SSE4.1 kernel.
2016-02-04 15:25:29 +01:00 · 2016-02-04 15:25:29 +01:00 · 3e7389eaf2
commit 3e7389eaf2
parent f250aa9d86
2 changed files with 44 additions and 27 deletions
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@ -19,25 +19,6 @@
 CCL_NAMESPACE_BEGIN
 ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 {
 	if(sc->data0 < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
 		sc->data0 = 0.0f;
 		sc->data1 = 0.0f;
 		int flag = bsdf_diffuse_setup(sc);
 		sc->type = CLOSURE_BSDF_BSSRDF_ID;
 		return flag;
 	}
 	else {
 		sc->data1 = saturate(sc->data1); /* texture blur */
 		sc->T.x = saturate(sc->T.x); /* sharpness */
 		sc->type = type;
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
 	}
 }
 /* Planar Truncated Gaussian
 *
 * Note how this is different from the typical gaussian, this one integrates
@ -210,13 +191,24 @@ ccl_device_inline float bssrdf_burley_compatible_mfp(float r)
 	return 0.5f * M_1_PI_F * r;
 }
-ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r)
+ccl_device void bssrdf_burley_setup(ShaderClosure *sc)
 {
 	/* Mean free path length. */
 	const float l = bssrdf_burley_compatible_mfp(sc->data0);
 	/* Surface albedo. */
 	const float A = sc->data2;
 	const float s = bssrdf_burley_fitting(A);
 	const float d = l / s;
 	sc->custom1 = l;
 	sc->custom2 = s;
 	sc->custom3 = d;
 }
 ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r)
 {
 	const float l = sc->custom1,
 	            s = sc->custom2;
 	/* Burley refletance profile, equation (3).
 	 *
 	 * Note that surface albedo is already included into sc->weight, no need to
@ -277,12 +269,7 @@ ccl_device void bssrdf_burley_sample(ShaderClosure *sc,
                                     float *r,
                                     float *h)
 {
-	/* Mean free path length. */
+	const float d = sc->custom3;
 	const float l = bssrdf_burley_compatible_mfp(sc->data0);
 	/* Surface albedo. */
 	const float A = sc->data2;
 	const float s = bssrdf_burley_fitting(A);
 	const float d = l / s;
 	/* This is a bit arbitrary, just need big enough radius so it matches
 	 * the mean free length, but still not too big so sampling is still
 	 * effective. Might need some further tweaks.
@ -330,6 +317,29 @@ ccl_device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float
 /* Generic */
 ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 {
 	if(sc->data0 < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
 		sc->data0 = 0.0f;
 		sc->data1 = 0.0f;
 		int flag = bsdf_diffuse_setup(sc);
 		sc->type = CLOSURE_BSDF_BSSRDF_ID;
 		return flag;
 	}
 	else {
 		sc->data1 = saturate(sc->data1); /* texture blur */
 		sc->T.x = saturate(sc->T.x); /* sharpness */
 		sc->type = type;
 		if(type == CLOSURE_BSSRDF_BURLEY_ID) {
 			bssrdf_burley_setup(sc);
 		}
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
 	}
 }
 ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@ -642,7 +642,14 @@ typedef ccl_addr_space struct ShaderClosure {
 	float data0;
 	float data1;
 	float data2;
-	int pad1, pad2, pad3;
+
 	/* Following fields could be used to store pre-calculated
 	 * values by various BSDF closures for more effective sampling
 	 * and evaluation.
 	 */
 	float custom1;
 	float custom2;
 	float custom3;
 #ifdef __OSL__
 	void *prim, *pad4;