Cycles: Speedup of Christensen-Burley SSS falloff function

The idea is simply to pre-compute fitting and parameterization
in the bssrdf_setup() function and re-use the values in both
sample() and eval().

The only trick is where to store the pre-calculated values and
the answer is inside of ShaderClosure->custom{1,2,3}. There's
no memory bump here because we now simply re-use padding fields
for the pre-calculated values. Similar trick we can do for other
BSDFs.

Seems to give nice speedup up to 7% here on my desktop with
Core i7 CPU, SSE4.1 kernel.
This commit is contained in:
Sergey Sharybin 2016-02-04 15:25:29 +01:00
parent f250aa9d86
commit 3e7389eaf2
2 changed files with 44 additions and 27 deletions

@ -19,25 +19,6 @@
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
{
if(sc->data0 < BSSRDF_MIN_RADIUS) {
/* revert to diffuse BSDF if radius too small */
sc->data0 = 0.0f;
sc->data1 = 0.0f;
int flag = bsdf_diffuse_setup(sc);
sc->type = CLOSURE_BSDF_BSSRDF_ID;
return flag;
}
else {
sc->data1 = saturate(sc->data1); /* texture blur */
sc->T.x = saturate(sc->T.x); /* sharpness */
sc->type = type;
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
}
}
/* Planar Truncated Gaussian /* Planar Truncated Gaussian
* *
* Note how this is different from the typical gaussian, this one integrates * Note how this is different from the typical gaussian, this one integrates
@ -210,13 +191,24 @@ ccl_device_inline float bssrdf_burley_compatible_mfp(float r)
return 0.5f * M_1_PI_F * r; return 0.5f * M_1_PI_F * r;
} }
ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r) ccl_device void bssrdf_burley_setup(ShaderClosure *sc)
{ {
/* Mean free path length. */ /* Mean free path length. */
const float l = bssrdf_burley_compatible_mfp(sc->data0); const float l = bssrdf_burley_compatible_mfp(sc->data0);
/* Surface albedo. */ /* Surface albedo. */
const float A = sc->data2; const float A = sc->data2;
const float s = bssrdf_burley_fitting(A); const float s = bssrdf_burley_fitting(A);
const float d = l / s;
sc->custom1 = l;
sc->custom2 = s;
sc->custom3 = d;
}
ccl_device float bssrdf_burley_eval(ShaderClosure *sc, float r)
{
const float l = sc->custom1,
s = sc->custom2;
/* Burley refletance profile, equation (3). /* Burley refletance profile, equation (3).
* *
* Note that surface albedo is already included into sc->weight, no need to * Note that surface albedo is already included into sc->weight, no need to
@ -277,12 +269,7 @@ ccl_device void bssrdf_burley_sample(ShaderClosure *sc,
float *r, float *r,
float *h) float *h)
{ {
/* Mean free path length. */ const float d = sc->custom3;
const float l = bssrdf_burley_compatible_mfp(sc->data0);
/* Surface albedo. */
const float A = sc->data2;
const float s = bssrdf_burley_fitting(A);
const float d = l / s;
/* This is a bit arbitrary, just need big enough radius so it matches /* This is a bit arbitrary, just need big enough radius so it matches
* the mean free length, but still not too big so sampling is still * the mean free length, but still not too big so sampling is still
* effective. Might need some further tweaks. * effective. Might need some further tweaks.
@ -330,6 +317,29 @@ ccl_device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float
/* Generic */ /* Generic */
ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
{
if(sc->data0 < BSSRDF_MIN_RADIUS) {
/* revert to diffuse BSDF if radius too small */
sc->data0 = 0.0f;
sc->data1 = 0.0f;
int flag = bsdf_diffuse_setup(sc);
sc->type = CLOSURE_BSDF_BSSRDF_ID;
return flag;
}
else {
sc->data1 = saturate(sc->data1); /* texture blur */
sc->T.x = saturate(sc->T.x); /* sharpness */
sc->type = type;
if(type == CLOSURE_BSSRDF_BURLEY_ID) {
bssrdf_burley_setup(sc);
}
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
}
}
ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h) ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
{ {
if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)

@ -642,7 +642,14 @@ typedef ccl_addr_space struct ShaderClosure {
float data0; float data0;
float data1; float data1;
float data2; float data2;
int pad1, pad2, pad3;
/* Following fields could be used to store pre-calculated
* values by various BSDF closures for more effective sampling
* and evaluation.
*/
float custom1;
float custom2;
float custom3;
#ifdef __OSL__ #ifdef __OSL__
void *prim, *pad4; void *prim, *pad4;