Cysles: Avoid having ShaderData on the stack

This commit introduces a SSS-oriented intersection structure which is replacing
old logic of having separate arrays for just intersections and shader data and
encapsulates all the data needed for SSS evaluation.

This giver a huge stack memory saving on GPU. In own experiments it gave 25%
memory usage reduction on GTX560Ti (722MB vs. 946MB).

Unfortunately, this gave some performance loss of 20% which only happens on GPU.
This is perhaps due to different memory access pattern. Will be solved in the
future, hopefully.

Famous saying: won in memory - lost in time (which is also valid in other way
around).
This commit is contained in:
Sergey Sharybin 2015-11-22 15:00:29 +05:00
parent e6fff424db
commit 8bca34fe32
9 changed files with 290 additions and 109 deletions

@ -255,38 +255,81 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, con
} }
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
const Ray *ray,
SubsurfaceIntersection *ss_isect,
int subsurface_object,
uint *lcg_state,
int max_hits)
{ {
#ifdef __OBJECT_MOTION__ #ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) { if(kernel_data.bvh.have_motion) {
#ifdef __HAIR__ #ifdef __HAIR__
if(kernel_data.bvh.have_curves) if(kernel_data.bvh.have_curves) {
return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface_hair_motion(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
}
#endif /* __HAIR__ */ #endif /* __HAIR__ */
return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface_motion(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
} }
#endif /* __OBJECT_MOTION__ */ #endif /* __OBJECT_MOTION__ */
#ifdef __HAIR__ #ifdef __HAIR__
if(kernel_data.bvh.have_curves) if(kernel_data.bvh.have_curves) {
return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface_hair(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
}
#endif /* __HAIR__ */ #endif /* __HAIR__ */
#ifdef __KERNEL_CPU__ #ifdef __KERNEL_CPU__
#ifdef __INSTANCING__ #ifdef __INSTANCING__
if(kernel_data.bvh.have_instancing) if(kernel_data.bvh.have_instancing) {
return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface_instancing(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
}
#endif /* __INSTANCING__ */ #endif /* __INSTANCING__ */
return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
#else /* __KERNEL_CPU__ */ #else /* __KERNEL_CPU__ */
#ifdef __INSTANCING__ #ifdef __INSTANCING__
return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface_instancing(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
#else #else
return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); return bvh_intersect_subsurface(kg,
ray,
ss_isect,
subsurface_object,
lcg_state,
max_hits);
#endif /* __INSTANCING__ */ #endif /* __INSTANCING__ */
#endif /* __KERNEL_CPU__ */ #endif /* __KERNEL_CPU__ */

@ -30,9 +30,9 @@
* *
*/ */
ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
const Ray *ray, const Ray *ray,
Intersection *isect_array, SubsurfaceIntersection *ss_isect,
int subsurface_object, int subsurface_object,
uint *lcg_state, uint *lcg_state,
int max_hits) int max_hits)
@ -60,7 +60,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
int object = OBJECT_NONE; int object = OBJECT_NONE;
float isect_t = ray->t; float isect_t = ray->t;
uint num_hits = 0; ss_isect->num_hits = 0;
#if BVH_FEATURE(BVH_MOTION) #if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm; Transform ob_itfm;
@ -210,7 +210,15 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
if(tri_object != subsurface_object) if(tri_object != subsurface_object)
continue; continue;
triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); triangle_intersect_subsurface(kg,
&isect_precalc,
ss_isect,
P,
object,
primAddr,
isect_t,
lcg_state,
max_hits);
} }
break; break;
} }
@ -223,7 +231,16 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
if(tri_object != subsurface_object) if(tri_object != subsurface_object)
continue; continue;
motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); motion_triangle_intersect_subsurface(kg,
ss_isect,
P,
dir,
ray->time,
object,
primAddr,
isect_t,
lcg_state,
max_hits);
} }
break; break;
} }
@ -301,13 +318,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
} }
#endif /* FEATURE(BVH_INSTANCING) */ #endif /* FEATURE(BVH_INSTANCING) */
} while(nodeAddr != ENTRYPOINT_SENTINEL); } while(nodeAddr != ENTRYPOINT_SENTINEL);
return num_hits;
} }
ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
const Ray *ray, const Ray *ray,
Intersection *isect_array, SubsurfaceIntersection *ss_isect,
int subsurface_object, int subsurface_object,
uint *lcg_state, uint *lcg_state,
int max_hits) int max_hits)
@ -316,7 +331,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
if(kernel_data.bvh.use_qbvh) { if(kernel_data.bvh.use_qbvh) {
return BVH_FUNCTION_FULL_NAME(QBVH)(kg, return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
ray, ray,
isect_array, ss_isect,
subsurface_object, subsurface_object,
lcg_state, lcg_state,
max_hits); max_hits);
@ -327,7 +342,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
kernel_assert(kernel_data.bvh.use_qbvh == false); kernel_assert(kernel_data.bvh.use_qbvh == false);
return BVH_FUNCTION_FULL_NAME(BVH)(kg, return BVH_FUNCTION_FULL_NAME(BVH)(kg,
ray, ray,
isect_array, ss_isect,
subsurface_object, subsurface_object,
lcg_state, lcg_state,
max_hits); max_hits);

@ -358,8 +358,17 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
* multiple hits we pick a single random primitive as the intersection point. */ * multiple hits we pick a single random primitive as the intersection point. */
#ifdef __SUBSURFACE__ #ifdef __SUBSURFACE__
ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, ccl_device_inline void motion_triangle_intersect_subsurface(
float3 P, float3 dir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) KernelGlobals *kg,
SubsurfaceIntersection *ss_isect,
float3 P,
float3 dir,
float time,
int object,
int triAddr,
float tmax,
uint *lcg_state,
int max_hits)
{ {
/* primitive index for vertex location lookup */ /* primitive index for vertex location lookup */
int prim = kernel_tex_fetch(__prim_index, triAddr); int prim = kernel_tex_fetch(__prim_index, triAddr);
@ -373,30 +382,34 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
float t, u, v; float t, u, v;
if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) { if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
(*num_hits)++; ss_isect->num_hits++;
int hit; int hit;
if(*num_hits <= max_hits) { if(ss_isect->num_hits <= max_hits) {
hit = *num_hits - 1; hit = ss_isect->num_hits - 1;
} }
else { else {
/* reservoir sampling: if we are at the maximum number of /* reservoir sampling: if we are at the maximum number of
* hits, randomly replace element or skip it */ * hits, randomly replace element or skip it */
hit = lcg_step_uint(lcg_state) % *num_hits; hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
if(hit >= max_hits) if(hit >= max_hits)
return; return;
} }
/* record intersection */ /* record intersection */
Intersection *isect = &isect_array[hit]; Intersection *isect = &ss_isect->hits[hit];
isect->t = t; isect->t = t;
isect->u = u; isect->u = u;
isect->v = v; isect->v = v;
isect->prim = triAddr; isect->prim = triAddr;
isect->object = object; isect->object = object;
isect->type = PRIMITIVE_MOTION_TRIANGLE; isect->type = PRIMITIVE_MOTION_TRIANGLE;
/* Record geometric normal. */
ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
verts[2] - verts[0]));
} }
} }
#endif #endif

@ -26,9 +26,9 @@
* *
*/ */
ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray, const Ray *ray,
Intersection *isect_array, SubsurfaceIntersection *ss_isect,
int subsurface_object, int subsurface_object,
uint *lcg_state, uint *lcg_state,
int max_hits) int max_hits)
@ -55,7 +55,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
float3 idir = bvh_inverse_direction(dir); float3 idir = bvh_inverse_direction(dir);
int object = OBJECT_NONE; int object = OBJECT_NONE;
float isect_t = ray->t; float isect_t = ray->t;
uint num_hits = 0;
ss_isect->num_hits = 0;
#if BVH_FEATURE(BVH_MOTION) #if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm; Transform ob_itfm;
@ -63,7 +64,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#ifndef __KERNEL_SSE41__ #ifndef __KERNEL_SSE41__
if(!isfinite(P.x)) { if(!isfinite(P.x)) {
return 0; return;
} }
#endif #endif
@ -226,7 +227,15 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(tri_object != subsurface_object) { if(tri_object != subsurface_object) {
continue; continue;
} }
triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); triangle_intersect_subsurface(kg,
&isect_precalc,
ss_isect,
P,
object,
primAddr,
isect_t,
lcg_state,
max_hits);
} }
break; break;
} }
@ -240,7 +249,16 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(tri_object != subsurface_object) { if(tri_object != subsurface_object) {
continue; continue;
} }
motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); motion_triangle_intersect_subsurface(kg,
ss_isect,
P,
dir,
ray->time,
object,
primAddr,
isect_t,
lcg_state,
max_hits);
} }
break; break;
} }
@ -321,6 +339,4 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
} }
#endif /* FEATURE(BVH_INSTANCING) */ #endif /* FEATURE(BVH_INSTANCING) */
} while(nodeAddr != ENTRYPOINT_SENTINEL); } while(nodeAddr != ENTRYPOINT_SENTINEL);
return num_hits;
} }

@ -204,12 +204,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
ccl_device_inline void triangle_intersect_subsurface( ccl_device_inline void triangle_intersect_subsurface(
KernelGlobals *kg, KernelGlobals *kg,
const IsectPrecalc *isect_precalc, const IsectPrecalc *isect_precalc,
Intersection *isect_array, SubsurfaceIntersection *ss_isect,
float3 P, float3 P,
int object, int object,
int triAddr, int triAddr,
float tmax, float tmax,
uint *num_hits,
uint *lcg_state, uint *lcg_state,
int max_hits) int max_hits)
{ {
@ -272,29 +271,36 @@ ccl_device_inline void triangle_intersect_subsurface(
/* Normalize U, V, W, and T. */ /* Normalize U, V, W, and T. */
const float inv_det = 1.0f / det; const float inv_det = 1.0f / det;
(*num_hits)++; ss_isect->num_hits++;
int hit; int hit;
if(*num_hits <= max_hits) { if(ss_isect->num_hits <= max_hits) {
hit = *num_hits - 1; hit = ss_isect->num_hits - 1;
} }
else { else {
/* reservoir sampling: if we are at the maximum number of /* reservoir sampling: if we are at the maximum number of
* hits, randomly replace element or skip it */ * hits, randomly replace element or skip it */
hit = lcg_step_uint(lcg_state) % *num_hits; hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
if(hit >= max_hits) if(hit >= max_hits)
return; return;
} }
/* record intersection */ /* record intersection */
Intersection *isect = &isect_array[hit]; Intersection *isect = &ss_isect->hits[hit];
isect->prim = triAddr; isect->prim = triAddr;
isect->object = object; isect->object = object;
isect->type = PRIMITIVE_TRIANGLE; isect->type = PRIMITIVE_TRIANGLE;
isect->u = U * inv_det; isect->u = U * inv_det;
isect->v = V * inv_det; isect->v = V * inv_det;
isect->t = T * inv_det; isect->t = T * inv_det;
/* Record geometric normal. */
/* TODO(sergey): Use float4_to_float3() on just an edges. */
const float3 v0 = float4_to_float3(tri_a);
const float3 v1 = float4_to_float3(tri_b);
const float3 v2 = float4_to_float3(tri_c);
ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
} }
#endif #endif

@ -338,10 +338,16 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
if(sc) { if(sc) {
uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; SubsurfaceIntersection ss_isect;
float bssrdf_u, bssrdf_v; float bssrdf_u, bssrdf_v;
path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false); int num_hits = subsurface_scatter_multi_intersect(kg,
&ss_isect,
sd,
sc,
&lcg_state,
bssrdf_u, bssrdf_v,
false);
#ifdef __VOLUME__ #ifdef __VOLUME__
Ray volume_ray = *ray; Ray volume_ray = *ray;
bool need_update_volume_stack = kernel_data.integrator.use_volumes && bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
@ -350,15 +356,26 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
/* compute lighting with the BSDF closure */ /* compute lighting with the BSDF closure */
for(int hit = 0; hit < num_hits; hit++) { for(int hit = 0; hit < num_hits; hit++) {
/* NOTE: We reuse the existing ShaderData, we assume the path
* integration loop stops when this function returns true.
*/
subsurface_scatter_multi_setup(kg,
&ss_isect,
hit,
sd,
state->flag,
sc,
false);
float3 tp = *throughput; float3 tp = *throughput;
PathState hit_state = *state; PathState hit_state = *state;
Ray hit_ray = *ray; Ray hit_ray = *ray;
hit_state.rng_offset += PRNG_BOUNCE_NUM; hit_state.rng_offset += PRNG_BOUNCE_NUM;
kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L); kernel_path_surface_connect_light(kg, rng, sd, tp, state, L);
if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) { if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) {
#ifdef __LAMP_MIS__ #ifdef __LAMP_MIS__
hit_state.ray_t = 0.0f; hit_state.ray_t = 0.0f;
#endif #endif

@ -128,10 +128,16 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
/* do subsurface scatter step with copy of shader data, this will /* do subsurface scatter step with copy of shader data, this will
* replace the BSSRDF with a diffuse BSDF closure */ * replace the BSSRDF with a diffuse BSDF closure */
for(int j = 0; j < num_samples; j++) { for(int j = 0; j < num_samples; j++) {
ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; SubsurfaceIntersection ss_isect;
float bssrdf_u, bssrdf_v; float bssrdf_u, bssrdf_v;
path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); int num_hits = subsurface_scatter_multi_intersect(kg,
&ss_isect,
sd,
sc,
&lcg_state,
bssrdf_u, bssrdf_v,
true);
#ifdef __VOLUME__ #ifdef __VOLUME__
Ray volume_ray = *ray; Ray volume_ray = *ray;
bool need_update_volume_stack = kernel_data.integrator.use_volumes && bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
@ -140,6 +146,15 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
/* compute lighting with the BSDF closure */ /* compute lighting with the BSDF closure */
for(int hit = 0; hit < num_hits; hit++) { for(int hit = 0; hit < num_hits; hit++) {
ShaderData bssrdf_sd = *sd;
subsurface_scatter_multi_setup(kg,
&ss_isect,
hit,
&bssrdf_sd,
state->flag,
sc,
true);
PathState hit_state = *state; PathState hit_state = *state;
path_state_branch(&hit_state, j, num_samples); path_state_branch(&hit_state, j, num_samples);
@ -147,7 +162,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
#ifdef __VOLUME__ #ifdef __VOLUME__
if(need_update_volume_stack) { if(need_update_volume_stack) {
/* Setup ray from previous surface point to the new one. */ /* Setup ray from previous surface point to the new one. */
float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng); float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
volume_ray.D = normalize_len(P - volume_ray.P, volume_ray.D = normalize_len(P - volume_ray.P,
&volume_ray.t); &volume_ray.t);
@ -165,15 +180,27 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
/* direct light */ /* direct light */
if(kernel_data.integrator.use_direct_light) { if(kernel_data.integrator.use_direct_light) {
bool all = kernel_data.integrator.sample_all_lights_direct; bool all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_surface_connect_light(kg, rng, kernel_branched_path_surface_connect_light(
&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all); kg,
rng,
&bssrdf_sd,
&hit_state,
throughput,
num_samples_inv,
L,
all);
} }
#endif #endif
/* indirect light */ /* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng, kernel_branched_path_surface_indirect_light(
&bssrdf_sd[hit], throughput, num_samples_inv, kg,
&hit_state, L); rng,
&bssrdf_sd,
throughput,
num_samples_inv,
&hit_state,
L);
} }
} }
} }

@ -179,19 +179,23 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
return color; return color;
} }
ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd, ShaderData *in_sd, int state_flag, float3 *eval, float3 *N) ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
ShaderData *sd,
int state_flag,
float3 *eval,
float3 *N)
{ {
/* average color and texture blur at outgoing point */ /* average color and texture blur at outgoing point */
float texture_blur; float texture_blur;
float3 out_color = shader_bssrdf_sum(out_sd, NULL, &texture_blur); float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
/* do we have bump mapping? */ /* do we have bump mapping? */
bool bump = (out_sd->flag & SD_HAS_BSSRDF_BUMP) != 0; bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
if(bump || texture_blur > 0.0f) { if(bump || texture_blur > 0.0f) {
/* average color and normal at incoming point */ /* average color and normal at incoming point */
shader_eval_surface(kg, in_sd, 0.0f, state_flag, SHADER_CONTEXT_SSS); shader_eval_surface(kg, sd, 0.0f, state_flag, SHADER_CONTEXT_SSS);
float3 in_color = shader_bssrdf_sum(in_sd, (bump)? N: NULL, NULL); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
/* we simply divide out the average color and multiply with the average /* we simply divide out the average color and multiply with the average
* of the other one. we could try to do this per closure but it's quite * of the other one. we could try to do this per closure but it's quite
@ -206,9 +210,18 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd
} }
} }
/* subsurface scattering step, from a point on the surface to other nearby points on the same object */ /* Subsurface scattering step, from a point on the surface to other
ccl_device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd, ShaderData bssrdf_sd[BSSRDF_MAX_HITS], * nearby points on the same object.
int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) */
ccl_device int subsurface_scatter_multi_intersect(
KernelGlobals *kg,
SubsurfaceIntersection* ss_isect,
ShaderData *sd,
ShaderClosure *sc,
uint *lcg_state,
float disk_u,
float disk_v,
bool all)
{ {
/* pick random axis in local frame and point on disk */ /* pick random axis in local frame and point on disk */
float3 disk_N, disk_T, disk_B; float3 disk_N, disk_T, disk_B;
@ -259,65 +272,84 @@ ccl_device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd,
float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
/* create ray */ /* create ray */
Ray ray; Ray *ray = &ss_isect->ray;
ray.P = sd->P + disk_N*disk_height + disk_P; ray->P = sd->P + disk_N*disk_height + disk_P;
ray.D = -disk_N; ray->D = -disk_N;
ray.t = 2.0f*disk_height; ray->t = 2.0f*disk_height;
ray.dP = sd->dP; ray->dP = sd->dP;
ray.dD = differential3_zero(); ray->dD = differential3_zero();
ray.time = sd->time; ray->time = sd->time;
/* intersect with the same object. if multiple intersections are found it /* intersect with the same object. if multiple intersections are found it
* will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
Intersection isect[BSSRDF_MAX_HITS]; scene_intersect_subsurface(kg,
uint num_hits = scene_intersect_subsurface(kg, &ray, isect, sd->object, lcg_state, BSSRDF_MAX_HITS); ray,
ss_isect,
/* evaluate bssrdf */ sd->object,
float3 eval = make_float3(0.0f, 0.0f, 0.0f); lcg_state,
int num_eval_hits = min(num_hits, BSSRDF_MAX_HITS); BSSRDF_MAX_HITS);
/* TODO(sergey): Investigate whether scene_intersect_subsurface() could
* indeed return more than BSSRDF_MAX_HITS hits.
*/
int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
for(int hit = 0; hit < num_eval_hits; hit++) { for(int hit = 0; hit < num_eval_hits; hit++) {
ShaderData *bsd = &bssrdf_sd[hit]; /* Quickly retrieve P and Ng without setting up ShaderData. */
float3 hit_P = ray->P + ss_isect->hits[hit].t * ray->D;
/* setup new shading point */ float3 hit_Ng = ss_isect->Ng[hit];
*bsd = *sd; if(ss_isect->hits[hit].object != OBJECT_NONE) {
shader_setup_from_subsurface(kg, bsd, &isect[hit], &ray); object_normal_transform(kg, sd, &hit_Ng);
}
/* probability densities for local frame axes */ /* probability densities for local frame axes */
float pdf_N = pick_pdf_N * fabsf(dot(disk_N, bsd->Ng)); float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
float pdf_T = pick_pdf_T * fabsf(dot(disk_T, bsd->Ng)); float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
float pdf_B = pick_pdf_B * fabsf(dot(disk_B, bsd->Ng)); float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
/* multiple importance sample between 3 axes, power heuristic /* multiple importance sample between 3 axes, power heuristic
* found to be slightly better than balance heuristic */ * found to be slightly better than balance heuristic */
float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B); float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B);
/* real distance to sampled point */ /* real distance to sampled point */
float r = len(bsd->P - sd->P); float r = len(hit_P - sd->P);
/* evaluate */ /* evaluate */
float w = mis_weight / pdf_N; float w = mis_weight / pdf_N;
if(num_hits > BSSRDF_MAX_HITS) if(ss_isect->num_hits > BSSRDF_MAX_HITS)
w *= num_hits/(float)BSSRDF_MAX_HITS; w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS;
eval = subsurface_scatter_eval(bsd, sc, disk_r, r, all) * w; float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
/* optionally blur colors and bump mapping */ ss_isect->weight[hit] = eval;
float3 N = bsd->N;
subsurface_color_bump_blur(kg, sd, bsd, state_flag, &eval, &N);
/* setup diffuse bsdf */
subsurface_scatter_setup_diffuse_bsdf(bsd, eval, true, N);
} }
return num_eval_hits; return num_eval_hits;
} }
ccl_device void subsurface_scatter_multi_setup(KernelGlobals *kg,
SubsurfaceIntersection* ss_isect,
int hit,
ShaderData *sd,
int state_flag,
ShaderClosure *sc,
bool all)
{
/* Setup new shading point. */
shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
/* Optionally blur colors and bump mapping. */
float3 weight = ss_isect->weight[hit];
float3 N = sd->N;
subsurface_color_bump_blur(kg, sd, state_flag, &weight, &N);
/* Setup diffuse BSDF. */
subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
}
/* subsurface scattering step, from a point on the surface to another nearby point on the same object */ /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
{ {
float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f);
uint num_hits = 0;
/* pick random axis in local frame and point on disk */ /* pick random axis in local frame and point on disk */
float3 disk_N, disk_T, disk_B; float3 disk_N, disk_T, disk_B;
@ -368,15 +400,15 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
/* intersect with the same object. if multiple intersections are /* intersect with the same object. if multiple intersections are
* found it will randomly pick one of them */ * found it will randomly pick one of them */
Intersection isect; SubsurfaceIntersection ss_isect;
num_hits = scene_intersect_subsurface(kg, &ray, &isect, sd->object, lcg_state, 1); scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
/* evaluate bssrdf */ /* evaluate bssrdf */
if(num_hits > 0) { if(ss_isect.num_hits > 0) {
float3 origP = sd->P; float3 origP = sd->P;
/* setup new shading point */ /* setup new shading point */
shader_setup_from_subsurface(kg, sd, &isect, &ray); shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
/* probability densities for local frame axes */ /* probability densities for local frame axes */
float pdf_N = pick_pdf_N * fabsf(dot(disk_N, sd->Ng)); float pdf_N = pick_pdf_N * fabsf(dot(disk_N, sd->Ng));
@ -391,16 +423,16 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
float r = len(sd->P - origP); float r = len(sd->P - origP);
/* evaluate */ /* evaluate */
float w = (mis_weight * num_hits) / pdf_N; float w = (mis_weight * ss_isect.num_hits) / pdf_N;
eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w; eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
} }
/* optionally blur colors and bump mapping */ /* optionally blur colors and bump mapping */
float3 N = sd->N; float3 N = sd->N;
subsurface_color_bump_blur(kg, sd, sd, state_flag, &eval, &N); subsurface_color_bump_blur(kg, sd, state_flag, &eval, &N);
/* setup diffuse bsdf */ /* setup diffuse bsdf */
subsurface_scatter_setup_diffuse_bsdf(sd, eval, (num_hits > 0), N); subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
} }
CCL_NAMESPACE_END CCL_NAMESPACE_END

@ -520,6 +520,18 @@ typedef ccl_addr_space struct Intersection {
#endif #endif
} Intersection; } Intersection;
/* Subsurface Intersection result */
struct SubsurfaceIntersection
{
Ray ray;
float3 weight[BSSRDF_MAX_HITS];
int num_hits;
struct Intersection hits[BSSRDF_MAX_HITS];
float3 Ng[BSSRDF_MAX_HITS];
};
/* Primitives */ /* Primitives */
typedef enum PrimitiveType { typedef enum PrimitiveType {