Metal: Optimize SSR shader for Apple Silicon
Reduce register spill to global memory in raytrace_resolve function. Results in a 20% uplift for this particular shader on Apple Silicon GPUs. Contributing to 3-5% uplift for scenes which have SSR enabled. This is achieved via reducing memory pressure using a packed data type for the sampling kernel. Authored by Apple: Michael Parkin-White Pull Request: https://projects.blender.org/blender/blender/pulls/106231
This commit is contained in:
parent
dc63f75837
commit
682cb6ecd3
@ -103,6 +103,20 @@ void resolve_reflection_sample(int planar_index,
|
||||
weight_accum += weight;
|
||||
}
|
||||
|
||||
/* NOTE(Metal): For Apple silicon GPUs executing this particular shader, by default, memory read
|
||||
* pressure is high while ALU remains low. Packing the sample data into a smaller format balances
|
||||
* this trade-off by reducing local shader register pressure and expensive memory look-ups into
|
||||
* spilled local shader memory, resulting in an increase in performance of 20% for this shader. */
|
||||
#ifdef GPU_METAL
|
||||
# define SAMPLE_STORAGE_TYPE uchar
|
||||
# define pack_sample(x, y) uchar(((uchar(x + 2)) << uchar(3)) + (uchar(y + 2)))
|
||||
# define unpack_sample(x) vec2((char(x) >> 3) - 2, (char(x) & 7) - 2)
|
||||
#else
|
||||
# define SAMPLE_STORAGE_TYPE vec2
|
||||
# define pack_sample(x, y) SAMPLE_STORAGE_TYPE(x, y)
|
||||
# define unpack_sample(x) x
|
||||
#endif
|
||||
|
||||
void raytrace_resolve(ClosureInputGlossy cl_in,
|
||||
inout ClosureEvalGlossy cl_eval,
|
||||
inout ClosureEvalCommon cl_common,
|
||||
@ -110,55 +124,55 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
|
||||
{
|
||||
/* Note: Reflection samples declared in function scope to avoid per-thread memory pressure on
|
||||
* tile-based GPUs e.g. Apple Silicon. */
|
||||
const vec2 resolve_sample_offsets[36] = vec2[36](
|
||||
const SAMPLE_STORAGE_TYPE resolve_sample_offsets[36] = SAMPLE_STORAGE_TYPE[36](
|
||||
/* Set 1. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 0),
|
||||
pack_sample(0, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-1, 3),
|
||||
vec2(1, 3),
|
||||
vec2(-1, 1),
|
||||
vec2(3, 1),
|
||||
vec2(-2, 0),
|
||||
vec2(3, 0),
|
||||
vec2(2, -1),
|
||||
vec2(1, -2),
|
||||
pack_sample(-1, 3),
|
||||
pack_sample(1, 3),
|
||||
pack_sample(-1, 1),
|
||||
pack_sample(3, 1),
|
||||
pack_sample(-2, 0),
|
||||
pack_sample(3, 0),
|
||||
pack_sample(2, -1),
|
||||
pack_sample(1, -2),
|
||||
/* Set 2. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 1),
|
||||
pack_sample(1, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(-2, 3),
|
||||
vec2(3, 3),
|
||||
vec2(0, 2),
|
||||
vec2(2, 2),
|
||||
vec2(-2, -1),
|
||||
vec2(1, -1),
|
||||
vec2(0, -2),
|
||||
vec2(3, -2),
|
||||
pack_sample(-2, 3),
|
||||
pack_sample(3, 3),
|
||||
pack_sample(0, 2),
|
||||
pack_sample(2, 2),
|
||||
pack_sample(-2, -1),
|
||||
pack_sample(1, -1),
|
||||
pack_sample(0, -2),
|
||||
pack_sample(3, -2),
|
||||
/* Set 3. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(0, 1),
|
||||
pack_sample(0, 1),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(0, 3),
|
||||
vec2(3, 2),
|
||||
vec2(-2, 1),
|
||||
vec2(2, 1),
|
||||
vec2(-1, 0),
|
||||
vec2(-2, -2),
|
||||
vec2(0, -1),
|
||||
vec2(2, -2),
|
||||
pack_sample(0, 3),
|
||||
pack_sample(3, 2),
|
||||
pack_sample(-2, 1),
|
||||
pack_sample(2, 1),
|
||||
pack_sample(-1, 0),
|
||||
pack_sample(-2, -2),
|
||||
pack_sample(0, -1),
|
||||
pack_sample(2, -2),
|
||||
/* Set 4. */
|
||||
/* First Ring (2x2). */
|
||||
vec2(1, 0),
|
||||
pack_sample(1, 0),
|
||||
/* Second Ring (6x6). */
|
||||
vec2(2, 3),
|
||||
vec2(-2, 2),
|
||||
vec2(-1, 2),
|
||||
vec2(1, 2),
|
||||
vec2(2, 0),
|
||||
vec2(-1, -1),
|
||||
vec2(3, -1),
|
||||
vec2(-1, -2));
|
||||
pack_sample(2, 3),
|
||||
pack_sample(-2, 2),
|
||||
pack_sample(-1, 2),
|
||||
pack_sample(1, 2),
|
||||
pack_sample(2, 0),
|
||||
pack_sample(-1, -1),
|
||||
pack_sample(3, -1),
|
||||
pack_sample(-1, -2));
|
||||
|
||||
float roughness = cl_in.roughness;
|
||||
|
||||
@ -208,7 +222,8 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
|
||||
int sample_id = sample_pool * resolve_samples_count + i;
|
||||
vec2 texture_size = vec2(textureSize(hitBuffer, 0));
|
||||
vec2 sample_texel = texture_size * uvcoordsvar.xy * ssrUvScale;
|
||||
vec2 sample_uv = (sample_texel + resolve_sample_offsets[sample_id]) / texture_size;
|
||||
vec2 sample_uv = (sample_texel + unpack_sample(resolve_sample_offsets[sample_id])) /
|
||||
texture_size;
|
||||
|
||||
resolve_reflection_sample(
|
||||
planar_index, sample_uv, vP, vN, vV, roughness_squared, cone_tan, weight_acc, ssr_accum);
|
||||
|
Loading…
Reference in New Issue
Block a user