blender/intern/cycles/kernel/kernel_random.h
George Kyriazis 7f4479da42 Cycles: OpenCL kernel split
This commit contains all the work related on the AMD megakernel split work
which was mainly done by Varun Sundar, George Kyriazis and Lenny Wang, plus
some help from Sergey Sharybin, Martijn Berger, Thomas Dinges and likely
someone else which we're forgetting to mention.

Currently only AMD cards are enabled for the new split kernel, but it is
possible to force split opencl kernel to be used by setting the following
environment variable: CYCLES_OPENCL_SPLIT_KERNEL_TEST=1.

Not all the features are supported yet, and that being said no motion blur,
camera blur, SSS and volumetrics for now. Also transparent shadows are
disabled on AMD device because of some compiler bug.

This kernel is also only implements regular path tracing and supporting
branched one will take a bit. Branched path tracing is exposed to the
interface still, which is a bit misleading and will be hidden there soon.

More feature will be enabled once they're ported to the split kernel and
tested.

Neither regular CPU nor CUDA has any difference, they're generating the
same exact code, which means no regressions/improvements there.

Based on the research paper:

  https://research.nvidia.com/sites/default/files/publications/laine2013hpg_paper.pdf

Here's the documentation:

  https://docs.google.com/document/d/1LuXW-CV-sVJkQaEGZlMJ86jZ8FmoPfecaMdR-oiWbUY/edit

Design discussion of the patch:

  https://developer.blender.org/T44197

Differential Revision: https://developer.blender.org/D1200
2015-05-09 19:52:40 +05:00

319 lines
9.7 KiB
C

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel_jitter.h"
CCL_NAMESPACE_BEGIN
#ifdef __SOBOL__
/* skip initial numbers that are not as well distributed, especially the
* first sequence is just 0 everywhere, which can be problematic for e.g.
* path termination */
#define SOBOL_SKIP 64
/* High Dimensional Sobol */
/* van der corput radical inverse */
ccl_device uint van_der_corput(uint bits)
{
bits = (bits << 16) | (bits >> 16);
bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
return bits;
}
/* sobol radical inverse */
ccl_device uint sobol(uint i)
{
uint r = 0;
for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
if(i & 1)
r ^= v;
return r;
}
/* inverse of sobol radical inverse */
ccl_device uint sobol_inverse(uint i)
{
const uint msb = 1U << 31;
uint r = 0;
for(uint v = 1; i; i <<= 1, v ^= v << 1)
if(i & msb)
r ^= v;
return r;
}
/* multidimensional sobol with generator matrices
* dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
uint i = index;
for(uint j = 0; i; i >>= 1, j++)
if(i & 1)
result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
return result;
}
/* lookup index and x/y coordinate, assumes m is a power of two */
ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
{
/* shift is constant per frame */
const uint shift = frame << (m << 1);
const uint sobol_shift = sobol(shift);
/* van der Corput is its own inverse */
const uint lower = van_der_corput(ex << (32 - m));
/* need to compensate for ey difference and shift */
const uint sobol_lower = sobol(lower);
const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
/* only use m upper bits for the index (m is a power of two) */
const uint sobol_result = delta | (delta >> m);
const uint upper = sobol_inverse(sobol_result);
const uint index = shift | upper | lower;
*x = van_der_corput(index);
*y = sobol_shift ^ sobol_result ^ sobol_lower;
return index;
}
ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
/* correlated multi-jittered */
int p = *rng + dimension;
return cmj_sample_1D(sample, num_samples, p);
}
#endif
#ifdef __SOBOL_FULL_SCREEN__
uint result = sobol_dimension(kg, *rng, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
return r;
#else
/* compute sobol sequence value using direction vectors */
uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
/* Cranly-Patterson rotation using rng seed */
float shift;
/* using the same *rng value to offset seems to give correlation issues,
* we could hash it with the dimension but this has a performance impact,
* we need to find a solution for this */
if(dimension & 1)
shift = (*rng >> 16) * (1.0f/(float)0xFFFF);
else
shift = (*rng & 0xFFFF) * (1.0f/(float)0xFFFF);
return r + shift - floorf(r + shift);
#endif
}
ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
/* correlated multi-jittered */
int p = *rng + dimension;
cmj_sample_2D(sample, num_samples, p, fx, fy);
}
else
#endif
{
/* sobol */
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
}
ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
{
#ifdef __SOBOL_FULL_SCREEN__
uint px, py;
uint bits = 16; /* limits us to 65536x65536 and 65536 samples */
uint size = 1 << bits;
uint frame = sample;
*rng = sobol_lookup(bits, frame, x, y, &px, &py);
*rng ^= kernel_data.integrator.seed;
if(sample == 0) {
*fx = 0.5f;
*fy = 0.5f;
}
else {
*fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x;
*fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y;
}
#else
*rng = *rng_state;
*rng ^= kernel_data.integrator.seed;
if(sample == 0) {
*fx = 0.5f;
*fy = 0.5f;
}
else {
path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
}
#endif
}
ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
{
/* nothing to do */
}
#else
/* Linear Congruential Generator */
ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
{
/* implicit mod 2^32 */
rng = (1103515245*(rng) + 12345);
return (float)rng * (1.0f/(float)0xFFFFFFFF);
}
ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
{
*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
}
ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
{
/* load state */
*rng = *rng_state;
*rng ^= kernel_data.integrator.seed;
if(sample == 0) {
*fx = 0.5f;
*fy = 0.5f;
}
else {
path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
}
}
ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
{
/* store state for next sample */
*rng_state = rng;
}
#endif
/* Linear Congruential Generator */
ccl_device uint lcg_step_uint(uint *rng)
{
/* implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return *rng;
}
ccl_device float lcg_step_float(uint *rng)
{
/* implicit mod 2^32 */
*rng = (1103515245*(*rng) + 12345);
return (float)*rng * (1.0f/(float)0xFFFFFFFF);
}
ccl_device uint lcg_init(uint seed)
{
uint rng = seed;
lcg_step_uint(&rng);
return rng;
}
/* Path Tracing Utility Functions
*
* For each random number in each step of the path we must have a unique
* dimension to avoid using the same sequence twice.
*
* For branches in the path we must be careful not to reuse the same number
* in a sequence and offset accordingly. */
ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
{
return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
}
ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
{
/* the rng_offset is not increased for transparent bounces. if we do then
* fully transparent objects can become subtly visible by the different
* sampling patterns used where the transparent object is.
*
* however for some random numbers that will determine if we next bounce
* is transparent we do need to increase the offset to avoid always making
* the same decision */
int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
}
ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
}
ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
{
return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
}
ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
{
int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
}
ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
}
ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
{
/* path is splitting into a branch, adjust so that each branch
* still gets a unique sample from the same sequence */
state->rng_offset += PRNG_BOUNCE_NUM;
state->sample = state->sample*num_branches + branch;
state->num_samples = state->num_samples*num_branches;
}
ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
{
return lcg_init(*rng + state->rng_offset + state->sample*scramble);
}
CCL_NAMESPACE_END