Cycles: add Optix support in the kernel

This adds all the kernel side changes for the Optix backend.

Ref D5363
This commit is contained in:
Patrick Mours 2019-09-12 14:46:47 +02:00 committed by Brecht Van Lommel
parent 7eb293a37b
commit 53932f1f06
10 changed files with 828 additions and 200 deletions

@ -33,49 +33,51 @@ CCL_NAMESPACE_BEGIN
#include "kernel/bvh/bvh_types.h"
#ifndef __KERNEL_OPTIX__
/* Common QBVH functions. */
#ifdef __QBVH__
# ifdef __QBVH__
# include "kernel/bvh/qbvh_nodes.h"
# ifdef __KERNEL_AVX2__
# include "kernel/bvh/obvh_nodes.h"
# endif
#endif
# endif
/* Regular BVH traversal */
#include "kernel/bvh/bvh_nodes.h"
# include "kernel/bvh/bvh_nodes.h"
#define BVH_FUNCTION_NAME bvh_intersect
#define BVH_FUNCTION_FEATURES 0
#include "kernel/bvh/bvh_traversal.h"
# define BVH_FUNCTION_NAME bvh_intersect
# define BVH_FUNCTION_FEATURES 0
# include "kernel/bvh/bvh_traversal.h"
#if defined(__INSTANCING__)
# if defined(__INSTANCING__)
# define BVH_FUNCTION_NAME bvh_intersect_instancing
# define BVH_FUNCTION_FEATURES BVH_INSTANCING
# include "kernel/bvh/bvh_traversal.h"
#endif
# endif
#if defined(__HAIR__)
# if defined(__HAIR__)
# define BVH_FUNCTION_NAME bvh_intersect_hair
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
# include "kernel/bvh/bvh_traversal.h"
#endif
# endif
#if defined(__OBJECT_MOTION__)
# if defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_motion
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION
# include "kernel/bvh/bvh_traversal.h"
#endif
# endif
#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
# if defined(__HAIR__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_hair_motion
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION
# include "kernel/bvh/bvh_traversal.h"
#endif
# endif
/* Subsurface scattering BVH traversal */
#if defined(__BVH_LOCAL__)
# if defined(__BVH_LOCAL__)
# define BVH_FUNCTION_NAME bvh_intersect_local
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "kernel/bvh/bvh_local.h"
@ -85,11 +87,11 @@ CCL_NAMESPACE_BEGIN
# define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
# include "kernel/bvh/bvh_local.h"
# endif
#endif /* __BVH_LOCAL__ */
# endif /* __BVH_LOCAL__ */
/* Volume BVH traversal */
#if defined(__VOLUME__)
# if defined(__VOLUME__)
# define BVH_FUNCTION_NAME bvh_intersect_volume
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "kernel/bvh/bvh_volume.h"
@ -105,11 +107,11 @@ CCL_NAMESPACE_BEGIN
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
# include "kernel/bvh/bvh_volume.h"
# endif
#endif /* __VOLUME__ */
# endif /* __VOLUME__ */
/* Record all intersections - Shadow BVH traversal */
#if defined(__SHADOW_RECORD_ALL__)
# if defined(__SHADOW_RECORD_ALL__)
# define BVH_FUNCTION_NAME bvh_intersect_shadow_all
# define BVH_FUNCTION_FEATURES 0
# include "kernel/bvh/bvh_shadow_all.h"
@ -137,11 +139,11 @@ CCL_NAMESPACE_BEGIN
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION
# include "kernel/bvh/bvh_shadow_all.h"
# endif
#endif /* __SHADOW_RECORD_ALL__ */
# endif /* __SHADOW_RECORD_ALL__ */
/* Record all intersections - Volume BVH traversal */
#if defined(__VOLUME_RECORD_ALL__)
# if defined(__VOLUME_RECORD_ALL__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all
# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "kernel/bvh/bvh_volume_all.h"
@ -157,12 +159,14 @@ CCL_NAMESPACE_BEGIN
# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
# include "kernel/bvh/bvh_volume_all.h"
# endif
#endif /* __VOLUME_RECORD_ALL__ */
# endif /* __VOLUME_RECORD_ALL__ */
#undef BVH_FEATURE
#undef BVH_NAME_JOIN
#undef BVH_NAME_EVAL
#undef BVH_FUNCTION_FULL_NAME
# undef BVH_FEATURE
# undef BVH_NAME_JOIN
# undef BVH_NAME_EVAL
# undef BVH_FUNCTION_FULL_NAME
#endif /* __KERNEL_OPTIX__ */
ccl_device_inline bool scene_intersect_valid(const Ray *ray)
{
@ -173,8 +177,10 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
* such cases.
* From production scenes so far it seems it's enough to test first element
* only.
* Scene intersection may also called with empty rays for conditional trace
* calls that evaluate to false, so filter those out.
*/
return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x);
return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
}
ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
@ -184,10 +190,46 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
{
PROFILING_INIT(kg, PROFILING_INTERSECT);
#ifdef __KERNEL_OPTIX__
uint p0 = 0;
uint p1 = 0;
uint p2 = 0;
uint p3 = 0;
uint p4 = visibility;
uint p5 = PRIMITIVE_NONE;
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
ray->P,
ray->D,
0.0f,
ray->t,
ray->time,
0xFF,
OPTIX_RAY_FLAG_NONE,
0,
0,
0, // SBT offset for PG_HITD
p0,
p1,
p2,
p3,
p4,
p5);
isect->t = __uint_as_float(p0);
isect->u = __uint_as_float(p1);
isect->v = __uint_as_float(p2);
isect->prim = p3;
isect->object = p4;
isect->type = p5;
return p5 != PRIMITIVE_NONE;
#else /* __KERNEL_OPTIX__ */
if (!scene_intersect_valid(ray)) {
return false;
}
#ifdef __EMBREE__
# ifdef __EMBREE__
if (kernel_data.bvh.scene) {
isect->t = ray->t;
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
@ -202,8 +244,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
}
return false;
}
#endif /* __EMBREE__ */
#ifdef __OBJECT_MOTION__
# endif /* __EMBREE__ */
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
@ -213,31 +256,29 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
return bvh_intersect_motion(kg, ray, isect, visibility);
}
#endif /* __OBJECT_MOTION__ */
# endif /* __OBJECT_MOTION__ */
#ifdef __HAIR__
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
return bvh_intersect_hair(kg, ray, isect, visibility);
}
#endif /* __HAIR__ */
#ifdef __KERNEL_CPU__
# endif /* __HAIR__ */
# ifdef __KERNEL_CPU__
# ifdef __INSTANCING__
if (kernel_data.bvh.have_instancing) {
return bvh_intersect_instancing(kg, ray, isect, visibility);
}
# endif /* __INSTANCING__ */
return bvh_intersect(kg, ray, isect, visibility);
#else /* __KERNEL_CPU__ */
# else /* __KERNEL_CPU__ */
# ifdef __INSTANCING__
return bvh_intersect_instancing(kg, ray, isect, visibility);
# else
return bvh_intersect(kg, ray, isect, visibility);
# endif /* __INSTANCING__ */
#endif /* __KERNEL_CPU__ */
# endif /* __KERNEL_CPU__ */
#endif /* __KERNEL_OPTIX__ */
}
#ifdef __BVH_LOCAL__
@ -250,10 +291,42 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
{
PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
# ifdef __KERNEL_OPTIX__
uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF;
uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF;
uint p4 = local_object;
// Is set to zero on miss or if ray is aborted, so can be used as return value
uint p5 = max_hits;
local_isect->num_hits = 0; // Initialize hit count to zero
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
ray->P,
ray->D,
0.0f,
ray->t,
ray->time,
// Need to always call into __anyhit__kernel_optix_local_hit
0xFF,
OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
1,
0,
0, // SBT offset for PG_HITL
p0,
p1,
p2,
p3,
p4,
p5);
return p5;
# else /* __KERNEL_OPTIX__ */
if (!scene_intersect_valid(ray)) {
local_isect->num_hits = 0;
return false;
}
# ifdef __EMBREE__
if (kernel_data.bvh.scene) {
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS);
@ -297,12 +370,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
return local_isect->num_hits > 0;
}
# endif /* __EMBREE__ */
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
}
# endif /* __OBJECT_MOTION__ */
return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
# endif /* __KERNEL_OPTIX__ */
}
#endif
@ -316,10 +391,40 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
{
PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
# ifdef __KERNEL_OPTIX__
uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
uint p3 = max_hits;
uint p4 = visibility;
uint p5 = false;
*num_hits = 0; // Initialize hit count to zero
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
ray->P,
ray->D,
0.0f,
ray->t,
ray->time,
// Need to always call into __anyhit__kernel_optix_shadow_all_hit
0xFF,
OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
2,
0,
0, // SBT offset for PG_HITS
p0,
p1,
*num_hits,
p3,
p4,
p5);
return p5;
# else /* __KERNEL_OPTIX__ */
if (!scene_intersect_valid(ray)) {
*num_hits = 0;
return false;
}
# ifdef __EMBREE__
if (kernel_data.bvh.scene) {
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
@ -337,7 +442,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
*num_hits = ctx.num_hits;
return rtc_ray.tfar == -INFINITY;
}
# endif
# endif /* __EMBREE__ */
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
# ifdef __HAIR__
@ -356,13 +462,21 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
}
# endif /* __HAIR__ */
# ifdef __KERNEL_CPU__
# ifdef __INSTANCING__
if (kernel_data.bvh.have_instancing) {
return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits);
}
# endif /* __INSTANCING__ */
return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
# else
# ifdef __INSTANCING__
return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits);
# else
return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
# endif /* __INSTANCING__ */
# endif /* __KERNEL_CPU__ */
# endif /* __KERNEL_OPTIX__ */
}
#endif /* __SHADOW_RECORD_ALL__ */
@ -374,6 +488,42 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
{
PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
# ifdef __KERNEL_OPTIX__
uint p0 = 0;
uint p1 = 0;
uint p2 = 0;
uint p3 = 0;
uint p4 = visibility;
uint p5 = PRIMITIVE_NONE;
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
ray->P,
ray->D,
0.0f,
ray->t,
ray->time,
// Visibility mask set to only intersect objects with volumes
0x02,
OPTIX_RAY_FLAG_NONE,
0,
0,
0, // SBT offset for PG_HITD
p0,
p1,
p2,
p3,
p4,
p5);
isect->t = __uint_as_float(p0);
isect->u = __uint_as_float(p1);
isect->v = __uint_as_float(p2);
isect->prim = p3;
isect->object = p4;
isect->type = p5;
return p5 != PRIMITIVE_NONE;
# else /* __KERNEL_OPTIX__ */
if (!scene_intersect_valid(ray)) {
return false;
}
@ -398,6 +548,7 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
return bvh_intersect_volume(kg, ray, isect, visibility);
# endif /* __INSTANCING__ */
# endif /* __KERNEL_CPU__ */
# endif /* __KERNEL_OPTIX__ */
}
#endif /* __VOLUME__ */
@ -413,6 +564,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
if (!scene_intersect_valid(ray)) {
return false;
}
# ifdef __EMBREE__
if (kernel_data.bvh.scene) {
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);

@ -38,12 +38,14 @@ ccl_device_forceinline bool cardinal_curve_intersect(KernelGlobals *kg,
{
const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
# ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
if (time < prim_time.x || time > prim_time.y) {
return false;
}
}
# endif
int segment = PRIMITIVE_UNPACK_SEGMENT(type);
float epsilon = 0.0f;
@ -505,12 +507,14 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
# ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
if (time < prim_time.x || time > prim_time.y) {
return false;
}
}
# endif
int segment = PRIMITIVE_UNPACK_SEGMENT(type);
/* curve Intersection check */

@ -0,0 +1,89 @@
/*
* Copyright 2019, NVIDIA Corporation.
* Copyright 2019, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __KERNEL_COMPAT_OPTIX_H__
#define __KERNEL_COMPAT_OPTIX_H__
#define OPTIX_DONT_INCLUDE_CUDA
#include <optix.h>
#define __KERNEL_GPU__
#define __KERNEL_CUDA__ // OptiX kernels are implicitly CUDA kernels too
#define __KERNEL_OPTIX__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END
#ifndef ATTR_FALLTHROUGH
# define ATTR_FALLTHROUGH
#endif
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef unsigned short half;
typedef unsigned long long CUtexObject;
#define FLT_MIN 1.175494350822287507969e-38f
#define FLT_MAX 340282346638528859811704183484516925440.0f
__device__ half __float2half(const float f)
{
half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
return val;
}
/* Selective nodes compilation. */
#ifndef __NODES_MAX_GROUP__
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
#endif
#ifndef __NODES_FEATURES__
# define __NODES_FEATURES__ NODE_FEATURE_ALL
#endif
#define ccl_device \
__device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
#define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
#define ccl_static_constant __constant__
#define ccl_constant const
#define ccl_local
#define ccl_local_param
#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_ref
#define ccl_align(n) __align__(n)
// Zero initialize structs to help the compiler figure out scoping
#define ccl_optional_struct_init = {}
#define kernel_data __params.data // See kernel_globals.h
#define kernel_tex_array(t) __params.t
#define kernel_tex_fetch(t, index) __params.t[(index)]
#define kernel_assert(cond)
/* Types */
#include "util/util_half.h"
#include "util/util_types.h"
#endif /* __KERNEL_COMPAT_OPTIX_H__ */

@ -90,12 +90,43 @@ typedef struct KernelGlobals {
#endif /* __KERNEL_CPU__ */
#ifdef __KERNEL_OPTIX__
typedef struct ShaderParams {
uint4 *input;
float4 *output;
int type;
int filter;
int sx;
int offset;
int sample;
} ShaderParams;
typedef struct KernelParams {
WorkTile tile;
KernelData data;
ShaderParams shader;
# define KERNEL_TEX(type, name) const type *name;
# include "kernel/kernel_textures.h"
} KernelParams;
typedef struct KernelGlobals {
# ifdef __VOLUME__
VolumeState volume_state;
# endif
Intersection hits_stack[64];
} KernelGlobals;
extern "C" __constant__ KernelParams __params;
#else /* __KERNEL_OPTIX__ */
/* For CUDA, constant memory textures must be globals, so we can't put them
* into a struct. As a result we don't actually use this struct and use actual
* globals and simply pass along a NULL pointer everywhere, which we hope gets
* optimized out. */
#ifdef __KERNEL_CUDA__
# ifdef __KERNEL_CUDA__
__constant__ KernelData __data;
typedef struct KernelGlobals {
@ -106,7 +137,9 @@ typedef struct KernelGlobals {
# define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
# include "kernel/kernel_textures.h"
#endif /* __KERNEL_CUDA__ */
# endif /* __KERNEL_CUDA__ */
#endif /* __KERNEL_OPTIX__ */
/* OpenCL */

@ -326,7 +326,13 @@ ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
return true;
}
ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
#ifdef __KERNEL_OPTIX__
ccl_device_inline /* inline trace calls */
#else
ccl_device_noinline
#endif
void
kernel_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
@ -655,9 +661,11 @@ ccl_device void kernel_path_trace(
kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
# ifndef __KERNEL_OPTIX__
if (ray.t == 0.0f) {
return;
}
# endif
/* Initialize state. */
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
@ -671,6 +679,13 @@ ccl_device void kernel_path_trace(
PathState state;
path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
# ifdef __KERNEL_OPTIX__
/* Force struct into local memory to avoid costly spilling on trace calls. */
if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
for (int i = 0; i < sizeof(L); ++i)
reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
# endif
/* Integrate. */
kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);

@ -48,7 +48,13 @@ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd
}
#endif
ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
#ifdef __KERNEL_OPTIX__
ccl_device_inline
#else
ccl_device_noinline
#endif
void
shader_setup_from_ray(KernelGlobals *kg,
ShaderData *sd,
const Intersection *isect,
const Ray *ray)

@ -17,13 +17,6 @@
CCL_NAMESPACE_BEGIN
#ifdef __VOLUME__
typedef struct VolumeState {
# ifdef __SPLIT_KERNEL__
# else
PathState ps;
# endif
} VolumeState;
/* Get PathState ready for use for volume stack evaluation. */
# ifdef __SPLIT_KERNEL__
ccl_addr_space
@ -55,12 +48,11 @@ ccl_addr_space
/* Attenuate throughput accordingly to the given intersection event.
* Returns true if the throughput is zero and traversal can be aborted.
*/
ccl_device_forceinline bool shadow_handle_transparent_isect(
KernelGlobals *kg,
ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
#ifdef __VOLUME__
ccl_addr_space struct PathState *volume_state,
ccl_addr_space PathState *volume_state,
#endif
Intersection *isect,
Ray *ray,
@ -163,7 +155,11 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
uint num_hits;
const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
# ifdef __VOLUME__
# ifdef __KERNEL_OPTIX__
VolumeState &volume_state = kg->volume_state;
# else
VolumeState volume_state;
# endif
# endif
/* If no opaque surface found but we did find transparent hits,
* shade them.
@ -302,7 +298,11 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
float3 *shadow)
{
# ifdef __VOLUME__
# ifdef __KERNEL_OPTIX__
VolumeState &volume_state = kg->volume_state;
# else
VolumeState volume_state;
# endif
# endif
if (blocked && is_transparent_isect) {
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
@ -387,32 +387,38 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
Ray *ray_input,
Ray *ray,
float3 *shadow)
{
Ray *ray = ray_input;
Intersection isect;
/* Some common early checks. */
*shadow = make_float3(1.0f, 1.0f, 1.0f);
#if !defined(__KERNEL_OPTIX__)
/* Some common early checks.
* Avoid conditional trace call in OptiX though, since those hurt performance there.
*/
if (ray->t == 0.0f) {
return false;
}
#endif
#ifdef __SHADOW_TRICKS__
const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
PATH_RAY_SHADOW;
#else
const uint visibility = PATH_RAY_SHADOW;
#endif
/* Do actual shadow shading. */
/* First of all, we check if integrator requires transparent shadows.
/* Do actual shadow shading.
* First of all, we check if integrator requires transparent shadows.
* if not, we use simplest and fastest ever way to calculate occlusion.
* Do not do this in OptiX to avoid the additional trace call.
*/
#ifdef __TRANSPARENT_SHADOWS__
#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
Intersection isect;
# ifdef __TRANSPARENT_SHADOWS__
if (!kernel_data.integrator.transparent_shadows)
#endif
# endif
{
return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
}
#endif
#ifdef __TRANSPARENT_SHADOWS__
# ifdef __SHADOW_RECORD_ALL__
/* For the transparent shadows we try to use record-all logic on the
@ -426,7 +432,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
return true;
}
const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
# ifdef __KERNEL_GPU__
# if defined(__KERNEL_GPU__) && !defined(__KERNEL_OPTIX__)
/* On GPU we do tricky with tracing opaque ray first, this avoids speed
* regressions in some files.
*

@ -353,7 +353,13 @@ ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
*weight = safe_divide_color(bssrdf->weight, A);
}
ccl_device_noinline bool subsurface_random_walk(KernelGlobals *kg,
#ifdef __KERNEL_OPTIX__
ccl_device_inline /* inline trace calls */
#else
ccl_device_noinline
#endif
bool
subsurface_random_walk(KernelGlobals *kg,
LocalIntersection *ss_isect,
ShaderData *sd,
ccl_addr_space PathState *state,

@ -143,6 +143,13 @@ CCL_NAMESPACE_BEGIN
# endif
#endif /* __KERNEL_CUDA__ */
#ifdef __KERNEL_OPTIX__
# undef __BAKING__
# undef __BRANCHED_PATH__
/* TODO(pmours): Cannot use optixTrace in non-inlined functions */
# undef __SHADER_RAYTRACE__
#endif /* __KERNEL_OPTIX__ */
#ifdef __KERNEL_OPENCL__
#endif /* __KERNEL_OPENCL__ */
@ -1056,6 +1063,15 @@ typedef struct PathState {
#endif
} PathState;
#ifdef __VOLUME__
typedef struct VolumeState {
# ifdef __SPLIT_KERNEL__
# else
PathState ps;
# endif
} VolumeState;
#endif
/* Struct to gather multiple nearby intersections. */
typedef struct LocalIntersection {
Ray ray;
@ -1343,9 +1359,12 @@ typedef enum KernelBVHLayout {
BVH_LAYOUT_BVH2 = (1 << 0),
BVH_LAYOUT_BVH4 = (1 << 1),
BVH_LAYOUT_BVH8 = (1 << 2),
BVH_LAYOUT_EMBREE = (1 << 3),
BVH_LAYOUT_OPTIX = (1 << 4),
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
BVH_LAYOUT_ALL = (unsigned int)(-1),
BVH_LAYOUT_ALL = (unsigned int)(~0u),
} KernelBVHLayout;
typedef struct KernelBVH {
@ -1357,14 +1376,18 @@ typedef struct KernelBVH {
int bvh_layout;
int use_bvh_steps;
/* Embree */
#ifdef __EMBREE__
/* Custom BVH */
#ifdef __KERNEL_OPTIX__
OptixTraversableHandle scene;
#else
# ifdef __EMBREE__
RTCScene scene;
# ifndef __KERNEL_64_BIT__
int pad1;
int pad2;
# endif
# else
int scene, pad2;
# endif
#else
int pad1, pad2;
#endif
} KernelBVH;
static_assert_align(KernelBVH, 16);

@ -0,0 +1,294 @@
/*
* Copyright 2019, NVIDIA Corporation.
* Copyright 2019, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/kernel_compat_optix.h"
#include "util/util_atomic.h"
#include "kernel/kernel_types.h"
#include "kernel/kernel_globals.h"
#include "../cuda/kernel_cuda_image.h" // Texture lookup uses normal CUDA intrinsics
#include "kernel/kernel_path.h"
#include "kernel/kernel_bake.h"
template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
{
return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
}
template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
{
return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
}
template<bool always = false> ccl_device_forceinline uint get_object_id()
{
#ifdef __OBJECT_MOTION__
// Always get the the instance ID from the TLAS
// There might be a motion transform node between TLAS and BLAS which does not have one
uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
#else
uint object = optixGetInstanceId();
#endif
// Choose between always returning object ID or only for instances
if (always)
// Can just remove the high bit since instace always contains object ID
return object & 0x7FFFFF;
// Set to OBJECT_NONE if this is not an instanced object
else if (object & 0x800000)
object = OBJECT_NONE;
return object;
}
extern "C" __global__ void __raygen__kernel_optix_path_trace()
{
KernelGlobals kg; // Allocate stack storage for common data
const uint3 launch_index = optixGetLaunchIndex();
// Keep threads for same pixel together to improve occupancy of warps
uint pixel_offset = launch_index.x / __params.tile.num_samples;
uint sample_offset = launch_index.x % __params.tile.num_samples;
kernel_path_trace(&kg,
__params.tile.buffer,
__params.tile.start_sample + sample_offset,
__params.tile.x + pixel_offset,
__params.tile.y + launch_index.y,
__params.tile.offset,
__params.tile.stride);
}
#ifdef __BAKING__
extern "C" __global__ void __raygen__kernel_optix_bake()
{
KernelGlobals kg;
const ShaderParams &p = __params.shader;
kernel_bake_evaluate(&kg,
p.input,
p.output,
(ShaderEvalType)p.type,
p.filter,
p.sx + optixGetLaunchIndex().x,
p.offset,
p.sample);
}
#endif
extern "C" __global__ void __raygen__kernel_optix_displace()
{
KernelGlobals kg;
const ShaderParams &p = __params.shader;
kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
}
extern "C" __global__ void __raygen__kernel_optix_background()
{
KernelGlobals kg;
const ShaderParams &p = __params.shader;
kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
}
extern "C" __global__ void __miss__kernel_optix_miss()
{
// 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
optixSetPayload_5(PRIMITIVE_NONE);
}
extern "C" __global__ void __anyhit__kernel_optix_local_hit()
{
#ifdef __BVH_LOCAL__
const uint object = get_object_id<true>();
if (object != optixGetPayload_4() /* local_object */) {
// Only intersect with matching object
return optixIgnoreIntersection();
}
int hit = 0;
uint *const lcg_state = get_payload_ptr_0<uint>();
LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
if (lcg_state) {
const uint max_hits = optixGetPayload_5();
for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
if (optixGetRayTmax() == local_isect->hits[i].t) {
return optixIgnoreIntersection();
}
}
hit = local_isect->num_hits++;
if (local_isect->num_hits > max_hits) {
hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
if (hit >= max_hits) {
return optixIgnoreIntersection();
}
}
}
else {
if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
// Record closest intersection only (do not terminate ray here, since there is no guarantee
// about distance ordering in anyhit)
return optixIgnoreIntersection();
}
local_isect->num_hits = 1;
}
Intersection *isect = &local_isect->hits[hit];
isect->t = optixGetRayTmax();
isect->prim = optixGetPrimitiveIndex();
isect->object = get_object_id();
isect->type = kernel_tex_fetch(__prim_type, isect->prim);
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
isect->u = 1.0f - barycentrics.y - barycentrics.x;
isect->v = barycentrics.x;
}
else {
isect->u = __uint_as_float(optixGetAttribute_0());
isect->v = __uint_as_float(optixGetAttribute_1());
}
// Record geometric normal
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
// Continue tracing (without this the trace call would return after the first hit)
optixIgnoreIntersection();
#endif
}
extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
{
#ifdef __SHADOW_RECORD_ALL__
const uint prim = optixGetPrimitiveIndex();
# ifdef __VISIBILITY_FLAG__
const uint visibility = optixGetPayload_4();
if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
return optixIgnoreIntersection();
}
# endif
// Offset into array with num_hits
Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
isect->t = optixGetRayTmax();
isect->prim = prim;
isect->object = get_object_id();
isect->type = kernel_tex_fetch(__prim_type, prim);
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
isect->u = 1.0f - barycentrics.y - barycentrics.x;
isect->v = barycentrics.x;
}
else {
isect->u = __uint_as_float(optixGetAttribute_0());
isect->v = __uint_as_float(optixGetAttribute_1());
}
# ifdef __TRANSPARENT_SHADOWS__
// Detect if this surface has a shader with transparent shadows
if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
# endif
// This is an opaque hit or the hit limit has been reached, abort traversal
optixSetPayload_5(true);
return optixTerminateRay();
# ifdef __TRANSPARENT_SHADOWS__
}
// TODO(pmours): Do we need REQUIRE_UNIQUE_ANYHIT for this to work?
optixSetPayload_2(optixGetPayload_2() + 1); // num_hits++
// Continue tracing
optixIgnoreIntersection();
# endif
#endif
}
extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
{
uint visibility = optixGetPayload_4();
#ifdef __VISIBILITY_FLAG__
const uint prim = optixGetPrimitiveIndex();
if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0)
return optixIgnoreIntersection();
#endif
// Shadow ray early termination
if (visibility & PATH_RAY_SHADOW_OPAQUE)
return optixTerminateRay();
}
extern "C" __global__ void __closesthit__kernel_optix_hit()
{
optixSetPayload_0(__float_as_uint(optixGetRayTmax())); // Intersection distance
optixSetPayload_3(optixGetPrimitiveIndex());
optixSetPayload_4(get_object_id());
// Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
optixSetPayload_2(__float_as_uint(barycentrics.x));
}
else {
optixSetPayload_1(optixGetAttribute_0());
optixSetPayload_2(optixGetAttribute_1());
}
}
#ifdef __HAIR__
extern "C" __global__ void __intersection__curve()
{
const uint prim = optixGetPrimitiveIndex();
const uint object = get_object_id<true>();
const uint type = kernel_tex_fetch(__prim_type, prim);
const uint visibility = optixGetPayload_4();
const float3 P = optixGetObjectRayOrigin();
const float3 dir = optixGetObjectRayDirection();
# ifdef __OBJECT_MOTION__
const float time = optixGetRayTime();
# else
const float time = 0.0f;
# endif
Intersection isect;
isect.t = optixGetRayTmax();
if (!(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) ?
curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type) :
cardinal_curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
optixReportIntersection(isect.t,
type & PRIMITIVE_ALL,
__float_as_int(isect.u), // Attribute_0
__float_as_int(isect.v)); // Attribute_1
}
}
#endif
#ifdef __KERNEL_DEBUG__
extern "C" __global__ void __exception__kernel_optix_exception()
{
printf("Unhandled exception occured: code %d!\n", optixGetExceptionCode());
}
#endif