From 3f8c99510926d5b63de7fb7dddc3d216b3f66966 Mon Sep 17 00:00:00 2001 From: Nikita Sirgienko Date: Thu, 16 Mar 2023 11:56:55 +0100 Subject: [PATCH] Cycles: add hardware raytracing support to oneAPI device Updated Embree 4 library with GPU support is required for it to be compiled - compatiblity with Embree 3 and Embree 4 without GPU support is maintained. Enabling hardware raytracing is an opt-in user setting for now. Pull Request: https://projects.blender.org/blender/blender/pulls/106266 --- intern/cycles/CMakeLists.txt | 3 + intern/cycles/blender/addon/properties.py | 12 ++ intern/cycles/blender/device.cpp | 21 ++- intern/cycles/blender/python.cpp | 8 + intern/cycles/device/device.cpp | 4 +- intern/cycles/device/device.h | 21 +-- intern/cycles/device/kernel.cpp | 8 +- intern/cycles/device/kernel.h | 11 +- intern/cycles/device/metal/device_impl.mm | 4 +- intern/cycles/device/oneapi/device.cpp | 9 +- intern/cycles/device/oneapi/device_impl.cpp | 96 ++++++++++- intern/cycles/device/oneapi/device_impl.h | 21 ++- intern/cycles/kernel/CMakeLists.txt | 58 ++++++- intern/cycles/kernel/device/cpu/bvh.h | 159 +++++++++++++++--- intern/cycles/kernel/device/gpu/kernel.h | 6 + intern/cycles/kernel/device/oneapi/compat.h | 11 +- .../device/oneapi/context_intersect_begin.h | 18 ++ .../device/oneapi/context_intersect_end.h | 7 + intern/cycles/kernel/device/oneapi/globals.h | 2 + intern/cycles/kernel/device/oneapi/kernel.cpp | 82 ++++++++- intern/cycles/kernel/device/oneapi/kernel.h | 6 +- intern/cycles/kernel/types.h | 5 +- intern/cycles/util/vector.h | 1 - 23 files changed, 508 insertions(+), 65 deletions(-) create mode 100644 intern/cycles/kernel/device/oneapi/context_intersect_begin.h create mode 100644 intern/cycles/kernel/device/oneapi/context_intersect_end.h diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index aa9d7f5c53b..f28c14ad4cc 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -281,6 +281,9 @@ endif() if(WITH_CYCLES_EMBREE) add_definitions(-DWITH_EMBREE) + if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT) + add_definitions(-DWITH_EMBREE_GPU) + endif() add_definitions(-DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION}) include_directories( SYSTEM diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 25079b0c507..79356c1c67e 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences): default=False, ) + use_oneapirt: BoolProperty( + name="Embree on GPU (Experimental)", + description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. " + "However this support is experimental and some scenes may render incorrectly", + default=False, + ) + kernel_optimization_level: EnumProperty( name="Kernel Optimization", description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. " @@ -1763,6 +1770,11 @@ class CyclesPreferences(bpy.types.AddonPreferences): col.prop(self, "kernel_optimization_level") col.prop(self, "use_metalrt") + if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu: + row = layout.row() + row.use_property_split = True + row.prop(self, "use_oneapirt") + def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index 9d2fba11c1a..b23549950b1 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -112,9 +112,26 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, device.has_peer_memory = false; } - if (get_boolean(cpreferences, "use_metalrt")) { - device.use_metalrt = true; + bool accumulated_use_hardware_raytracing = false; + foreach ( + DeviceInfo &info, + (device.multi_devices.size() != 0 ? device.multi_devices : vector({device}))) { + if (info.type == DEVICE_METAL && !get_boolean(cpreferences, "use_metalrt")) { + info.use_hardware_raytracing = false; + } + + if (info.type == DEVICE_ONEAPI && !get_boolean(cpreferences, "use_oneapirt")) { + info.use_hardware_raytracing = false; + } + + /* There is an accumulative logic here, because Multidevices are support only for + * the same backend + CPU in Blender right now, and both oneAPI and Metal have a + * global boolean backend setting (see above) for enabling/disabling HW RT, + * so all subdevices in the multidevice should enable (or disable) HW RT + * simultaneously (and CPU device are expected to ignore "use_hardware_raytracing" setting) */ + accumulated_use_hardware_raytracing |= info.use_hardware_raytracing; } + device.use_hardware_raytracing = accumulated_use_hardware_raytracing; if (preview) { /* Disable specialization for preview renders. */ diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index 6ac68035033..682d7075455 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -1034,6 +1034,14 @@ void *CCL_python_module_init() Py_INCREF(Py_False); #endif /* WITH_EMBREE */ +#ifdef WITH_EMBREE_GPU + PyModule_AddObject(mod, "with_embree_gpu", Py_True); + Py_INCREF(Py_True); +#else /* WITH_EMBREE_GPU */ + PyModule_AddObject(mod, "with_embree_gpu", Py_False); + Py_INCREF(Py_False); +#endif /* WITH_EMBREE_GPU */ + if (ccl::openimagedenoise_supported()) { PyModule_AddObject(mod, "with_openimagedenoise", Py_True); Py_INCREF(Py_True); diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 0ef080a0c00..0ed03f1e94d 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -354,7 +354,7 @@ DeviceInfo Device::get_multi_device(const vector &subdevices, info.has_guiding = true; info.has_profiling = true; info.has_peer_memory = false; - info.use_metalrt = false; + info.use_hardware_raytracing = false; info.denoisers = DENOISER_ALL; foreach (const DeviceInfo &device, subdevices) { @@ -403,7 +403,7 @@ DeviceInfo Device::get_multi_device(const vector &subdevices, info.has_guiding &= device.has_guiding; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; - info.use_metalrt |= device.use_metalrt; + info.use_hardware_raytracing |= device.use_hardware_raytracing; info.denoisers &= device.denoisers; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 0ff946b3db9..a73dcc4ab1f 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -71,15 +71,16 @@ class DeviceInfo { string description; string id; /* used for user preferences, should stay fixed with changing hardware config */ int num; - bool display_device; /* GPU is used as a display device. */ - bool has_nanovdb; /* Support NanoVDB volumes. */ - bool has_light_tree; /* Support light tree. */ - bool has_osl; /* Support Open Shading Language. */ - bool has_guiding; /* Support path guiding. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ - bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ - bool has_gpu_queue; /* Device supports GPU queue. */ - bool use_metalrt; /* Use MetalRT to accelerate ray queries (Metal only). */ + bool display_device; /* GPU is used as a display device. */ + bool has_nanovdb; /* Support NanoVDB volumes. */ + bool has_light_tree; /* Support light tree. */ + bool has_osl; /* Support Open Shading Language. */ + bool has_guiding; /* Support path guiding. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ + bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + bool has_gpu_queue; /* Device supports GPU queue. */ + bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend. + */ KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing * kernels (Metal only). */ DenoiserTypeMask denoisers; /* Supported denoiser types. */ @@ -101,7 +102,7 @@ class DeviceInfo { has_profiling = false; has_peer_memory = false; has_gpu_queue = false; - use_metalrt = false; + use_hardware_raytracing = false; denoisers = DENOISER_NONE; } diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 903ad096586..5e871c00090 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -3,7 +3,9 @@ #include "device/kernel.h" -#include "util/log.h" +#ifndef __KERNEL_ONEAPI__ +# include "util/log.h" +#endif CCL_NAMESPACE_BEGIN @@ -153,10 +155,13 @@ const char *device_kernel_as_string(DeviceKernel kernel) case DEVICE_KERNEL_NUM: break; }; +#ifndef __KERNEL_ONEAPI__ LOG(FATAL) << "Unhandled kernel " << static_cast(kernel) << ", should never happen."; +#endif return "UNKNOWN"; } +#ifndef __KERNEL_ONEAPI__ std::ostream &operator<<(std::ostream &os, DeviceKernel kernel) { os << device_kernel_as_string(kernel); @@ -178,5 +183,6 @@ string device_kernel_mask_as_string(DeviceKernelMask mask) return str; } +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h index b829a891260..e1cd2b8f092 100644 --- a/intern/cycles/device/kernel.h +++ b/intern/cycles/device/kernel.h @@ -3,11 +3,13 @@ #pragma once -#include "kernel/types.h" +#ifndef __KERNEL_ONEAPI__ +# include "kernel/types.h" -#include "util/string.h" +# include "util/string.h" -#include // NOLINT +# include // NOLINT +#endif CCL_NAMESPACE_BEGIN @@ -15,9 +17,12 @@ bool device_kernel_has_shading(DeviceKernel kernel); bool device_kernel_has_intersection(DeviceKernel kernel); const char *device_kernel_as_string(DeviceKernel kernel); + +#ifndef __KERNEL_ONEAPI__ std::ostream &operator<<(std::ostream &os, DeviceKernel kernel); typedef uint64_t DeviceKernelMask; string device_kernel_mask_as_string(DeviceKernelMask mask); +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 0c805b1ddd8..03a2a2c1f06 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -100,12 +100,12 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile } case METAL_GPU_AMD: { max_threads_per_threadgroup = 128; - use_metalrt = info.use_metalrt; + use_metalrt = info.use_hardware_raytracing; break; } case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; - use_metalrt = info.use_metalrt; + use_metalrt = info.use_hardware_raytracing; break; } } diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index aa77cb34482..7fca0fa0cd6 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -87,7 +87,8 @@ Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &pro } #ifdef WITH_ONEAPI -static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr) +static void device_iterator_cb( + const char *id, const char *name, int num, bool hwrt_support, void *user_ptr) { vector *devices = (vector *)user_ptr; @@ -112,6 +113,12 @@ static void device_iterator_cb(const char *id, const char *name, int num, void * /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */ info.display_device = false; +# if WITH_EMBREE_GPU + info.use_hardware_raytracing = hwrt_support; +# else + info.use_hardware_raytracing = false; +# endif + devices->push_back(info); VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; } diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 0aec8268bd5..3379541a5d8 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -8,7 +8,19 @@ # include "util/debug.h" # include "util/log.h" +# ifdef WITH_EMBREE_GPU +# include "bvh/embree.h" +# endif + # include "kernel/device/oneapi/globals.h" +# include "kernel/device/oneapi/kernel.h" + +# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION) +/* These declarations are missing from embree headers when compiling from a compiler that doesn't + * support SYCL. */ +extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config); +extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device); +# endif CCL_NAMESPACE_BEGIN @@ -22,16 +34,29 @@ static void queue_error_cb(const char *message, void *user_ptr) OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) : Device(info, stats, profiler), device_queue_(nullptr), +# ifdef WITH_EMBREE_GPU + embree_device(nullptr), + embree_scene(nullptr), +# endif texture_info_(this, "texture_info", MEM_GLOBAL), kg_memory_(nullptr), kg_memory_device_(nullptr), kg_memory_size_(0) { need_texture_info_ = false; + use_hardware_raytracing = info.use_hardware_raytracing; oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); - bool is_finished_ok = create_queue(device_queue_, info.num); + bool is_finished_ok = create_queue(device_queue_, + info.num, +# ifdef WITH_EMBREE_GPU + use_hardware_raytracing ? &embree_device : nullptr +# else + nullptr +# endif + ); + if (is_finished_ok == false) { set_error("oneAPI queue initialization error: got runtime exception \"" + oneapi_error_string_ + "\""); @@ -42,6 +67,16 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi assert(device_queue_); } +# ifdef WITH_EMBREE_GPU + use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr); +# else + use_hardware_raytracing = false; +# endif + + if (use_hardware_raytracing) { + VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration."; + } + size_t globals_segment_size; is_finished_ok = kernel_globals_size(globals_segment_size); if (is_finished_ok == false) { @@ -64,6 +99,11 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi OneapiDevice::~OneapiDevice() { +# ifdef WITH_EMBREE_GPU + if (embree_device) + rtcReleaseDevice(embree_device); +# endif + texture_info_.free(); usm_free(device_queue_, kg_memory_); usm_free(device_queue_, kg_memory_device_); @@ -82,13 +122,36 @@ bool OneapiDevice::check_peer_access(Device * /*peer_device*/) BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; + return use_hardware_raytracing ? BVH_LAYOUT_EMBREE : BVH_LAYOUT_BVH2; } +# ifdef WITH_EMBREE_GPU +void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ + if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) { + BVHEmbree *const bvh_embree = static_cast(bvh); + if (refit) { + bvh_embree->refit(progress); + } + else { + bvh_embree->build(progress, &stats, embree_device); + } + if (bvh->params.top_level) { + embree_scene = bvh_embree->scene; + } + } + else { + Device::build_bvh(bvh, progress, refit); + } +} +# endif + bool OneapiDevice::load_kernels(const uint requested_features) { assert(device_queue_); + kernel_features = requested_features; + bool is_finished_ok = oneapi_run_test_kernel(device_queue_); if (is_finished_ok == false) { set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ + @@ -327,6 +390,16 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size) << string_human_readable_number(size) << " bytes. (" << string_human_readable_size(size) << ")"; +# ifdef WITH_EMBREE_GPU + if (strcmp(name, "data") == 0) { + assert(size <= sizeof(KernelData)); + + /* Update scene handle(since it is different for each device on multi devices) */ + KernelData *const data = (KernelData *)host; + data->device_bvh = embree_scene; + } +# endif + ConstMemMap::iterator i = const_mem_map_.find(name); device_vector *data; @@ -446,7 +519,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_ # endif } -bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index) +bool OneapiDevice::create_queue(SyclQueue *&external_queue, + int device_index, + void *embree_device_pointer) { bool finished_correct = true; try { @@ -457,6 +532,11 @@ bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index) sycl::queue *created_queue = new sycl::queue(devices[device_index], sycl::property::queue::in_order()); external_queue = reinterpret_cast(created_queue); +# ifdef WITH_EMBREE_GPU + if (embree_device_pointer) { + *((RTCDevice *)embree_device_pointer) = rtcNewSYCLDevice(created_queue->get_context(), ""); + } +# endif } catch (sycl::exception const &e) { finished_correct = false; @@ -625,7 +705,8 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context, size_t global_size, void **args) { - return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args); + return oneapi_enqueue_kernel( + kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args); } /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows @@ -830,12 +911,17 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p std::string name = device.get_info(); # else std::string name = "SYCL Host Task (Debug)"; +# endif +# ifdef WITH_EMBREE_GPU + bool hwrt_support = rtcIsSYCLDeviceSupported(device); +# else + bool hwrt_support = false; # endif std::string id = "ONEAPI_" + platform_name + "_" + name; if (device.has(sycl::aspect::ext_intel_pci_address)) { id.append("_" + device.get_info()); } - (cb)(id.c_str(), name.c_str(), num, user_ptr); + (cb)(id.c_str(), name.c_str(), num, hwrt_support, user_ptr); num++; } } diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 197cf03d60d..0a83d199450 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -16,15 +16,16 @@ CCL_NAMESPACE_BEGIN class DeviceQueue; -typedef void (*OneAPIDeviceIteratorCallback)(const char *id, - const char *name, - int num, - void *user_ptr); +typedef void (*OneAPIDeviceIteratorCallback)( + const char *id, const char *name, int num, bool hwrt_support, void *user_ptr); class OneapiDevice : public Device { private: SyclQueue *device_queue_; - +# if WITH_EMBREE_GPU + RTCDevice embree_device; + RTCScene embree_scene; +# endif using ConstMemMap = map *>; ConstMemMap const_mem_map_; device_vector texture_info_; @@ -34,6 +35,8 @@ class OneapiDevice : public Device { size_t kg_memory_size_ = (size_t)0; size_t max_memory_on_device_ = (size_t)0; std::string oneapi_error_string_; + bool use_hardware_raytracing = false; + unsigned int kernel_features = 0; public: virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -41,10 +44,12 @@ class OneapiDevice : public Device { OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); virtual ~OneapiDevice(); - +# if WITH_EMBREE_GPU + void build_bvh(BVH *bvh, Progress &progress, bool refit) override; +# endif bool check_peer_access(Device *peer_device) override; - bool load_kernels(const uint requested_features) override; + bool load_kernels(const uint kernel_features) override; void load_texture_info(); @@ -114,7 +119,7 @@ class OneapiDevice : public Device { protected: void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host); - bool create_queue(SyclQueue *&external_queue, int device_index); + bool create_queue(SyclQueue *&external_queue, int device_index, void *embree_device); void free_queue(SyclQueue *queue); void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment); void *usm_alloc_device(SyclQueue *queue, size_t memory_size); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index eb3d20f04ad..57ab5beb030 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -96,10 +96,13 @@ set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS device/oneapi/compat.h device/oneapi/context_begin.h device/oneapi/context_end.h + device/oneapi/context_intersect_begin.h + device/oneapi/context_intersect_end.h device/oneapi/globals.h device/oneapi/image.h device/oneapi/kernel.h device/oneapi/kernel_templates.h + device/cpu/bvh.h ) set(SRC_KERNEL_CLOSURE_HEADERS @@ -764,7 +767,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI) # Set defaults for spir64 and spir64_gen options if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) - set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") + set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-regular-grf-kernel integrator_intersect -ze-opt-large-grf-kernel shade -ze-opt-no-local-to-generic'") endif() if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen) set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target") @@ -798,6 +801,59 @@ if(WITH_CYCLES_DEVICE_ONEAPI) -I"${NANOVDB_INCLUDE_DIR}") endif() + if(WITH_CYCLES_EMBREE AND EMBREE_SYCL_SUPPORT) + list(APPEND sycl_compiler_flags + -DWITH_EMBREE + -DWITH_EMBREE_GPU + -DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION} + -I"${EMBREE_INCLUDE_DIRS}") + + if(WIN32) + list(APPEND sycl_compiler_flags + -ladvapi32.lib + ) + endif() + + set(next_library_mode "") + foreach(library ${EMBREE_LIBRARIES}) + string(TOLOWER "${library}" library_lower) + if(("${library_lower}" STREQUAL "optimized") OR + ("${library_lower}" STREQUAL "debug")) + set(next_library_mode "${library_lower}") + else() + if(next_library_mode STREQUAL "") + list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library}) + list(APPEND EMBREE_TBB_LIBRARIES_debug ${library}) + else() + list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library}) + endif() + set(next_library_mode "") + endif() + endforeach() + + foreach(library ${TBB_LIBRARIES}) + string(TOLOWER "${library}" library_lower) + if(("${library_lower}" STREQUAL "optimized") OR + ("${library_lower}" STREQUAL "debug")) + set(next_library_mode "${library_lower}") + else() + if(next_library_mode STREQUAL "") + list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library}) + list(APPEND EMBREE_TBB_LIBRARIES_debug ${library}) + else() + list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library}) + endif() + set(next_library_mode "") + endif() + endforeach() + list(APPEND sycl_compiler_flags + "$<$:${EMBREE_TBB_LIBRARIES_optimized}>" + "$<$:${EMBREE_TBB_LIBRARIES_optimized}>" + "$<$:${EMBREE_TBB_LIBRARIES_optimized}>" + "$<$:${EMBREE_TBB_LIBRARIES_debug}>" + ) + endif() + if(WITH_CYCLES_DEBUG) list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG) endif() diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h index a7baec0b718..65aed44dbbe 100644 --- a/intern/cycles/kernel/device/cpu/bvh.h +++ b/intern/cycles/kernel/device/cpu/bvh.h @@ -13,8 +13,13 @@ # include #endif -#include "kernel/device/cpu/compat.h" -#include "kernel/device/cpu/globals.h" +#ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/compat.h" +# include "kernel/device/oneapi/globals.h" +#else +# include "kernel/device/cpu/compat.h" +# include "kernel/device/cpu/globals.h" +#endif #include "kernel/bvh/types.h" #include "kernel/bvh/util.h" @@ -33,11 +38,18 @@ using numhit_t = uint8_t; using numhit_t = uint32_t; #endif -#define CYCLES_EMBREE_USED_FEATURES \ - (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \ - RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \ - RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \ - RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE) +#ifdef __KERNEL_ONEAPI__ +static constexpr sycl::specialization_id oneapi_embree_features{ + (const RTCFeatureFlags)(0)}; +# define CYCLES_EMBREE_USED_FEATURES \ + (kernel_handler.get_specialization_constant()) +#else +# define CYCLES_EMBREE_USED_FEATURES \ + (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \ + RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \ + RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \ + RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE) +#endif #define EMBREE_IS_HAIR(x) (x & 1) @@ -252,7 +264,8 @@ ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals kg, * Things like recording subsurface or shadow hits for later evaluation * as well as filtering for volume objects happen here. * Cycles' own BVH does that directly inside the traversal calls. */ -ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args) +ccl_device_forceinline void kernel_embree_filter_intersection_func_impl( + const RTCFilterFunctionNArguments *args) { /* Current implementation in Cycles assumes only single-ray intersection queries. */ assert(args->N == 1); @@ -263,7 +276,11 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA #else CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context); #endif +#ifdef __KERNEL_ONEAPI__ + KernelGlobalsGPU *kg = nullptr; +#else const KernelGlobalsCPU *kg = ctx->kg; +#endif const Ray *cray = ctx->ray; if (kernel_embree_is_self_intersection( @@ -277,7 +294,7 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA * as well as filtering for volume objects happen here. * Cycles' own BVH does that directly inside the traversal calls. */ -ccl_device void kernel_embree_filter_occluded_shadow_all_func( +ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl( const RTCFilterFunctionNArguments *args) { /* Current implementation in Cycles assumes only single-ray intersection queries. */ @@ -290,7 +307,11 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func( #else CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context); #endif +#ifdef __KERNEL_ONEAPI__ + KernelGlobalsGPU *kg = nullptr; +#else const KernelGlobalsCPU *kg = ctx->kg; +#endif const Ray *cray = ctx->ray; Intersection current_isect; @@ -326,7 +347,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func( } /* Test if we need to record this transparent intersection. */ - const numhit_t max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + const numhit_t max_record_hits = min(ctx->max_hits, numhit_t(INTEGRATOR_SHADOW_ISECT_SIZE)); if (ctx->num_recorded_hits < max_record_hits) { /* If maximum number of hits was reached, replace the intersection with the * highest distance. We want to find the N closest intersections. */ @@ -363,7 +384,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func( *args->valid = 0; } -ccl_device_forceinline void kernel_embree_filter_occluded_local_func( +ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl( const RTCFilterFunctionNArguments *args) { /* Current implementation in Cycles assumes only single-ray intersection queries. */ @@ -376,7 +397,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func( #else CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context); #endif +#ifdef __KERNEL_ONEAPI__ + KernelGlobalsGPU *kg = nullptr; +#else const KernelGlobalsCPU *kg = ctx->kg; +#endif const Ray *cray = ctx->ray; /* Check if it's hitting the correct object. */ @@ -462,7 +487,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func( *args->valid = 0; } -ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func( +ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl( const RTCFilterFunctionNArguments *args) { /* Current implementation in Cycles assumes only single-ray intersection queries. */ @@ -475,7 +500,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func( #else CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context); #endif +#ifdef __KERNEL_ONEAPI__ + KernelGlobalsGPU *kg = nullptr; +#else const KernelGlobalsCPU *kg = ctx->kg; +#endif const Ray *cray = ctx->ray; /* Append the intersection to the end of the array. */ @@ -513,14 +542,14 @@ ccl_device_forceinline void kernel_embree_filter_occluded_func( switch (ctx->type) { case CCLIntersectContext::RAY_SHADOW_ALL: - kernel_embree_filter_occluded_shadow_all_func(args); + kernel_embree_filter_occluded_shadow_all_func_impl(args); break; case CCLIntersectContext::RAY_LOCAL: case CCLIntersectContext::RAY_SSS: - kernel_embree_filter_occluded_local_func(args); + kernel_embree_filter_occluded_local_func_impl(args); break; case CCLIntersectContext::RAY_VOLUME_ALL: - kernel_embree_filter_occluded_volume_all_func(args); + kernel_embree_filter_occluded_volume_all_func_impl(args); break; case CCLIntersectContext::RAY_REGULAR: @@ -569,7 +598,63 @@ ccl_device void kernel_embree_filter_occluded_func_backface_cull( kernel_embree_filter_occluded_func(args); } +#endif +#ifdef __KERNEL_ONEAPI__ +/* Static wrappers so we can call the callbacks from out side the ONEAPIKernelContext class */ +RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline +kernel_embree_filter_intersection_func_static(const RTCFilterFunctionNArguments *args) +{ + RTCHit *hit = (RTCHit *)args->hit; + CCLFirstHitContext *ctx = (CCLFirstHitContext *)(args->context); + ONEAPIKernelContext *context = static_cast(ctx->kg); + context->kernel_embree_filter_intersection_func_impl(args); +} + +RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline +kernel_embree_filter_occluded_shadow_all_func_static(const RTCFilterFunctionNArguments *args) +{ + RTCHit *hit = (RTCHit *)args->hit; + CCLShadowContext *ctx = (CCLShadowContext *)(args->context); + ONEAPIKernelContext *context = static_cast(ctx->kg); + context->kernel_embree_filter_occluded_shadow_all_func_impl(args); +} + +RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline +kernel_embree_filter_occluded_local_func_static(const RTCFilterFunctionNArguments *args) +{ + RTCHit *hit = (RTCHit *)args->hit; + CCLLocalContext *ctx = (CCLLocalContext *)(args->context); + ONEAPIKernelContext *context = static_cast(ctx->kg); + context->kernel_embree_filter_occluded_local_func_impl(args); +} + +RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline +kernel_embree_filter_occluded_volume_all_func_static(const RTCFilterFunctionNArguments *args) +{ + RTCHit *hit = (RTCHit *)args->hit; + CCLVolumeContext *ctx = (CCLVolumeContext *)(args->context); + ONEAPIKernelContext *context = static_cast(ctx->kg); + context->kernel_embree_filter_occluded_volume_all_func_impl(args); +} + +# define kernel_embree_filter_intersection_func \ + ONEAPIKernelContext::kernel_embree_filter_intersection_func_static +# define kernel_embree_filter_occluded_shadow_all_func \ + ONEAPIKernelContext::kernel_embree_filter_occluded_shadow_all_func_static +# define kernel_embree_filter_occluded_local_func \ + ONEAPIKernelContext::kernel_embree_filter_occluded_local_func_static +# define kernel_embree_filter_occluded_volume_all_func \ + ONEAPIKernelContext::kernel_embree_filter_occluded_volume_all_func_static +#else +# define kernel_embree_filter_intersection_func kernel_embree_filter_intersection_func_impl +# if EMBREE_MAJOR_VERSION >= 4 +# define kernel_embree_filter_occluded_shadow_all_func \ + kernel_embree_filter_occluded_shadow_all_func_impl +# define kernel_embree_filter_occluded_local_func kernel_embree_filter_occluded_local_func_impl +# define kernel_embree_filter_occluded_volume_all_func \ + kernel_embree_filter_occluded_volume_all_func_impl +# endif #endif /* Scene intersection. */ @@ -583,7 +668,15 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg, #if EMBREE_MAJOR_VERSION >= 4 CCLFirstHitContext ctx; rtcInitRayQueryContext(&ctx); +# ifdef __KERNEL_ONEAPI__ + /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and + * uses global device allocation (CUDA, Optix, HIP) or passes all needed data + * as a class context (Metal, oneAPI). So we need to pass this context here + * in order to have an access to it later in Embree filter functions on GPU. */ + ctx.kg = (KernelGlobals)this; +# else ctx.kg = kg; +# endif #else CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); rtcInitIntersectContext(&ctx); @@ -596,7 +689,7 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg, #if EMBREE_MAJOR_VERSION >= 4 RTCIntersectArguments args; rtcInitIntersectArguments(&args); - args.filter = (RTCFilterFunctionN)kernel_embree_filter_intersection_func; + args.filter = reinterpret_cast(kernel_embree_filter_intersection_func); args.feature_mask = CYCLES_EMBREE_USED_FEATURES; args.context = &ctx; rtcIntersect1(kernel_data.device_bvh, &ray_hit, &args); @@ -625,7 +718,15 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 CCLLocalContext ctx; rtcInitRayQueryContext(&ctx); +# ifdef __KERNEL_ONEAPI__ + /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and + * uses global device allocation (CUDA, Optix, HIP) or passes all needed data + * as a class context (Metal, oneAPI). So we need to pass this context here + * in order to have an access to it later in Embree filter functions on GPU. */ + ctx.kg = (KernelGlobals)this; +# else ctx.kg = kg; +# endif # else CCLIntersectContext ctx(kg, has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL); @@ -646,7 +747,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 RTCOccludedArguments args; rtcInitOccludedArguments(&args); - args.filter = (RTCFilterFunctionN)(kernel_embree_filter_occluded_local_func); + args.filter = reinterpret_cast(kernel_embree_filter_occluded_local_func); args.feature_mask = CYCLES_EMBREE_USED_FEATURES; args.context = &ctx; # endif @@ -692,7 +793,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg, #ifdef __SHADOW_RECORD_ALL__ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg, - IntegratorShadowStateCPU *state, + IntegratorShadowState state, ccl_private const Ray *ray, uint visibility, uint max_hits, @@ -702,7 +803,15 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 CCLShadowContext ctx; rtcInitRayQueryContext(&ctx); +# ifdef __KERNEL_ONEAPI__ + /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and + * uses global device allocation (CUDA, Optix, HIP) or passes all needed data + * as a class context (Metal, oneAPI). So we need to pass this context here + * in order to have an access to it later in Embree filter functions on GPU. */ + ctx.kg = (KernelGlobals)this; +# else ctx.kg = kg; +# endif # else CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); rtcInitIntersectContext(&ctx); @@ -718,7 +827,8 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 RTCOccludedArguments args; rtcInitOccludedArguments(&args); - args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_shadow_all_func; + args.filter = reinterpret_cast( + kernel_embree_filter_occluded_shadow_all_func); args.feature_mask = CYCLES_EMBREE_USED_FEATURES; args.context = &ctx; rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args); @@ -742,7 +852,15 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 CCLVolumeContext ctx; rtcInitRayQueryContext(&ctx); +# ifdef __KERNEL_ONEAPI__ + /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and + * uses global device allocation (CUDA, Optix, HIP) or passes all needed data + * as a class context (Metal, oneAPI). So we need to pass this context here + * in order to have an access to it later in Embree filter functions on GPU. */ + ctx.kg = (KernelGlobals)this; +# else ctx.kg = kg; +# endif # else CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); rtcInitIntersectContext(&ctx); @@ -756,7 +874,8 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg, # if EMBREE_MAJOR_VERSION >= 4 RTCOccludedArguments args; rtcInitOccludedArguments(&args); - args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_volume_all_func; + args.filter = reinterpret_cast( + kernel_embree_filter_occluded_volume_all_func); args.feature_mask = CYCLES_EMBREE_USED_FEATURES; args.context = &ctx; rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index d31d8c46d4a..97f699cbe05 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -128,6 +128,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix +/* Intersection kernels need access to the kernel handler for specialization constants to work + * properly. */ +#ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_begin.h" +#endif + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_intersect_closest, ccl_global const int *path_index_array, diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index b113faf9761..07515eb61f4 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -5,6 +5,11 @@ #define __KERNEL_GPU__ #define __KERNEL_ONEAPI__ +#define __KERNEL_64_BIT__ + +#ifdef WITH_EMBREE_GPU +# define __KERNEL_GPU_RAYTRACING__ +#endif #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END @@ -57,17 +62,19 @@ #define ccl_gpu_kernel_threads(block_num_threads) #ifndef WITH_ONEAPI_SYCL_HOST_TASK -# define ccl_gpu_kernel_signature(name, ...) \ +# define __ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ size_t kernel_local_size, \ sycl::handler &cgh, \ __VA_ARGS__) { \ (kg); \ - cgh.parallel_for( \ + cgh.parallel_for( \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ [=](sycl::nd_item<1> item) { +# define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature + # define ccl_gpu_kernel_postfix \ }); \ } diff --git a/intern/cycles/kernel/device/oneapi/context_intersect_begin.h b/intern/cycles/kernel/device/oneapi/context_intersect_begin.h new file mode 100644 index 00000000000..e2764836215 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_intersect_begin.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2023 Intel Corporation */ + +#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU) +# undef ccl_gpu_kernel_signature +# define ccl_gpu_kernel_signature(name, ...) \ + void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ + size_t kernel_global_size, \ + size_t kernel_local_size, \ + sycl::handler &cgh, \ + __VA_ARGS__) \ + { \ + (kg); \ + cgh.parallel_for( \ + sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ + [=](sycl::nd_item<1> item, sycl::kernel_handler oneapi_kernel_handler) { \ + ((ONEAPIKernelContext*)kg)->kernel_handler = oneapi_kernel_handler; +#endif diff --git a/intern/cycles/kernel/device/oneapi/context_intersect_end.h b/intern/cycles/kernel/device/oneapi/context_intersect_end.h new file mode 100644 index 00000000000..948ee5b36dc --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_intersect_end.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2023 Intel Corporation */ + +#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU) +# undef ccl_gpu_kernel_signature +# define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature +#endif diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index 87932deb2f0..a1854b8fabe 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -31,6 +31,8 @@ typedef struct KernelGlobalsGPU { size_t nd_item_group_range_0; size_t nd_item_global_id_0; size_t nd_item_global_range_0; +#else + sycl::kernel_handler kernel_handler; #endif } KernelGlobalsGPU; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index c53088d49ce..e2f4f3963f8 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -16,9 +16,22 @@ # include "kernel/device/gpu/kernel.h" +# include "device/kernel.cpp" + static OneAPIErrorCallback s_error_cb = nullptr; static void *s_error_user_ptr = nullptr; +# ifdef WITH_EMBREE_GPU +static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_BASIC_FEATURES = + (const RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | + RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | + RTC_FEATURE_FLAG_POINT | RTC_FEATURE_FLAG_MOTION_BLUR); +static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_ALL_FEATURES = + (const RTCFeatureFlags)(CYCLES_ONEAPI_EMBREE_BASIC_FEATURES | + RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | + RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE); +# endif + void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) { s_error_cb = cb; @@ -144,13 +157,55 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) { -# ifdef SYCL_SKIP_KERNELS_PRELOAD - (void)queue_; - (void)requested_features; -# else assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); +# ifdef WITH_EMBREE_GPU + /* Preloading intersection kernels is mandatory with Embree on GPU execution, + * because AoT will be not fully performant. */ + try { + sycl::kernel_bundle all_kernels_bundle = + sycl::get_kernel_bundle(queue->get_context(), + {queue->get_device()}); + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); + + /* NOTE(@nsirgien): Names in this conditions below should match names from + * oneapi_call macro in oneapi_enqueue_kernel below */ + /* Also, here we handle only intersection kernels (and skip the rest) */ + if (kernel_name.find("_intersect_") == std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && + kernel_name.find("_intersect_volume") != std::string::npos) { + continue; + } + + sycl::kernel_bundle one_kernel_bundle = + sycl::get_kernel_bundle(queue->get_context(), {kernel_id}); + + one_kernel_bundle.set_specialization_constant( + CYCLES_ONEAPI_EMBREE_BASIC_FEATURES); + sycl::build(one_kernel_bundle); + + one_kernel_bundle.set_specialization_constant( + CYCLES_ONEAPI_EMBREE_ALL_FEATURES); + sycl::build(one_kernel_bundle); + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +# endif + +# ifdef SYCL_SKIP_KERNELS_PRELOAD + (void)queue_; + (void)kernel_features; +# else try { sycl::kernel_bundle all_kernels_bundle = sycl::get_kernel_bundle(queue->get_context(), @@ -195,6 +250,8 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) bool oneapi_enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, + const uint kernel_features, + bool use_hardware_raytracing, void **args) { bool success = true; @@ -246,8 +303,24 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # pragma GCC diagnostic error "-Wswitch" # endif +# ifdef WITH_EMBREE_GPU + bool is_with_rthw_kernel = device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK; + const RTCFeatureFlags used_embree_features = (is_with_rthw_kernel && with_hwrt && + !with_curve_features) ? + CYCLES_ONEAPI_EMBREE_BASIC_FEATURES : + CYCLES_ONEAPI_EMBREE_ALL_FEATURES; +# endif + try { queue->submit([&](sycl::handler &cgh) { +# ifdef WITH_EMBREE_GPU + if (is_with_rthw_kernel) + cgh.set_specialization_constant( + used_embree_features); +# endif switch (device_kernel) { case DEVICE_KERNEL_INTEGRATOR_RESET: { oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset); @@ -549,4 +622,5 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # endif return success; } + #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index 2bfc0b89c87..cef59c99c79 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -47,10 +47,14 @@ CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size( CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context, int kernel, size_t global_size, + const unsigned int kernel_features, + bool use_hardware_raytracing, void **args); CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue, - const unsigned int requested_features); + const unsigned int kernel_features, + bool use_hardware_raytracing); # ifdef __cplusplus } + # endif #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index a11f9f44920..58c3d95deb5 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -3,8 +3,9 @@ #pragma once -#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE) -# if EMBREE_MAJOR_VERSION >= 4 +#if (!defined(__KERNEL_GPU__) || (defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU))) && \ + defined(WITH_EMBREE) +# if EMBREE_MAJOR_VERSION == 4 # include # include # else diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h index 9e27997cf2c..6188ab8a57c 100644 --- a/intern/cycles/util/vector.h +++ b/intern/cycles/util/vector.h @@ -4,7 +4,6 @@ #ifndef __UTIL_VECTOR_H__ #define __UTIL_VECTOR_H__ -#include #include #include