From 3f8c99510926d5b63de7fb7dddc3d216b3f66966 Mon Sep 17 00:00:00 2001
From: Nikita Sirgienko <nikita.sirgienko@intel.com>
Date: Thu, 16 Mar 2023 11:56:55 +0100
Subject: [PATCH] Cycles: add hardware raytracing support to oneAPI device

Updated Embree 4 library with GPU support is required for it to be
compiled - compatiblity with Embree 3 and Embree 4 without GPU support
is maintained.
Enabling hardware raytracing is an opt-in user setting for now.

Pull Request: https://projects.blender.org/blender/blender/pulls/106266
---
 intern/cycles/CMakeLists.txt                  |   3 +
 intern/cycles/blender/addon/properties.py     |  12 ++
 intern/cycles/blender/device.cpp              |  21 ++-
 intern/cycles/blender/python.cpp              |   8 +
 intern/cycles/device/device.cpp               |   4 +-
 intern/cycles/device/device.h                 |  21 +--
 intern/cycles/device/kernel.cpp               |   8 +-
 intern/cycles/device/kernel.h                 |  11 +-
 intern/cycles/device/metal/device_impl.mm     |   4 +-
 intern/cycles/device/oneapi/device.cpp        |   9 +-
 intern/cycles/device/oneapi/device_impl.cpp   |  96 ++++++++++-
 intern/cycles/device/oneapi/device_impl.h     |  21 ++-
 intern/cycles/kernel/CMakeLists.txt           |  58 ++++++-
 intern/cycles/kernel/device/cpu/bvh.h         | 159 +++++++++++++++---
 intern/cycles/kernel/device/gpu/kernel.h      |   6 +
 intern/cycles/kernel/device/oneapi/compat.h   |  11 +-
 .../device/oneapi/context_intersect_begin.h   |  18 ++
 .../device/oneapi/context_intersect_end.h     |   7 +
 intern/cycles/kernel/device/oneapi/globals.h  |   2 +
 intern/cycles/kernel/device/oneapi/kernel.cpp |  82 ++++++++-
 intern/cycles/kernel/device/oneapi/kernel.h   |   6 +-
 intern/cycles/kernel/types.h                  |   5 +-
 intern/cycles/util/vector.h                   |   1 -
 23 files changed, 508 insertions(+), 65 deletions(-)
 create mode 100644 intern/cycles/kernel/device/oneapi/context_intersect_begin.h
 create mode 100644 intern/cycles/kernel/device/oneapi/context_intersect_end.h

diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index aa9d7f5c53b..f28c14ad4cc 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -281,6 +281,9 @@ endif()
 
 if(WITH_CYCLES_EMBREE)
   add_definitions(-DWITH_EMBREE)
+  if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT)
+    add_definitions(-DWITH_EMBREE_GPU)
+  endif()
   add_definitions(-DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION})
   include_directories(
     SYSTEM
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 25079b0c507..79356c1c67e 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         default=False,
     )
 
+    use_oneapirt: BoolProperty(
+        name="Embree on GPU (Experimental)",
+        description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. "
+                    "However this support is experimental and some scenes may render incorrectly",
+        default=False,
+    )
+
     kernel_optimization_level: EnumProperty(
         name="Kernel Optimization",
         description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. "
@@ -1763,6 +1770,11 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                     col.prop(self, "kernel_optimization_level")
                 col.prop(self, "use_metalrt")
 
+        if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "use_oneapirt")
+
     def draw(self, context):
         self.draw_impl(self.layout, context)
 
diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp
index 9d2fba11c1a..b23549950b1 100644
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -112,9 +112,26 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences,
     device.has_peer_memory = false;
   }
 
-  if (get_boolean(cpreferences, "use_metalrt")) {
-    device.use_metalrt = true;
+  bool accumulated_use_hardware_raytracing = false;
+  foreach (
+      DeviceInfo &info,
+      (device.multi_devices.size() != 0 ? device.multi_devices : vector<DeviceInfo>({device}))) {
+    if (info.type == DEVICE_METAL && !get_boolean(cpreferences, "use_metalrt")) {
+      info.use_hardware_raytracing = false;
+    }
+
+    if (info.type == DEVICE_ONEAPI && !get_boolean(cpreferences, "use_oneapirt")) {
+      info.use_hardware_raytracing = false;
+    }
+
+    /* There is an accumulative logic here, because Multidevices are support only for
+     * the same backend + CPU in Blender right now, and both oneAPI and Metal have a
+     * global boolean backend setting (see above) for enabling/disabling HW RT,
+     * so all subdevices in the multidevice should enable (or disable) HW RT
+     * simultaneously (and CPU device are expected to ignore "use_hardware_raytracing" setting) */
+    accumulated_use_hardware_raytracing |= info.use_hardware_raytracing;
   }
+  device.use_hardware_raytracing = accumulated_use_hardware_raytracing;
 
   if (preview) {
     /* Disable specialization for preview renders. */
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index 6ac68035033..682d7075455 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -1034,6 +1034,14 @@ void *CCL_python_module_init()
   Py_INCREF(Py_False);
 #endif /* WITH_EMBREE */
 
+#ifdef WITH_EMBREE_GPU
+  PyModule_AddObject(mod, "with_embree_gpu", Py_True);
+  Py_INCREF(Py_True);
+#else  /* WITH_EMBREE_GPU */
+  PyModule_AddObject(mod, "with_embree_gpu", Py_False);
+  Py_INCREF(Py_False);
+#endif /* WITH_EMBREE_GPU */
+
   if (ccl::openimagedenoise_supported()) {
     PyModule_AddObject(mod, "with_openimagedenoise", Py_True);
     Py_INCREF(Py_True);
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 0ef080a0c00..0ed03f1e94d 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -354,7 +354,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   info.has_guiding = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
-  info.use_metalrt = false;
+  info.use_hardware_raytracing = false;
   info.denoisers = DENOISER_ALL;
 
   foreach (const DeviceInfo &device, subdevices) {
@@ -403,7 +403,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.has_guiding &= device.has_guiding;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
-    info.use_metalrt |= device.use_metalrt;
+    info.use_hardware_raytracing |= device.use_hardware_raytracing;
     info.denoisers &= device.denoisers;
   }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 0ff946b3db9..a73dcc4ab1f 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -71,15 +71,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;  /* GPU is used as a display device. */
-  bool has_nanovdb;     /* Support NanoVDB volumes. */
-  bool has_light_tree;  /* Support light tree. */
-  bool has_osl;         /* Support Open Shading Language. */
-  bool has_guiding;     /* Support path guiding. */
-  bool has_profiling;   /* Supports runtime collection of profiling info. */
-  bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
-  bool has_gpu_queue;   /* Device supports GPU queue. */
-  bool use_metalrt;     /* Use MetalRT to accelerate ray queries (Metal only). */
+  bool display_device;          /* GPU is used as a display device. */
+  bool has_nanovdb;             /* Support NanoVDB volumes. */
+  bool has_light_tree;          /* Support light tree. */
+  bool has_osl;                 /* Support Open Shading Language. */
+  bool has_guiding;             /* Support path guiding. */
+  bool has_profiling;           /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;         /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;           /* Device supports GPU queue. */
+  bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend.
+                                 */
   KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
                                                       * kernels (Metal only). */
   DenoiserTypeMask denoisers;                        /* Supported denoiser types. */
@@ -101,7 +102,7 @@ class DeviceInfo {
     has_profiling = false;
     has_peer_memory = false;
     has_gpu_queue = false;
-    use_metalrt = false;
+    use_hardware_raytracing = false;
     denoisers = DENOISER_NONE;
   }
 
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 903ad096586..5e871c00090 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -3,7 +3,9 @@
 
 #include "device/kernel.h"
 
-#include "util/log.h"
+#ifndef __KERNEL_ONEAPI__
+#  include "util/log.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,10 +155,13 @@ const char *device_kernel_as_string(DeviceKernel kernel)
     case DEVICE_KERNEL_NUM:
       break;
   };
+#ifndef __KERNEL_ONEAPI__
   LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+#endif
   return "UNKNOWN";
 }
 
+#ifndef __KERNEL_ONEAPI__
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
 {
   os << device_kernel_as_string(kernel);
@@ -178,5 +183,6 @@ string device_kernel_mask_as_string(DeviceKernelMask mask)
 
   return str;
 }
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h
index b829a891260..e1cd2b8f092 100644
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -3,11 +3,13 @@
 
 #pragma once
 
-#include "kernel/types.h"
+#ifndef __KERNEL_ONEAPI__
+#  include "kernel/types.h"
 
-#include "util/string.h"
+#  include "util/string.h"
 
-#include <ostream>  // NOLINT
+#  include <ostream>  // NOLINT
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -15,9 +17,12 @@ bool device_kernel_has_shading(DeviceKernel kernel);
 bool device_kernel_has_intersection(DeviceKernel kernel);
 
 const char *device_kernel_as_string(DeviceKernel kernel);
+
+#ifndef __KERNEL_ONEAPI__
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
 
 typedef uint64_t DeviceKernelMask;
 string device_kernel_mask_as_string(DeviceKernelMask mask);
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 0c805b1ddd8..03a2a2c1f06 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -100,12 +100,12 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
     }
     case METAL_GPU_AMD: {
       max_threads_per_threadgroup = 128;
-      use_metalrt = info.use_metalrt;
+      use_metalrt = info.use_hardware_raytracing;
       break;
     }
     case METAL_GPU_APPLE: {
       max_threads_per_threadgroup = 512;
-      use_metalrt = info.use_metalrt;
+      use_metalrt = info.use_hardware_raytracing;
       break;
     }
   }
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index aa77cb34482..7fca0fa0cd6 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -87,7 +87,8 @@ Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &pro
 }
 
 #ifdef WITH_ONEAPI
-static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
+static void device_iterator_cb(
+    const char *id, const char *name, int num, bool hwrt_support, void *user_ptr)
 {
   vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
 
@@ -112,6 +113,12 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
   /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
   info.display_device = false;
 
+#  if WITH_EMBREE_GPU
+  info.use_hardware_raytracing = hwrt_support;
+#  else
+  info.use_hardware_raytracing = false;
+#  endif
+
   devices->push_back(info);
   VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
 }
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 0aec8268bd5..3379541a5d8 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -8,7 +8,19 @@
 #  include "util/debug.h"
 #  include "util/log.h"
 
+#  ifdef WITH_EMBREE_GPU
+#    include "bvh/embree.h"
+#  endif
+
 #  include "kernel/device/oneapi/globals.h"
+#  include "kernel/device/oneapi/kernel.h"
+
+#  if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
+/* These declarations are missing from embree headers when compiling from a compiler that doesn't
+ * support SYCL. */
+extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
+extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
+#  endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -22,16 +34,29 @@ static void queue_error_cb(const char *message, void *user_ptr)
 OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     : Device(info, stats, profiler),
       device_queue_(nullptr),
+#  ifdef WITH_EMBREE_GPU
+      embree_device(nullptr),
+      embree_scene(nullptr),
+#  endif
       texture_info_(this, "texture_info", MEM_GLOBAL),
       kg_memory_(nullptr),
       kg_memory_device_(nullptr),
       kg_memory_size_(0)
 {
   need_texture_info_ = false;
+  use_hardware_raytracing = info.use_hardware_raytracing;
 
   oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
 
-  bool is_finished_ok = create_queue(device_queue_, info.num);
+  bool is_finished_ok = create_queue(device_queue_,
+                                     info.num,
+#  ifdef WITH_EMBREE_GPU
+                                     use_hardware_raytracing ? &embree_device : nullptr
+#  else
+                                     nullptr
+#  endif
+  );
+
   if (is_finished_ok == false) {
     set_error("oneAPI queue initialization error: got runtime exception \"" +
               oneapi_error_string_ + "\"");
@@ -42,6 +67,16 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
     assert(device_queue_);
   }
 
+#  ifdef WITH_EMBREE_GPU
+  use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
+#  else
+  use_hardware_raytracing = false;
+#  endif
+
+  if (use_hardware_raytracing) {
+    VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
+  }
+
   size_t globals_segment_size;
   is_finished_ok = kernel_globals_size(globals_segment_size);
   if (is_finished_ok == false) {
@@ -64,6 +99,11 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
 
 OneapiDevice::~OneapiDevice()
 {
+#  ifdef WITH_EMBREE_GPU
+  if (embree_device)
+    rtcReleaseDevice(embree_device);
+#  endif
+
   texture_info_.free();
   usm_free(device_queue_, kg_memory_);
   usm_free(device_queue_, kg_memory_device_);
@@ -82,13 +122,36 @@ bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
 
 BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
 {
-  return BVH_LAYOUT_BVH2;
+  return use_hardware_raytracing ? BVH_LAYOUT_EMBREE : BVH_LAYOUT_BVH2;
 }
 
+#  ifdef WITH_EMBREE_GPU
+void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else {
+    Device::build_bvh(bvh, progress, refit);
+  }
+}
+#  endif
+
 bool OneapiDevice::load_kernels(const uint requested_features)
 {
   assert(device_queue_);
 
+  kernel_features = requested_features;
+
   bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
   if (is_finished_ok == false) {
     set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
@@ -327,6 +390,16 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
              << string_human_readable_number(size) << " bytes. ("
              << string_human_readable_size(size) << ")";
 
+#  ifdef WITH_EMBREE_GPU
+  if (strcmp(name, "data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update scene handle(since it is different for each device on multi devices) */
+    KernelData *const data = (KernelData *)host;
+    data->device_bvh = embree_scene;
+  }
+#  endif
+
   ConstMemMap::iterator i = const_mem_map_.find(name);
   device_vector<uchar> *data;
 
@@ -446,7 +519,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
 #  endif
 }
 
-bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
+bool OneapiDevice::create_queue(SyclQueue *&external_queue,
+                                int device_index,
+                                void *embree_device_pointer)
 {
   bool finished_correct = true;
   try {
@@ -457,6 +532,11 @@ bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
     sycl::queue *created_queue = new sycl::queue(devices[device_index],
                                                  sycl::property::queue::in_order());
     external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+#  ifdef WITH_EMBREE_GPU
+    if (embree_device_pointer) {
+      *((RTCDevice *)embree_device_pointer) = rtcNewSYCLDevice(created_queue->get_context(), "");
+    }
+#  endif
   }
   catch (sycl::exception const &e) {
     finished_correct = false;
@@ -625,7 +705,8 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
                                   size_t global_size,
                                   void **args)
 {
-  return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
+  return oneapi_enqueue_kernel(
+      kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args);
 }
 
 /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
@@ -830,12 +911,17 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
     std::string name = device.get_info<sycl::info::device::name>();
 #  else
     std::string name = "SYCL Host Task (Debug)";
+#  endif
+#  ifdef WITH_EMBREE_GPU
+    bool hwrt_support = rtcIsSYCLDeviceSupported(device);
+#  else
+    bool hwrt_support = false;
 #  endif
     std::string id = "ONEAPI_" + platform_name + "_" + name;
     if (device.has(sycl::aspect::ext_intel_pci_address)) {
       id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
     }
-    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    (cb)(id.c_str(), name.c_str(), num, hwrt_support, user_ptr);
     num++;
   }
 }
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index 197cf03d60d..0a83d199450 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -16,15 +16,16 @@ CCL_NAMESPACE_BEGIN
 
 class DeviceQueue;
 
-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
-                                             const char *name,
-                                             int num,
-                                             void *user_ptr);
+typedef void (*OneAPIDeviceIteratorCallback)(
+    const char *id, const char *name, int num, bool hwrt_support, void *user_ptr);
 
 class OneapiDevice : public Device {
  private:
   SyclQueue *device_queue_;
-
+#  if WITH_EMBREE_GPU
+  RTCDevice embree_device;
+  RTCScene embree_scene;
+#  endif
   using ConstMemMap = map<string, device_vector<uchar> *>;
   ConstMemMap const_mem_map_;
   device_vector<TextureInfo> texture_info_;
@@ -34,6 +35,8 @@ class OneapiDevice : public Device {
   size_t kg_memory_size_ = (size_t)0;
   size_t max_memory_on_device_ = (size_t)0;
   std::string oneapi_error_string_;
+  bool use_hardware_raytracing = false;
+  unsigned int kernel_features = 0;
 
  public:
   virtual BVHLayoutMask get_bvh_layout_mask() const override;
@@ -41,10 +44,12 @@ class OneapiDevice : public Device {
   OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   virtual ~OneapiDevice();
-
+#  if WITH_EMBREE_GPU
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+#  endif
   bool check_peer_access(Device *peer_device) override;
 
-  bool load_kernels(const uint requested_features) override;
+  bool load_kernels(const uint kernel_features) override;
 
   void load_texture_info();
 
@@ -114,7 +119,7 @@ class OneapiDevice : public Device {
 
  protected:
   void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
-  bool create_queue(SyclQueue *&external_queue, int device_index);
+  bool create_queue(SyclQueue *&external_queue, int device_index, void *embree_device);
   void free_queue(SyclQueue *queue);
   void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
   void *usm_alloc_device(SyclQueue *queue, size_t memory_size);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index eb3d20f04ad..57ab5beb030 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -96,10 +96,13 @@ set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
   device/oneapi/compat.h
   device/oneapi/context_begin.h
   device/oneapi/context_end.h
+  device/oneapi/context_intersect_begin.h
+  device/oneapi/context_intersect_end.h
   device/oneapi/globals.h
   device/oneapi/image.h
   device/oneapi/kernel.h
   device/oneapi/kernel_templates.h
+  device/cpu/bvh.h
 )
 
 set(SRC_KERNEL_CLOSURE_HEADERS
@@ -764,7 +767,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
 
   # Set defaults for spir64 and spir64_gen options
   if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
-    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
+    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-regular-grf-kernel integrator_intersect -ze-opt-large-grf-kernel shade -ze-opt-no-local-to-generic'")
   endif()
   if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
     set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
@@ -798,6 +801,59 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
       -I"${NANOVDB_INCLUDE_DIR}")
   endif()
 
+  if(WITH_CYCLES_EMBREE AND EMBREE_SYCL_SUPPORT)
+    list(APPEND sycl_compiler_flags
+      -DWITH_EMBREE
+      -DWITH_EMBREE_GPU
+      -DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION}
+      -I"${EMBREE_INCLUDE_DIRS}")
+
+    if(WIN32)
+      list(APPEND sycl_compiler_flags
+        -ladvapi32.lib
+      )
+    endif()
+
+    set(next_library_mode "")
+    foreach(library ${EMBREE_LIBRARIES})
+      string(TOLOWER "${library}" library_lower)
+      if(("${library_lower}" STREQUAL "optimized") OR
+         ("${library_lower}" STREQUAL "debug"))
+        set(next_library_mode "${library_lower}")
+      else()
+        if(next_library_mode STREQUAL "")
+          list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
+          list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
+        else()
+          list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
+        endif()
+        set(next_library_mode "")
+      endif()
+    endforeach()
+
+    foreach(library ${TBB_LIBRARIES})
+      string(TOLOWER "${library}" library_lower)
+      if(("${library_lower}" STREQUAL "optimized") OR
+         ("${library_lower}" STREQUAL "debug"))
+        set(next_library_mode "${library_lower}")
+      else()
+        if(next_library_mode STREQUAL "")
+          list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
+          list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
+        else()
+          list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
+        endif()
+        set(next_library_mode "")
+      endif()
+    endforeach()
+      list(APPEND sycl_compiler_flags
+        "$<$<CONFIG:Release>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:RelWithDebInfo>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:MinSizeRel>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:Debug>:${EMBREE_TBB_LIBRARIES_debug}>"
+      )
+  endif()
+
   if(WITH_CYCLES_DEBUG)
     list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
   endif()
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
index a7baec0b718..65aed44dbbe 100644
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -13,8 +13,13 @@
 #  include <embree3/rtcore_scene.h>
 #endif
 
-#include "kernel/device/cpu/compat.h"
-#include "kernel/device/cpu/globals.h"
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/compat.h"
+#  include "kernel/device/oneapi/globals.h"
+#else
+#  include "kernel/device/cpu/compat.h"
+#  include "kernel/device/cpu/globals.h"
+#endif
 
 #include "kernel/bvh/types.h"
 #include "kernel/bvh/util.h"
@@ -33,11 +38,18 @@ using numhit_t = uint8_t;
 using numhit_t = uint32_t;
 #endif
 
-#define CYCLES_EMBREE_USED_FEATURES \
-  (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
-                    RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
-                    RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
-                    RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
+#ifdef __KERNEL_ONEAPI__
+static constexpr sycl::specialization_id<RTCFeatureFlags> oneapi_embree_features{
+    (const RTCFeatureFlags)(0)};
+#  define CYCLES_EMBREE_USED_FEATURES \
+    (kernel_handler.get_specialization_constant<oneapi_embree_features>())
+#else
+#  define CYCLES_EMBREE_USED_FEATURES \
+    (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
+                      RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
+                      RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
+                      RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
+#endif
 
 #define EMBREE_IS_HAIR(x) (x & 1)
 
@@ -252,7 +264,8 @@ ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals kg,
  * Things like recording subsurface or shadow hits for later evaluation
  * as well as filtering for volume objects happen here.
  * Cycles' own BVH does that directly inside the traversal calls. */
-ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+ccl_device_forceinline void kernel_embree_filter_intersection_func_impl(
+    const RTCFilterFunctionNArguments *args)
 {
   /* Current implementation in Cycles assumes only single-ray intersection queries. */
   assert(args->N == 1);
@@ -263,7 +276,11 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
 #else
   CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
   const KernelGlobalsCPU *kg = ctx->kg;
+#endif
   const Ray *cray = ctx->ray;
 
   if (kernel_embree_is_self_intersection(
@@ -277,7 +294,7 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
  * as well as filtering for volume objects happen here.
  * Cycles' own BVH does that directly inside the traversal calls.
  */
-ccl_device void kernel_embree_filter_occluded_shadow_all_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl(
     const RTCFilterFunctionNArguments *args)
 {
   /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -290,7 +307,11 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
 #else
   CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
   const KernelGlobalsCPU *kg = ctx->kg;
+#endif
   const Ray *cray = ctx->ray;
 
   Intersection current_isect;
@@ -326,7 +347,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
   }
 
   /* Test if we need to record this transparent intersection. */
-  const numhit_t max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  const numhit_t max_record_hits = min(ctx->max_hits, numhit_t(INTEGRATOR_SHADOW_ISECT_SIZE));
   if (ctx->num_recorded_hits < max_record_hits) {
     /* If maximum number of hits was reached, replace the intersection with the
      * highest distance. We want to find the N closest intersections. */
@@ -363,7 +384,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
   *args->valid = 0;
 }
 
-ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl(
     const RTCFilterFunctionNArguments *args)
 {
   /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -376,7 +397,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
 #else
   CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
   const KernelGlobalsCPU *kg = ctx->kg;
+#endif
   const Ray *cray = ctx->ray;
 
   /* Check if it's hitting the correct object. */
@@ -462,7 +487,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
   *args->valid = 0;
 }
 
-ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl(
     const RTCFilterFunctionNArguments *args)
 {
   /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -475,7 +500,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
 #else
   CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
   const KernelGlobalsCPU *kg = ctx->kg;
+#endif
   const Ray *cray = ctx->ray;
 
   /* Append the intersection to the end of the array. */
@@ -513,14 +542,14 @@ ccl_device_forceinline void kernel_embree_filter_occluded_func(
 
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL:
-      kernel_embree_filter_occluded_shadow_all_func(args);
+      kernel_embree_filter_occluded_shadow_all_func_impl(args);
       break;
     case CCLIntersectContext::RAY_LOCAL:
     case CCLIntersectContext::RAY_SSS:
-      kernel_embree_filter_occluded_local_func(args);
+      kernel_embree_filter_occluded_local_func_impl(args);
       break;
     case CCLIntersectContext::RAY_VOLUME_ALL:
-      kernel_embree_filter_occluded_volume_all_func(args);
+      kernel_embree_filter_occluded_volume_all_func_impl(args);
       break;
 
     case CCLIntersectContext::RAY_REGULAR:
@@ -569,7 +598,63 @@ ccl_device void kernel_embree_filter_occluded_func_backface_cull(
 
   kernel_embree_filter_occluded_func(args);
 }
+#endif
 
+#ifdef __KERNEL_ONEAPI__
+/* Static wrappers so we can call the callbacks from out side the ONEAPIKernelContext class */
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_intersection_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLFirstHitContext *ctx = (CCLFirstHitContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_intersection_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_shadow_all_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLShadowContext *ctx = (CCLShadowContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_shadow_all_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_local_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLLocalContext *ctx = (CCLLocalContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_local_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_volume_all_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLVolumeContext *ctx = (CCLVolumeContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_volume_all_func_impl(args);
+}
+
+#  define kernel_embree_filter_intersection_func \
+    ONEAPIKernelContext::kernel_embree_filter_intersection_func_static
+#  define kernel_embree_filter_occluded_shadow_all_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_shadow_all_func_static
+#  define kernel_embree_filter_occluded_local_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_local_func_static
+#  define kernel_embree_filter_occluded_volume_all_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_volume_all_func_static
+#else
+#  define kernel_embree_filter_intersection_func kernel_embree_filter_intersection_func_impl
+#  if EMBREE_MAJOR_VERSION >= 4
+#    define kernel_embree_filter_occluded_shadow_all_func \
+      kernel_embree_filter_occluded_shadow_all_func_impl
+#    define kernel_embree_filter_occluded_local_func kernel_embree_filter_occluded_local_func_impl
+#    define kernel_embree_filter_occluded_volume_all_func \
+      kernel_embree_filter_occluded_volume_all_func_impl
+#  endif
 #endif
 
 /* Scene intersection. */
@@ -583,7 +668,15 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
 #if EMBREE_MAJOR_VERSION >= 4
   CCLFirstHitContext ctx;
   rtcInitRayQueryContext(&ctx);
+#  ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#  else
   ctx.kg = kg;
+#  endif
 #else
   CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
   rtcInitIntersectContext(&ctx);
@@ -596,7 +689,7 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
 #if EMBREE_MAJOR_VERSION >= 4
   RTCIntersectArguments args;
   rtcInitIntersectArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_intersection_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_intersection_func);
   args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
   args.context = &ctx;
   rtcIntersect1(kernel_data.device_bvh, &ray_hit, &args);
@@ -625,7 +718,15 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   CCLLocalContext ctx;
   rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
   ctx.kg = kg;
+#    endif
 #  else
   CCLIntersectContext ctx(kg,
                           has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
@@ -646,7 +747,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   RTCOccludedArguments args;
   rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)(kernel_embree_filter_occluded_local_func);
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_occluded_local_func);
   args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
   args.context = &ctx;
 #  endif
@@ -692,7 +793,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
 
 #ifdef __SHADOW_RECORD_ALL__
 ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
-                                                             IntegratorShadowStateCPU *state,
+                                                             IntegratorShadowState state,
                                                              ccl_private const Ray *ray,
                                                              uint visibility,
                                                              uint max_hits,
@@ -702,7 +803,15 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   CCLShadowContext ctx;
   rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
   ctx.kg = kg;
+#    endif
 #  else
   CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
   rtcInitIntersectContext(&ctx);
@@ -718,7 +827,8 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   RTCOccludedArguments args;
   rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_shadow_all_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(
+      kernel_embree_filter_occluded_shadow_all_func);
   args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
   args.context = &ctx;
   rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);
@@ -742,7 +852,15 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   CCLVolumeContext ctx;
   rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
   ctx.kg = kg;
+#    endif
 #  else
   CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
   rtcInitIntersectContext(&ctx);
@@ -756,7 +874,8 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
   RTCOccludedArguments args;
   rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_volume_all_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(
+      kernel_embree_filter_occluded_volume_all_func);
   args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
   args.context = &ctx;
   rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d31d8c46d4a..97f699cbe05 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -128,6 +128,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix
 
+/* Intersection kernels need access to the kernel handler for specialization constants to work
+ * properly. */
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/context_intersect_begin.h"
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     ccl_gpu_kernel_signature(integrator_intersect_closest,
                              ccl_global const int *path_index_array,
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index b113faf9761..07515eb61f4 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -5,6 +5,11 @@
 
 #define __KERNEL_GPU__
 #define __KERNEL_ONEAPI__
+#define __KERNEL_64_BIT__
+
+#ifdef WITH_EMBREE_GPU
+#  define __KERNEL_GPU_RAYTRACING__
+#endif
 
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -57,17 +62,19 @@
 #define ccl_gpu_kernel_threads(block_num_threads)
 
 #ifndef WITH_ONEAPI_SYCL_HOST_TASK
-#  define ccl_gpu_kernel_signature(name, ...) \
+#  define __ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                           size_t kernel_global_size, \
                           size_t kernel_local_size, \
                           sycl::handler &cgh, \
                           __VA_ARGS__) { \
       (kg); \
-      cgh.parallel_for<class kernel_##name>( \
+      cgh.parallel_for( \
           sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
           [=](sycl::nd_item<1> item) {
 
+#  define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
+
 #  define ccl_gpu_kernel_postfix \
           }); \
     }
diff --git a/intern/cycles/kernel/device/oneapi/context_intersect_begin.h b/intern/cycles/kernel/device/oneapi/context_intersect_begin.h
new file mode 100644
index 00000000000..e2764836215
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_intersect_begin.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2023 Intel Corporation */
+
+#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
+#  undef ccl_gpu_kernel_signature
+#  define ccl_gpu_kernel_signature(name, ...) \
+    void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                              size_t kernel_global_size, \
+                              size_t kernel_local_size, \
+                              sycl::handler &cgh, \
+                              __VA_ARGS__) \
+    { \
+      (kg); \
+      cgh.parallel_for( \
+          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
+          [=](sycl::nd_item<1> item, sycl::kernel_handler oneapi_kernel_handler) { \
+            ((ONEAPIKernelContext*)kg)->kernel_handler = oneapi_kernel_handler;
+#endif
diff --git a/intern/cycles/kernel/device/oneapi/context_intersect_end.h b/intern/cycles/kernel/device/oneapi/context_intersect_end.h
new file mode 100644
index 00000000000..948ee5b36dc
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_intersect_end.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2023 Intel Corporation */
+
+#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
+#  undef ccl_gpu_kernel_signature
+#  define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
+#endif
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index 87932deb2f0..a1854b8fabe 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -31,6 +31,8 @@ typedef struct KernelGlobalsGPU {
   size_t nd_item_group_range_0;
   size_t nd_item_global_id_0;
   size_t nd_item_global_range_0;
+#else
+  sycl::kernel_handler kernel_handler;
 #endif
 } KernelGlobalsGPU;
 
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index c53088d49ce..e2f4f3963f8 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -16,9 +16,22 @@
 
 #  include "kernel/device/gpu/kernel.h"
 
+#  include "device/kernel.cpp"
+
 static OneAPIErrorCallback s_error_cb = nullptr;
 static void *s_error_user_ptr = nullptr;
 
+#  ifdef WITH_EMBREE_GPU
+static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_BASIC_FEATURES =
+    (const RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE |
+                            RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS |
+                            RTC_FEATURE_FLAG_POINT | RTC_FEATURE_FLAG_MOTION_BLUR);
+static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_ALL_FEATURES =
+    (const RTCFeatureFlags)(CYCLES_ONEAPI_EMBREE_BASIC_FEATURES |
+                            RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE |
+                            RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE);
+#  endif
+
 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
 {
   s_error_cb = cb;
@@ -144,13 +157,55 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
 
 bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
 {
-#  ifdef SYCL_SKIP_KERNELS_PRELOAD
-  (void)queue_;
-  (void)requested_features;
-#  else
   assert(queue_);
   sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+#  ifdef WITH_EMBREE_GPU
+  /* Preloading intersection kernels is mandatory with Embree on GPU execution,
+   * because AoT will be not fully performant. */
+  try {
+    sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+                                                           {queue->get_device()});
 
+    for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+      const std::string &kernel_name = kernel_id.get_name();
+
+      /* NOTE(@nsirgien): Names in this conditions below should match names from
+       * oneapi_call macro in oneapi_enqueue_kernel below */
+      /* Also, here we handle only intersection kernels (and skip the rest) */
+      if (kernel_name.find("_intersect_") == std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+          kernel_name.find("_intersect_volume") != std::string::npos) {
+        continue;
+      }
+
+      sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+
+      one_kernel_bundle.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+          CYCLES_ONEAPI_EMBREE_BASIC_FEATURES);
+      sycl::build(one_kernel_bundle);
+
+      one_kernel_bundle.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+          CYCLES_ONEAPI_EMBREE_ALL_FEATURES);
+      sycl::build(one_kernel_bundle);
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  endif
+
+#  ifdef SYCL_SKIP_KERNELS_PRELOAD
+  (void)queue_;
+  (void)kernel_features;
+#  else
   try {
     sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
         sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
@@ -195,6 +250,8 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
                            int kernel,
                            size_t global_size,
+                           const uint kernel_features,
+                           bool use_hardware_raytracing,
                            void **args)
 {
   bool success = true;
@@ -246,8 +303,24 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #    pragma GCC diagnostic error "-Wswitch"
 #  endif
 
+#  ifdef WITH_EMBREE_GPU
+  bool is_with_rthw_kernel = device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+                             device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+                             device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+                             device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK;
+  const RTCFeatureFlags used_embree_features = (is_with_rthw_kernel && with_hwrt &&
+                                                !with_curve_features) ?
+                                                   CYCLES_ONEAPI_EMBREE_BASIC_FEATURES :
+                                                   CYCLES_ONEAPI_EMBREE_ALL_FEATURES;
+#  endif
+
   try {
     queue->submit([&](sycl::handler &cgh) {
+#  ifdef WITH_EMBREE_GPU
+      if (is_with_rthw_kernel)
+        cgh.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+            used_embree_features);
+#  endif
       switch (device_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_RESET: {
           oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
@@ -549,4 +622,5 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #  endif
   return success;
 }
+
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index 2bfc0b89c87..cef59c99c79 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -47,10 +47,14 @@ CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
 CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
                                                        int kernel,
                                                        size_t global_size,
+                                                       const unsigned int kernel_features,
+                                                       bool use_hardware_raytracing,
                                                        void **args);
 CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
-                                                     const unsigned int requested_features);
+                                                     const unsigned int kernel_features,
+                                                     bool use_hardware_raytracing);
 #  ifdef __cplusplus
 }
+
 #  endif
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index a11f9f44920..58c3d95deb5 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -3,8 +3,9 @@
 
 #pragma once
 
-#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
-#  if EMBREE_MAJOR_VERSION >= 4
+#if (!defined(__KERNEL_GPU__) || (defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU))) && \
+    defined(WITH_EMBREE)
+#  if EMBREE_MAJOR_VERSION == 4
 #    include <embree4/rtcore.h>
 #    include <embree4/rtcore_scene.h>
 #  else
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 9e27997cf2c..6188ab8a57c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -4,7 +4,6 @@
 #ifndef __UTIL_VECTOR_H__
 #define __UTIL_VECTOR_H__
 
-#include <cassert>
 #include <cstring>
 #include <vector>