diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 416d836ca84..063bb915325 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1683,7 +1683,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif device_type == 'ONEAPI': import sys if sys.platform.startswith("win"): - driver_version = "XX.X.101.5186" + driver_version = "XX.X.101.5518" col.label(text=rpt_("Requires Intel GPU with Xe-HPG architecture"), icon='BLANK1', translate=False) col.label(text=rpt_("and Windows driver version %s or newer") % driver_version, icon='BLANK1', translate=False) diff --git a/intern/cycles/bvh/embree.cpp b/intern/cycles/bvh/embree.cpp index aad985d8624..67edaa6b5f9 100644 --- a/intern/cycles/bvh/embree.cpp +++ b/intern/cycles/bvh/embree.cpp @@ -178,6 +178,51 @@ void BVHEmbree::build(Progress &progress, rtcCommitScene(scene); } +string BVHEmbree::get_last_error_message() +{ + const RTCError error_code = rtcGetDeviceError(rtc_device); + switch (error_code) { + case RTC_ERROR_NONE: + return "no error"; + case RTC_ERROR_UNKNOWN: + return "unknown error"; + case RTC_ERROR_INVALID_ARGUMENT: + return "invalid argument error"; + case RTC_ERROR_INVALID_OPERATION: + return "invalid operation error"; + case RTC_ERROR_OUT_OF_MEMORY: + return "out of memory error"; + case RTC_ERROR_UNSUPPORTED_CPU: + return "unsupported cpu error"; + case RTC_ERROR_CANCELLED: + return "cancelled"; + default: + /* We should never end here unless enum for RTC errors would change. */ + return "unknown error"; + } +} + +# if WITH_EMBREE_GPU && RTC_VERSION >= 40302 +bool BVHEmbree::offload_scenes_to_gpu(const vector &scenes) +{ + /* Having BVH on GPU is more performance-critical than texture data. + * In order to ensure good performance even when running out of GPU + * memory, we force BVH to migrate to GPU before allocating other textures + * that may not fit. */ + for (const RTCScene &embree_scene : scenes) { + RTCSceneFlags scene_flags = rtcGetSceneFlags(embree_scene); + scene_flags = scene_flags | RTC_SCENE_FLAG_PREFETCH_USM_SHARED_ON_GPU; + rtcSetSceneFlags(embree_scene, scene_flags); + rtcCommitScene(embree_scene); + /* In case of any errors from Embree, we should stop + * the execution and propagate the error. */ + if (rtcGetDeviceError(rtc_device) != RTC_ERROR_NONE) + return false; + } + return true; +} +# endif + void BVHEmbree::add_object(Object *ob, int i) { Geometry *geom = ob->get_geometry(); diff --git a/intern/cycles/bvh/embree.h b/intern/cycles/bvh/embree.h index 4be4115a73c..09582491c84 100644 --- a/intern/cycles/bvh/embree.h +++ b/intern/cycles/bvh/embree.h @@ -18,6 +18,7 @@ # include "bvh/bvh.h" # include "bvh/params.h" +# include "util/string.h" # include "util/thread.h" # include "util/types.h" # include "util/vector.h" @@ -36,6 +37,12 @@ class BVHEmbree : public BVH { const bool isSyclEmbreeDevice = false); void refit(Progress &progress); +# if WITH_EMBREE_GPU && RTC_VERSION >= 40302 + bool offload_scenes_to_gpu(const vector &scenes); +# endif + + string get_last_error_message(); + RTCScene scene; protected: diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index 7d27f457548..87cbd0c56c6 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -257,6 +257,7 @@ class device_memory { friend class OptiXDevice; friend class HIPDevice; friend class MetalDevice; + friend class OneapiDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index f7bffea3b78..59b1b7682b3 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -11,6 +11,7 @@ # include "device/oneapi/device_impl.h" # include "util/debug.h" +# include "util/foreach.h" # include "util/log.h" # ifdef WITH_EMBREE_GPU @@ -47,18 +48,20 @@ static void queue_error_cb(const char *message, void *user_ptr) } OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), + : GPUDevice(info, stats, profiler), device_queue_(nullptr), # ifdef WITH_EMBREE_GPU embree_device(nullptr), embree_scene(nullptr), # endif - texture_info_(this, "texture_info", MEM_GLOBAL), kg_memory_(nullptr), kg_memory_device_(nullptr), kg_memory_size_(0) { - need_texture_info_ = false; + /* Verify that base class types can be used with specific backend types */ + static_assert(sizeof(texMemObject) == sizeof(void *)); + static_assert(sizeof(arrayMemObject) == sizeof(void *)); + use_hardware_raytracing = info.use_hardware_raytracing; oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); @@ -110,6 +113,18 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi kg_memory_size_ = globals_segment_size; max_memory_on_device_ = get_memcapacity(); + init_host_memory(); + move_texture_to_host = false; + can_map_host = true; + + const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM"); + if (headroom_str != nullptr) { + const long long override_headroom = (float)atoll(headroom_str); + device_working_headroom = override_headroom; + device_texture_headroom = override_headroom; + } + VLOG_DEBUG << "oneAPI memory headroom size: " + << string_human_readable_size(device_working_headroom); } OneapiDevice::~OneapiDevice() @@ -119,7 +134,7 @@ OneapiDevice::~OneapiDevice() rtcReleaseDevice(embree_device); # endif - texture_info_.free(); + texture_info.free(); usm_free(device_queue_, kg_memory_); usm_free(device_queue_, kg_memory_device_); @@ -166,8 +181,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) else { bvh_embree->build(progress, &stats, embree_device, true); } + +# if RTC_VERSION >= 40302 + thread_scoped_lock lock(scene_data_mutex); + all_embree_scenes.push_back(bvh_embree->scene); +# endif + if (bvh->params.top_level) { embree_scene = bvh_embree->scene; +# if RTC_VERSION >= 40302 + if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) { + set_error( + string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)", + bvh_embree->get_last_error_message())); + } + all_embree_scenes.clear(); +# endif } } else { @@ -176,6 +205,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } # endif +size_t OneapiDevice::get_free_mem() const +{ + /* Accurate: Use device info. */ + const sycl::device &device = reinterpret_cast(device_queue_)->get_device(); + if (device.has(sycl::aspect::ext_intel_free_memory)) { + return device.get_info(); + } + /* Estimate: Capacity - in use. */ + else if (device_mem_in_use < max_memory_on_device_) { + return max_memory_on_device_ - device_mem_in_use; + } + else { + return 0; + } +} + bool OneapiDevice::load_kernels(const uint requested_features) { assert(device_queue_); @@ -208,63 +253,101 @@ bool OneapiDevice::load_kernels(const uint requested_features) VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\""; } + if (is_finished_ok) { + reserve_private_memory(requested_features); + is_finished_ok = !have_error(); + } + return is_finished_ok; } -void OneapiDevice::load_texture_info() +void OneapiDevice::reserve_private_memory(const uint kernel_features) { - if (need_texture_info_) { - need_texture_info_ = false; - texture_info_.copy_to_device(); + size_t free_before = get_free_mem(); + + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE : + (kernel_features & KERNEL_FEATURE_MNEE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE : + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE; + + { + unique_ptr queue = gpu_queue_create(); + + device_ptr d_path_index = 0; + device_ptr d_render_buffer = 0; + int d_work_size = 0; + DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); + + queue->init_execution(); + /* Launch of the kernel seems to be sufficient to reserve all + * needed memory regardless of the execution global size. + * So, the smallest possible size is used here. */ + queue->enqueue(test_kernel, 1, args); + queue->synchronize(); } + + size_t free_after = get_free_mem(); + + VLOG_INFO << "For kernel execution were reserved " + << string_human_readable_number(free_before - free_after) << " bytes. (" + << string_human_readable_size(free_before - free_after) << ")"; } -void OneapiDevice::generic_alloc(device_memory &mem) +void OneapiDevice::get_device_memory_info(size_t &total, size_t &free) { - size_t memory_size = mem.memory_size(); - - /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then - * we can use USM host memory. - * Because of the expected performance impact, implementation of this has had a low priority - * and is not implemented yet. */ - - assert(device_queue_); - /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device - * and shared. For new project it maybe more beneficial to use USM shared memory, because it - * provides automatic migration mechanism in order to allow to use the same pointer on host and - * on device, without need to worry about explicit memory transfer operations. But for - * Blender/Cycles this type of memory is not very suitable in current application architecture, - * because Cycles already uses two different pointer for host activity and device activity, and - * also has to perform all needed memory transfer operations. So, USM device memory - * type has been used for oneAPI device in order to better fit in Cycles architecture. */ - void *device_pointer = nullptr; - if (mem.memory_size() + stats.mem_used < max_memory_on_device_) - device_pointer = usm_alloc_device(device_queue_, memory_size); - if (device_pointer == nullptr) { - set_error("oneAPI kernel - device memory allocation error for " + - string_human_readable_size(mem.memory_size()) + - ", possibly caused by lack of available memory space on the device: " + - string_human_readable_size(stats.mem_used) + " of " + - string_human_readable_size(max_memory_on_device_) + " is already allocated"); - } - - mem.device_pointer = reinterpret_cast(device_pointer); - mem.device_size = memory_size; - - stats.mem_alloc(memory_size); + free = get_free_mem(); + total = max_memory_on_device_; } -void OneapiDevice::generic_copy_to(device_memory &mem) +bool OneapiDevice::alloc_device(void *&device_pointer, size_t size) { - if (!mem.device_pointer) { - return; - } - size_t memory_size = mem.memory_size(); + bool allocation_success = false; + device_pointer = usm_alloc_device(device_queue_, size); + if (device_pointer != nullptr) { + allocation_success = true; + /* Due to lazy memory initialisation in GPU runtime we will force memory to + * appear in device memory via execution of a kernel using this memory.. */ + if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) { + set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + + "\""); + usm_free(device_queue_, device_pointer); - /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */ - assert(mem.host_pointer); - assert(device_queue_); - usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size); + device_pointer = nullptr; + allocation_success = false; + } + } + + return allocation_success; +} + +void OneapiDevice::free_device(void *device_pointer) +{ + usm_free(device_queue_, device_pointer); +} + +bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size) +{ + shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64); + return shared_pointer != nullptr; +} + +void OneapiDevice::free_host(void *shared_pointer) +{ + usm_free(device_queue_, shared_pointer); +} + +void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer) +{ + /* Device and host pointer are in the same address space + * as we're using Unified Shared Memory. */ + device_pointer = shared_pointer; +} + +void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) +{ + usm_memcpy(device_queue_, device_pointer, host_pointer, size); } /* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */ @@ -288,20 +371,6 @@ void *OneapiDevice::kernel_globals_device_pointer() return kg_memory_device_; } -void OneapiDevice::generic_free(device_memory &mem) -{ - if (!mem.device_pointer) { - return; - } - - stats.mem_free(mem.device_size); - mem.device_size = 0; - - assert(device_queue_); - usm_free(device_queue_, (void *)mem.device_pointer); - mem.device_pointer = 0; -} - void OneapiDevice::mem_alloc(device_memory &mem) { if (mem.type == MEM_TEXTURE) { @@ -344,7 +413,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem) } else { if (!mem.device_pointer) - mem_alloc(mem); + generic_alloc(mem); generic_copy_to(mem); } @@ -515,14 +584,14 @@ void OneapiDevice::tex_alloc(device_texture &mem) /* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */ const uint slot = mem.slot; - if (slot >= texture_info_.size()) { - texture_info_.resize(slot + 128); + if (slot >= texture_info.size()) { + texture_info.resize(slot + 128); } - texture_info_[slot] = mem.info; - need_texture_info_ = true; + texture_info[slot] = mem.info; + need_texture_info = true; - texture_info_[slot].data = (uint64_t)mem.device_pointer; + texture_info[slot].data = (uint64_t)mem.device_pointer; } void OneapiDevice::tex_free(device_texture &mem) @@ -628,6 +697,16 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); + /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device + * and shared. For new project it could more beneficial to use USM shared memory, because it + * provides automatic migration mechanism in order to allow to use the same pointer on host and + * on device, without need to worry about explicit memory transfer operations, although usage of + * USM shared imply some documented limitations on the memory usage in regards of parallel access + * from differen threads. But for Blender/Cycles this type of memory is not very suitable in + * current application architecture, because Cycles is multithread application and already uses + * two different pointer for host activity and device activity, and also has to perform all + * needed memory transfer operations. So, USM device memory type has been used for oneAPI device + * in order to better fit in Cycles architecture. */ # ifndef WITH_ONEAPI_SYCL_HOST_TASK return sycl::malloc_device(memory_size, *queue); # else @@ -646,9 +725,26 @@ void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr) bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) { assert(queue_); + /* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous + * runtime errors. It's better to avoid running memory operations in that case. + * The render will be canceled and the queue will be destroyed anyway. */ + if (have_error()) + return false; + sycl::queue *queue = reinterpret_cast(queue_); OneapiDevice::check_usm(queue_, dest, true); OneapiDevice::check_usm(queue_, src, true); + sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); + sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); + /* Unknown here means, that this is not an USM allocation, which implies that this is + * some generic C++ allocation, so we could use C++ memcpy directly with USM host. */ + if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) && + (src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown)) + { + memcpy(dest, src, num_bytes); + return true; + } + try { sycl::event mem_event = queue->memcpy(dest, src, num_bytes); # ifdef WITH_CYCLES_DEBUG @@ -658,8 +754,6 @@ bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t n mem_event.wait_and_throw(); return true; # else - sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); - sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); bool from_device_to_host = dest_type == sycl::usm::alloc::host && src_type == sycl::usm::alloc::device; bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || @@ -684,6 +778,12 @@ bool OneapiDevice::usm_memset(SyclQueue *queue_, size_t num_bytes) { assert(queue_); + /* sycl::queue::memset may crash if the queue is in an invalid state due to previous + * runtime errors. It's better to avoid running memory operations in that case. + * The render will be canceled and the queue will be destroyed anyway. */ + if (have_error()) + return false; + sycl::queue *queue = reinterpret_cast(queue_); OneapiDevice::check_usm(queue_, usm_ptr, true); try { @@ -735,7 +835,7 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_, assert(memory_name); assert(memory_device_pointer); KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; - OneapiDevice::check_usm(queue_, memory_device_pointer); + OneapiDevice::check_usm(queue_, memory_device_pointer, true); OneapiDevice::check_usm(queue_, kernel_globals, true); std::string matched_name(memory_name); @@ -874,11 +974,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue, /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows * since Windows driver 101.3268. */ -static const int lowest_supported_driver_version_win = 1015186; +static const int lowest_supported_driver_version_win = 1015518; # ifdef _WIN32 -/* For Windows driver 101.5186, compute-runtime version is 28044. +/* For Windows driver 101.5518, compute-runtime version is 28044. * This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/ -static const int lowest_supported_driver_version_neo = 28044; +static const int lowest_supported_driver_version_neo = 29283; # else static const int lowest_supported_driver_version_neo = 27642; # endif diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 11d5acf6a5d..86408a4a5de 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -21,17 +21,19 @@ typedef void (*OneAPIDeviceIteratorCallback)(const char *id, bool oidn_support, void *user_ptr); -class OneapiDevice : public Device { +class OneapiDevice : public GPUDevice { private: SyclQueue *device_queue_; # ifdef WITH_EMBREE_GPU RTCDevice embree_device; RTCScene embree_scene; +# if RTC_VERSION >= 40302 + thread_mutex scene_data_mutex; + vector all_embree_scenes; +# endif # endif using ConstMemMap = map *>; ConstMemMap const_mem_map_; - device_vector texture_info_; - bool need_texture_info_; void *kg_memory_; void *kg_memory_device_; size_t kg_memory_size_ = (size_t)0; @@ -41,6 +43,8 @@ class OneapiDevice : public Device { unsigned int kernel_features = 0; int scene_max_shaders_ = 0; + size_t get_free_mem() const; + public: virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override; @@ -54,13 +58,15 @@ class OneapiDevice : public Device { bool load_kernels(const uint kernel_features) override; - void load_texture_info(); + void reserve_private_memory(const uint kernel_features); - void generic_alloc(device_memory &mem); - - void generic_copy_to(device_memory &mem); - - void generic_free(device_memory &mem); + virtual void get_device_memory_info(size_t &total, size_t &free) override; + virtual bool alloc_device(void *&device_pointer, size_t size) override; + virtual void free_device(void *device_pointer) override; + virtual bool alloc_host(void *&shared_pointer, size_t size) override; + virtual void free_host(void *shared_pointer) override; + virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override; + virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override; string oneapi_error_message(); diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index d27aebd8006..b8dbd82fb2f 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -133,6 +133,26 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return is_computation_correct; } +bool oneapi_zero_memory_on_device(SyclQueue *queue_, void *device_pointer, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + try { + queue->submit([&](sycl::handler &cgh) { + cgh.parallel_for(num_bytes, + [=](sycl::id<1> idx) { ((char *)device_pointer)[idx.get(0)] = (char)0; }); + }); + queue->wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + bool oneapi_kernel_is_required_for_features(const std::string &kernel_name, const uint kernel_features) { diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index 49f2dc7d0b1..387b62bffe8 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -44,6 +44,9 @@ extern "C" { # endif CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_zero_memory_on_device(SyclQueue *queue_, + void *device_pointer, + size_t num_bytes); CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr); CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel); CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,