Cycles: oneAPI: Enable host memory migration

This enables scenes with all textures not fitting in GPU
memory to finally render. For scenes that are fitting,
no functional change or performance change is expected.

Pull Request: https://projects.blender.org/blender/blender/pulls/122385
This commit is contained in:
Nikita Sirgienko 2024-05-28 19:04:19 +02:00 committed by Nikita Sirgienko
parent b210d56857
commit 759bb6c768
8 changed files with 267 additions and 85 deletions

@ -1683,7 +1683,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
elif device_type == 'ONEAPI':
import sys
if sys.platform.startswith("win"):
driver_version = "XX.X.101.5186"
driver_version = "XX.X.101.5518"
col.label(text=rpt_("Requires Intel GPU with Xe-HPG architecture"), icon='BLANK1', translate=False)
col.label(text=rpt_("and Windows driver version %s or newer") % driver_version,
icon='BLANK1', translate=False)

@ -178,6 +178,51 @@ void BVHEmbree::build(Progress &progress,
rtcCommitScene(scene);
}
string BVHEmbree::get_last_error_message()
{
const RTCError error_code = rtcGetDeviceError(rtc_device);
switch (error_code) {
case RTC_ERROR_NONE:
return "no error";
case RTC_ERROR_UNKNOWN:
return "unknown error";
case RTC_ERROR_INVALID_ARGUMENT:
return "invalid argument error";
case RTC_ERROR_INVALID_OPERATION:
return "invalid operation error";
case RTC_ERROR_OUT_OF_MEMORY:
return "out of memory error";
case RTC_ERROR_UNSUPPORTED_CPU:
return "unsupported cpu error";
case RTC_ERROR_CANCELLED:
return "cancelled";
default:
/* We should never end here unless enum for RTC errors would change. */
return "unknown error";
}
}
# if WITH_EMBREE_GPU && RTC_VERSION >= 40302
bool BVHEmbree::offload_scenes_to_gpu(const vector<RTCScene> &scenes)
{
/* Having BVH on GPU is more performance-critical than texture data.
* In order to ensure good performance even when running out of GPU
* memory, we force BVH to migrate to GPU before allocating other textures
* that may not fit. */
for (const RTCScene &embree_scene : scenes) {
RTCSceneFlags scene_flags = rtcGetSceneFlags(embree_scene);
scene_flags = scene_flags | RTC_SCENE_FLAG_PREFETCH_USM_SHARED_ON_GPU;
rtcSetSceneFlags(embree_scene, scene_flags);
rtcCommitScene(embree_scene);
/* In case of any errors from Embree, we should stop
* the execution and propagate the error. */
if (rtcGetDeviceError(rtc_device) != RTC_ERROR_NONE)
return false;
}
return true;
}
# endif
void BVHEmbree::add_object(Object *ob, int i)
{
Geometry *geom = ob->get_geometry();

@ -18,6 +18,7 @@
# include "bvh/bvh.h"
# include "bvh/params.h"
# include "util/string.h"
# include "util/thread.h"
# include "util/types.h"
# include "util/vector.h"
@ -36,6 +37,12 @@ class BVHEmbree : public BVH {
const bool isSyclEmbreeDevice = false);
void refit(Progress &progress);
# if WITH_EMBREE_GPU && RTC_VERSION >= 40302
bool offload_scenes_to_gpu(const vector<RTCScene> &scenes);
# endif
string get_last_error_message();
RTCScene scene;
protected:

@ -257,6 +257,7 @@ class device_memory {
friend class OptiXDevice;
friend class HIPDevice;
friend class MetalDevice;
friend class OneapiDevice;
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);

@ -11,6 +11,7 @@
# include "device/oneapi/device_impl.h"
# include "util/debug.h"
# include "util/foreach.h"
# include "util/log.h"
# ifdef WITH_EMBREE_GPU
@ -47,18 +48,20 @@ static void queue_error_cb(const char *message, void *user_ptr)
}
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler),
: GPUDevice(info, stats, profiler),
device_queue_(nullptr),
# ifdef WITH_EMBREE_GPU
embree_device(nullptr),
embree_scene(nullptr),
# endif
texture_info_(this, "texture_info", MEM_GLOBAL),
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0)
{
need_texture_info_ = false;
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(void *));
static_assert(sizeof(arrayMemObject) == sizeof(void *));
use_hardware_raytracing = info.use_hardware_raytracing;
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
@ -110,6 +113,18 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
kg_memory_size_ = globals_segment_size;
max_memory_on_device_ = get_memcapacity();
init_host_memory();
move_texture_to_host = false;
can_map_host = true;
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
if (headroom_str != nullptr) {
const long long override_headroom = (float)atoll(headroom_str);
device_working_headroom = override_headroom;
device_texture_headroom = override_headroom;
}
VLOG_DEBUG << "oneAPI memory headroom size: "
<< string_human_readable_size(device_working_headroom);
}
OneapiDevice::~OneapiDevice()
@ -119,7 +134,7 @@ OneapiDevice::~OneapiDevice()
rtcReleaseDevice(embree_device);
# endif
texture_info_.free();
texture_info.free();
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
@ -166,8 +181,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
else {
bvh_embree->build(progress, &stats, embree_device, true);
}
# if RTC_VERSION >= 40302
thread_scoped_lock lock(scene_data_mutex);
all_embree_scenes.push_back(bvh_embree->scene);
# endif
if (bvh->params.top_level) {
embree_scene = bvh_embree->scene;
# if RTC_VERSION >= 40302
if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) {
set_error(
string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)",
bvh_embree->get_last_error_message()));
}
all_embree_scenes.clear();
# endif
}
}
else {
@ -176,6 +205,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
# endif
size_t OneapiDevice::get_free_mem() const
{
/* Accurate: Use device info. */
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_free_memory)) {
return device.get_info<sycl::ext::intel::info::device::free_memory>();
}
/* Estimate: Capacity - in use. */
else if (device_mem_in_use < max_memory_on_device_) {
return max_memory_on_device_ - device_mem_in_use;
}
else {
return 0;
}
}
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
@ -208,63 +253,101 @@ bool OneapiDevice::load_kernels(const uint requested_features)
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
}
if (is_finished_ok) {
reserve_private_memory(requested_features);
is_finished_ok = !have_error();
}
return is_finished_ok;
}
void OneapiDevice::load_texture_info()
void OneapiDevice::reserve_private_memory(const uint kernel_features)
{
if (need_texture_info_) {
need_texture_info_ = false;
texture_info_.copy_to_device();
size_t free_before = get_free_mem();
/* Use the biggest kernel for estimation. */
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
(kernel_features & KERNEL_FEATURE_MNEE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
{
unique_ptr<DeviceQueue> queue = gpu_queue_create();
device_ptr d_path_index = 0;
device_ptr d_render_buffer = 0;
int d_work_size = 0;
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
queue->init_execution();
/* Launch of the kernel seems to be sufficient to reserve all
* needed memory regardless of the execution global size.
* So, the smallest possible size is used here. */
queue->enqueue(test_kernel, 1, args);
queue->synchronize();
}
size_t free_after = get_free_mem();
VLOG_INFO << "For kernel execution were reserved "
<< string_human_readable_number(free_before - free_after) << " bytes. ("
<< string_human_readable_size(free_before - free_after) << ")";
}
void OneapiDevice::generic_alloc(device_memory &mem)
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
{
size_t memory_size = mem.memory_size();
/* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
* we can use USM host memory.
* Because of the expected performance impact, implementation of this has had a low priority
* and is not implemented yet. */
assert(device_queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it maybe more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations. But for
* Blender/Cycles this type of memory is not very suitable in current application architecture,
* because Cycles already uses two different pointer for host activity and device activity, and
* also has to perform all needed memory transfer operations. So, USM device memory
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
void *device_pointer = nullptr;
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
device_pointer = usm_alloc_device(device_queue_, memory_size);
if (device_pointer == nullptr) {
set_error("oneAPI kernel - device memory allocation error for " +
string_human_readable_size(mem.memory_size()) +
", possibly caused by lack of available memory space on the device: " +
string_human_readable_size(stats.mem_used) + " of " +
string_human_readable_size(max_memory_on_device_) + " is already allocated");
}
mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
mem.device_size = memory_size;
stats.mem_alloc(memory_size);
free = get_free_mem();
total = max_memory_on_device_;
}
void OneapiDevice::generic_copy_to(device_memory &mem)
bool OneapiDevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.device_pointer) {
return;
}
size_t memory_size = mem.memory_size();
bool allocation_success = false;
device_pointer = usm_alloc_device(device_queue_, size);
if (device_pointer != nullptr) {
allocation_success = true;
/* Due to lazy memory initialisation in GPU runtime we will force memory to
* appear in device memory via execution of a kernel using this memory.. */
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
"\"");
usm_free(device_queue_, device_pointer);
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
assert(mem.host_pointer);
assert(device_queue_);
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
device_pointer = nullptr;
allocation_success = false;
}
}
return allocation_success;
}
void OneapiDevice::free_device(void *device_pointer)
{
usm_free(device_queue_, device_pointer);
}
bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size)
{
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
return shared_pointer != nullptr;
}
void OneapiDevice::free_host(void *shared_pointer)
{
usm_free(device_queue_, shared_pointer);
}
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
/* Device and host pointer are in the same address space
* as we're using Unified Shared Memory. */
device_pointer = shared_pointer;
}
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
}
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@ -288,20 +371,6 @@ void *OneapiDevice::kernel_globals_device_pointer()
return kg_memory_device_;
}
void OneapiDevice::generic_free(device_memory &mem)
{
if (!mem.device_pointer) {
return;
}
stats.mem_free(mem.device_size);
mem.device_size = 0;
assert(device_queue_);
usm_free(device_queue_, (void *)mem.device_pointer);
mem.device_pointer = 0;
}
void OneapiDevice::mem_alloc(device_memory &mem)
{
if (mem.type == MEM_TEXTURE) {
@ -344,7 +413,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
}
else {
if (!mem.device_pointer)
mem_alloc(mem);
generic_alloc(mem);
generic_copy_to(mem);
}
@ -515,14 +584,14 @@ void OneapiDevice::tex_alloc(device_texture &mem)
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
const uint slot = mem.slot;
if (slot >= texture_info_.size()) {
texture_info_.resize(slot + 128);
if (slot >= texture_info.size()) {
texture_info.resize(slot + 128);
}
texture_info_[slot] = mem.info;
need_texture_info_ = true;
texture_info[slot] = mem.info;
need_texture_info = true;
texture_info_[slot].data = (uint64_t)mem.device_pointer;
texture_info[slot].data = (uint64_t)mem.device_pointer;
}
void OneapiDevice::tex_free(device_texture &mem)
@ -628,6 +697,16 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it could more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations, although usage of
* USM shared imply some documented limitations on the memory usage in regards of parallel access
* from differen threads. But for Blender/Cycles this type of memory is not very suitable in
* current application architecture, because Cycles is multithread application and already uses
* two different pointer for host activity and device activity, and also has to perform all
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
* in order to better fit in Cycles architecture. */
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
return sycl::malloc_device(memory_size, *queue);
# else
@ -646,9 +725,26 @@ void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, dest, true);
OneapiDevice::check_usm(queue_, src, true);
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
/* Unknown here means, that this is not an USM allocation, which implies that this is
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
{
memcpy(dest, src, num_bytes);
return true;
}
try {
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
@ -658,8 +754,6 @@ bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t n
mem_event.wait_and_throw();
return true;
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
@ -684,6 +778,12 @@ bool OneapiDevice::usm_memset(SyclQueue *queue_,
size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
try {
@ -735,7 +835,7 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
OneapiDevice::check_usm(queue_, memory_device_pointer);
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
OneapiDevice::check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
@ -874,11 +974,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
static const int lowest_supported_driver_version_win = 1015186;
static const int lowest_supported_driver_version_win = 1015518;
# ifdef _WIN32
/* For Windows driver 101.5186, compute-runtime version is 28044.
/* For Windows driver 101.5518, compute-runtime version is 28044.
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
static const int lowest_supported_driver_version_neo = 28044;
static const int lowest_supported_driver_version_neo = 29283;
# else
static const int lowest_supported_driver_version_neo = 27642;
# endif

@ -21,17 +21,19 @@ typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
bool oidn_support,
void *user_ptr);
class OneapiDevice : public Device {
class OneapiDevice : public GPUDevice {
private:
SyclQueue *device_queue_;
# ifdef WITH_EMBREE_GPU
RTCDevice embree_device;
RTCScene embree_scene;
# if RTC_VERSION >= 40302
thread_mutex scene_data_mutex;
vector<RTCScene> all_embree_scenes;
# endif
# endif
using ConstMemMap = map<string, device_vector<uchar> *>;
ConstMemMap const_mem_map_;
device_vector<TextureInfo> texture_info_;
bool need_texture_info_;
void *kg_memory_;
void *kg_memory_device_;
size_t kg_memory_size_ = (size_t)0;
@ -41,6 +43,8 @@ class OneapiDevice : public Device {
unsigned int kernel_features = 0;
int scene_max_shaders_ = 0;
size_t get_free_mem() const;
public:
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
@ -54,13 +58,15 @@ class OneapiDevice : public Device {
bool load_kernels(const uint kernel_features) override;
void load_texture_info();
void reserve_private_memory(const uint kernel_features);
void generic_alloc(device_memory &mem);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
string oneapi_error_message();

@ -133,6 +133,26 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return is_computation_correct;
}
bool oneapi_zero_memory_on_device(SyclQueue *queue_, void *device_pointer, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->submit([&](sycl::handler &cgh) {
cgh.parallel_for(num_bytes,
[=](sycl::id<1> idx) { ((char *)device_pointer)[idx.get(0)] = (char)0; });
});
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
const uint kernel_features)
{

@ -44,6 +44,9 @@ extern "C" {
# endif
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_zero_memory_on_device(SyclQueue *queue_,
void *device_pointer,
size_t num_bytes);
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,