Cycles: Add support for P2P memory distribution (e.g. via NVLink)

This change modifies the multi-device implementation to support memory distribution across devices, to reduce the overall memory footprint of large scenes and allow scenes to fit entirely into combined GPU memory that previously had to fall back to host memory. Reviewed By: brecht Differential Revision: https://developer.blender.org/D7426
2020-06-08 17:16:10 +02:00 · 2020-06-08 17:16:10 +02:00 · 9f7d84b656
commit 9f7d84b656
parent 0a907657d4
11 changed files with 371 additions and 97 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -1535,6 +1535,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings)

+    peer_memory: BoolProperty(
+        name="Distribute memory across devices",
+        description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it",
+        default=False,
+    )
+
    def find_existing_device_entry(self, device):
        for device_entry in self.devices:
            if device_entry.id == device[2] and device_entry.type == device[1]:
@ -1632,14 +1638,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        row = layout.row()
        row.prop(self, "compute_device_type", expand=True)

-        devices = self.get_devices_for_type(self.compute_device_type)
+        if self.compute_device_type == 'NONE':
+            return
        row = layout.row()
-        if self.compute_device_type == 'CUDA':
-            self._draw_devices(row, 'CUDA', devices)
-        elif self.compute_device_type == 'OPTIX':
-            self._draw_devices(row, 'OPTIX', devices)
-        elif self.compute_device_type == 'OPENCL':
-            self._draw_devices(row, 'OPENCL', devices)
+        devices = self.get_devices_for_type(self.compute_device_type)
+        self._draw_devices(row, self.compute_device_type, devices)
+
+        import _cycles
+        has_peer_memory = 0
+        for device in _cycles.available_devices(self.compute_device_type):
+            if device[3] and self.find_existing_device_entry(device).use:
+                has_peer_memory += 1
+        if has_peer_memory > 1:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "peer_memory")

    def draw(self, context):
        self.draw_impl(self.layout, context)
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@ -113,6 +113,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
        device = Device::get_multi_device(used_devices, threads, background);
      }
      /* Else keep using the CPU device that was set before. */
+
+      if (!get_boolean(cpreferences, "peer_memory")) {
+        device.has_peer_memory = false;
+      }
    }
  }

--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@ -416,10 +416,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
  for (size_t i = 0; i < devices.size(); i++) {
    DeviceInfo &device = devices[i];
    string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(3);
+    PyObject *device_tuple = PyTuple_New(4);
    PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
    PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
    PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory));
    PyTuple_SET_ITEM(ret, i, device_tuple);
  }

--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@ -51,6 +51,7 @@ class CUDADevice : public Device {
  size_t map_host_used;
  size_t map_host_limit;
  int can_map_host;
+  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;
@ -111,6 +112,8 @@ class CUDADevice : public Device {

  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);

+  bool check_peer_access(Device *peer_device);
+
  bool use_adaptive_compilation();

  bool use_split_kernel();
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@ -207,6 +207,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
  map_host_limit = 0;
  map_host_used = 0;
  can_map_host = 0;
+  pitch_alignment = 0;

  functions.loaded = false;

@ -224,6 +225,9 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
  cuda_assert(
      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));

+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
  if (can_map_host) {
    ctx_flags |= CU_CTX_MAP_HOST;
@ -286,6 +290,49 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat
  return true;
 }

+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) {
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CUDADevice::use_adaptive_compilation()
 {
  return DebugFlags().cuda.adaptive_compile;
@ -674,6 +721,12 @@ void CUDADevice::load_texture_info()

 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
 {
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
  /* Signal to reallocate textures in host memory only. */
  move_texture_to_host = true;

@ -687,6 +740,12 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
      device_memory &mem = *pair.first;
      CUDAMem *cmem = &pair.second;

+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                        (&mem != &texture_info);
      bool is_image = is_texture && (mem.data_height > 1);
@ -696,11 +755,6 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
        continue;
      }

-      /* Already in host memory. */
-      if (cmem->use_mapped_host) {
-        continue;
-      }
-
      /* For other textures, only move image textures. */
      if (for_texture && !is_image) {
        continue;
@ -723,26 +777,30 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
      static thread_mutex move_mutex;
      thread_scoped_lock lock(move_mutex);

-      /* Preserve the original device pointer, in case of multi device
-       * we can't change it because the pointer mapping would break. */
-      device_ptr prev_pointer = max_mem->device_pointer;
-      size_t prev_size = max_mem->device_size;
+      any_device_moving_textures_to_host = true;

-      mem_copy_to(*max_mem);
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
      size = (max_size >= size) ? 0 : size - max_size;

-      max_mem->device_pointer = prev_pointer;
-      max_mem->device_size = prev_size;
+      any_device_moving_textures_to_host = false;
    }
    else {
      break;
    }
  }

+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
  /* Update texture info array with new pointers. */
  load_texture_info();
-
-  move_texture_to_host = false;
 }

 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
@ -808,9 +866,6 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
      map_host_used += size;
      status = " in host memory";
    }
-    else {
-      status = " failed, out of host memory";
-    }
  }

  if (mem_alloc_result != CUDA_SUCCESS) {
@ -906,7 +961,7 @@ void CUDADevice::generic_free(device_memory &mem)
    }
    else {
      /* Free device memory. */
-      cuMemFree(mem.device_pointer);
+      cuda_assert(cuMemFree(mem.device_pointer));
    }

    stats.mem_free(mem.device_size);
@ -1032,18 +1087,17 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)

 void CUDADevice::global_alloc(device_memory &mem)
 {
-  CUDAContextScope scope(this);
-
-  generic_alloc(mem);
-  generic_copy_to(mem);
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }

  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
 }

 void CUDADevice::global_free(device_memory &mem)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
+  if (mem.is_resident(this) && mem.device_pointer) {
    generic_free(mem);
  }
 }
@ -1112,7 +1166,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

-  if (mem.data_depth > 1) {
+  if (!mem.is_resident(this)) {
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
    /* 3D texture using array, there is no API for linear memory. */
    CUDA_ARRAY3D_DESCRIPTOR desc;

@ -1156,10 +1222,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
-    int alignment = 0;
-    cuda_assert(
-        cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-    dst_pitch = align_up(src_pitch, alignment);
+    dst_pitch = align_up(src_pitch, pitch_alignment);
    size_t dst_size = dst_pitch * mem.data_height;

    cmem = generic_alloc(mem, dst_size - mem.memory_size());
@ -1251,7 +1314,11 @@ void CUDADevice::tex_free(device_texture &mem)
      cuTexObjectDestroy(cmem.texobject);
    }

-    if (cmem.array) {
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
      /* Free array. */
      cuArrayDestroy(cmem.array);
      stats.mem_free(mem.device_size);
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -602,6 +602,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.has_adaptive_stop_per_sample = true;
  info.has_osl = true;
  info.has_profiling = true;
+  info.has_peer_memory = false;

  foreach (const DeviceInfo &device, subdevices) {
    /* Ensure CPU device does not slow down GPU. */
@ -645,6 +646,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
    info.has_osl &= device.has_osl;
    info.has_profiling &= device.has_profiling;
+    info.has_peer_memory |= device.has_peer_memory;
  }

  return info;
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -82,6 +82,7 @@ class DeviceInfo {
  bool has_osl;                      /* Support Open Shading Language. */
  bool use_split_kernel;             /* Use split or mega kernel. */
  bool has_profiling;                /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
  int cpu_threads;
  vector<DeviceInfo> multi_devices;
  vector<DeviceInfo> denoising_devices;
@ -99,6 +100,7 @@ class DeviceInfo {
    has_osl = false;
    use_split_kernel = false;
    has_profiling = false;
+    has_peer_memory = false;
  }

  bool operator==(const DeviceInfo &info)
@ -435,6 +437,17 @@ class Device {
  {
  }

+  virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
+  {
+    /* Memory is always resident if this is not a multi device, regardless of whether the pointer
+     * is valid or not (since it may not have been allocated yet). */
+    return sub_device == this;
+  }
+  virtual bool check_peer_access(Device * /*peer_device*/)
+  {
+    return false;
+  }
+
  /* static */
  static Device *create(DeviceInfo &info,
                        Stats &stats,
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@ -131,6 +131,15 @@ void device_cuda_info(vector<DeviceInfo> &devices)
    info.has_volume_decoupled = false;
    info.has_adaptive_stop_per_sample = false;

+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        cuDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
    int pci_location[3] = {0, 0, 0};
    cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
    cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@ -125,6 +125,11 @@ void device_memory::restore_device()
  device_pointer = original_device_ptr;
 }

+bool device_memory::is_resident(Device *sub_device) const
+{
+  return device->is_resident(device_pointer, sub_device);
+}
+
 /* Device Sub Ptr */

 device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@ -230,6 +230,8 @@ class device_memory {
  void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
  void restore_device();

+  bool is_resident(Device *sub_device) const;
+
 protected:
  friend class CUDADevice;
  friend class OptiXDevice;
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@ -34,37 +34,66 @@ CCL_NAMESPACE_BEGIN
 class MultiDevice : public Device {
 public:
  struct SubDevice {
-    explicit SubDevice(Device *device_) : device(device_)
-    {
-    }
-
+    Stats stats;
    Device *device;
    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
  };

  list<SubDevice> devices, denoising_devices;
  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;

  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
      : Device(info, stats, profiler, background_), unique_key(1)
  {
    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
      /* Always add CPU devices at the back since GPU devices can change
       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
      if (subinfo.type == DEVICE_CPU) {
-        devices.push_back(SubDevice(device));
+        devices.emplace_back();
+        sub = &devices.back();
      }
      else {
-        devices.push_front(SubDevice(device));
+        devices.emplace_front();
+        sub = &devices.front();
      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
    }

    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+      denoising_devices.emplace_back();
+      SubDevice *sub = &denoising_devices.back();

-      denoising_devices.push_back(SubDevice(device));
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
    }

 #ifdef WITH_NETWORK
@ -175,11 +204,11 @@ class MultiDevice : public Device {

  bool build_optix_bvh(BVH *bvh)
  {
-    // Broadcast acceleration structure build to all render devices
-    foreach (SubDevice &sub, devices)
+    /* Broadcast acceleration structure build to all render devices */
+    foreach (SubDevice &sub, devices) {
      if (!sub.device->build_optix_bvh(bvh))
        return false;
-
+    }
    return true;
  }

@ -191,17 +220,82 @@ class MultiDevice : public Device {
    return devices.front().device->osl_memory();
  }

+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(sub.peer_island_index >= 0 && key != 0);
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
  void mem_alloc(device_memory &mem)
  {
    device_ptr key = unique_key++;

-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = 0;
-      mem.device_size = 0;
+    if (mem.type == MEM_PIXELS) {
+      /* Always allocate pixels memory on all devices
+       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;

-      sub.device->mem_alloc(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        sub.device->mem_alloc(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
+             mem.type == MEM_DEVICE_ONLY);
+      /* The remaining memory types can be distributed across devices */
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        owner_sub->device->mem_alloc(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
    }

    mem.device = this;
@ -215,13 +309,36 @@ class MultiDevice : public Device {
    device_ptr key = (existing_key) ? existing_key : unique_key++;
    size_t existing_size = mem.device_size;

-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;

-      sub.device->mem_copy_to(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        sub.device->mem_copy_to(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_copy_to(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+
+        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+          /* Need to create texture objects and update pointer in kernel globals on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_copy_to(mem);
+            }
+          }
+        }
+      }
    }

    mem.device = this;
@ -238,10 +355,11 @@ class MultiDevice : public Device {
      int sy = y + i * sub_h;
      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;

-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];

-      sub.device->mem_copy_from(mem, sy, w, sh, elem);
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
      i++;
    }

@ -255,16 +373,18 @@ class MultiDevice : public Device {
    device_ptr key = (existing_key) ? existing_key : unique_key++;
    size_t existing_size = mem.device_size;

-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
-
-      sub.device->mem_zero(mem);
-      sub.ptr_map[key] = mem.device_pointer;
-    }
-
+    /* This is a hack to only allocate the tile buffers on denoising devices
+     * Similarily the tile buffers also need to be allocated separately on all devices so any
+     * overlap rendered for denoising does not interfer with each other */
    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        sub.device->mem_zero(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
      foreach (SubDevice &sub, denoising_devices) {
        mem.device = sub.device;
        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
@ -274,6 +394,17 @@ class MultiDevice : public Device {
        sub.ptr_map[key] = mem.device_pointer;
      }
    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_zero(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
+    }

    mem.device = this;
    mem.device_pointer = key;
@ -285,16 +416,16 @@ class MultiDevice : public Device {
    device_ptr key = mem.device_pointer;
    size_t existing_size = mem.device_size;

-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
-      mem.device_size = existing_size;
+    /* Free memory that was allocated for all devices (see above) on each device */
+    if (strcmp(mem.name, "RenderBuffers") == 0 || mem.type == MEM_PIXELS) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = sub.ptr_map[key];
+        mem.device_size = existing_size;

-      sub.device->mem_free(mem);
-      sub.ptr_map.erase(sub.ptr_map.find(key));
-    }
-
-    if (strcmp(mem.name, "RenderBuffers") == 0) {
+        sub.device->mem_free(mem);
+        sub.ptr_map.erase(sub.ptr_map.find(key));
+      }
      foreach (SubDevice &sub, denoising_devices) {
        mem.device = sub.device;
        mem.device_pointer = sub.ptr_map[key];
@ -304,6 +435,26 @@ class MultiDevice : public Device {
        sub.ptr_map.erase(sub.ptr_map.find(key));
      }
    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+        mem.device = owner_sub->device;
+        mem.device_pointer = owner_sub->ptr_map[key];
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_free(mem);
+        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+        if (mem.type == MEM_TEXTURE) {
+          /* Free texture objects on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_free(mem);
+            }
+          }
+        }
+      }
+    }

    mem.device = this;
    mem.device_pointer = 0;
@ -330,6 +481,8 @@ class MultiDevice : public Device {
                   bool transparent,
                   const DeviceDrawParams &draw_params)
  {
+    assert(rgba.type == MEM_PIXELS);
+
    device_ptr key = rgba.device_pointer;
    int i = 0, sub_h = h / devices.size();
    int sub_height = height / devices.size();
@ -358,7 +511,7 @@ class MultiDevice : public Device {

    foreach (SubDevice &sub, devices) {
      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
+        tile.buffer = find_matching_mem(tile.buffer, sub);
        return;
      }
    }
@ -517,16 +670,21 @@ class MultiDevice : public Device {
        DeviceTask subtask = tasks.front();
        tasks.pop_front();

-        if (task.buffer)
+        if (task.type == DeviceTask::DENOISE_BUFFER && !denoising_devices.empty()) {
          subtask.buffer = sub.ptr_map[task.buffer];
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = sub.ptr_map[task.shader_input];
-        if (task.shader_output)
-          subtask.shader_output = sub.ptr_map[task.shader_output];
+        }
+        else {
+          if (task.buffer)
+            subtask.buffer = find_matching_mem(task.buffer, sub);
+          if (task.rgba_byte)
+            subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+          if (task.rgba_half)
+            subtask.rgba_half = sub.ptr_map[task.rgba_half];
+          if (task.shader_input)
+            subtask.shader_input = find_matching_mem(task.shader_input, sub);
+          if (task.shader_output)
+            subtask.shader_output = find_matching_mem(task.shader_output, sub);
+        }

        sub.device->task_add(subtask);
      }
@ -548,9 +706,6 @@ class MultiDevice : public Device {
    foreach (SubDevice &sub, denoising_devices)
      sub.device->task_cancel();
  }
-
- protected:
-  Stats sub_stats_;
 };

 Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)