Cycles: add HIP device support for AMD GPUs

NOTE: this feature is not ready for user testing, and not yet enabled in daily builds. It is being merged now for easier collaboration on development. HIP is a heterogenous compute interface allowing C++ code to be executed on GPUs similar to CUDA. It is intended to bring back AMD GPU rendering support on Windows and Linux. https://github.com/ROCm-Developer-Tools/HIP. As of the time of writing, it should compile and run on Linux with existing HIP compilers and driver runtimes. Publicly available compilers and drivers for Windows will come later. See task T91571 for more details on the current status and work remaining to be done. Credits: Sayak Biswas (AMD) Arya Rafii (AMD) Brian Savery (AMD) Differential Revision: https://developer.blender.org/D12578
2021-09-28 16:51:14 +02:00 · 2021-09-28 16:51:14 +02:00 · 044a77352f
commit 044a77352f
parent 262b211856
45 changed files with 4854 additions and 19 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -419,6 +419,8 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)

 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)

 option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
@ -821,6 +823,11 @@ if(NOT WITH_CUDA_DYNLOAD)
  endif()
 endif()

+if(WITH_CYCLES_DEVICE_HIP)
+  # Currently HIP must be dynamically loaded, this may change in future toolkits
+  set(WITH_HIP_DYNLOAD ON)
+endif()
+
 #-----------------------------------------------------------------------------
 # Check check if submodules are cloned

@ -1850,6 +1857,9 @@ elseif(WITH_CYCLES_STANDALONE)
  if(WITH_CUDA_DYNLOAD)
    add_subdirectory(extern/cuew)
  endif()
+  if(WITH_HIP_DYNLOAD)
+    add_subdirectory(extern/hipew)
+  endif()
  if(NOT WITH_SYSTEM_GLEW)
    add_subdirectory(extern/glew)
  endif()
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@ -70,6 +70,9 @@ if(WITH_CYCLES OR WITH_COMPOSITOR OR WITH_OPENSUBDIV)
  if(WITH_CUDA_DYNLOAD)
    add_subdirectory(cuew)
  endif()
+  if(WITH_HIP_DYNLOAD)
+    add_subdirectory(hipew)
+  endif()
 endif()

 if(WITH_GHOST_X11 AND WITH_GHOST_XDND)
--- a/extern/hipew/CMakeLists.txt
+++ b/extern/hipew/CMakeLists.txt
@ -0,0 +1,39 @@
+# ***** BEGIN GPL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# The Original Code is Copyright (C) 2021, Blender Foundation
+# All rights reserved.
+# ***** END GPL LICENSE BLOCK *****
+
+set(INC
+  .
+  include
+)
+
+set(INC_SYS
+
+)
+
+set(SRC
+  src/hipew.c
+
+  include/hipew.h
+)
+
+set(LIB
+)
+
+blender_add_lib(extern_hipew "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
--- a/extern/hipew/src/hipew.c
+++ b/extern/hipew/src/hipew.c
@ -0,0 +1,533 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+#ifdef _MSC_VER
+#  if _MSC_VER < 1900
+#    define snprintf _snprintf
+#  endif
+#  define popen _popen
+#  define pclose _pclose
+#  define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <hipew.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  define VC_EXTRALEAN
+#  include <windows.h>
+
+/* Utility macros. */
+
+typedef HMODULE DynamicLibrary;
+
+#  define dynamic_library_open(path)         LoadLibraryA(path)
+#  define dynamic_library_close(lib)         FreeLibrary(lib)
+#  define dynamic_library_find(lib, symbol)  GetProcAddress(lib, symbol)
+#else
+#  include <dlfcn.h>
+
+typedef void* DynamicLibrary;
+
+#  define dynamic_library_open(path)         dlopen(path, RTLD_NOW)
+#  define dynamic_library_close(lib)         dlclose(lib)
+#  define dynamic_library_find(lib, symbol)  dlsym(lib, symbol)
+#endif
+
+#define _LIBRARY_FIND_CHECKED(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name); \
+        assert(name);
+
+#define _LIBRARY_FIND(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name);
+
+#define HIP_LIBRARY_FIND_CHECKED(name) \
+        _LIBRARY_FIND_CHECKED(hip_lib, name)
+#define HIP_LIBRARY_FIND(name) _LIBRARY_FIND(hip_lib, name)
+
+
+static DynamicLibrary hip_lib;
+
+/* Function definitions. */
+thipGetErrorName *hipGetErrorName;
+thipInit *hipInit;
+thipDriverGetVersion *hipDriverGetVersion;
+thipGetDevice *hipGetDevice;
+thipGetDeviceCount *hipGetDeviceCount;
+thipDeviceGetName *hipDeviceGetName;
+thipDeviceGetAttribute *hipDeviceGetAttribute;
+thipDeviceComputeCapability *hipDeviceComputeCapability;
+thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain;
+thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease;
+thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags;
+thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState;
+thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset;
+thipCtxCreate *hipCtxCreate;
+thipCtxDestroy *hipCtxDestroy;
+thipCtxPushCurrent *hipCtxPushCurrent;
+thipCtxPopCurrent *hipCtxPopCurrent;
+thipCtxSetCurrent *hipCtxSetCurrent;
+thipCtxGetCurrent *hipCtxGetCurrent;
+thipCtxGetDevice *hipCtxGetDevice;
+thipCtxGetFlags *hipCtxGetFlags;
+thipCtxSynchronize *hipCtxSynchronize;
+thipDeviceSynchronize *hipDeviceSynchronize;
+thipCtxGetCacheConfig *hipCtxGetCacheConfig;
+thipCtxSetCacheConfig *hipCtxSetCacheConfig;
+thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig;
+thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig;
+thipCtxGetApiVersion *hipCtxGetApiVersion;
+thipModuleLoad *hipModuleLoad;
+thipModuleLoadData *hipModuleLoadData;
+thipModuleLoadDataEx *hipModuleLoadDataEx;
+thipModuleUnload *hipModuleUnload;
+thipModuleGetFunction *hipModuleGetFunction;
+thipModuleGetGlobal *hipModuleGetGlobal;
+thipModuleGetTexRef *hipModuleGetTexRef;
+thipMemGetInfo *hipMemGetInfo;
+thipMalloc *hipMalloc;
+thipMemAllocPitch *hipMemAllocPitch;
+thipFree *hipFree;
+thipMemGetAddressRange *hipMemGetAddressRange;
+thipHostMalloc *hipHostMalloc;
+thipHostFree *hipHostFree;
+thipHostGetDevicePointer *hipHostGetDevicePointer;
+thipHostGetFlags *hipHostGetFlags;
+thipMallocManaged *hipMallocManaged;
+thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId;
+thipDeviceGetPCIBusId *hipDeviceGetPCIBusId;
+thipMemcpyPeer *hipMemcpyPeer;
+thipMemcpyHtoD *hipMemcpyHtoD;
+thipMemcpyDtoH *hipMemcpyDtoH;
+thipMemcpyDtoD *hipMemcpyDtoD;
+thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned;
+thipMemcpyParam2D *hipMemcpyParam2D;
+thipDrvMemcpy3D *hipDrvMemcpy3D;
+thipMemcpyHtoDAsync *hipMemcpyHtoDAsync;
+thipMemcpyDtoHAsync *hipMemcpyDtoHAsync;
+thipMemcpyParam2DAsync *hipMemcpyParam2DAsync;
+thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync;
+thipMemsetD8 *hipMemsetD8;
+thipMemsetD16 *hipMemsetD16;
+thipMemsetD32 *hipMemsetD32;
+thipMemsetD8Async *hipMemsetD8Async;
+thipMemsetD16Async *hipMemsetD16Async;
+thipMemsetD32Async *hipMemsetD32Async;
+thipArrayCreate *hipArrayCreate;
+thipArrayDestroy *hipArrayDestroy;
+thipArray3DCreate *hipArray3DCreate;
+thipStreamCreateWithFlags *hipStreamCreateWithFlags;
+thipStreamCreateWithPriority *hipStreamCreateWithPriority;
+thipStreamGetPriority *hipStreamGetPriority;
+thipStreamGetFlags *hipStreamGetFlags;
+thipStreamWaitEvent *hipStreamWaitEvent;
+thipStreamAddCallback *hipStreamAddCallback;
+thipStreamQuery *hipStreamQuery;
+thipStreamSynchronize *hipStreamSynchronize;
+thipStreamDestroy *hipStreamDestroy;
+thipEventCreateWithFlags *hipEventCreateWithFlags;
+thipEventRecord *hipEventRecord;
+thipEventQuery *hipEventQuery;
+thipEventSynchronize *hipEventSynchronize;
+thipEventDestroy *hipEventDestroy;
+thipEventElapsedTime *hipEventElapsedTime;
+thipFuncGetAttribute *hipFuncGetAttribute;
+thipFuncSetCacheConfig *hipFuncSetCacheConfig;
+thipModuleLaunchKernel *hipModuleLaunchKernel;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize;
+thipTexRefSetArray *hipTexRefSetArray;
+thipTexRefSetAddress *hipTexRefSetAddress;
+thipTexRefSetAddress2D *hipTexRefSetAddress2D;
+thipTexRefSetFormat *hipTexRefSetFormat;
+thipTexRefSetAddressMode *hipTexRefSetAddressMode;
+thipTexRefSetFilterMode *hipTexRefSetFilterMode;
+thipTexRefSetFlags *hipTexRefSetFlags;
+thipTexRefGetAddress *hipTexRefGetAddress;
+thipTexRefGetArray *hipTexRefGetArray;
+thipTexRefGetAddressMode *hipTexRefGetAddressMode;
+thipTexObjectCreate *hipTexObjectCreate;
+thipTexObjectDestroy *hipTexObjectDestroy;
+thipDeviceCanAccessPeer *hipDeviceCanAccessPeer;
+
+thipCtxEnablePeerAccess *hipCtxEnablePeerAccess;
+thipCtxDisablePeerAccess *hipCtxDisablePeerAccess;
+thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute;
+thipGraphicsUnregisterResource *hipGraphicsUnregisterResource;
+thipGraphicsMapResources *hipGraphicsMapResources;
+thipGraphicsUnmapResources *hipGraphicsUnmapResources;
+thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer;
+
+thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer;
+thipGLGetDevices *hipGLGetDevices;
+
+
+
+static DynamicLibrary dynamic_library_open_find(const char **paths) {
+  int i = 0;
+  while (paths[i] != NULL) {
+      DynamicLibrary lib = dynamic_library_open(paths[i]);
+      if (lib != NULL) {
+        return lib;
+      }
+      ++i;
+  }
+  return NULL;
+}
+
+/* Implementation function. */
+static void hipewHipExit(void) {
+  if (hip_lib != NULL) {
+    /*  Ignore errors. */
+    dynamic_library_close(hip_lib);
+    hip_lib = NULL;
+  }
+}
+
+static int hipewHipInit(void) {
+  /* Library paths. */
+#ifdef _WIN32
+  /* Expected in c:/windows/system or similar, no path needed. */
+  const char *hip_paths[] = {"amdhip64.dll", NULL};
+#elif defined(__APPLE__)
+  /* Default installation path. */
+  const char *hip_paths[] = {"", NULL};
+#else
+  const char *hip_paths[] = {"/opt/rocm/hip/lib/libamdhip64.so", NULL};
+#endif
+  static int initialized = 0;
+  static int result = 0;
+  int error, driver_version;
+
+  if (initialized) {
+    return result;
+  }
+
+  initialized = 1;
+
+  error = atexit(hipewHipExit);
+  if (error) {
+    result = HIPEW_ERROR_ATEXIT_FAILED;
+    return result;
+  }
+
+  /* Load library. */
+  hip_lib = dynamic_library_open_find(hip_paths);
+
+  if (hip_lib == NULL) {
+    result = HIPEW_ERROR_OPEN_FAILED;
+    return result;
+  }
+
+  /* Fetch all function pointers. */
+  HIP_LIBRARY_FIND_CHECKED(hipGetErrorName);
+  HIP_LIBRARY_FIND_CHECKED(hipInit);
+  HIP_LIBRARY_FIND_CHECKED(hipDriverGetVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRetain);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRelease);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxGetState);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxReset);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPushCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPopCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetApiVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoad);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadData);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadDataEx);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleUnload);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetFunction);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetGlobal);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetTexRef);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetInfo);
+  HIP_LIBRARY_FIND_CHECKED(hipMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipMemAllocPitch);
+  HIP_LIBRARY_FIND_CHECKED(hipFree);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetAddressRange);
+  HIP_LIBRARY_FIND_CHECKED(hipHostMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipHostFree);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetDevicePointer);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipMallocManaged);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetByPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoH);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2D);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3D);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoDAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoHAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy2DUnaligned);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32Async);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipArray3DCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamWaitEvent);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamAddCallback);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipEventRecord);
+  HIP_LIBRARY_FIND_CHECKED(hipEventQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipEventSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipEventDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventElapsedTime);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLaunchKernel);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleOccupancyMaxPotentialBlockSize);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetArray);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress2D);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFormat);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFilterMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceCanAccessPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxEnablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDisablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetP2PAttribute);
+#ifdef _WIN32
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnregisterResource);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsMapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnmapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsResourceGetMappedPointer);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsGLRegisterBuffer);
+  HIP_LIBRARY_FIND_CHECKED(hipGLGetDevices);
+#endif
+  result = HIPEW_SUCCESS;
+  return result;
+}
+
+
+
+int hipewInit(hipuint32_t flags) {
+  int result = HIPEW_SUCCESS;
+
+  if (flags & HIPEW_INIT_HIP) {
+    result = hipewHipInit();
+    if (result != HIPEW_SUCCESS) {
+      return result;
+    }
+  }
+
+  return result;
+}
+
+
+const char *hipewErrorString(hipError_t result) {
+  switch (result) {
+    case hipSuccess: return "No errors";
+    case hipErrorInvalidValue: return "Invalid value";
+    case hipErrorOutOfMemory: return "Out of memory";
+    case hipErrorNotInitialized: return "Driver not initialized";
+    case hipErrorDeinitialized: return "Driver deinitialized";
+    case hipErrorProfilerDisabled: return "Profiler disabled";
+    case hipErrorProfilerNotInitialized: return "Profiler not initialized";
+    case hipErrorProfilerAlreadyStarted: return "Profiler already started";
+    case hipErrorProfilerAlreadyStopped: return "Profiler already stopped";
+    case hipErrorNoDevice: return "No HIP-capable device available";
+    case hipErrorInvalidDevice: return "Invalid device";
+    case hipErrorInvalidImage: return "Invalid kernel image";
+    case hipErrorInvalidContext: return "Invalid context";
+    case hipErrorContextAlreadyCurrent: return "Context already current";
+    case hipErrorMapFailed: return "Map failed";
+    case hipErrorUnmapFailed: return "Unmap failed";
+    case hipErrorArrayIsMapped: return "Array is mapped";
+    case hipErrorAlreadyMapped: return "Already mapped";
+    case hipErrorNoBinaryForGpu: return "No binary for GPU";
+    case hipErrorAlreadyAcquired: return "Already acquired";
+    case hipErrorNotMapped: return "Not mapped";
+    case hipErrorNotMappedAsArray: return "Mapped resource not available for access as an array";
+    case hipErrorNotMappedAsPointer: return "Mapped resource not available for access as a pointer";
+    case hipErrorECCNotCorrectable: return "Uncorrectable ECC error detected";
+    case hipErrorUnsupportedLimit: return "hipLimit_t not supported by device";
+    case hipErrorContextAlreadyInUse: return "Context already in use";
+    case hipErrorPeerAccessUnsupported: return "Peer access unsupported";
+    case hipErrorInvalidKernelFile: return "Invalid ptx";
+    case hipErrorInvalidGraphicsContext: return "Invalid graphics context";
+    case hipErrorInvalidSource: return "Invalid source";
+    case hipErrorFileNotFound: return "File not found";
+    case hipErrorSharedObjectSymbolNotFound: return "Link to a shared object failed to resolve";
+    case hipErrorSharedObjectInitFailed: return "Shared object initialization failed";
+    case hipErrorOperatingSystem: return "Operating system";
+    case hipErrorInvalidHandle: return "Invalid handle";
+    case hipErrorNotFound: return "Not found";
+    case hipErrorNotReady: return "HIP not ready";
+    case hipErrorIllegalAddress: return "Illegal address";
+    case hipErrorLaunchOutOfResources: return "Launch exceeded resources";
+    case hipErrorLaunchTimeOut: return "Launch exceeded timeout";
+    case hipErrorPeerAccessAlreadyEnabled: return "Peer access already enabled";
+    case hipErrorPeerAccessNotEnabled: return "Peer access not enabled";
+    case hipErrorSetOnActiveProcess: return "Primary context active";
+    case hipErrorAssert: return "Assert";
+    case hipErrorHostMemoryAlreadyRegistered: return "Host memory already registered";
+    case hipErrorHostMemoryNotRegistered: return "Host memory not registered";
+    case hipErrorLaunchFailure: return "Launch failed";
+    case hipErrorCooperativeLaunchTooLarge: return "Cooperative launch too large";
+    case hipErrorNotSupported: return "Not supported";
+    case hipErrorUnknown: return "Unknown error";
+    default: return "Unknown HIP error value";
+  }
+}
+
+static void path_join(const char *path1,
+                      const char *path2,
+                      int maxlen,
+                      char *result) {
+#if defined(WIN32) || defined(_WIN32)
+  const char separator = '\\';
+#else
+  const char separator = '/';
+#endif
+  int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2);
+  if (n != -1 && n < maxlen) {
+    result[n] = '\0';
+  }
+  else {
+    result[maxlen - 1] = '\0';
+  }
+}
+
+static int path_exists(const char *path) {
+  struct stat st;
+  if (stat(path, &st)) {
+    return 0;
+  }
+  return 1;
+}
+
+const char *hipewCompilerPath(void) {
+    #ifdef _WIN32
+    const char *hipPath = getenv("HIP_ROCCLR_HOME");
+    const char *windowsCommand = "perl ";
+    const char *executable = "bin/hipcc";
+
+    static char hipcc[65536];
+    static char finalCommand[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)) {
+        snprintf(finalCommand, sizeof(hipcc), "%s %s", windowsCommand, hipcc);
+        return finalCommand;
+      } else {
+        printf("Could not find hipcc. Make sure HIP_ROCCLR_HOME points to the directory holding /bin/hipcc");
+      }
+    }
+    #else
+    const char *hipPath =  "opt/rocm/hip/bin";
+    const char *executable = "hipcc";
+
+    static char hipcc[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)){
+        return hipcc;
+      }
+    }
+    #endif
+
+  {
+#ifdef _WIN32
+    FILE *handle = popen("where hipcc", "r");
+#else
+    FILE *handle = popen("which hipcc", "r");
+#endif
+    if (handle) {
+      char buffer[4096] = {0};
+      int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
+      buffer[len] = '\0';
+      pclose(handle);
+      if (buffer[0]) {
+        return "hipcc";
+      }
+    }
+  }
+
+  return NULL;
+}
+
+int hipewCompilerVersion(void) {
+  const char *path = hipewCompilerPath();
+  const char *marker = "Hip compilation tools, release ";
+  FILE *pipe;
+  int major, minor;
+  char *versionstr;
+  char buf[128];
+  char output[65536] = "\0";
+  char command[65536] = "\0";
+
+  if (path == NULL) {
+    return 0;
+  }
+
+  /* get --version output */
+  strcat(command, "\"");
+  strncat(command, path, sizeof(command) - 1);
+  strncat(command, "\" --version", sizeof(command) - strlen(path) - 1);
+  pipe = popen(command, "r");
+  if (!pipe) {
+    fprintf(stderr, "HIP: failed to run compiler to retrieve version");
+    return 0;
+  }
+
+  while (!feof(pipe)) {
+    if (fgets(buf, sizeof(buf), pipe) != NULL) {
+      strncat(output, buf, sizeof(output) - strlen(output) - 1);
+    }
+  }
+
+  pclose(pipe);
+  return 40;
+}
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@ -297,6 +297,7 @@ endif()

 if(WITH_CYCLES_STANDALONE)
  set(WITH_CYCLES_DEVICE_CUDA TRUE)
+  set(WITH_CYCLES_DEVICE_HIP TRUE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@ -95,6 +95,9 @@ set(ADDON_FILES

 add_definitions(${GL_DEFINITIONS})

+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_MOD_FLUID)
  add_definitions(-DWITH_FLUID)
 endif()
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@ -28,7 +28,7 @@ def _configure_argument_parser():
                        action='store_true')
    parser.add_argument("--cycles-device",
                        help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
+                             "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
                             "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                        default=None)
    return parser
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -111,6 +111,7 @@ enum_device_type = (
    ('CPU', "CPU", "CPU", 0),
    ('CUDA', "CUDA", "CUDA", 1),
    ('OPTIX', "OptiX", "OptiX", 3),
+    ("HIP", "HIP", "HIP", 4)
 )

 enum_texture_limit = (
@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def get_device_types(self, context):
        import _cycles
-        has_cuda, has_optix = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip = _cycles.get_device_types()
+
        list = [('NONE', "None", "Don't use compute device", 0)]
        if has_cuda:
            list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
        if has_optix:
            list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
+        if has_hip:
+            list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+
        return list

    compute_device_type: EnumProperty(
@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def update_device_entries(self, device_list):
        for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
                continue
            # Try to find existing Device entry
            entry = self.find_existing_device_entry(device)
@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            elif entry.type == 'CPU':
                cpu_devices.append(entry)
        # Extend all GPU devices with CPU.
-        if compute_device_type != 'CPU':
+        if compute_device_type != 'CPU' and compute_device_type != 'HIP':
            devices.extend(cpu_devices)
        return devices

@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        import _cycles
        # Ensure `self.devices` is not re-allocated when the second call to
        # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP'):
            self.update_device_entries(_cycles.available_devices(device_type))

    # Deprecated: use refresh_devices instead.
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -99,6 +99,11 @@ def use_cuda(context):
    return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')


+def use_hip(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'HIP' and cscene.device == 'GPU')
+
 def use_optix(context):
    cscene = context.scene.cycles

--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@ -26,6 +26,7 @@ enum ComputeDevice {
  COMPUTE_DEVICE_CPU = 0,
  COMPUTE_DEVICE_CUDA = 1,
  COMPUTE_DEVICE_OPTIX = 3,
+  COMPUTE_DEVICE_HIP = 4,

  COMPUTE_DEVICE_NUM
 };
@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
      else if (compute_device == COMPUTE_DEVICE_OPTIX) {
        mask |= DEVICE_MASK_OPTIX;
      }
+      else if (compute_device == COMPUTE_DEVICE_HIP) {
+        mask |= DEVICE_MASK_HIP;
+      }
      vector<DeviceInfo> devices = Device::available_devices(mask);

      /* Match device preferences and available devices. */
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
  vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false;
+  bool has_cuda = false, has_optix = false, has_hip = false;
  foreach (DeviceType device_type, device_types) {
    has_cuda |= (device_type == DEVICE_CUDA);
    has_optix |= (device_type == DEVICE_OPTIX);
+    has_hip |= (device_type == DEVICE_HIP);
  }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
  PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
  return list;
 }

@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
  else if (override == "OPTIX") {
    BlenderSession::device_override = DEVICE_MASK_OPTIX;
  }
+  else if (override == "HIP") {
+    BlenderSession::device_override = DEVICE_MASK_HIP;
+  }
  else {
    printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
    Py_RETURN_FALSE;
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@ -531,5 +531,9 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
    endif()
  endif()
 endif()
+if(NOT WITH_HIP_DYNLOAD)
+  message(STATUS "Setting up HIP Dynamic Load")
+  set(WITH_HIP_DYNLOAD ON)
+endif()

 unset(_cycles_lib_dir)
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@ -162,6 +162,10 @@ macro(cycles_target_link_libraries target)
    target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
  endif()

+  if(WITH_HIP_DYNLOAD)
+    target_link_libraries(${target} extern_hipew)
+  endif()
+
  if(CYCLES_STANDALONE_REPOSITORY)
    target_link_libraries(${target} extern_numaapi)
  else()
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@ -34,6 +34,13 @@ else()
  add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
 endif()

+if(WITH_HIP_DYNLOAD)
+  list(APPEND INC
+    ../../../extern/hipew/include
+  )
+  add_definitions(-DWITH_HIP_DYNLOAD)
+endif()
+
 set(SRC
  device.cpp
  device_denoise.cpp
@ -70,6 +77,21 @@ set(SRC_CUDA
  cuda/util.h
 )

+set(SRC_HIP
+  hip/device.cpp
+  hip/device.h
+  hip/device_impl.cpp
+  hip/device_impl.h
+  hip/graphics_interop.cpp
+  hip/graphics_interop.h
+  hip/kernel.cpp
+  hip/kernel.h
+  hip/queue.cpp
+  hip/queue.h
+  hip/util.cpp
+  hip/util.h
+)
+
 set(SRC_DUMMY
  dummy/device.cpp
  dummy/device.h
@ -115,11 +137,20 @@ else()
  )
 endif()

+if(WITH_HIP_DYNLOAD)
+  list(APPEND LIB
+    extern_hipew
+  )
+endif()
+
 add_definitions(${GL_DEFINITIONS})

 if(WITH_CYCLES_DEVICE_CUDA)
  add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
@ -140,6 +171,7 @@ cycles_add_library(cycles_device "${LIB}"
  ${SRC}
  ${SRC_CPU}
  ${SRC_CUDA}
+  ${SRC_HIP}
  ${SRC_DUMMY}
  ${SRC_MULTI}
  ${SRC_OPTIX}
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -25,6 +25,7 @@
 #include "device/cpu/device.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
+#include "device/hip/device.h"
 #include "device/multi/device.h"
 #include "device/optix/device.h"

@ -46,6 +47,7 @@ thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
+vector<DeviceInfo> Device::hip_devices;
 uint Device::devices_initialized_mask = 0;

 /* Device */
@ -96,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
        device = device_optix_create(info, stats, profiler);
      break;
 #endif
+
+#ifdef WITH_HIP
+    case DEVICE_HIP:
+      if (device_hip_init())
+        device = device_hip_create(info, stats, profiler);
+      break;
+#endif
+
    default:
      break;
  }
@ -117,6 +127,8 @@ DeviceType Device::type_from_string(const char *name)
    return DEVICE_OPTIX;
  else if (strcmp(name, "MULTI") == 0)
    return DEVICE_MULTI;
+  else if (strcmp(name, "HIP") == 0)
+    return DEVICE_HIP;

  return DEVICE_NONE;
 }
@ -131,6 +143,8 @@ string Device::string_from_type(DeviceType type)
    return "OPTIX";
  else if (type == DEVICE_MULTI)
    return "MULTI";
+  else if (type == DEVICE_HIP)
+    return "HIP";

  return "";
 }
@ -145,6 +159,10 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
  types.push_back(DEVICE_OPTIX);
 #endif
+#ifdef WITH_HIP
+  types.push_back(DEVICE_HIP);
+#endif
+
  return types;
 }

@ -186,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
  }
 #endif

+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (!(devices_initialized_mask & DEVICE_MASK_HIP)) {
+      if (device_hip_init()) {
+        device_hip_info(hip_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_HIP;
+    }
+    foreach (DeviceInfo &info, hip_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
  if (mask & DEVICE_MASK_CPU) {
    if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
      device_cpu_info(cpu_devices);
@ -226,6 +258,15 @@ string Device::device_capabilities(uint mask)
  }
 #endif

+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (device_hip_init()) {
+      capabilities += "\nHIP device capabilities:\n";
+      capabilities += device_hip_capabilities();
+    }
+  }
+#endif
+
  return capabilities;
 }

@ -314,6 +355,7 @@ void Device::free_memory()
  devices_initialized_mask = 0;
  cuda_devices.free_memory();
  optix_devices.free_memory();
+  hip_devices.free_memory();
  cpu_devices.free_memory();
 }

--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -51,6 +51,7 @@ enum DeviceType {
  DEVICE_CUDA,
  DEVICE_MULTI,
  DEVICE_OPTIX,
+  DEVICE_HIP,
  DEVICE_DUMMY,
 };

@ -58,6 +59,7 @@ enum DeviceTypeMask {
  DEVICE_MASK_CPU = (1 << DEVICE_CPU),
  DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
  DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
+  DEVICE_MASK_HIP = (1 << DEVICE_HIP),
  DEVICE_MASK_ALL = ~0
 };

@ -284,6 +286,7 @@ class Device {
  static vector<DeviceInfo> cuda_devices;
  static vector<DeviceInfo> optix_devices;
  static vector<DeviceInfo> cpu_devices;
+  static vector<DeviceInfo> hip_devices;
  static uint devices_initialized_mask;
 };

--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@ -277,6 +277,7 @@ class device_memory {
 protected:
  friend class CUDADevice;
  friend class OptiXDevice;
+  friend class HIPDevice;

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@ -0,0 +1,276 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/hip/device.h"
+
+#include "util/util_logging.h"
+
+#ifdef WITH_HIP
+#  include "device/device.h"
+#  include "device/hip/device_impl.h"
+
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
+#endif /* WITH_HIP */
+
+CCL_NAMESPACE_BEGIN
+
+bool device_hip_init()
+{
+#if !defined(WITH_HIP)
+  return false;
+#elif defined(WITH_HIP_DYNLOAD)
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int hipew_result = hipewInit(HIPEW_INIT_HIP);
+  if (hipew_result == HIPEW_SUCCESS) {
+    VLOG(1) << "HIPEW initialization succeeded";
+    if (HIPDevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+    else if (hipewCompilerPath() != NULL) {
+      VLOG(1) << "Found HIPCC " << hipewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
+              << " unable to use HIP";
+    }
+  }
+  else {
+    VLOG(1) << "HIPEW initialization failed: "
+            << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
+  }
+
+  return result;
+#else  /* WITH_HIP_DYNLOAD */
+  return true;
+#endif /* WITH_HIP_DYNLOAD */
+}
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_HIP
+  return new HIPDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_HIP
+static hipError_t device_hip_safe_init()
+{
+#  ifdef _WIN32
+  __try {
+    return hipInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the HIP driver and hope we can
+     * survive even with corrupted HIP installs. */
+    fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n");
+  }
+
+  return hipErrorNoDevice;
+#  else
+  return hipInit(0);
+#  endif
+}
+#endif /* WITH_HIP */
+
+void device_hip_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice)
+      fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = hipDeviceGetName(name, 256, num);
+    if (result != hipSuccess) {
+      fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result));
+      continue;
+    }
+
+    int major;
+    hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
+    // TODO : (Arya) What is the last major version we are supporting?
+
+    DeviceInfo info;
+
+    info.type = DEVICE_HIP;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_nanovdb = true;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        hipDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
+    int pci_location[3] = {0, 0, 0};
+    hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
+    hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
+    hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num);
+    info.id = string_printf("HIP_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      VLOG(1) << "Device has compute preemption or is not used for display.";
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_HIP */
+  (void)devices;
+#endif /* WITH_HIP */
+}
+
+string device_hip_capabilities()
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice) {
+      return string("Error initializing HIP: ") + hipewErrorString(result);
+    }
+    return "No HIP device found\n";
+  }
+
+  int count;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    return string("Error getting devices: ") + hipewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (hipDeviceGetName(name, 256, num) != hipSuccess) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
+#  define GET_ATTR(attr) \
+    { \
+      if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \
+        capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \
+      } \
+    } \
+    (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MaxThreadsPerBlock);
+    GET_ATTR(MaxBlockDimX);
+    GET_ATTR(MaxBlockDimY);
+    GET_ATTR(MaxBlockDimZ);
+    GET_ATTR(MaxGridDimX);
+    GET_ATTR(MaxGridDimY);
+    GET_ATTR(MaxGridDimZ);
+    GET_ATTR(MaxSharedMemoryPerBlock);
+    GET_ATTR(TotalConstantMemory);
+    GET_ATTR(WarpSize);
+    GET_ATTR(MaxPitch);
+    GET_ATTR(MaxRegistersPerBlock);
+    GET_ATTR(ClockRate);
+    GET_ATTR(TextureAlignment);
+    GET_ATTR(MultiprocessorCount);
+    GET_ATTR(KernelExecTimeout);
+    GET_ATTR(Integrated);
+    GET_ATTR(CanMapHostMemory);
+    GET_ATTR(ComputeMode);
+    GET_ATTR(MaxTexture1DWidth);
+    GET_ATTR(MaxTexture2DWidth);
+    GET_ATTR(MaxTexture2DHeight);
+    GET_ATTR(MaxTexture3DWidth);
+    GET_ATTR(MaxTexture3DHeight);
+    GET_ATTR(MaxTexture3DDepth);
+    GET_ATTR(ConcurrentKernels);
+    GET_ATTR(EccEnabled);
+    GET_ATTR(MemoryClockRate);
+    GET_ATTR(MemoryBusWidth);
+    GET_ATTR(L2CacheSize);
+    GET_ATTR(MaxThreadsPerMultiProcessor);
+    GET_ATTR(ComputeCapabilityMajor);
+    GET_ATTR(ComputeCapabilityMinor);
+    GET_ATTR(MaxSharedMemoryPerMultiprocessor);
+    GET_ATTR(ManagedMemory);
+    GET_ATTR(IsMultiGpuBoard);
+#  undef GET_ATTR
+    capabilities += "\n";
+  }
+
+  return capabilities;
+
+#else  /* WITH_HIP */
+  return "";
+#endif /* WITH_HIP */
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/hip/device.h
+++ b/intern/cycles/device/hip/device.h
@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_hip_init();
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_hip_info(vector<DeviceInfo> &devices);
+
+string device_hip_capabilities();
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@ -0,0 +1,153 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device.h"
+#  include "device/hip/kernel.h"
+#  include "device/hip/queue.h"
+#  include "device/hip/util.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  else
+#    include "util/util_opengl.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class HIPDevice : public Device {
+
+  friend class HIPContextScope;
+
+ public:
+  hipDevice_t hipDevice;
+  hipCtx_t hipContext;
+  hipModule_t hipModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int hipDevId;
+  int hipDevArchitecture;
+  bool first_error;
+
+  struct HIPMem {
+    HIPMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    hipTextureObject_t texobject;
+    hArray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, HIPMem> HIPMemMap;
+  HIPMemMap hip_mem_map;
+  thread_mutex hip_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  HIPDeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~HIPDevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "hip",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(hipDeviceAttribute_t attribute, int *value);
+  int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/graphics_interop.cpp
+++ b/intern/cycles/device/hip/graphics_interop.cpp
@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/graphics_interop.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue)
+    : queue_(queue), device_(static_cast<HIPDevice *>(queue->device))
+{
+}
+
+HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop()
+{
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+}
+
+void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+
+  const hipError_t result = hipGraphicsGLRegisterBuffer(
+      &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr HIPDeviceGraphicsInterop::map()
+{
+  if (!hip_graphics_resource_) {
+    return 0;
+  }
+
+  HIPContextScope scope(device_);
+
+  hipDeviceptr_t hip_buffer;
+  size_t bytes;
+
+  hip_device_assert(device_,
+                    hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream()));
+  hip_device_assert(
+      device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_));
+
+  return static_cast<device_ptr>(hip_buffer);
+}
+
+void HIPDeviceGraphicsInterop::unmap()
+{
+  HIPContextScope scope(device_);
+
+  hip_device_assert(device_,
+                    hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class HIPDeviceQueue;
+
+class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue);
+
+  HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~HIPDeviceGraphicsInterop();
+
+  HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  HIPDeviceQueue *queue_ = nullptr;
+  HIPDevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  hipGraphicsResource hip_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/kernel.cpp
+++ b/intern/cycles/device/hip/kernel.cpp
@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/kernel.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void HIPDeviceKernels::load(HIPDevice *device)
+{
+  hipModule_t hipModule = device->hipModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    HIPDeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    hip_device_assert(device,
+                      hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str()));
+
+    if (kernel.function) {
+      hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1));
+
+      hip_device_assert(
+          device,
+          hipModuleOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool HIPDeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP*/
--- a/intern/cycles/device/hip/kernel.h
+++ b/intern/cycles/device/hip/kernel.h
@ -0,0 +1,54 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* HIP kernel and associate occupancy information. */
+class HIPDeviceKernel {
+ public:
+  hipFunction_t function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of HIP kernels for each DeviceKernel. */
+class HIPDeviceKernels {
+ public:
+  void load(HIPDevice *device);
+  const HIPDeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/queue.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/graphics_interop.h"
+#  include "device/hip/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* HIPDeviceQueue */
+
+HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
+    : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
+{
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
+}
+
+HIPDeviceQueue::~HIPDeviceQueue()
+{
+  const HIPContextScope scope(hip_device_);
+  hipStreamDestroy(hip_stream_);
+}
+
+int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+  /* TODO: compute automatically. */
+  /* TODO: must have at least num_threads_per_block. */
+  return 14416128;
+}
+
+int HIPDeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void HIPDeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  HIPContextScope scope(hip_device_);
+  hip_device_->load_texture_info();
+  hip_device_assert(hip_device_, hipDeviceSynchronize());
+
+  debug_init_execution();
+}
+
+bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return hip_device_->kernels.available(kernel);
+}
+
+bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const HIPContextScope scope(hip_device_);
+  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = hip_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  hip_device_assert(hip_device_,
+                    hipModuleLaunchKernel(hip_kernel.function,
+                                          num_blocks,
+                                          1,
+                                          1,
+                                          num_threads_per_block,
+                                          1,
+                                          1,
+                                          shared_mem_bytes,
+                                          hip_stream_,
+                                          args,
+                                          0));
+  return !(hip_device_->have_error());
+}
+
+bool HIPDeviceQueue::synchronize()
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  debug_synchronize();
+
+  return !(hip_device_->have_error());
+}
+
+void HIPDeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyHtoDAsync(
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyDtoHAsync(
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+}
+
+// TODO : (Arya) Enable this after stabilizing dev branch
+unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
+{
+  return make_unique<HIPDeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class device_memory;
+
+/* Base class for HIP queues. */
+class HIPDeviceQueue : public DeviceQueue {
+ public:
+  HIPDeviceQueue(HIPDevice *device);
+  ~HIPDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual hipStream_t stream()
+  {
+    return hip_stream_;
+  }
+
+  // TODO : (Arya) Enable this after stabilizing the dev branch
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  HIPDevice *hip_device_;
+  hipStream_t hip_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/util.cpp
+++ b/intern/cycles/device/hip/util.cpp
@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/util.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPContextScope::HIPContextScope(HIPDevice *device) : device(device)
+{
+  hip_device_assert(device, hipCtxPushCurrent(device->hipContext));
+}
+
+HIPContextScope::~HIPContextScope()
+{
+  hip_device_assert(device, hipCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_HIP_DYNLOAD
+const char *hipewErrorString(hipError_t result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *hipewCompilerPath()
+{
+  return CYCLES_HIP_HIPCC_EXECUTABLE;
+}
+
+int hipewCompilerVersion()
+{
+  return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* Utility to push/pop HIP context. */
+class HIPContextScope {
+ public:
+  HIPContextScope(HIPDevice *device);
+  ~HIPContextScope();
+
+ private:
+  HIPDevice *device;
+};
+
+/* Utility for checking return values of HIP function calls. */
+#  define hip_device_assert(hip_device, stmt) \
+    { \
+      hipError_t result = stmt; \
+      if (result != hipSuccess) { \
+        const char *name = hipewErrorString(result); \
+        hip_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define hip_assert(stmt) hip_device_assert(this, stmt)
+
+#  ifndef WITH_HIP_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked HIP at all. */
+const char *hipewErrorString(hipError_t result);
+const char *hipewCompilerPath();
+int hipewCompilerVersion();
+#  endif /* WITH_HIP_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@ -1035,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type)
      return "CUDA";
    case DEVICE_OPTIX:
      return "OptiX";
+    case DEVICE_HIP:
+      return "HIP";
    case DEVICE_DUMMY:
      return "Dummy";
    case DEVICE_MULTI:
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA
  device/cuda/kernel.cu
 )

+set(SRC_DEVICE_HIP
+  device/hip/kernel.cpp
+)
+
 set(SRC_DEVICE_OPTIX
  device/optix/kernel.cu
  device/optix/kernel_shader_raytrace.cu
@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS
  device/cuda/globals.h
 )

+set(SRC_DEVICE_HIP_HEADERS
+  device/hip/compat.h
+  device/hip/config.h
+  device/hip/globals.h
+)
+
 set(SRC_DEVICE_OPTIX_HEADERS
  device/optix/compat.h
  device/optix/globals.h
@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
  cycles_set_solution_folder(cycles_kernel_cuda)
 endif()

+####################################################### START
+
+# HIP module
+
+if(WITH_CYCLES_HIP_BINARIES)
+  # 64 bit only
+  set(HIP_BITS 64)
+
+  # HIP version
+  execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}")
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}")
+  set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}")
+
+
+  message(WARNING
+    "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected")
+
+  # build for each arch
+  set(hip_sources device/hip/kernel.cpp
+    ${SRC_HEADERS}
+    ${SRC_DEVICE_HIP_HEADERS}
+    ${SRC_BVH_HEADERS}
+    ${SRC_SVM_HEADERS}
+    ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
+    ${SRC_CLOSURE_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+  set(hip_fatbins)
+
+  macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "fatbin")
+    endif()
+    set(hip_file ${name}_${arch}.${format})
+
+    set(kernel_sources ${sources})
+    if(NOT ${prev_arch} STREQUAL "none")
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin)
+      endif()
+    endif()
+
+    set(hip_kernel_src "/device/hip/${name}.cpp")
+
+    set(hip_flags ${flags}
+      -D CCL_NAMESPACE_BEGIN=
+      -D CCL_NAMESPACE_END=
+      -D HIPCC
+      -m ${HIP_BITS}
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip
+      --use_fast_math
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
+
+    if(${experimental})
+      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
+      set(name ${name}_experimental)
+    endif()
+
+    if(WITH_CYCLES_DEBUG)
+      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+    endif()
+
+    if(WITH_NANOVDB)
+      set(hip_flags ${hip_flags}
+        -D WITH_NANOVDB
+        -I "${NANOVDB_INCLUDE_DIR}")
+    endif()
+
+
+  set(prev_arch "none")
+  foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
+      set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
+      set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR})
+    if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir)
+      # Compile regular kernel
+      CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE)
+
+      if(WITH_CYCLES_HIP_BUILD_SERIAL)
+        set(prev_arch ${arch})
+      endif()
+
+      unset(hip_hipcc_executable)
+      unset(hip_toolkit_root_dir)
+    endif()
+  endforeach()
+
+  add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins})
+  cycles_set_solution_folder(cycles_kernel_hip)
+endif()
+
+####################################################### END
 # OptiX PTX modules

 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@ -602,11 +710,13 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_DEVICE_CPU}
  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_HIP}
  ${SRC_DEVICE_OPTIX}
  ${SRC_HEADERS}
  ${SRC_DEVICE_CPU_HEADERS}
  ${SRC_DEVICE_GPU_HEADERS}
  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_HIP_HEADERS}
  ${SRC_DEVICE_OPTIX_HEADERS}
  ${SRC_BVH_HEADERS}
  ${SRC_CLOSURE_HEADERS}
@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
 source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS})
 source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
 source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
 source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
@ -632,14 +743,19 @@ endif()
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
  add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
+if(WITH_CYCLES_HIP)
+  add_dependencies(cycles_kernel cycles_kernel_hip)
+endif()

 # Install kernel source for runtime compilation

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@ -25,7 +25,11 @@ CCL_NAMESPACE_BEGIN

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
 {
--- a/intern/cycles/kernel/device/gpu/parallel_reduce.h
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
 * the overall cost of the algorithm while keeping the work complexity O(n) and
 * the step complexity O(log n). (Brent's Theorem optimization) */

-#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
 __device__ void gpu_parallel_sum(
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

 template<uint blocksize, typename GetKeyOp>
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+#  include "hip/hip_fp16.h"
+#  include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+#  include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/hip/kernel.cpp
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+#  include "kernel/device/hip/compat.h"
+#  include "kernel/device/hip/config.h"
+#  include "kernel/device/hip/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@ -34,7 +34,7 @@

 #else /* __KERNEL_GPU__ */

-#  ifdef __KERNEL_CUDA__
+#  if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)

 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))

--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false)
  reset();
 }

+DebugFlags::HIP::HIP() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
  if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
 }

+void DebugFlags::HIP::reset()
+{
+  if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
  reset();
@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)

  os << "OptiX flags:\n"
     << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
+
+  os << "HIP flags:\n"
+     << "  HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n";
+
  return os;
 }

--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@ -93,6 +93,17 @@ class DebugFlags {
    bool adaptive_compile;
  };

+  /* Descriptor of HIP feature-set to be used. */
+  struct HIP {
+    HIP();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
  /* Descriptor of OptiX feature-set to be used. */
  struct OptiX {
    OptiX();
@ -124,6 +135,9 @@ class DebugFlags {
  /* Requested OptiX flags. */
  OptiX optix;

+  /* Requested HIP flags. */
+  HIP hip;
+
 private:
  DebugFlags();

--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 /* Half Floats */

 /* CUDA has its own half data type, no need to define then */
-#ifndef __KERNEL_CUDA__
+#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
 * unsigned shorts. */
 class half {
@ -59,7 +59,7 @@ struct half4 {
  half x, y, z, w;
 };

-#ifdef __KERNEL_CUDA__
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)

 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f)

 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
+
 #  ifndef __KERNEL_SSE2__
  for (int i = 0; i < 4; i++) {
    /* optimized float to half for pixels:
@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 #  endif
 }

+#  ifndef __KERNEL_HIP__
+
 ccl_device_inline float half_to_float(half h)
 {
  float f;
@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h)

  return f;
 }
+#  else
+
+ccl_device_inline float half_to_float(std::uint32_t a) noexcept
+{
+
+  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
+
+  std::uint32_t v = __float_as_uint(__uint_as_float(u) *
+                                    __uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
+                    0x38000000U;
+
+  u = (a & 0x7fff) != 0 ? v : u;
+
+  return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
+}
+
+#  endif /* __KERNEL_HIP__ */

 ccl_device_inline float4 half4_to_float4(half4 h)
 {
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@ -26,6 +26,10 @@
 #  include <cmath>
 #endif

+#ifdef __HIP__
+#  include <hip/hip_vector_types.h>
+#endif
+
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN

 /* Scalar */

-#ifdef _WIN32
+#ifndef __HIP__
+#  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
  return (a > b) ? a : b;
@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b)
 {
  return (a < b) ? a : b;
 }
-#endif /* _WIN32 */
+
+#  endif /* _WIN32 */
+#endif   /* __HIP__ */

 #ifndef __KERNEL_GPU__
 using std::isfinite;
@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f)
  return u.i;
 }

+#ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
  union {
@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i)
  u.i = i;
  return u.f;
 }
+#endif

 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits)

 ccl_device_inline uint count_leading_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return __clz(x);
 #else
  assert(x != 0);
@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x)

 ccl_device_inline uint count_trailing_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return (__ffs(x) - 1);
 #else
  assert(x != 0);
@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x)

 ccl_device_inline uint find_first_set(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
  return __ffs(x);
 #else
 #  ifdef _MSC_VER