Add CudaAllocator to encapsulate runtime managed memory logic.

Unified memory is now used when we detect that the hardware supports it.
2024-09-08 21:33:55 +00:00 · 2017-07-14 10:36:15 -04:00 · 2017-07-14 10:36:15 -04:00 · bd042ec567
commit bd042ec567
parent 938db09ad5
7 changed files with 268 additions and 66 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -108,18 +108,6 @@ option(VTKm_USE_DOUBLE_PRECISION
  )
 option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)

-if (VTKm_ENABLE_CUDA)
-  option(VTKm_USE_UNIFIED_MEMORY
-    "Use CUDA unified memory"
-    OFF
-    )
-endif (VTKm_ENABLE_CUDA)
-
-if (VTKm_USE_UNIFIED_MEMORY)
-  set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
-endif()
-
-
 option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
 set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})

--- a/examples/unified_memory/UnifiedMemory.cu
+++ b/examples/unified_memory/UnifiedMemory.cu
@ -30,6 +30,7 @@
 #include <vtkm/cont/CellSetExplicit.h>
 #include <vtkm/cont/DataSet.h>
 #include <vtkm/cont/Timer.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>

 namespace
 {
@ -92,8 +93,8 @@ vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
  vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };

  vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
-  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0] * vdims[1] *
-                                                                       vdims[2]);
+  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
+    0, 1, vdims[0] * vdims[1] * vdims[2]);
  vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
    TangleField(vdims, mins, maxs));
  tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
@ -153,37 +154,39 @@ int main(int argc, char* argv[])
  typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
  vtkm::worklet::SineWorklet sineWorklet;

-#ifdef VTKM_USE_UNIFIED_MEMORY
-  std::cout << "Testing with unified memory" << std::endl;
+  bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();

-  vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+  if (usingManagedMemory)
+  {
+    std::cout << "Testing with unified memory" << std::endl;

-  vtkm::cont::Timer<> timer;
+    vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);

-  dispatcher.Invoke(input, output);
-  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
+    vtkm::cont::Timer<> timer;

-  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-  std::cout << "Time: " << elapsedTime << std::endl;
+    dispatcher.Invoke(input, output);
+    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;

-#else
+    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+    std::cout << "Time: " << elapsedTime << std::endl;
+  }
+  else
+  {
+    vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+    vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
+    NBlocks *= 2;
+    dispatcher.SetNumberOfBlocks(NBlocks);
+    std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
+              << std::endl;

-  vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
-  vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
-  NBlocks *= 2;
-  dispatcher.SetNumberOfBlocks(NBlocks);
-  std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
-            << std::endl;
+    vtkm::cont::Timer<> timer;

-  vtkm::cont::Timer<> timer;
+    dispatcher.Invoke(input, output);
+    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;

-  dispatcher.Invoke(input, output);
-  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
-
-  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-  std::cout << "Time: " << elapsedTime << std::endl;
-
-#endif
+    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+    std::cout << "Time: " << elapsedTime << std::endl;
+  }

  int dim = 128;
  if (argc > 2)
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
@ -21,6 +21,9 @@
 #define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_cu

 #include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
+
+using vtkm::cont::cuda::internal::CudaAllocator;

 namespace vtkm
 {
@ -64,18 +67,8 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Allocate(TypelessExecut
  // Attempt to allocate:
  try
  {
-    char* tmp;
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    int dev;
-    VTKM_CUDA_CALL(cudaGetDevice(&dev));
-    VTKM_CUDA_CALL(cudaMallocManaged(&tmp, static_cast<std::size_t>(numBytes)));
-    VTKM_CUDA_CALL(cudaMemAdvise(tmp, numBytes, cudaMemAdviseSetPreferredLocation, dev));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, numBytes, dev, 0));
-    VTKM_CUDA_CALL(cudaStreamSynchronize(0));
-#else
-    VTKM_CUDA_CALL(cudaMalloc(&tmp, static_cast<std::size_t>(numBytes)));
-#endif
-
+    // Cast to char* so that the pointer math below will work.
+    char* tmp = static_cast<char*>(CudaAllocator::Allocate(static_cast<size_t>(numBytes)));
    execArray.Array = tmp;
    execArray.ArrayEnd = tmp + numBytes;
    execArray.ArrayCapacity = tmp + numBytes;
@ -93,7 +86,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Free(
 {
  if (execArray.Array != nullptr)
  {
-    VTKM_CUDA_CALL(cudaFree(execArray.Array));
+    CudaAllocator::Free(execArray.Array);
    execArray.Array = nullptr;
    execArray.ArrayEnd = nullptr;
    execArray.ArrayCapacity = nullptr;
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@ -35,6 +35,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 VTKM_THIRDPARTY_POST_INCLUDE

 #include <vtkm/cont/cuda/ErrorCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
 #include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>

@ -172,17 +173,8 @@ public:
    // Attempt to allocate:
    try
    {
-      ValueType* tmp;
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      int dev;
-      VTKM_CUDA_CALL(cudaGetDevice(&dev));
-      VTKM_CUDA_CALL(cudaMallocManaged(&tmp, bufferSize));
-      VTKM_CUDA_CALL(cudaMemAdvise(tmp, bufferSize, cudaMemAdviseSetPreferredLocation, dev));
-      VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, bufferSize, dev, 0));
-      VTKM_CUDA_CALL(cudaStreamSynchronize(0));
-#else
-      VTKM_CUDA_CALL(cudaMalloc(&tmp, bufferSize));
-#endif
+      ValueType* tmp =
+        static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
      this->Begin = PointerType(tmp);
    }
    catch (const std::exception& error)
@ -214,9 +206,6 @@ public:
    storage->Allocate(this->GetNumberOfValues());
    try
    {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      cudaDeviceSynchronize();
-#endif
      ::thrust::copy(
        this->Begin, this->End, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
    }
@ -254,7 +243,7 @@ public:
  {
    if (this->Begin.get() != nullptr)
    {
-      VTKM_CUDA_CALL(cudaFree(this->Begin.get()));
+      vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin.get());
      this->Begin = PointerType(static_cast<ValueType*>(nullptr));
      this->End = PointerType(static_cast<ValueType*>(nullptr));
      this->Capacity = PointerType(static_cast<ValueType*>(nullptr));
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@ -21,6 +21,7 @@
 set(headers
  ArrayManagerExecutionCuda.h
  ArrayManagerExecutionThrustDevice.h
+  CudaAllocator.h
  DeviceAdapterAlgorithmCuda.h
  DeviceAdapterAlgorithmThrust.h
  DeviceAdapterTagCuda.h
@ -30,6 +31,11 @@ set(headers
  VirtualObjectTransferCuda.h
  )

+set(sources
+  ArrayManagerExecutionCuda.cu
+  CudaAllocator.cu
+  )
+
 vtkm_declare_headers(CUDA ${headers} TESTABLE ${VTKm_ENABLE_CUDA})

 #-----------------------------------------------------------------------------
@ -49,7 +55,7 @@ if (VTKm_ENABLE_CUDA)
    list(APPEND compile_options -Xcompiler=-fPIC)
  endif()

-  cuda_compile(vtkm_cont_cuda_object_files ArrayManagerExecutionCuda.cu
+  cuda_compile(vtkm_cont_cuda_object_files ${sources}
               OPTIONS "${compile_options}")

  #Setup the dependency chain for the custom object build so that
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@ -0,0 +1,164 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 Sandia Corporation.
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/cuda/ErrorCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
+
+#include <cuda_runtime.h>
+
+// These static vars are in an anon namespace to work around MSVC linker issues.
+namespace
+{
+// Has CudaAllocator::Initialize been called?
+static bool IsInitialized = false;
+
+// True if all devices support concurrent pagable managed memory.
+static bool ManagedMemorySupported = false;
+}
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+bool CudaAllocator::UsingManagedMemory()
+{
+  CudaAllocator::Initialize();
+  return ManagedMemorySupported;
+}
+
+void* CudaAllocator::Allocate(std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  void* ptr = nullptr;
+  if (ManagedMemorySupported)
+  {
+    VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
+  }
+  else
+  {
+    VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
+  }
+
+  return ptr;
+}
+
+void CudaAllocator::Free(void* ptr)
+{
+  CudaAllocator::Initialize();
+
+  VTKM_CUDA_CALL(cudaFree(ptr));
+}
+
+void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    // TODO these hints need to be benchmarked and adjusted once we start
+    // sharing the pointers between cont/exec
+    VTKM_CUDA_CALL(
+      cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
+  }
+}
+
+void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
+{
+  CudaAllocator::Initialize();
+
+  if (ManagedMemorySupported)
+  {
+    int dev;
+    VTKM_CUDA_CALL(cudaGetDevice(&dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
+    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+  }
+}
+
+void CudaAllocator::Initialize()
+{
+  if (!IsInitialized)
+  {
+    int numDevices;
+    VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
+
+    if (numDevices == 0)
+    {
+      ManagedMemorySupported = false;
+      IsInitialized = true;
+      return;
+    }
+
+    // Check all devices, use the feature set supported by all
+    bool managed = true;
+    cudaDeviceProp prop;
+    for (int i = 0; i < numDevices && managed; ++i)
+    {
+      VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
+      // We check for concurrentManagedAccess, as devices with only the
+      // managedAccess property have extra synchronization requirements.
+      managed = managed && prop.concurrentManagedAccess;
+    }
+
+    ManagedMemorySupported = managed;
+    IsInitialized = true;
+  }
+}
+}
+}
+}
+} // end namespace vtkm::cont::cuda::internal
--- a/vtkm/cont/cuda/internal/CudaAllocator.h
+++ b/vtkm/cont/cuda/internal/CudaAllocator.h
@ -0,0 +1,59 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 Sandia Corporation.
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_cuda_internal_CudaAllocator_h
+#define vtk_m_cont_cuda_internal_CudaAllocator_h
+
+#include <vtkm/cont/vtkm_cont_export.h>
+#include <vtkm/internal/ExportMacros.h>
+
+#include <cstddef>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+struct VTKM_CONT_EXPORT CudaAllocator
+{
+  static VTKM_CONT bool UsingManagedMemory();
+
+  static VTKM_CONT void* Allocate(std::size_t numBytes);
+  static VTKM_CONT void Free(void* ptr);
+
+  static VTKM_CONT void PrepareForControl(const void* ptr, std::size_t numBytes);
+
+  static VTKM_CONT void PrepareForInput(const void* ptr, std::size_t numBytes);
+  static VTKM_CONT void PrepareForOutput(const void* ptr, std::size_t numBytes);
+  static VTKM_CONT void PrepareForInPlace(const void* ptr, std::size_t numBytes);
+
+private:
+  static VTKM_CONT void Initialize();
+};
+}
+}
+}
+} // end namespace vtkm::cont::cuda::internal
+
+#endif // vtk_m_cont_cuda_internal_CudaAllocator_h