From baac7e1fb9dd8c38e87385b9122dd326964833a1 Mon Sep 17 00:00:00 2001
From: Dave Pugmire <dpugmire@gmail.com>
Date: Mon, 15 Aug 2022 14:30:13 -0400
Subject: [PATCH] Put sync memory into RuntimeDeviceTracker

---
 .../threaded_multiblock_filter/MultiBlock.cxx |  5 +-
 vtkm/cont/RuntimeDeviceTracker.cxx            | 54 ++++--------------
 vtkm/cont/RuntimeDeviceTracker.h              |  4 +-
 vtkm/cont/cuda/internal/CudaAllocator.cu      | 57 +++++++------------
 vtkm/filter/NewFilter.cxx                     | 22 +++----
 vtkm/filter/NewFilter.h                       |  5 --
 .../testing/UnitTestMultiBlockFilter.cxx      | 16 +-----
 7 files changed, 47 insertions(+), 116 deletions(-)

diff --git a/examples/threaded_multiblock_filter/MultiBlock.cxx b/examples/threaded_multiblock_filter/MultiBlock.cxx
index fca9f9a8f..de14c21c1 100644
--- a/examples/threaded_multiblock_filter/MultiBlock.cxx
+++ b/examples/threaded_multiblock_filter/MultiBlock.cxx
@@ -9,7 +9,6 @@
 //============================================================================
 
 #include <vtkm/cont/Initialize.h>
-#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/filter/contour/Contour.h>
 #include <vtkm/io/VTKDataSetReader.h>
 #include <vtkm/io/VTKDataSetWriter.h>
@@ -211,12 +210,12 @@ int main(int argc, char** argv)
     {
       if (opts.SyncMemAlloc)
       {
-        vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
+        //vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
         std::cout << "  Task: Sync memory alloc = ON" << std::endl;
       }
       else
       {
-        vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
+        //vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
         std::cout << "  Task: Sync memory alloc = OFF" << std::endl;
       }
     }
diff --git a/vtkm/cont/RuntimeDeviceTracker.cxx b/vtkm/cont/RuntimeDeviceTracker.cxx
index 384851190..63d333b27 100644
--- a/vtkm/cont/RuntimeDeviceTracker.cxx
+++ b/vtkm/cont/RuntimeDeviceTracker.cxx
@@ -41,30 +41,15 @@ struct RuntimeDeviceTrackerInternals
   bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; }
   void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; }
 
-  bool GetThreadFriendlyMemAlloc(std::size_t deviceId) const
-  {
-    return this->ThreadFriendlyMemAlloc[deviceId];
-  }
-  void SetThreadFriendlyMemAlloc(std::size_t deviceId, bool flag)
-  {
-    this->ThreadFriendlyMemAlloc[deviceId] = flag;
-  }
+  bool GetThreadFriendlyMemAlloc() const { return this->ThreadFriendlyMemAlloc; }
+  void SetThreadFriendlyMemAlloc(bool flag) { this->ThreadFriendlyMemAlloc = flag; }
 
   void ResetRuntimeAllowed()
   {
     std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false);
   }
 
-  void ResetThreadFriendlyMemAlloc()
-  {
-    std::fill_n(this->ThreadFriendlyMemAlloc, VTKM_MAX_DEVICE_ADAPTER_ID, false);
-  }
-
-  void Reset()
-  {
-    this->ResetRuntimeAllowed();
-    this->ResetThreadFriendlyMemAlloc();
-  }
+  void Reset() { this->ResetRuntimeAllowed(); }
 
 private:
   void CopyFrom(const RuntimeDeviceTrackerInternals* v)
@@ -72,14 +57,13 @@ private:
     std::copy(std::cbegin(v->RuntimeAllowed),
               std::cend(v->RuntimeAllowed),
               std::begin(this->RuntimeAllowed));
-    std::copy(std::cbegin(v->ThreadFriendlyMemAlloc),
-              std::cend(v->ThreadFriendlyMemAlloc),
-              std::begin(this->ThreadFriendlyMemAlloc));
+    this->SetThreadFriendlyMemAlloc(v->GetThreadFriendlyMemAlloc());
   }
 
   bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID];
-  bool ThreadFriendlyMemAlloc[VTKM_MAX_DEVICE_ADAPTER_ID];
+  bool ThreadFriendlyMemAlloc = false;
 };
+
 }
 
 VTKM_CONT
@@ -130,24 +114,9 @@ bool RuntimeDeviceTracker::CanRunOn(vtkm::cont::DeviceAdapterId deviceId) const
 }
 
 VTKM_CONT
-bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId) const
+bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc() const
 {
-  if (deviceId == vtkm::cont::DeviceAdapterTagAny{})
-  { //If at least a single device is enabled, than any device is enabled
-    for (vtkm::Int8 i = 1; i < VTKM_MAX_DEVICE_ADAPTER_ID; ++i)
-    {
-      if (this->Internals->GetThreadFriendlyMemAlloc(static_cast<std::size_t>(i)))
-      {
-        return true;
-      }
-    }
-    return false;
-  }
-  else
-  {
-    this->CheckDevice(deviceId);
-    return this->Internals->GetThreadFriendlyMemAlloc(deviceId.GetValue());
-  }
+  return this->Internals->GetThreadFriendlyMemAlloc();
 }
 
 VTKM_CONT
@@ -159,12 +128,9 @@ void RuntimeDeviceTracker::SetDeviceState(vtkm::cont::DeviceAdapterId deviceId,
 }
 
 VTKM_CONT
-void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId,
-                                                     bool state)
+void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(bool state)
 {
-  this->CheckDevice(deviceId);
-
-  this->Internals->SetThreadFriendlyMemAlloc(deviceId.GetValue(), state);
+  this->Internals->SetThreadFriendlyMemAlloc(state);
 }
 
 VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId)
diff --git a/vtkm/cont/RuntimeDeviceTracker.h b/vtkm/cont/RuntimeDeviceTracker.h
index 8dc5189db..5de7c22bc 100644
--- a/vtkm/cont/RuntimeDeviceTracker.h
+++ b/vtkm/cont/RuntimeDeviceTracker.h
@@ -113,8 +113,8 @@ public:
   /// \brief Get/Set use of thread-friendly memory allocation for a device.
   ///
   ///
-  VTKM_CONT bool GetThreadFriendlyMemAlloc(DeviceAdapterId deviceId) const;
-  VTKM_CONT void SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, bool state);
+  VTKM_CONT bool GetThreadFriendlyMemAlloc() const;
+  VTKM_CONT void SetThreadFriendlyMemAlloc(bool state);
 
   /// \brief Copies the state from the given device.
   ///
diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu
index 83cf6b1fd..c786844aa 100644
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@@ -12,6 +12,7 @@
 #include <mutex>
 #include <vtkm/cont/Logging.h>
 #include <vtkm/cont/RuntimeDeviceInformation.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
 #include <vtkm/cont/cuda/ErrorCuda.h>
 #include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
@@ -47,9 +48,6 @@ static bool ManagedMemoryEnabled = false;
 // True if concurrent pagable managed memory is supported by the machines hardware.
 static bool HardwareSupportsManagedMemory = false;
 
-// True if using syncronous memory allocator. Managed memory must be off to use this.
-static thread_local bool UseSyncMemoryAlloc = true;
-
 // Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
 // This value should be > 0 or else these functions will error out.
 static std::size_t Threshold = 1 << 20;
@@ -106,29 +104,6 @@ void CudaAllocator::ForceManagedMemoryOn()
   }
 }
 
-void CudaAllocator::ForceSyncMemoryAllocator()
-{
-  UseSyncMemoryAlloc = true;
-  VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using synchronous memory allocator.");
-}
-
-void CudaAllocator::ForceAsyncMemoryAllocator()
-{
-#if CUDART_VERSION >= 11030
-  if (ManagedMemoryEnabled)
-  {
-    CudaAllocator::ForceManagedMemoryOff();
-    VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
-               "CudaAllocator turning off managed memory for asynchronous memory allocation");
-  }
-  UseSyncMemoryAlloc = false;
-  VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using asynchronous memory allocator.");
-#else
-  VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
-             "Asynchronous memory allocator not supported for cuda version < 11.3");
-#endif
-}
-
 bool CudaAllocator::IsDevicePointer(const void* ptr)
 {
   CudaAllocator::Initialize();
@@ -186,11 +161,11 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
 
   void* ptr = nullptr;
 #if CUDART_VERSION >= 11030
-  if (!UseSyncMemoryAlloc)
+  const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
+  if (tracker.GetThreadFriendlyMemAlloc())
   {
     VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
   }
-
   else
 #endif
     if (ManagedMemoryEnabled)
@@ -215,7 +190,18 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
 void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
 {
   void* ptr = nullptr;
-  VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
+#if CUDART_VERSION >= 11030
+  const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
+  if (tracker.GetThreadFriendlyMemAlloc())
+  {
+    VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
+  }
+  else
+#endif
+  {
+    VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
+  }
+
   {
     VTKM_LOG_F(vtkm::cont::LogLevel::MemExec,
                "Allocated CUDA array of %s at %p.",
@@ -237,17 +223,16 @@ void CudaAllocator::Free(void* ptr)
 
   VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
 
-  if (UseSyncMemoryAlloc)
+#if CUDART_VERSION >= 11030
+  const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
+  if (tracker.GetThreadFriendlyMemAlloc())
   {
-    VTKM_CUDA_CALL(cudaFree(ptr));
+    VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
   }
   else
-  {
-#if CUDART_VERSION >= 11030
-    VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
-#else
-    VTKM_CUDA_CALL(cudaFree(ptr));
 #endif
+  {
+    VTKM_CUDA_CALL(cudaFree(ptr));
   }
 }
 
diff --git a/vtkm/filter/NewFilter.cxx b/vtkm/filter/NewFilter.cxx
index 7365910e9..847b591dd 100644
--- a/vtkm/filter/NewFilter.cxx
+++ b/vtkm/filter/NewFilter.cxx
@@ -21,14 +21,14 @@ namespace vtkm
 namespace filter
 {
 
-NewFilter::~NewFilter() = default;
-
-void NewFilter::RunFilter(NewFilter* self,
-                          vtkm::filter::DataSetQueue& input,
-                          vtkm::filter::DataSetQueue& output)
+namespace
+{
+void RunFilter(NewFilter* self,
+               vtkm::filter::DataSetQueue& input,
+               vtkm::filter::DataSetQueue& output)
 {
   auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
-  tracker.SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterTagAny{}, true);
+  tracker.SetThreadFriendlyMemAlloc(true);
 
   std::pair<vtkm::Id, vtkm::cont::DataSet> task;
   while (input.GetTask(task))
@@ -40,6 +40,9 @@ void NewFilter::RunFilter(NewFilter* self,
   vtkm::cont::Algorithm::Synchronize();
 }
 
+}
+
+NewFilter::~NewFilter() = default;
 
 bool NewFilter::CanThread() const
 {
@@ -63,11 +66,8 @@ vtkm::cont::PartitionedDataSet NewFilter::DoExecutePartitions(
     std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads));
     for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++)
     {
-      auto f = std::async(std::launch::async,
-                          vtkm::filter::NewFilter::RunFilter,
-                          this,
-                          std::ref(inputQueue),
-                          std::ref(outputQueue));
+      auto f = std::async(
+        std::launch::async, RunFilter, this, std::ref(inputQueue), std::ref(outputQueue));
       futures[i] = std::move(f);
     }
 
diff --git a/vtkm/filter/NewFilter.h b/vtkm/filter/NewFilter.h
index dc21c259b..b312d8da2 100644
--- a/vtkm/filter/NewFilter.h
+++ b/vtkm/filter/NewFilter.h
@@ -446,11 +446,6 @@ private:
     }
   }
 
-  VTKM_CONT
-  static void RunFilter(NewFilter* self,
-                        vtkm::filter::DataSetQueue& input,
-                        vtkm::filter::DataSetQueue& output);
-
   VTKM_CONT
   virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input);
 
diff --git a/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx b/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx
index 8353b3a4d..6ee313c1b 100644
--- a/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx
+++ b/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx
@@ -10,7 +10,6 @@
 
 #include <vtkm/Math.h>
 #include <vtkm/cont/DataSet.h>
-#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/cont/testing/MakeTestDataSet.h>
 #include <vtkm/cont/testing/Testing.h>
 
@@ -87,7 +86,7 @@ void ValidateResults(const vtkm::cont::PartitionedDataSet& truth,
 } //namespace
 
 
-void RunMBTests()
+void TestMultiBlockFilter()
 {
   vtkm::cont::PartitionedDataSet pds;
 
@@ -160,19 +159,6 @@ void RunMBTests()
   ValidateResults(results[0], results[1], "gradient", false);
 }
 
-void TestMultiBlockFilter()
-{
-  //For cuda, run it with and without async memory allocation.
-#ifdef VTKM_CUDA
-  vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
-  RunMBTests();
-  vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
-  RunMBTests();
-#else
-  RunMBTests();
-#endif
-}
-
 int UnitTestMultiBlockFilter(int argc, char* argv[])
 {
   return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv);