Address reviewer comments and suggestions.

2024-10-05 01:49:02 +00:00 · 2022-08-11 13:43:49 -04:00 · 2022-08-11 13:43:49 -04:00 · 3920806a66
commit 3920806a66
parent 7e945bb145
6 changed files with 44 additions and 41 deletions
--- a/examples/threaded_multiblock_filter/MultiBlock.cxx
+++ b/examples/threaded_multiblock_filter/MultiBlock.cxx
@ -18,10 +18,6 @@
 #include <cstdlib>
 #include <iostream>

-int rank = 0;
-int numRanks = 1;
-
-
 struct Options
 {
 public:
@ -38,8 +34,6 @@ public:
  std::string MapField = "";
  int IsoLevels = 1;
  std::vector<double> IsoValues;
-  int Rank = 0;
-  int NumRanks = 1;
  std::string ThreadMode = "serial";
  bool SyncMemAlloc = true;
  int NumTasks = 0;
@ -172,8 +166,7 @@ public:

    if ((!this->Tangle && (this->DataFile == "" || this->ThreadMode == "")) || this->Field == "")
    {
-      if (this->Rank == 0)
-        std::cerr << "Error in options" << std::endl;
+      std::cerr << "Error in options" << std::endl;
      return false;
    }

@ -218,12 +211,12 @@ int main(int argc, char** argv)
    {
      if (opts.SyncMemAlloc)
      {
-        CudaAllocator::ForceSyncMemoryAllocator();
+        vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
        std::cout << "  Task: Sync memory alloc = ON" << std::endl;
      }
      else
      {
-        CudaAllocator::ForceAsyncMemoryAllocator();
+        vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
        std::cout << "  Task: Sync memory alloc = OFF" << std::endl;
      }
    }
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@ -48,7 +48,7 @@ static bool ManagedMemoryEnabled = false;
 static bool HardwareSupportsManagedMemory = false;

 // True if using syncronous memory allocator. Managed memory must be off to use this.
-static bool UseSyncMemoryAlloc = true;
+static thread_local bool UseSyncMemoryAlloc = true;

 // Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
 // This value should be > 0 or else these functions will error out.
@ -106,11 +106,6 @@ void CudaAllocator::ForceManagedMemoryOn()
  }
 }

-bool CudaAllocator::UsingSyncMemoryAllocator()
-{
-  return UseSyncMemoryAlloc;
-}
-
 void CudaAllocator::ForceSyncMemoryAllocator()
 {
  UseSyncMemoryAlloc = true;
@ -190,22 +185,21 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
  }

  void* ptr = nullptr;
-  if (ManagedMemoryEnabled)
+#if CUDART_VERSION >= 11030
+  if (!UseSyncMemoryAlloc)
+  {
+    VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
+  }
+
+  else
+#endif
+    if (ManagedMemoryEnabled)
  {
    VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
  }
  else
  {
-    if (UseSyncMemoryAlloc)
-    {
-      VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
-    }
-    else
-    {
-#if CUDART_VERSION >= 11030
-      VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
-#endif
-    }
+    VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
  }

  {
@ -251,6 +245,8 @@ void CudaAllocator::Free(void* ptr)
  {
 #if CUDART_VERSION >= 11030
    VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
+#else
+    VTKM_CUDA_CALL(cudaFree(ptr));
 #endif
  }
 }
--- a/vtkm/cont/cuda/internal/CudaAllocator.h
+++ b/vtkm/cont/cuda/internal/CudaAllocator.h
@ -40,11 +40,6 @@ struct VTKM_CONT_EXPORT CudaAllocator
  /// VTK-m will ignore the request and continue to use unmanaged memory (aka cudaMalloc).
  static VTKM_CONT void ForceManagedMemoryOn();

-  static VTKM_CONT bool UsingSyncMemoryAllocator();
-  static VTKM_CONT bool UsingAsyncMemoryAllocator()
-  {
-    return !CudaAllocator::UsingSyncMemoryAllocator();
-  }
  static VTKM_CONT void ForceSyncMemoryAllocator();
  static VTKM_CONT void ForceAsyncMemoryAllocator();

--- a/vtkm/filter/NewFilter.cxx
+++ b/vtkm/filter/NewFilter.cxx
@ -20,12 +20,17 @@ namespace vtkm
 {
 namespace filter
 {
-namespace
-{
-void RunFilter(NewFilter* self,
-               vtkm::filter::DataSetQueue& input,
-               vtkm::filter::DataSetQueue& output)
+
+NewFilter::~NewFilter() = default;
+
+void NewFilter::RunFilter(NewFilter* self,
+                          vtkm::filter::DataSetQueue& input,
+                          vtkm::filter::DataSetQueue& output)
 {
+#ifdef VTKM_CUDA
+  vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
+#endif
+
  std::pair<vtkm::Id, vtkm::cont::DataSet> task;
  while (input.GetTask(task))
  {
@ -35,9 +40,7 @@ void RunFilter(NewFilter* self,

  vtkm::cont::Algorithm::Synchronize();
 }
-} // anonymous namespace

-NewFilter::~NewFilter() = default;

 bool NewFilter::CanThread() const
 {
@ -61,8 +64,11 @@ vtkm::cont::PartitionedDataSet NewFilter::DoExecutePartitions(
    std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads));
    for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++)
    {
-      auto f = std::async(
-        std::launch::async, RunFilter, this, std::ref(inputQueue), std::ref(outputQueue));
+      auto f = std::async(std::launch::async,
+                          vtkm::filter::NewFilter::RunFilter,
+                          this,
+                          std::ref(inputQueue),
+                          std::ref(outputQueue));
      futures[i] = std::move(f);
    }

--- a/vtkm/filter/NewFilter.h
+++ b/vtkm/filter/NewFilter.h
@ -17,6 +17,7 @@
 #include <vtkm/cont/PartitionedDataSet.h>

 #include <vtkm/filter/FieldSelection.h>
+#include <vtkm/filter/TaskQueue.h>
 #include <vtkm/filter/vtkm_filter_core_export.h>

 namespace vtkm
@ -228,6 +229,11 @@ public:
  VTKM_CONT
  void SetThreadsPerGPU(vtkm::Id numThreads) { this->NumThreadsPerGPU = numThreads; }

+  VTKM_CONT
+  vtkm::Id SetThreadsPerCPU() const { return this->NumThreadsPerCPU; }
+  VTKM_CONT
+  vtkm::Id SetThreadsPerGPU() const { return this->NumThreadsPerGPU; }
+
  VTKM_CONT
  bool GetRunMultiThreadedFilter() const
  {
@ -440,6 +446,11 @@ private:
    }
  }

+  VTKM_CONT
+  static void RunFilter(NewFilter* self,
+                        vtkm::filter::DataSetQueue& input,
+                        vtkm::filter::DataSetQueue& output);
+
  VTKM_CONT
  virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input);

--- a/vtkm/filter/TaskQueue.h
+++ b/vtkm/filter/TaskQueue.h
@ -103,7 +103,9 @@ public:
      //Insert them back in the same order.
      std::pair<vtkm::Id, vtkm::cont::DataSet> task;
      while (this->GetTask(task))
+      {
        dataSets[static_cast<std::size_t>(task.first)] = std::move(task.second);
+      }

      pds.AppendPartitions(dataSets);
    }