Minimize cuda synchronizations

1. Have a per-thread pinned array for cuda errors 2. Check for errors before scheduling new tasks and at explicit sync points 3. Remove explicit synchronizations from most places Addresses part 2 of #168
2024-09-16 17:22:55 +00:00 · 2018-06-26 14:52:04 -04:00 · 2018-06-26 14:52:04 -04:00 · 06dee259f7
commit 06dee259f7
parent a8305bad51
7 changed files with 110 additions and 69 deletions
--- a/docs/changelog/cuda-asynchronous-errors.md
+++ b/docs/changelog/cuda-asynchronous-errors.md
@ -0,0 +1,15 @@
+# Worklets are now asynchronous in Cuda
+
+Worklets are now fully asynchronous in the cuda backend. This means that
+worklet errors are reported asynchonously. Existing errors are checked for
+before invocation of a new worklet and at explicit synchronization points like
+`DeviceAdapterAlgorithm<>::Synchronize()`.
+
+An important effect of this change is that functions that are synchronization
+points, like `ArrayHandle::GetPortalControl()` and
+`ArrayHandle::GetPortalConstControl()`, may now throw exception for errors from
+previously executed worklets.
+
+Worklet invocations, synchronization and error reporting happen independtly
+on different threads. Therefore, synchronization on one thread does not affect
+any other threads.
--- a/vtkm/cont/ArrayHandle.h
+++ b/vtkm/cont/ArrayHandle.h
@ -364,10 +364,14 @@ public:
  VTKM_CONT const StorageType& GetStorage() const;

  /// Get the array portal of the control array.
+  /// Since worklet invocations are asynchronous and this routine is a synchronization point,
+  /// exceptions maybe thrown for errors from previously executed worklets.
  ///
  VTKM_CONT PortalControl GetPortalControl();

  /// Get the array portal of the control array.
+  /// Since worklet invocations are asynchronous and this routine is a synchronization point,
+  /// exceptions maybe thrown for errors from previously executed worklets.
  ///
  VTKM_CONT PortalConstControl GetPortalConstControl() const;

--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@ -70,45 +70,41 @@ VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)

 // we use cuda pinned memory to reduce the amount of synchronization
 // and mem copies between the host and device.
-char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
-  vtkm::Id& arraySize,
-  char** hostPointer)
+auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
+  -> const PinnedErrorArray&
 {
-  const vtkm::Id ERROR_ARRAY_SIZE = 1024;
-  static bool errorArrayInit = false;
-  static char* hostPtr = nullptr;
-  static char* devicePtr = nullptr;
-  if (!errorArrayInit)
-  {
-    VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
-    VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
-    errorArrayInit = true;
-  }
-  //set the size of the array
-  arraySize = ERROR_ARRAY_SIZE;
+  constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
+  static thread_local PinnedErrorArray local;

-  //specify the host pointer to the memory
-  *hostPointer = hostPtr;
-  (void)hostPointer;
-  return devicePtr;
+  if (!local.HostPtr)
+  {
+    VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
+    VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
+    local.HostPtr[0] = '\0'; // clear
+    local.Size = ERROR_ARRAY_SIZE;
+  }
+
+  return local;
 }

-char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
  vtkm::exec::cuda::internal::TaskStrided& functor)
 {
-  //since the memory is pinned we can access it safely on the host
-  //without a memcpy
-  vtkm::Id errorArraySize = 0;
-  char* hostErrorPtr = nullptr;
-  char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
-
-  //clear the first character which means that we don't contain an error
-  hostErrorPtr[0] = '\0';
-
-  vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
+  auto pinnedArray = GetPinnedErrorArray();
+  vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
  functor.SetErrorMessageBuffer(errorMessage);
+}

-  return hostErrorPtr;
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
+{
+  auto pinnedArray = GetPinnedErrorArray();
+  if (pinnedArray.HostPtr[0] != '\0')
+  {
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+    auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
+    pinnedArray.HostPtr[0] = '\0'; // clear
+    throw excep;
+  }
 }

 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -432,7 +432,6 @@ private:
    {
      cuda::internal::throwAsVTKmException();
    }
-    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
    return sum[2];
  }

@ -1101,13 +1100,24 @@ public:
                             ::thrust::equal_to<T>(),
                             binary_functor);
  }
+
  // we use cuda pinned memory to reduce the amount of synchronization
  // and mem copies between the host and device.
-  VTKM_CONT_EXPORT
-  static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);
+  struct VTKM_CONT_EXPORT PinnedErrorArray
+  {
+    char* HostPtr = nullptr;
+    char* DevicePtr = nullptr;
+    vtkm::Id Size = 0;
+  };

  VTKM_CONT_EXPORT
-  static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
+  static const PinnedErrorArray& GetPinnedErrorArray();
+
+  VTKM_CONT_EXPORT
+  static void CheckForErrors(); // throws vtkm::cont::ErrorExecution
+
+  VTKM_CONT_EXPORT
+  static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);

  VTKM_CONT_EXPORT
  static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
@ -1126,7 +1136,9 @@ public:
      // No instances means nothing to run. Just return.
      return;
    }
-    char* hostErrorPtr = SetupErrorBuffer(functor);
+
+    CheckForErrors();
+    SetupErrorBuffer(functor);

    vtkm::UInt32 grids, blocks;
    GetGridsAndBlocks(grids, blocks, numInstances);
@ -1134,18 +1146,6 @@ public:
    cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
                                                                                   numInstances);

-    //sync so that we can check the results of the call.
-    //In the future I want move this before the schedule call, and throwing
-    //an exception if the previous schedule wrote an error. This would help
-    //cuda to run longer before we hard sync.
-    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
-
-    //check what the value is
-    if (hostErrorPtr[0] != '\0')
-    {
-      throw vtkm::cont::ErrorExecution(hostErrorPtr);
-    }
-
 #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
    parameter_sweep_1d_schedule(functor, numInstances);
 #endif
@ -1161,7 +1161,9 @@ public:
      // No instances means nothing to run. Just return.
      return;
    }
-    char* hostErrorPtr = SetupErrorBuffer(functor);
+
+    CheckForErrors();
+    SetupErrorBuffer(functor);

    const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
                      static_cast<vtkm::UInt32>(rangeMax[1]),
@ -1173,18 +1175,6 @@ public:

    cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);

-    //sync so that we can check the results of the call.
-    //In the future I want move this before the schedule call, and throwing
-    //an exception if the previous schedule wrote an error. This would help
-    //cuda to run longer before we hard sync.
-    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
-
-    //check what the value is
-    if (hostErrorPtr[0] != '\0')
-    {
-      throw vtkm::cont::ErrorExecution(hostErrorPtr);
-    }
-
 #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
    parameter_sweep_3d_schedule(functor, rangeMax);
 #endif
@ -1288,6 +1278,7 @@ public:
  VTKM_CONT static void Synchronize()
  {
    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+    CheckForErrors();
  }
 };

--- a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
+++ b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
@ -184,7 +184,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
  //our stream. We need to block on the copy back to control since
  //we don't wanting it accessing memory that hasn't finished
  //being used by the GPU
-  cudaStreamSynchronize(cudaStreamPerThread);
+  vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTagCuda>::Synchronize();
 }

 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::UsingForRead(
--- a/vtkm/cont/testing/TestingDeviceAdapter.h
+++ b/vtkm/cont/testing/TestingDeviceAdapter.h
@ -45,9 +45,11 @@
 #include <vtkm/exec/AtomicArray.h>

 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <ctime>
 #include <random>
+#include <thread>
 #include <utility>
 #include <vector>

@ -1919,6 +1921,7 @@ private:
    try
    {
      Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
+      Algorithm::Synchronize();
    }
    catch (vtkm::cont::ErrorExecution& error)
    {
@ -1932,6 +1935,7 @@ private:
    try
    {
      Algorithm::Schedule(AllErrorKernel(), ARRAY_SIZE);
+      Algorithm::Synchronize();
    }
    catch (vtkm::cont::ErrorExecution& error)
    {
@ -1939,6 +1943,41 @@ private:
      message = error.GetMessage();
    }
    VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
+
+    // This is spcifically to test the cuda-backend but should pass for all backends
+    std::cout << "Testing if execution errors are eventually propogated to the host "
+              << "without explicit synchronization\n";
+    message = "";
+    int nkernels = 0;
+    try
+    {
+      IdArrayHandle idArray;
+      idArray.Allocate(ARRAY_SIZE);
+      auto portal = idArray.PrepareForInPlace(DeviceAdapterTag{});
+
+      Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
+      for (; nkernels < 100; ++nkernels)
+      {
+        Algorithm::Schedule(AddArrayKernel(portal), ARRAY_SIZE);
+        std::this_thread::sleep_for(std::chrono::milliseconds(20));
+      }
+      Algorithm::Synchronize();
+    }
+    catch (vtkm::cont::ErrorExecution& error)
+    {
+      std::cout << "Got expected error: \"" << error.GetMessage() << "\" ";
+      if (nkernels < 100)
+      {
+        std::cout << "after " << nkernels << " invocations of other kernel" << std::endl;
+      }
+      else
+      {
+        std::cout << "only after explicit synchronization" << std::endl;
+      }
+      message = error.GetMessage();
+    }
+    std::cout << "\n";
+    VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
  }

  template <typename T, int N = 0>
--- a/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu
+++ b/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu
@ -345,19 +345,15 @@ void TestErrorFunctorInvoke()

  auto task = TaskTypes::MakeTask(worklet, invocation, vtkm::Id());

-  vtkm::Id errorArraySize = 0;
-  char* hostErrorPtr = nullptr;
-  char* deviceErrorPtr = Algorithm::GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
-
-  hostErrorPtr[0] = '\0';
-  vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
+  auto errorArray = Algorithm::GetPinnedErrorArray();
+  vtkm::exec::internal::ErrorMessageBuffer errorMessage(errorArray.DevicePtr, errorArray.Size);
  task.SetErrorMessageBuffer(errorMessage);

  ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
  cudaDeviceSynchronize();

  VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
-  VTKM_TEST_ASSERT(hostErrorPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
+  VTKM_TEST_ASSERT(errorArray.HostPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
 }

 template <typename DeviceAdapter>