mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-16 17:22:55 +00:00
Minimize cuda synchronizations
1. Have a per-thread pinned array for cuda errors 2. Check for errors before scheduling new tasks and at explicit sync points 3. Remove explicit synchronizations from most places Addresses part 2 of #168
This commit is contained in:
parent
a8305bad51
commit
06dee259f7
15
docs/changelog/cuda-asynchronous-errors.md
Normal file
15
docs/changelog/cuda-asynchronous-errors.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Worklets are now asynchronous in Cuda
|
||||
|
||||
Worklets are now fully asynchronous in the cuda backend. This means that
|
||||
worklet errors are reported asynchonously. Existing errors are checked for
|
||||
before invocation of a new worklet and at explicit synchronization points like
|
||||
`DeviceAdapterAlgorithm<>::Synchronize()`.
|
||||
|
||||
An important effect of this change is that functions that are synchronization
|
||||
points, like `ArrayHandle::GetPortalControl()` and
|
||||
`ArrayHandle::GetPortalConstControl()`, may now throw exception for errors from
|
||||
previously executed worklets.
|
||||
|
||||
Worklet invocations, synchronization and error reporting happen independtly
|
||||
on different threads. Therefore, synchronization on one thread does not affect
|
||||
any other threads.
|
@ -364,10 +364,14 @@ public:
|
||||
VTKM_CONT const StorageType& GetStorage() const;
|
||||
|
||||
/// Get the array portal of the control array.
|
||||
/// Since worklet invocations are asynchronous and this routine is a synchronization point,
|
||||
/// exceptions maybe thrown for errors from previously executed worklets.
|
||||
///
|
||||
VTKM_CONT PortalControl GetPortalControl();
|
||||
|
||||
/// Get the array portal of the control array.
|
||||
/// Since worklet invocations are asynchronous and this routine is a synchronization point,
|
||||
/// exceptions maybe thrown for errors from previously executed worklets.
|
||||
///
|
||||
VTKM_CONT PortalConstControl GetPortalConstControl() const;
|
||||
|
||||
|
@ -70,45 +70,41 @@ VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
|
||||
|
||||
// we use cuda pinned memory to reduce the amount of synchronization
|
||||
// and mem copies between the host and device.
|
||||
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
|
||||
vtkm::Id& arraySize,
|
||||
char** hostPointer)
|
||||
auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
|
||||
-> const PinnedErrorArray&
|
||||
{
|
||||
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
|
||||
static bool errorArrayInit = false;
|
||||
static char* hostPtr = nullptr;
|
||||
static char* devicePtr = nullptr;
|
||||
if (!errorArrayInit)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
|
||||
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
|
||||
errorArrayInit = true;
|
||||
}
|
||||
//set the size of the array
|
||||
arraySize = ERROR_ARRAY_SIZE;
|
||||
constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
|
||||
static thread_local PinnedErrorArray local;
|
||||
|
||||
//specify the host pointer to the memory
|
||||
*hostPointer = hostPtr;
|
||||
(void)hostPointer;
|
||||
return devicePtr;
|
||||
if (!local.HostPtr)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
|
||||
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
|
||||
local.HostPtr[0] = '\0'; // clear
|
||||
local.Size = ERROR_ARRAY_SIZE;
|
||||
}
|
||||
|
||||
return local;
|
||||
}
|
||||
|
||||
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
|
||||
vtkm::exec::cuda::internal::TaskStrided& functor)
|
||||
{
|
||||
//since the memory is pinned we can access it safely on the host
|
||||
//without a memcpy
|
||||
vtkm::Id errorArraySize = 0;
|
||||
char* hostErrorPtr = nullptr;
|
||||
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
||||
|
||||
//clear the first character which means that we don't contain an error
|
||||
hostErrorPtr[0] = '\0';
|
||||
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
auto pinnedArray = GetPinnedErrorArray();
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
|
||||
functor.SetErrorMessageBuffer(errorMessage);
|
||||
}
|
||||
|
||||
return hostErrorPtr;
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
|
||||
{
|
||||
auto pinnedArray = GetPinnedErrorArray();
|
||||
if (pinnedArray.HostPtr[0] != '\0')
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
|
||||
pinnedArray.HostPtr[0] = '\0'; // clear
|
||||
throw excep;
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
||||
|
@ -432,7 +432,6 @@ private:
|
||||
{
|
||||
cuda::internal::throwAsVTKmException();
|
||||
}
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
return sum[2];
|
||||
}
|
||||
|
||||
@ -1101,13 +1100,24 @@ public:
|
||||
::thrust::equal_to<T>(),
|
||||
binary_functor);
|
||||
}
|
||||
|
||||
// we use cuda pinned memory to reduce the amount of synchronization
|
||||
// and mem copies between the host and device.
|
||||
VTKM_CONT_EXPORT
|
||||
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);
|
||||
struct VTKM_CONT_EXPORT PinnedErrorArray
|
||||
{
|
||||
char* HostPtr = nullptr;
|
||||
char* DevicePtr = nullptr;
|
||||
vtkm::Id Size = 0;
|
||||
};
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
||||
static const PinnedErrorArray& GetPinnedErrorArray();
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static void CheckForErrors(); // throws vtkm::cont::ErrorExecution
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
|
||||
@ -1126,7 +1136,9 @@ public:
|
||||
// No instances means nothing to run. Just return.
|
||||
return;
|
||||
}
|
||||
char* hostErrorPtr = SetupErrorBuffer(functor);
|
||||
|
||||
CheckForErrors();
|
||||
SetupErrorBuffer(functor);
|
||||
|
||||
vtkm::UInt32 grids, blocks;
|
||||
GetGridsAndBlocks(grids, blocks, numInstances);
|
||||
@ -1134,18 +1146,6 @@ public:
|
||||
cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
|
||||
numInstances);
|
||||
|
||||
//sync so that we can check the results of the call.
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
//an exception if the previous schedule wrote an error. This would help
|
||||
//cuda to run longer before we hard sync.
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
//check what the value is
|
||||
if (hostErrorPtr[0] != '\0')
|
||||
{
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
||||
parameter_sweep_1d_schedule(functor, numInstances);
|
||||
#endif
|
||||
@ -1161,7 +1161,9 @@ public:
|
||||
// No instances means nothing to run. Just return.
|
||||
return;
|
||||
}
|
||||
char* hostErrorPtr = SetupErrorBuffer(functor);
|
||||
|
||||
CheckForErrors();
|
||||
SetupErrorBuffer(functor);
|
||||
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
@ -1173,18 +1175,6 @@ public:
|
||||
|
||||
cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
|
||||
|
||||
//sync so that we can check the results of the call.
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
//an exception if the previous schedule wrote an error. This would help
|
||||
//cuda to run longer before we hard sync.
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
//check what the value is
|
||||
if (hostErrorPtr[0] != '\0')
|
||||
{
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
parameter_sweep_3d_schedule(functor, rangeMax);
|
||||
#endif
|
||||
@ -1288,6 +1278,7 @@ public:
|
||||
VTKM_CONT static void Synchronize()
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
CheckForErrors();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -184,7 +184,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
|
||||
//our stream. We need to block on the copy back to control since
|
||||
//we don't wanting it accessing memory that hasn't finished
|
||||
//being used by the GPU
|
||||
cudaStreamSynchronize(cudaStreamPerThread);
|
||||
vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTagCuda>::Synchronize();
|
||||
}
|
||||
|
||||
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::UsingForRead(
|
||||
|
@ -45,9 +45,11 @@
|
||||
#include <vtkm/exec/AtomicArray.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@ -1919,6 +1921,7 @@ private:
|
||||
try
|
||||
{
|
||||
Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
|
||||
Algorithm::Synchronize();
|
||||
}
|
||||
catch (vtkm::cont::ErrorExecution& error)
|
||||
{
|
||||
@ -1932,6 +1935,7 @@ private:
|
||||
try
|
||||
{
|
||||
Algorithm::Schedule(AllErrorKernel(), ARRAY_SIZE);
|
||||
Algorithm::Synchronize();
|
||||
}
|
||||
catch (vtkm::cont::ErrorExecution& error)
|
||||
{
|
||||
@ -1939,6 +1943,41 @@ private:
|
||||
message = error.GetMessage();
|
||||
}
|
||||
VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
|
||||
|
||||
// This is spcifically to test the cuda-backend but should pass for all backends
|
||||
std::cout << "Testing if execution errors are eventually propogated to the host "
|
||||
<< "without explicit synchronization\n";
|
||||
message = "";
|
||||
int nkernels = 0;
|
||||
try
|
||||
{
|
||||
IdArrayHandle idArray;
|
||||
idArray.Allocate(ARRAY_SIZE);
|
||||
auto portal = idArray.PrepareForInPlace(DeviceAdapterTag{});
|
||||
|
||||
Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
|
||||
for (; nkernels < 100; ++nkernels)
|
||||
{
|
||||
Algorithm::Schedule(AddArrayKernel(portal), ARRAY_SIZE);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(20));
|
||||
}
|
||||
Algorithm::Synchronize();
|
||||
}
|
||||
catch (vtkm::cont::ErrorExecution& error)
|
||||
{
|
||||
std::cout << "Got expected error: \"" << error.GetMessage() << "\" ";
|
||||
if (nkernels < 100)
|
||||
{
|
||||
std::cout << "after " << nkernels << " invocations of other kernel" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "only after explicit synchronization" << std::endl;
|
||||
}
|
||||
message = error.GetMessage();
|
||||
}
|
||||
std::cout << "\n";
|
||||
VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
|
||||
}
|
||||
|
||||
template <typename T, int N = 0>
|
||||
|
@ -345,19 +345,15 @@ void TestErrorFunctorInvoke()
|
||||
|
||||
auto task = TaskTypes::MakeTask(worklet, invocation, vtkm::Id());
|
||||
|
||||
vtkm::Id errorArraySize = 0;
|
||||
char* hostErrorPtr = nullptr;
|
||||
char* deviceErrorPtr = Algorithm::GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
||||
|
||||
hostErrorPtr[0] = '\0';
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
|
||||
auto errorArray = Algorithm::GetPinnedErrorArray();
|
||||
vtkm::exec::internal::ErrorMessageBuffer errorMessage(errorArray.DevicePtr, errorArray.Size);
|
||||
task.SetErrorMessageBuffer(errorMessage);
|
||||
|
||||
ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
|
||||
VTKM_TEST_ASSERT(hostErrorPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
|
||||
VTKM_TEST_ASSERT(errorArray.HostPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
|
||||
}
|
||||
|
||||
template <typename DeviceAdapter>
|
||||
|
Loading…
Reference in New Issue
Block a user