Merge topic 'cuda-per-thread-streams-2'

06dee259f Minimize cuda synchronizations

Acked-by: Kitware Robot <kwrobot@kitware.com>
Acked-by: Kenneth Moreland <kmorel@sandia.gov>
Merge-request: !1288
This commit is contained in:
Sujin Philip 2018-07-25 19:07:30 +00:00 committed by Kitware Robot
commit 259d670ab5
7 changed files with 110 additions and 69 deletions

@ -0,0 +1,15 @@
# Worklets are now asynchronous in Cuda
Worklets are now fully asynchronous in the cuda backend. This means that
worklet errors are reported asynchonously. Existing errors are checked for
before invocation of a new worklet and at explicit synchronization points like
`DeviceAdapterAlgorithm<>::Synchronize()`.
An important effect of this change is that functions that are synchronization
points, like `ArrayHandle::GetPortalControl()` and
`ArrayHandle::GetPortalConstControl()`, may now throw exception for errors from
previously executed worklets.
Worklet invocations, synchronization and error reporting happen independtly
on different threads. Therefore, synchronization on one thread does not affect
any other threads.

@ -365,10 +365,14 @@ public:
VTKM_CONT const StorageType& GetStorage() const;
/// Get the array portal of the control array.
/// Since worklet invocations are asynchronous and this routine is a synchronization point,
/// exceptions maybe thrown for errors from previously executed worklets.
///
VTKM_CONT PortalControl GetPortalControl();
/// Get the array portal of the control array.
/// Since worklet invocations are asynchronous and this routine is a synchronization point,
/// exceptions maybe thrown for errors from previously executed worklets.
///
VTKM_CONT PortalConstControl GetPortalConstControl() const;

@ -70,45 +70,41 @@ VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
vtkm::Id& arraySize,
char** hostPointer)
auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
-> const PinnedErrorArray&
{
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
static bool errorArrayInit = false;
static char* hostPtr = nullptr;
static char* devicePtr = nullptr;
if (!errorArrayInit)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
errorArrayInit = true;
}
//set the size of the array
arraySize = ERROR_ARRAY_SIZE;
constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
static thread_local PinnedErrorArray local;
//specify the host pointer to the memory
*hostPointer = hostPtr;
(void)hostPointer;
return devicePtr;
if (!local.HostPtr)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
local.HostPtr[0] = '\0'; // clear
local.Size = ERROR_ARRAY_SIZE;
}
return local;
}
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
vtkm::exec::cuda::internal::TaskStrided& functor)
{
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = nullptr;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
auto pinnedArray = GetPinnedErrorArray();
vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
functor.SetErrorMessageBuffer(errorMessage);
}
return hostErrorPtr;
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
{
auto pinnedArray = GetPinnedErrorArray();
if (pinnedArray.HostPtr[0] != '\0')
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
pinnedArray.HostPtr[0] = '\0'; // clear
throw excep;
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(

@ -432,7 +432,6 @@ private:
{
cuda::internal::throwAsVTKmException();
}
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
return sum[2];
}
@ -1102,13 +1101,24 @@ public:
::thrust::equal_to<T>(),
binary_functor);
}
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
VTKM_CONT_EXPORT
static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);
struct VTKM_CONT_EXPORT PinnedErrorArray
{
char* HostPtr = nullptr;
char* DevicePtr = nullptr;
vtkm::Id Size = 0;
};
VTKM_CONT_EXPORT
static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
static const PinnedErrorArray& GetPinnedErrorArray();
VTKM_CONT_EXPORT
static void CheckForErrors(); // throws vtkm::cont::ErrorExecution
VTKM_CONT_EXPORT
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
VTKM_CONT_EXPORT
static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
@ -1127,7 +1137,9 @@ public:
// No instances means nothing to run. Just return.
return;
}
char* hostErrorPtr = SetupErrorBuffer(functor);
CheckForErrors();
SetupErrorBuffer(functor);
vtkm::UInt32 grids, blocks;
GetGridsAndBlocks(grids, blocks, numInstances);
@ -1135,18 +1147,6 @@ public:
cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
numInstances);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is
if (hostErrorPtr[0] != '\0')
{
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
parameter_sweep_1d_schedule(functor, numInstances);
#endif
@ -1162,7 +1162,9 @@ public:
// No instances means nothing to run. Just return.
return;
}
char* hostErrorPtr = SetupErrorBuffer(functor);
CheckForErrors();
SetupErrorBuffer(functor);
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
@ -1174,18 +1176,6 @@ public:
cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is
if (hostErrorPtr[0] != '\0')
{
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
parameter_sweep_3d_schedule(functor, rangeMax);
#endif
@ -1289,6 +1279,7 @@ public:
VTKM_CONT static void Synchronize()
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
CheckForErrors();
}
};

@ -178,7 +178,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
//our stream. We need to block on the copy back to control since
//we don't wanting it accessing memory that hasn't finished
//being used by the GPU
cudaStreamSynchronize(cudaStreamPerThread);
vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTagCuda>::Synchronize();
}
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::UsingForRead(

@ -45,9 +45,11 @@
#include <vtkm/cont/AtomicArray.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <ctime>
#include <random>
#include <thread>
#include <utility>
#include <vector>
@ -1919,6 +1921,7 @@ private:
try
{
Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
Algorithm::Synchronize();
}
catch (vtkm::cont::ErrorExecution& error)
{
@ -1932,6 +1935,7 @@ private:
try
{
Algorithm::Schedule(AllErrorKernel(), ARRAY_SIZE);
Algorithm::Synchronize();
}
catch (vtkm::cont::ErrorExecution& error)
{
@ -1939,6 +1943,41 @@ private:
message = error.GetMessage();
}
VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
// This is spcifically to test the cuda-backend but should pass for all backends
std::cout << "Testing if execution errors are eventually propogated to the host "
<< "without explicit synchronization\n";
message = "";
int nkernels = 0;
try
{
IdArrayHandle idArray;
idArray.Allocate(ARRAY_SIZE);
auto portal = idArray.PrepareForInPlace(DeviceAdapterTag{});
Algorithm::Schedule(OneErrorKernel(), ARRAY_SIZE);
for (; nkernels < 100; ++nkernels)
{
Algorithm::Schedule(AddArrayKernel(portal), ARRAY_SIZE);
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
Algorithm::Synchronize();
}
catch (vtkm::cont::ErrorExecution& error)
{
std::cout << "Got expected error: \"" << error.GetMessage() << "\" ";
if (nkernels < 100)
{
std::cout << "after " << nkernels << " invocations of other kernel" << std::endl;
}
else
{
std::cout << "only after explicit synchronization" << std::endl;
}
message = error.GetMessage();
}
std::cout << "\n";
VTKM_TEST_ASSERT(message == ERROR_MESSAGE, "Did not get expected error message.");
}
template <typename T, int N = 0>

@ -345,19 +345,15 @@ void TestErrorFunctorInvoke()
auto task = TaskTypes::MakeTask(worklet, invocation, vtkm::Id());
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = nullptr;
char* deviceErrorPtr = Algorithm::GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
auto errorArray = Algorithm::GetPinnedErrorArray();
vtkm::exec::internal::ErrorMessageBuffer errorMessage(errorArray.DevicePtr, errorArray.Size);
task.SetErrorMessageBuffer(errorMessage);
ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
cudaDeviceSynchronize();
VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
VTKM_TEST_ASSERT(hostErrorPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
VTKM_TEST_ASSERT(errorArray.HostPtr == std::string(ERROR_MESSAGE), "Got wrong error message.");
}
template <typename DeviceAdapter>