copying cpu memory to pascal managed memory now works consistently.

When copying small arrays from cpu memory to pascal memory we would
see subsequent kernels fail as the memory transfer hadn't finished.
This is a bug as each stream should act like a FIFO queue. So
for now when encountering this use case we explicitly synchronize
after the memcpy.
This commit is contained in:
Robert Maynard 2018-05-16 14:35:08 -04:00
parent a4b16c4b4e
commit e0b6e69878
2 changed files with 10 additions and 0 deletions

@ -150,6 +150,15 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(
static_cast<std::size_t>(numBytes),
cudaMemcpyHostToDevice,
cudaStreamPerThread));
if (CudaAllocator::IsManagedPointer(executionPtr))
{
//If we are moving memory from unmanaged host memory
//to managed host memory we have the possibility that
//the memcpy will not finish before the first usage is finished
//to work around this bug we explicitly synchronize for this
//one use case
cudaStreamSynchronize(cudaStreamPerThread);
}
}
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,

@ -205,6 +205,7 @@ public:
vtkm::cont::cuda::internal::IteratorBegin(portal),
vtkm::cont::cuda::internal::IteratorEnd(portal),
thrust::cuda::pointer<ValueType>(beginPointer));
cudaStreamSynchronize(cudaStreamPerThread);
//unmap the resource
this->Resource->UnMap();