copying cpu memory to pascal managed memory now works consistently.

When copying small arrays from cpu memory to pascal memory we would see subsequent kernels fail as the memory transfer hadn't finished. This is a bug as each stream should act like a FIFO queue. So for now when encountering this use case we explicitly synchronize after the memcpy.
2024-09-16 17:22:55 +00:00 · 2018-05-16 14:35:08 -04:00 · 2018-05-16 14:35:08 -04:00 · e0b6e69878
commit e0b6e69878
parent a4b16c4b4e
2 changed files with 10 additions and 0 deletions
--- a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
+++ b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
@ -150,6 +150,15 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(
                                 static_cast<std::size_t>(numBytes),
                                 cudaMemcpyHostToDevice,
                                 cudaStreamPerThread));
+  if (CudaAllocator::IsManagedPointer(executionPtr))
+  {
+    //If we are moving memory from unmanaged host memory
+    //to managed host memory we have the possibility that
+    //the memcpy will not finish before the first usage is finished
+    //to work around this bug we explicitly synchronize for this
+    //one use case
+    cudaStreamSynchronize(cudaStreamPerThread);
+  }
 }

 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
--- a/vtkm/interop/cuda/internal/TransferToOpenGL.h
+++ b/vtkm/interop/cuda/internal/TransferToOpenGL.h
@ -205,6 +205,7 @@ public:
                   vtkm::cont::cuda::internal::IteratorBegin(portal),
                   vtkm::cont::cuda::internal::IteratorEnd(portal),
                   thrust::cuda::pointer<ValueType>(beginPointer));
+    cudaStreamSynchronize(cudaStreamPerThread);

    //unmap the resource
    this->Resource->UnMap();