From 82cdae002574eebb388fbb2e48241555e552c0e1 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Tue, 29 May 2018 16:21:59 -0400
Subject: [PATCH] VTK-m waits for cuda streams to finish before host access

Previously it was possible for VTK-m to access memory from
the host before the computations in a stream finished.
---
 .../internal/DeviceAdapterAlgorithmCuda.h     |  1 +
 .../ExecutionArrayInterfaceBasicCuda.cu       | 28 +++++++++----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
index 3664c71c1..9ed178164 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@@ -432,6 +432,7 @@ private:
     {
       cuda::internal::throwAsVTKmException();
     }
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     return sum[2];
   }
 
diff --git a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
index 730bc1a65..f53e6afe0 100644
--- a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
+++ b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
@@ -150,15 +150,6 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(
                                  static_cast<std::size_t>(numBytes),
                                  cudaMemcpyHostToDevice,
                                  cudaStreamPerThread));
-  if (CudaAllocator::IsManagedPointer(executionPtr))
-  {
-    //If we are moving memory from unmanaged host memory
-    //to managed host memory we have the possibility that
-    //the memcpy will not finish before the first usage is finished
-    //to work around this bug we explicitly synchronize for this
-    //one use case
-    cudaStreamSynchronize(cudaStreamPerThread);
-  }
 }
 
 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
@@ -179,14 +170,21 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
 
     // If it is managed, just return and let CUDA handle the migration for us.
     CudaAllocator::PrepareForControl(controlPtr, numBytes);
-    return;
+  }
+  else
+  {
+    VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
+                                   executionPtr,
+                                   static_cast<std::size_t>(numBytes),
+                                   cudaMemcpyDeviceToHost,
+                                   cudaStreamPerThread));
   }
 
-  VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
-                                 executionPtr,
-                                 static_cast<std::size_t>(numBytes),
-                                 cudaMemcpyDeviceToHost,
-                                 cudaStreamPerThread));
+  //In all cases we have possibly multiple async calls queued up in
+  //our stream. We need to block on the copy back to control since
+  //we don't wanting it accessing memory that hasn't finished
+  //being used by the GPU
+  cudaStreamSynchronize(cudaStreamPerThread);
 }
 
 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::UsingForRead(