Merge topic 'wait_for_cuda_streams_to_finish_before_host_access'

82cdae002 VTK-m waits for cuda streams to finish before host access Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Allison Vacanti <allison.vacanti@kitware.com> Merge-request: !1244
2018-06-01 16:53:15 +00:00 · 2018-06-01 16:53:15 +00:00 · b8468761c7
commit b8468761c7
parent 84fc7520c6 82cdae0025
2 changed files with 14 additions and 15 deletions
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -432,6 +432,7 @@ private:
    {
      cuda::internal::throwAsVTKmException();
    }
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
    return sum[2];
  }

--- a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
+++ b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu
@ -150,15 +150,6 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(
                                 static_cast<std::size_t>(numBytes),
                                 cudaMemcpyHostToDevice,
                                 cudaStreamPerThread));
-  if (CudaAllocator::IsManagedPointer(executionPtr))
-  {
-    //If we are moving memory from unmanaged host memory
-    //to managed host memory we have the possibility that
-    //the memcpy will not finish before the first usage is finished
-    //to work around this bug we explicitly synchronize for this
-    //one use case
-    cudaStreamSynchronize(cudaStreamPerThread);
-  }
 }

 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
@ -179,14 +170,21 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi

    // If it is managed, just return and let CUDA handle the migration for us.
    CudaAllocator::PrepareForControl(controlPtr, numBytes);
-    return;
+  }
+  else
+  {
+    VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
+                                   executionPtr,
+                                   static_cast<std::size_t>(numBytes),
+                                   cudaMemcpyDeviceToHost,
+                                   cudaStreamPerThread));
  }

-  VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
-                                 executionPtr,
-                                 static_cast<std::size_t>(numBytes),
-                                 cudaMemcpyDeviceToHost,
-                                 cudaStreamPerThread));
+  //In all cases we have possibly multiple async calls queued up in
+  //our stream. We need to block on the copy back to control since
+  //we don't wanting it accessing memory that hasn't finished
+  //being used by the GPU
+  cudaStreamSynchronize(cudaStreamPerThread);
 }

 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::UsingForRead(