From 82cdae002574eebb388fbb2e48241555e552c0e1 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 29 May 2018 16:21:59 -0400 Subject: [PATCH] VTK-m waits for cuda streams to finish before host access Previously it was possible for VTK-m to access memory from the host before the computations in a stream finished. --- .../internal/DeviceAdapterAlgorithmCuda.h | 1 + .../ExecutionArrayInterfaceBasicCuda.cu | 28 +++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h index 3664c71c1..9ed178164 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h @@ -432,6 +432,7 @@ private: { cuda::internal::throwAsVTKmException(); } + VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread)); return sum[2]; } diff --git a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu index 730bc1a65..f53e6afe0 100644 --- a/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu +++ b/vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.cu @@ -150,15 +150,6 @@ void ExecutionArrayInterfaceBasic::CopyFromControl( static_cast(numBytes), cudaMemcpyHostToDevice, cudaStreamPerThread)); - if (CudaAllocator::IsManagedPointer(executionPtr)) - { - //If we are moving memory from unmanaged host memory - //to managed host memory we have the possibility that - //the memcpy will not finish before the first usage is finished - //to work around this bug we explicitly synchronize for this - //one use case - cudaStreamSynchronize(cudaStreamPerThread); - } } void ExecutionArrayInterfaceBasic::CopyToControl(const void* executionPtr, @@ -179,14 +170,21 @@ void ExecutionArrayInterfaceBasic::CopyToControl(const voi // If it is managed, just return and let CUDA handle the migration for us. CudaAllocator::PrepareForControl(controlPtr, numBytes); - return; + } + else + { + VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr, + executionPtr, + static_cast(numBytes), + cudaMemcpyDeviceToHost, + cudaStreamPerThread)); } - VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr, - executionPtr, - static_cast(numBytes), - cudaMemcpyDeviceToHost, - cudaStreamPerThread)); + //In all cases we have possibly multiple async calls queued up in + //our stream. We need to block on the copy back to control since + //we don't wanting it accessing memory that hasn't finished + //being used by the GPU + cudaStreamSynchronize(cudaStreamPerThread); } void ExecutionArrayInterfaceBasic::UsingForRead(