Merge topic 'cuda-streams'

72a6cf4a Change cuda calls to use the per-thread stream. Acked-by: Kitware Robot <kwrobot@kitware.com> Merge-request: !887
2017-08-17 19:27:20 +00:00 · 2017-08-17 19:27:20 +00:00 · 11fce76732
commit 11fce76732
parent 0c435d39f4 72a6cf4a21
8 changed files with 120 additions and 82 deletions
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.cu
@ -134,8 +134,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(const v
    return;
  }

-  VTKM_CUDA_CALL(cudaMemcpy(
-    executionPtr, controlPtr, static_cast<std::size_t>(numBytes), cudaMemcpyHostToDevice));
+  VTKM_CUDA_CALL(cudaMemcpyAsync(executionPtr,
+                                 controlPtr,
+                                 static_cast<std::size_t>(numBytes),
+                                 cudaMemcpyHostToDevice,
+                                 cudaStreamPerThread));
 }

 void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
@ -158,8 +161,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
    return;
  }

-  VTKM_CUDA_CALL(cudaMemcpy(
-    controlPtr, executionPtr, static_cast<std::size_t>(numBytes), cudaMemcpyDeviceToHost));
+  VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
+                                 executionPtr,
+                                 static_cast<std::size_t>(numBytes),
+                                 cudaMemcpyDeviceToHost,
+                                 cudaStreamPerThread));
 }

 } // end namespace internal
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@ -123,7 +123,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
    VTKM_CUDA_CALL(
      cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
  }
 }

@ -137,7 +137,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
  }
 }

@ -151,7 +151,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
  }
 }

@ -165,7 +165,7 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
-    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
+    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
  }
 }

--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -63,7 +63,10 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
      vtkm::cont::DeviceAdapterTagCuda>
 {

-  VTKM_CONT static void Synchronize() { VTKM_CUDA_CALL(cudaDeviceSynchronize()); }
+  VTKM_CONT static void Synchronize()
+  {
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+  }
 };

 /// CUDA contains its own high resolution timer.
@ -86,13 +89,13 @@ public:

  VTKM_CONT void Reset()
  {
-    VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
    VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
  }

  VTKM_CONT vtkm::Float64 GetElapsedTime()
  {
-    VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
    VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
    float elapsedTimeMilliseconds;
    VTKM_CUDA_CALL(
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@ -63,6 +63,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 #include <thrust/system/cuda/execution_policy.h>
 VTKM_THIRDPARTY_POST_INCLUDE

+#include <atomic>
+
 namespace vtkm
 {
 namespace cont
@ -201,7 +203,7 @@ private:
    try
    {
      ::thrust::copy(
-        thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
+        ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
    }
    catch (...)
    {
@ -226,7 +228,7 @@ private:
    try
    {
      auto newLast = ::thrust::copy_if(
-        thrust::cuda::par, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
+        ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
      return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
    }
    catch (...)
@ -255,7 +257,7 @@ private:
  {
    try
    {
-      ::thrust::copy_n(thrust::cuda::par,
+      ::thrust::copy_n(ThrustCudaPolicyPerThread,
                       IteratorBegin(input) + inputOffset,
                       static_cast<std::size_t>(size),
                       IteratorBegin(output) + outputOffset);
@ -295,7 +297,7 @@ private:

    try
    {
-      ::thrust::lower_bound(thrust::cuda::par,
+      ::thrust::lower_bound(ThrustCudaPolicyPerThread,
                            IteratorBegin(input),
                            IteratorEnd(input),
                            IteratorBegin(values),
@ -337,7 +339,7 @@ private:
    try
    {
      return ::thrust::reduce(
-        thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
+        ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
    }
    catch (...)
    {
@ -364,8 +366,11 @@ private:

    try
    {
-      return ::thrust::reduce(
-        thrust::cuda::par, IteratorBegin(castPortal), IteratorEnd(castPortal), initialValue, bop);
+      return ::thrust::reduce(ThrustCudaPolicyPerThread,
+                              IteratorBegin(castPortal),
+                              IteratorEnd(castPortal),
+                              initialValue,
+                              bop);
    }
    catch (...)
    {
@ -447,11 +452,11 @@ private:
      //store the current value of the last position array in a separate cuda
      //memory location since the exclusive_scan will overwrite that value
      //once run
-      ::thrust::copy_n(thrust::cuda::par, IteratorEnd(input) - 1, 1, sum.begin());
+      ::thrust::copy_n(ThrustCudaPolicyPerThread, IteratorEnd(input) - 1, 1, sum.begin());

      vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);

-      auto end = ::thrust::exclusive_scan(thrust::cuda::par,
+      auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
                                          IteratorBegin(input),
                                          IteratorEnd(input),
                                          IteratorBegin(output),
@ -461,10 +466,10 @@ private:
      //Store the new value for the end of the array. This is done because
      //with items such as the transpose array it is unsafe to pass the
      //portal to the SumExclusiveScan
-      ::thrust::copy_n(thrust::cuda::par, (end - 1), 1, sum.begin() + 1);
+      ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);

      //execute the binaryOp one last time on the device.
-      SumExclusiveScan<<<1, 1>>>(sum[0], sum[1], sum[2], bop);
+      SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(sum[0], sum[1], sum[2], bop);
    }
    catch (...)
    {
@ -491,8 +496,11 @@ private:

    try
    {
-      auto end = ::thrust::inclusive_scan(
-        thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output), bop);
+      auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
+                                          IteratorBegin(input),
+                                          IteratorEnd(input),
+                                          IteratorBegin(output),
+                                          bop);
      return *(end - 1);
    }
    catch (...)
@ -535,7 +543,7 @@ private:

    try
    {
-      ::thrust::inclusive_scan_by_key(thrust::cuda::par,
+      ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
                                      IteratorBegin(keys),
                                      IteratorEnd(keys),
                                      IteratorBegin(values),
@ -585,7 +593,7 @@ private:
      binary_operator);
    try
    {
-      ::thrust::exclusive_scan_by_key(thrust::cuda::par,
+      ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
                                      IteratorBegin(keys),
                                      IteratorEnd(keys),
                                      IteratorBegin(values),
@ -655,7 +663,7 @@ private:
    try
    {
      auto begin = IteratorBegin(values);
-      auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values));
+      auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values));
      return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
    }
    catch (...)
@ -674,7 +682,7 @@ private:
    try
    {
      auto begin = IteratorBegin(values);
-      auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values), bop);
+      auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values), bop);
      return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
    }
    catch (...)
@ -691,7 +699,7 @@ private:
  {
    try
    {
-      ::thrust::upper_bound(thrust::cuda::par,
+      ::thrust::upper_bound(ThrustCudaPolicyPerThread,
                            IteratorBegin(input),
                            IteratorEnd(input),
                            IteratorBegin(values),
@ -716,7 +724,7 @@ private:
      binary_compare);
    try
    {
-      ::thrust::upper_bound(thrust::cuda::par,
+      ::thrust::upper_bound(ThrustCudaPolicyPerThread,
                            IteratorBegin(input),
                            IteratorEnd(input),
                            IteratorBegin(values),
@ -736,7 +744,7 @@ private:
  {
    try
    {
-      ::thrust::upper_bound(thrust::cuda::par,
+      ::thrust::upper_bound(ThrustCudaPolicyPerThread,
                            IteratorBegin(input),
                            IteratorEnd(input),
                            IteratorBegin(values_output),
@ -1147,11 +1155,12 @@ private:
  VTKM_CONT
  static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
  {
-    static bool gridQueryInit = false;
+    static std::atomic<bool> gridQueryInit(false);
    static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
+    // NOTE: The following code may still be executed by multiple threads
+    // but it should not cause any correctness issues.
    if (!gridQueryInit)
    {
-      gridQueryInit = true;
      int currDevice;
      VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda

@ -1171,10 +1180,14 @@ private:

      vtkm::UInt32* dev_actual_size;
      VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
-      DetermineProperXGridSize<<<1, 1>>>(maxGridSize[0], dev_actual_size);
-      VTKM_CUDA_CALL(cudaDeviceSynchronize());
-      VTKM_CUDA_CALL(
-        cudaMemcpy(&maxGridSize[0], dev_actual_size, sizeof(vtkm::UInt32), cudaMemcpyDeviceToHost));
+      DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
+      VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
+                                     dev_actual_size,
+                                     sizeof(vtkm::UInt32),
+                                     cudaMemcpyDeviceToHost,
+                                     cudaStreamPerThread));
+      VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+      gridQueryInit = true;
      VTKM_CUDA_CALL(cudaFree(dev_actual_size));
    }
    return maxGridSize;
@ -1223,7 +1236,7 @@ public:
    //handle datasets larger than 2B, we need to execute multiple kernels
    if (totalBlocks < maxblocksPerLaunch)
    {
-      Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize>>>(
+      Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
        functor, vtkm::Id(0), numInstances);
    }
    else
@ -1233,7 +1246,7 @@ public:
      for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
           numberOfKernelsInvoked += numberOfKernelsToRun)
      {
-        Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize>>>(
+        Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
          functor, numberOfKernelsInvoked, numInstances);
      }
    }
@ -1242,7 +1255,7 @@ public:
    //In the future I want move this before the schedule call, and throwing
    //an exception if the previous schedule wrote an error. This would help
    //cuda to run longer before we hard sync.
-    VTKM_CUDA_CALL(cudaDeviceSynchronize());
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));

    //check what the value is
    if (hostErrorPtr[0] != '\0')
@ -1302,13 +1315,14 @@ public:
    dim3 gridSize3d;
    compute_block_size(ranges, blockSize3d, gridSize3d);

-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
+    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
+                                                                                        ranges);

    //sync so that we can check the results of the call.
    //In the future I want move this before the schedule call, and throwing
    //an exception if the previous schedule wrote an error. This would help
    //cuda to run longer before we hard sync.
-    VTKM_CUDA_CALL(cudaDeviceSynchronize());
+    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));

    //check what the value is
    if (hostErrorPtr[0] != '\0')
--- a/vtkm/cont/cuda/internal/TaskTuner.h
+++ b/vtkm/cont/cuda/internal/TaskTuner.h
@ -128,10 +128,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
    VTKM_CUDA_CALL(cudaEventCreate(&start));
    VTKM_CUDA_CALL(cudaEventCreate(&stop));

-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
+    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+    Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
      functor, vtkm::Id(0), size);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
    float elapsedTimeMilliseconds;
@ -157,9 +157,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
    VTKM_CUDA_CALL(cudaEventCreate(&stop));


-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+    Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
+      functor, vtkm::Id(0), size);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
    float elapsedTimeMilliseconds;
@ -188,9 +189,10 @@ static void compare_3d_dynamic_block_picker(Functor functor,
    VTKM_CUDA_CALL(cudaEventCreate(&start));
    VTKM_CUDA_CALL(cudaEventCreate(&stop));

-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
+                                                                                        ranges);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
    float elapsedTimeMilliseconds;
@ -269,9 +271,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
        }

        compute_block_size(ranges, blockSize3d, gridSize3d);
-        VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-        Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
-        VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+        VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+        Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
+                                                                                            ranges);
+        VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
        float elapsedTimeMilliseconds;
@ -309,9 +312,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
    dim3 gridSize3d;

    compute_block_size(ranges, blockSize3d, gridSize3d);
-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
+                                                                                        ranges);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
    float elapsedTimeMilliseconds;
--- a/vtkm/cont/cuda/internal/VirtualObjectTransferCuda.h
+++ b/vtkm/cont/cuda/internal/VirtualObjectTransferCuda.h
@ -49,13 +49,15 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
  {
    TargetClass* cutarget;
    VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
-    VTKM_CUDA_CALL(cudaMemcpy(cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
+    VTKM_CUDA_CALL(cudaMemcpyAsync(
+      cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));

    VirtualObject* cuobject;
    VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
-    detail::CreateKernel<<<1, 1>>>(cuobject, cutarget);
+    detail::CreateKernel<<<1, 1, 0, cudaStreamPerThread>>>(cuobject, cutarget);
    VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
-    VTKM_CUDA_CALL(cudaMemcpy(&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost));
+    VTKM_CUDA_CALL(cudaMemcpyAsync(
+      &object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost, cudaStreamPerThread));
    VTKM_CUDA_CALL(cudaFree(cuobject));

    return cutarget;
@ -63,7 +65,8 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt

  static void Update(void* deviceState, const void* target)
  {
-    VTKM_CUDA_CALL(cudaMemcpy(deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
+    VTKM_CUDA_CALL(cudaMemcpyAsync(
+      deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
  }

  static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }
--- a/vtkm/exec/cuda/internal/ExecutionPolicy.h
+++ b/vtkm/exec/cuda/internal/ExecutionPolicy.h
@ -31,6 +31,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 #include <thrust/system/cuda/memory.h>
 VTKM_THIRDPARTY_POST_INCLUDE

+#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)
+
 struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
 {
 };
@ -47,7 +49,7 @@ __host__ __device__ void sort(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
 { //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
+  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
 }

 template <typename T, typename RandomAccessIterator>
@ -59,7 +61,8 @@ __host__ __device__ void sort_by_key(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
 { //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
+  return thrust::sort_by_key(
+    ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
 }

 template <typename T>
@ -70,7 +73,7 @@ __host__ __device__ void sort(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
 { //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
+  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
 }

 template <typename T, typename RandomAccessIterator>
@ -82,7 +85,8 @@ __host__ __device__ void sort_by_key(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
 { //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
+  return thrust::sort_by_key(
+    ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
 }

 template <typename T>
@ -93,7 +97,7 @@ __host__ __device__ void sort(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
 { //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
+  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
 }

 template <typename T, typename RandomAccessIterator>
@ -105,7 +109,8 @@ __host__ __device__ void sort_by_key(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
 { //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
+  return thrust::sort_by_key(
+    ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
 }

 template <typename T>
@ -116,7 +121,7 @@ __host__ __device__ void sort(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
 { //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
+  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
 }

 template <typename T, typename RandomAccessIterator>
@ -128,7 +133,8 @@ __host__ __device__ void sort_by_key(
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
 { //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
-  return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
+  return thrust::sort_by_key(
+    ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
 }

 template <typename RandomAccessIterator, typename StrictWeakOrdering>
@ -141,7 +147,7 @@ __host__ __device__ void sort(const vtkm_cuda_policy& exec,
  //the operator is not an approved less/greater operator.
  //This most likely will cause thrust to internally determine that
  //the best sort implementation is merge sort.
-  return thrust::sort(thrust::cuda::par, first, last, comp);
+  return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
 }

 template <typename RandomAccessIteratorKeys,
@ -157,7 +163,7 @@ __host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
  //the operator is not an approved less/greater operator.
  //This most likely will cause thrust to internally determine that
  //the best sort implementation is merge sort.
-  return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, comp);
+  return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
 }

 template <typename T,
@ -178,23 +184,24 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke

 {
 #if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
-  ::thrust::pair<OutputIterator1, OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par,
-                                                                                  keys_first.get(),
-                                                                                  keys_last.get(),
-                                                                                  values_first,
-                                                                                  keys_output,
-                                                                                  values_output,
-                                                                                  binary_pred,
-                                                                                  binary_op);
+  ::thrust::pair<OutputIterator1, OutputIterator2> result =
+    thrust::reduce_by_key(ThrustCudaPolicyPerThread,
+                          keys_first.get(),
+                          keys_last.get(),
+                          values_first,
+                          keys_output,
+                          values_output,
+                          binary_pred,
+                          binary_op);

 //only sync if we are being invoked from the host
 #ifndef __CUDA_ARCH__
-  VTKM_CUDA_CALL(cudaDeviceSynchronize());
+  VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 #endif

  return result;
 #else
-  return thrust::reduce_by_key(thrust::cuda::par,
+  return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
                               keys_first,
                               keys_last,
                               values_first,
--- a/vtkm/interop/cuda/internal/TransferToOpenGL.h
+++ b/vtkm/interop/cuda/internal/TransferToOpenGL.h
@ -33,6 +33,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/system/cuda/execution_policy.h>
+#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
 VTKM_THIRDPARTY_POST_INCLUDE

 namespace vtkm
@ -200,7 +201,7 @@ public:

    //Perhaps a direct call to thrust copy should be wrapped in a vtkm
    //compatble function
-    ::thrust::copy(thrust::cuda::par,
+    ::thrust::copy(ThrustCudaPolicyPerThread,
                   vtkm::cont::cuda::internal::IteratorBegin(portal),
                   vtkm::cont::cuda::internal::IteratorEnd(portal),
                   thrust::cuda::pointer<ValueType>(beginPointer));