Change cuda calls to use the per-thread stream.

This commit is contained in:
Sujin Philip 2017-08-16 16:11:43 -04:00
parent 0c435d39f4
commit 72a6cf4a21
8 changed files with 120 additions and 82 deletions

@ -134,8 +134,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(const v
return;
}
VTKM_CUDA_CALL(cudaMemcpy(
executionPtr, controlPtr, static_cast<std::size_t>(numBytes), cudaMemcpyHostToDevice));
VTKM_CUDA_CALL(cudaMemcpyAsync(executionPtr,
controlPtr,
static_cast<std::size_t>(numBytes),
cudaMemcpyHostToDevice,
cudaStreamPerThread));
}
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
@ -158,8 +161,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
return;
}
VTKM_CUDA_CALL(cudaMemcpy(
controlPtr, executionPtr, static_cast<std::size_t>(numBytes), cudaMemcpyDeviceToHost));
VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
executionPtr,
static_cast<std::size_t>(numBytes),
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
}
} // end namespace internal

@ -123,7 +123,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
}
}
@ -137,7 +137,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
}
}
@ -151,7 +151,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
}
}
@ -165,7 +165,7 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
}
}

@ -63,7 +63,10 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
vtkm::cont::DeviceAdapterTagCuda>
{
VTKM_CONT static void Synchronize() { VTKM_CUDA_CALL(cudaDeviceSynchronize()); }
VTKM_CONT static void Synchronize()
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
}
};
/// CUDA contains its own high resolution timer.
@ -86,13 +89,13 @@ public:
VTKM_CONT void Reset()
{
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
}
VTKM_CONT vtkm::Float64 GetElapsedTime()
{
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(

@ -63,6 +63,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/system/cuda/execution_policy.h>
VTKM_THIRDPARTY_POST_INCLUDE
#include <atomic>
namespace vtkm
{
namespace cont
@ -201,7 +203,7 @@ private:
try
{
::thrust::copy(
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
}
catch (...)
{
@ -226,7 +228,7 @@ private:
try
{
auto newLast = ::thrust::copy_if(
thrust::cuda::par, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
}
catch (...)
@ -255,7 +257,7 @@ private:
{
try
{
::thrust::copy_n(thrust::cuda::par,
::thrust::copy_n(ThrustCudaPolicyPerThread,
IteratorBegin(input) + inputOffset,
static_cast<std::size_t>(size),
IteratorBegin(output) + outputOffset);
@ -295,7 +297,7 @@ private:
try
{
::thrust::lower_bound(thrust::cuda::par,
::thrust::lower_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
@ -337,7 +339,7 @@ private:
try
{
return ::thrust::reduce(
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
}
catch (...)
{
@ -364,8 +366,11 @@ private:
try
{
return ::thrust::reduce(
thrust::cuda::par, IteratorBegin(castPortal), IteratorEnd(castPortal), initialValue, bop);
return ::thrust::reduce(ThrustCudaPolicyPerThread,
IteratorBegin(castPortal),
IteratorEnd(castPortal),
initialValue,
bop);
}
catch (...)
{
@ -447,11 +452,11 @@ private:
//store the current value of the last position array in a separate cuda
//memory location since the exclusive_scan will overwrite that value
//once run
::thrust::copy_n(thrust::cuda::par, IteratorEnd(input) - 1, 1, sum.begin());
::thrust::copy_n(ThrustCudaPolicyPerThread, IteratorEnd(input) - 1, 1, sum.begin());
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
auto end = ::thrust::exclusive_scan(thrust::cuda::par,
auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output),
@ -461,10 +466,10 @@ private:
//Store the new value for the end of the array. This is done because
//with items such as the transpose array it is unsafe to pass the
//portal to the SumExclusiveScan
::thrust::copy_n(thrust::cuda::par, (end - 1), 1, sum.begin() + 1);
::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
//execute the binaryOp one last time on the device.
SumExclusiveScan<<<1, 1>>>(sum[0], sum[1], sum[2], bop);
SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(sum[0], sum[1], sum[2], bop);
}
catch (...)
{
@ -491,8 +496,11 @@ private:
try
{
auto end = ::thrust::inclusive_scan(
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output), bop);
auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output),
bop);
return *(end - 1);
}
catch (...)
@ -535,7 +543,7 @@ private:
try
{
::thrust::inclusive_scan_by_key(thrust::cuda::par,
::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
IteratorBegin(keys),
IteratorEnd(keys),
IteratorBegin(values),
@ -585,7 +593,7 @@ private:
binary_operator);
try
{
::thrust::exclusive_scan_by_key(thrust::cuda::par,
::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
IteratorBegin(keys),
IteratorEnd(keys),
IteratorBegin(values),
@ -655,7 +663,7 @@ private:
try
{
auto begin = IteratorBegin(values);
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values));
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values));
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
}
catch (...)
@ -674,7 +682,7 @@ private:
try
{
auto begin = IteratorBegin(values);
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values), bop);
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values), bop);
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
}
catch (...)
@ -691,7 +699,7 @@ private:
{
try
{
::thrust::upper_bound(thrust::cuda::par,
::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
@ -716,7 +724,7 @@ private:
binary_compare);
try
{
::thrust::upper_bound(thrust::cuda::par,
::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
@ -736,7 +744,7 @@ private:
{
try
{
::thrust::upper_bound(thrust::cuda::par,
::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values_output),
@ -1147,11 +1155,12 @@ private:
VTKM_CONT
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
{
static bool gridQueryInit = false;
static std::atomic<bool> gridQueryInit(false);
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
// NOTE: The following code may still be executed by multiple threads
// but it should not cause any correctness issues.
if (!gridQueryInit)
{
gridQueryInit = true;
int currDevice;
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
@ -1171,10 +1180,14 @@ private:
vtkm::UInt32* dev_actual_size;
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
DetermineProperXGridSize<<<1, 1>>>(maxGridSize[0], dev_actual_size);
VTKM_CUDA_CALL(cudaDeviceSynchronize());
VTKM_CUDA_CALL(
cudaMemcpy(&maxGridSize[0], dev_actual_size, sizeof(vtkm::UInt32), cudaMemcpyDeviceToHost));
DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
dev_actual_size,
sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
gridQueryInit = true;
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
}
return maxGridSize;
@ -1223,7 +1236,7 @@ public:
//handle datasets larger than 2B, we need to execute multiple kernels
if (totalBlocks < maxblocksPerLaunch)
{
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize>>>(
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), numInstances);
}
else
@ -1233,7 +1246,7 @@ public:
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
numberOfKernelsInvoked += numberOfKernelsToRun)
{
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize>>>(
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
functor, numberOfKernelsInvoked, numInstances);
}
}
@ -1242,7 +1255,7 @@ public:
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaDeviceSynchronize());
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is
if (hostErrorPtr[0] != '\0')
@ -1302,13 +1315,14 @@ public:
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaDeviceSynchronize());
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is
if (hostErrorPtr[0] != '\0')

@ -128,10 +128,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
@ -157,9 +157,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
@ -188,9 +189,10 @@ static void compare_3d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
@ -269,9 +271,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
}
compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
@ -309,9 +312,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;

@ -49,13 +49,15 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
{
TargetClass* cutarget;
VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
VTKM_CUDA_CALL(cudaMemcpy(cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
VTKM_CUDA_CALL(cudaMemcpyAsync(
cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
VirtualObject* cuobject;
VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
detail::CreateKernel<<<1, 1>>>(cuobject, cutarget);
detail::CreateKernel<<<1, 1, 0, cudaStreamPerThread>>>(cuobject, cutarget);
VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
VTKM_CUDA_CALL(cudaMemcpy(&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost));
VTKM_CUDA_CALL(cudaMemcpyAsync(
&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaFree(cuobject));
return cutarget;
@ -63,7 +65,8 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
static void Update(void* deviceState, const void* target)
{
VTKM_CUDA_CALL(cudaMemcpy(deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
VTKM_CUDA_CALL(cudaMemcpyAsync(
deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
}
static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }

@ -31,6 +31,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/system/cuda/memory.h>
VTKM_THIRDPARTY_POST_INCLUDE
#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
{
};
@ -47,7 +49,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
}
template <typename T, typename RandomAccessIterator>
@ -59,7 +61,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
}
template <typename T>
@ -70,7 +73,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
{ //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
}
template <typename T, typename RandomAccessIterator>
@ -82,7 +85,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
{ //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
}
template <typename T>
@ -93,7 +97,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
{ //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
}
template <typename T, typename RandomAccessIterator>
@ -105,7 +109,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
{ //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
}
template <typename T>
@ -116,7 +121,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
{ //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
}
template <typename T, typename RandomAccessIterator>
@ -128,7 +133,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
{ //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
}
template <typename RandomAccessIterator, typename StrictWeakOrdering>
@ -141,7 +147,7 @@ __host__ __device__ void sort(const vtkm_cuda_policy& exec,
//the operator is not an approved less/greater operator.
//This most likely will cause thrust to internally determine that
//the best sort implementation is merge sort.
return thrust::sort(thrust::cuda::par, first, last, comp);
return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
}
template <typename RandomAccessIteratorKeys,
@ -157,7 +163,7 @@ __host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
//the operator is not an approved less/greater operator.
//This most likely will cause thrust to internally determine that
//the best sort implementation is merge sort.
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, comp);
return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
}
template <typename T,
@ -178,23 +184,24 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
{
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
::thrust::pair<OutputIterator1, OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par,
keys_first.get(),
keys_last.get(),
values_first,
keys_output,
values_output,
binary_pred,
binary_op);
::thrust::pair<OutputIterator1, OutputIterator2> result =
thrust::reduce_by_key(ThrustCudaPolicyPerThread,
keys_first.get(),
keys_last.get(),
values_first,
keys_output,
values_output,
binary_pred,
binary_op);
//only sync if we are being invoked from the host
#ifndef __CUDA_ARCH__
VTKM_CUDA_CALL(cudaDeviceSynchronize());
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
#endif
return result;
#else
return thrust::reduce_by_key(thrust::cuda::par,
return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
keys_first,
keys_last,
values_first,

@ -33,6 +33,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/system/cuda/execution_policy.h>
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
VTKM_THIRDPARTY_POST_INCLUDE
namespace vtkm
@ -200,7 +201,7 @@ public:
//Perhaps a direct call to thrust copy should be wrapped in a vtkm
//compatble function
::thrust::copy(thrust::cuda::par,
::thrust::copy(ThrustCudaPolicyPerThread,
vtkm::cont::cuda::internal::IteratorBegin(portal),
vtkm::cont::cuda::internal::IteratorEnd(portal),
thrust::cuda::pointer<ValueType>(beginPointer));