Merge topic 'cuda-streams'

72a6cf4a Change cuda calls to use the per-thread stream.

Acked-by: Kitware Robot <kwrobot@kitware.com>
Merge-request: !887
This commit is contained in:
Sujin Philip 2017-08-17 19:27:20 +00:00 committed by Kitware Robot
commit 11fce76732
8 changed files with 120 additions and 82 deletions

@ -134,8 +134,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(const v
return; return;
} }
VTKM_CUDA_CALL(cudaMemcpy( VTKM_CUDA_CALL(cudaMemcpyAsync(executionPtr,
executionPtr, controlPtr, static_cast<std::size_t>(numBytes), cudaMemcpyHostToDevice)); controlPtr,
static_cast<std::size_t>(numBytes),
cudaMemcpyHostToDevice,
cudaStreamPerThread));
} }
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr, void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
@ -158,8 +161,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
return; return;
} }
VTKM_CUDA_CALL(cudaMemcpy( VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
controlPtr, executionPtr, static_cast<std::size_t>(numBytes), cudaMemcpyDeviceToHost)); executionPtr,
static_cast<std::size_t>(numBytes),
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
} }
} // end namespace internal } // end namespace internal

@ -123,7 +123,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL( VTKM_CUDA_CALL(
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
} }
} }
@ -137,7 +137,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev)); VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
} }
} }
@ -151,7 +151,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev)); VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
} }
} }
@ -165,7 +165,7 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
VTKM_CUDA_CALL(cudaGetDevice(&dev)); VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
} }
} }

@ -63,7 +63,10 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
vtkm::cont::DeviceAdapterTagCuda> vtkm::cont::DeviceAdapterTagCuda>
{ {
VTKM_CONT static void Synchronize() { VTKM_CUDA_CALL(cudaDeviceSynchronize()); } VTKM_CONT static void Synchronize()
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
}
}; };
/// CUDA contains its own high resolution timer. /// CUDA contains its own high resolution timer.
@ -86,13 +89,13 @@ public:
VTKM_CONT void Reset() VTKM_CONT void Reset()
{ {
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0)); VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent)); VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
} }
VTKM_CONT vtkm::Float64 GetElapsedTime() VTKM_CONT vtkm::Float64 GetElapsedTime()
{ {
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0)); VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent)); VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;
VTKM_CUDA_CALL( VTKM_CUDA_CALL(

@ -63,6 +63,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/system/cuda/execution_policy.h> #include <thrust/system/cuda/execution_policy.h>
VTKM_THIRDPARTY_POST_INCLUDE VTKM_THIRDPARTY_POST_INCLUDE
#include <atomic>
namespace vtkm namespace vtkm
{ {
namespace cont namespace cont
@ -201,7 +203,7 @@ private:
try try
{ {
::thrust::copy( ::thrust::copy(
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output)); ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
} }
catch (...) catch (...)
{ {
@ -226,7 +228,7 @@ private:
try try
{ {
auto newLast = ::thrust::copy_if( auto newLast = ::thrust::copy_if(
thrust::cuda::par, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up); ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast)); return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
} }
catch (...) catch (...)
@ -255,7 +257,7 @@ private:
{ {
try try
{ {
::thrust::copy_n(thrust::cuda::par, ::thrust::copy_n(ThrustCudaPolicyPerThread,
IteratorBegin(input) + inputOffset, IteratorBegin(input) + inputOffset,
static_cast<std::size_t>(size), static_cast<std::size_t>(size),
IteratorBegin(output) + outputOffset); IteratorBegin(output) + outputOffset);
@ -295,7 +297,7 @@ private:
try try
{ {
::thrust::lower_bound(thrust::cuda::par, ::thrust::lower_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input), IteratorBegin(input),
IteratorEnd(input), IteratorEnd(input),
IteratorBegin(values), IteratorBegin(values),
@ -337,7 +339,7 @@ private:
try try
{ {
return ::thrust::reduce( return ::thrust::reduce(
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), initialValue, bop); ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
} }
catch (...) catch (...)
{ {
@ -364,8 +366,11 @@ private:
try try
{ {
return ::thrust::reduce( return ::thrust::reduce(ThrustCudaPolicyPerThread,
thrust::cuda::par, IteratorBegin(castPortal), IteratorEnd(castPortal), initialValue, bop); IteratorBegin(castPortal),
IteratorEnd(castPortal),
initialValue,
bop);
} }
catch (...) catch (...)
{ {
@ -447,11 +452,11 @@ private:
//store the current value of the last position array in a separate cuda //store the current value of the last position array in a separate cuda
//memory location since the exclusive_scan will overwrite that value //memory location since the exclusive_scan will overwrite that value
//once run //once run
::thrust::copy_n(thrust::cuda::par, IteratorEnd(input) - 1, 1, sum.begin()); ::thrust::copy_n(ThrustCudaPolicyPerThread, IteratorEnd(input) - 1, 1, sum.begin());
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp); vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
auto end = ::thrust::exclusive_scan(thrust::cuda::par, auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
IteratorBegin(input), IteratorBegin(input),
IteratorEnd(input), IteratorEnd(input),
IteratorBegin(output), IteratorBegin(output),
@ -461,10 +466,10 @@ private:
//Store the new value for the end of the array. This is done because //Store the new value for the end of the array. This is done because
//with items such as the transpose array it is unsafe to pass the //with items such as the transpose array it is unsafe to pass the
//portal to the SumExclusiveScan //portal to the SumExclusiveScan
::thrust::copy_n(thrust::cuda::par, (end - 1), 1, sum.begin() + 1); ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
//execute the binaryOp one last time on the device. //execute the binaryOp one last time on the device.
SumExclusiveScan<<<1, 1>>>(sum[0], sum[1], sum[2], bop); SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(sum[0], sum[1], sum[2], bop);
} }
catch (...) catch (...)
{ {
@ -491,8 +496,11 @@ private:
try try
{ {
auto end = ::thrust::inclusive_scan( auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output), bop); IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output),
bop);
return *(end - 1); return *(end - 1);
} }
catch (...) catch (...)
@ -535,7 +543,7 @@ private:
try try
{ {
::thrust::inclusive_scan_by_key(thrust::cuda::par, ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
IteratorBegin(keys), IteratorBegin(keys),
IteratorEnd(keys), IteratorEnd(keys),
IteratorBegin(values), IteratorBegin(values),
@ -585,7 +593,7 @@ private:
binary_operator); binary_operator);
try try
{ {
::thrust::exclusive_scan_by_key(thrust::cuda::par, ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
IteratorBegin(keys), IteratorBegin(keys),
IteratorEnd(keys), IteratorEnd(keys),
IteratorBegin(values), IteratorBegin(values),
@ -655,7 +663,7 @@ private:
try try
{ {
auto begin = IteratorBegin(values); auto begin = IteratorBegin(values);
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values)); auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values));
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast)); return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
} }
catch (...) catch (...)
@ -674,7 +682,7 @@ private:
try try
{ {
auto begin = IteratorBegin(values); auto begin = IteratorBegin(values);
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values), bop); auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values), bop);
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast)); return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
} }
catch (...) catch (...)
@ -691,7 +699,7 @@ private:
{ {
try try
{ {
::thrust::upper_bound(thrust::cuda::par, ::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input), IteratorBegin(input),
IteratorEnd(input), IteratorEnd(input),
IteratorBegin(values), IteratorBegin(values),
@ -716,7 +724,7 @@ private:
binary_compare); binary_compare);
try try
{ {
::thrust::upper_bound(thrust::cuda::par, ::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input), IteratorBegin(input),
IteratorEnd(input), IteratorEnd(input),
IteratorBegin(values), IteratorBegin(values),
@ -736,7 +744,7 @@ private:
{ {
try try
{ {
::thrust::upper_bound(thrust::cuda::par, ::thrust::upper_bound(ThrustCudaPolicyPerThread,
IteratorBegin(input), IteratorBegin(input),
IteratorEnd(input), IteratorEnd(input),
IteratorBegin(values_output), IteratorBegin(values_output),
@ -1147,11 +1155,12 @@ private:
VTKM_CONT VTKM_CONT
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks() static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
{ {
static bool gridQueryInit = false; static std::atomic<bool> gridQueryInit(false);
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize; static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
// NOTE: The following code may still be executed by multiple threads
// but it should not cause any correctness issues.
if (!gridQueryInit) if (!gridQueryInit)
{ {
gridQueryInit = true;
int currDevice; int currDevice;
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
@ -1171,10 +1180,14 @@ private:
vtkm::UInt32* dev_actual_size; vtkm::UInt32* dev_actual_size;
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32))); VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
DetermineProperXGridSize<<<1, 1>>>(maxGridSize[0], dev_actual_size); DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
VTKM_CUDA_CALL(cudaDeviceSynchronize()); VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
VTKM_CUDA_CALL( dev_actual_size,
cudaMemcpy(&maxGridSize[0], dev_actual_size, sizeof(vtkm::UInt32), cudaMemcpyDeviceToHost)); sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost,
cudaStreamPerThread));
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
gridQueryInit = true;
VTKM_CUDA_CALL(cudaFree(dev_actual_size)); VTKM_CUDA_CALL(cudaFree(dev_actual_size));
} }
return maxGridSize; return maxGridSize;
@ -1223,7 +1236,7 @@ public:
//handle datasets larger than 2B, we need to execute multiple kernels //handle datasets larger than 2B, we need to execute multiple kernels
if (totalBlocks < maxblocksPerLaunch) if (totalBlocks < maxblocksPerLaunch)
{ {
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize>>>( Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), numInstances); functor, vtkm::Id(0), numInstances);
} }
else else
@ -1233,7 +1246,7 @@ public:
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances; for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
numberOfKernelsInvoked += numberOfKernelsToRun) numberOfKernelsInvoked += numberOfKernelsToRun)
{ {
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize>>>( Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
functor, numberOfKernelsInvoked, numInstances); functor, numberOfKernelsInvoked, numInstances);
} }
} }
@ -1242,7 +1255,7 @@ public:
//In the future I want move this before the schedule call, and throwing //In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help //an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync. //cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaDeviceSynchronize()); VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is //check what the value is
if (hostErrorPtr[0] != '\0') if (hostErrorPtr[0] != '\0')
@ -1302,13 +1315,14 @@ public:
dim3 gridSize3d; dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d); compute_block_size(ranges, blockSize3d, gridSize3d);
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges); Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
ranges);
//sync so that we can check the results of the call. //sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing //In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help //an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync. //cuda to run longer before we hard sync.
VTKM_CUDA_CALL(cudaDeviceSynchronize()); VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
//check what the value is //check what the value is
if (hostErrorPtr[0] != '\0') if (hostErrorPtr[0] != '\0')

@ -128,10 +128,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&start)); VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop)); VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0)); VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>( Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
functor, vtkm::Id(0), size); functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop)); VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;
@ -157,9 +157,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&stop)); VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0)); VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size); Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); functor, vtkm::Id(0), size);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop)); VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;
@ -188,9 +189,10 @@ static void compare_3d_dynamic_block_picker(Functor functor,
VTKM_CUDA_CALL(cudaEventCreate(&start)); VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop)); VTKM_CUDA_CALL(cudaEventCreate(&stop));
VTKM_CUDA_CALL(cudaEventRecord(start, 0)); VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges); Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop)); VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;
@ -269,9 +271,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
} }
compute_block_size(ranges, blockSize3d, gridSize3d); compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, 0)); VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges); Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop)); VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;
@ -309,9 +312,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
dim3 gridSize3d; dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d); compute_block_size(ranges, blockSize3d, gridSize3d);
VTKM_CUDA_CALL(cudaEventRecord(start, 0)); VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges); Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop)); VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds; float elapsedTimeMilliseconds;

@ -49,13 +49,15 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
{ {
TargetClass* cutarget; TargetClass* cutarget;
VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass))); VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
VTKM_CUDA_CALL(cudaMemcpy(cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice)); VTKM_CUDA_CALL(cudaMemcpyAsync(
cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
VirtualObject* cuobject; VirtualObject* cuobject;
VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject))); VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
detail::CreateKernel<<<1, 1>>>(cuobject, cutarget); detail::CreateKernel<<<1, 1, 0, cudaStreamPerThread>>>(cuobject, cutarget);
VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR(); VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
VTKM_CUDA_CALL(cudaMemcpy(&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost)); VTKM_CUDA_CALL(cudaMemcpyAsync(
&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaFree(cuobject)); VTKM_CUDA_CALL(cudaFree(cuobject));
return cutarget; return cutarget;
@ -63,7 +65,8 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
static void Update(void* deviceState, const void* target) static void Update(void* deviceState, const void* target)
{ {
VTKM_CUDA_CALL(cudaMemcpy(deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice)); VTKM_CUDA_CALL(cudaMemcpyAsync(
deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
} }
static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); } static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }

@ -31,6 +31,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/system/cuda/memory.h> #include <thrust/system/cuda/memory.h>
VTKM_THIRDPARTY_POST_INCLUDE VTKM_THIRDPARTY_POST_INCLUDE
#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy> struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
{ {
}; };
@ -47,7 +49,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op { //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>()); return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
} }
template <typename T, typename RandomAccessIterator> template <typename T, typename RandomAccessIterator>
@ -59,7 +61,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op { //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>()); return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
} }
template <typename T> template <typename T>
@ -70,7 +73,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
{ //sort for concrete pointers and less than op { //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>()); return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
} }
template <typename T, typename RandomAccessIterator> template <typename T, typename RandomAccessIterator>
@ -82,7 +85,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
{ //sort for concrete pointers and less than op { //sort for concrete pointers and less than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>()); return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
} }
template <typename T> template <typename T>
@ -93,7 +97,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
{ //sort for concrete pointers and greater than op { //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>()); return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
} }
template <typename T, typename RandomAccessIterator> template <typename T, typename RandomAccessIterator>
@ -105,7 +109,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
{ //sort for concrete pointers and greater than op { //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>()); return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
} }
template <typename T> template <typename T>
@ -116,7 +121,7 @@ __host__ __device__ void sort(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
{ //sort for concrete pointers and greater than op { //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>()); return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
} }
template <typename T, typename RandomAccessIterator> template <typename T, typename RandomAccessIterator>
@ -128,7 +133,8 @@ __host__ __device__ void sort_by_key(
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp) vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
{ //sort for concrete pointers and greater than op { //sort for concrete pointers and greater than op
//this makes sure that we invoke the thrust radix sort and not merge sort //this makes sure that we invoke the thrust radix sort and not merge sort
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>()); return thrust::sort_by_key(
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
} }
template <typename RandomAccessIterator, typename StrictWeakOrdering> template <typename RandomAccessIterator, typename StrictWeakOrdering>
@ -141,7 +147,7 @@ __host__ __device__ void sort(const vtkm_cuda_policy& exec,
//the operator is not an approved less/greater operator. //the operator is not an approved less/greater operator.
//This most likely will cause thrust to internally determine that //This most likely will cause thrust to internally determine that
//the best sort implementation is merge sort. //the best sort implementation is merge sort.
return thrust::sort(thrust::cuda::par, first, last, comp); return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
} }
template <typename RandomAccessIteratorKeys, template <typename RandomAccessIteratorKeys,
@ -157,7 +163,7 @@ __host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
//the operator is not an approved less/greater operator. //the operator is not an approved less/greater operator.
//This most likely will cause thrust to internally determine that //This most likely will cause thrust to internally determine that
//the best sort implementation is merge sort. //the best sort implementation is merge sort.
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, comp); return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
} }
template <typename T, template <typename T,
@ -178,23 +184,24 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
{ {
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000) #if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
::thrust::pair<OutputIterator1, OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par, ::thrust::pair<OutputIterator1, OutputIterator2> result =
keys_first.get(), thrust::reduce_by_key(ThrustCudaPolicyPerThread,
keys_last.get(), keys_first.get(),
values_first, keys_last.get(),
keys_output, values_first,
values_output, keys_output,
binary_pred, values_output,
binary_op); binary_pred,
binary_op);
//only sync if we are being invoked from the host //only sync if we are being invoked from the host
#ifndef __CUDA_ARCH__ #ifndef __CUDA_ARCH__
VTKM_CUDA_CALL(cudaDeviceSynchronize()); VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
#endif #endif
return result; return result;
#else #else
return thrust::reduce_by_key(thrust::cuda::par, return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
keys_first, keys_first,
keys_last, keys_last,
values_first, values_first,

@ -33,6 +33,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/copy.h> #include <thrust/copy.h>
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include <thrust/system/cuda/execution_policy.h> #include <thrust/system/cuda/execution_policy.h>
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
VTKM_THIRDPARTY_POST_INCLUDE VTKM_THIRDPARTY_POST_INCLUDE
namespace vtkm namespace vtkm
@ -200,7 +201,7 @@ public:
//Perhaps a direct call to thrust copy should be wrapped in a vtkm //Perhaps a direct call to thrust copy should be wrapped in a vtkm
//compatble function //compatble function
::thrust::copy(thrust::cuda::par, ::thrust::copy(ThrustCudaPolicyPerThread,
vtkm::cont::cuda::internal::IteratorBegin(portal), vtkm::cont::cuda::internal::IteratorBegin(portal),
vtkm::cont::cuda::internal::IteratorEnd(portal), vtkm::cont::cuda::internal::IteratorEnd(portal),
thrust::cuda::pointer<ValueType>(beginPointer)); thrust::cuda::pointer<ValueType>(beginPointer));