Merge topic 'cuda-streams'
72a6cf4a Change cuda calls to use the per-thread stream. Acked-by: Kitware Robot <kwrobot@kitware.com> Merge-request: !887
This commit is contained in:
commit
11fce76732
@ -134,8 +134,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(const v
|
||||
return;
|
||||
}
|
||||
|
||||
VTKM_CUDA_CALL(cudaMemcpy(
|
||||
executionPtr, controlPtr, static_cast<std::size_t>(numBytes), cudaMemcpyHostToDevice));
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(executionPtr,
|
||||
controlPtr,
|
||||
static_cast<std::size_t>(numBytes),
|
||||
cudaMemcpyHostToDevice,
|
||||
cudaStreamPerThread));
|
||||
}
|
||||
|
||||
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
|
||||
@ -158,8 +161,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
|
||||
return;
|
||||
}
|
||||
|
||||
VTKM_CUDA_CALL(cudaMemcpy(
|
||||
controlPtr, executionPtr, static_cast<std::size_t>(numBytes), cudaMemcpyDeviceToHost));
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
|
||||
executionPtr,
|
||||
static_cast<std::size_t>(numBytes),
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
@ -123,7 +123,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
|
||||
VTKM_CUDA_CALL(
|
||||
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,7 +137,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||
}
|
||||
}
|
||||
|
||||
@ -151,7 +151,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,7 +165,7 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,10 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
|
||||
vtkm::cont::DeviceAdapterTagCuda>
|
||||
{
|
||||
|
||||
VTKM_CONT static void Synchronize() { VTKM_CUDA_CALL(cudaDeviceSynchronize()); }
|
||||
VTKM_CONT static void Synchronize()
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
};
|
||||
|
||||
/// CUDA contains its own high resolution timer.
|
||||
@ -86,13 +89,13 @@ public:
|
||||
|
||||
VTKM_CONT void Reset()
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
|
||||
}
|
||||
|
||||
VTKM_CONT vtkm::Float64 GetElapsedTime()
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(
|
||||
|
@ -63,6 +63,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
#include <thrust/system/cuda/execution_policy.h>
|
||||
VTKM_THIRDPARTY_POST_INCLUDE
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
@ -201,7 +203,7 @@ private:
|
||||
try
|
||||
{
|
||||
::thrust::copy(
|
||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
|
||||
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -226,7 +228,7 @@ private:
|
||||
try
|
||||
{
|
||||
auto newLast = ::thrust::copy_if(
|
||||
thrust::cuda::par, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
|
||||
ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
|
||||
return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
|
||||
}
|
||||
catch (...)
|
||||
@ -255,7 +257,7 @@ private:
|
||||
{
|
||||
try
|
||||
{
|
||||
::thrust::copy_n(thrust::cuda::par,
|
||||
::thrust::copy_n(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input) + inputOffset,
|
||||
static_cast<std::size_t>(size),
|
||||
IteratorBegin(output) + outputOffset);
|
||||
@ -295,7 +297,7 @@ private:
|
||||
|
||||
try
|
||||
{
|
||||
::thrust::lower_bound(thrust::cuda::par,
|
||||
::thrust::lower_bound(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(values),
|
||||
@ -337,7 +339,7 @@ private:
|
||||
try
|
||||
{
|
||||
return ::thrust::reduce(
|
||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
|
||||
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -364,8 +366,11 @@ private:
|
||||
|
||||
try
|
||||
{
|
||||
return ::thrust::reduce(
|
||||
thrust::cuda::par, IteratorBegin(castPortal), IteratorEnd(castPortal), initialValue, bop);
|
||||
return ::thrust::reduce(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(castPortal),
|
||||
IteratorEnd(castPortal),
|
||||
initialValue,
|
||||
bop);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -447,11 +452,11 @@ private:
|
||||
//store the current value of the last position array in a separate cuda
|
||||
//memory location since the exclusive_scan will overwrite that value
|
||||
//once run
|
||||
::thrust::copy_n(thrust::cuda::par, IteratorEnd(input) - 1, 1, sum.begin());
|
||||
::thrust::copy_n(ThrustCudaPolicyPerThread, IteratorEnd(input) - 1, 1, sum.begin());
|
||||
|
||||
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
|
||||
|
||||
auto end = ::thrust::exclusive_scan(thrust::cuda::par,
|
||||
auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(output),
|
||||
@ -461,10 +466,10 @@ private:
|
||||
//Store the new value for the end of the array. This is done because
|
||||
//with items such as the transpose array it is unsafe to pass the
|
||||
//portal to the SumExclusiveScan
|
||||
::thrust::copy_n(thrust::cuda::par, (end - 1), 1, sum.begin() + 1);
|
||||
::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
|
||||
|
||||
//execute the binaryOp one last time on the device.
|
||||
SumExclusiveScan<<<1, 1>>>(sum[0], sum[1], sum[2], bop);
|
||||
SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(sum[0], sum[1], sum[2], bop);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -491,8 +496,11 @@ private:
|
||||
|
||||
try
|
||||
{
|
||||
auto end = ::thrust::inclusive_scan(
|
||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output), bop);
|
||||
auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(output),
|
||||
bop);
|
||||
return *(end - 1);
|
||||
}
|
||||
catch (...)
|
||||
@ -535,7 +543,7 @@ private:
|
||||
|
||||
try
|
||||
{
|
||||
::thrust::inclusive_scan_by_key(thrust::cuda::par,
|
||||
::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(keys),
|
||||
IteratorEnd(keys),
|
||||
IteratorBegin(values),
|
||||
@ -585,7 +593,7 @@ private:
|
||||
binary_operator);
|
||||
try
|
||||
{
|
||||
::thrust::exclusive_scan_by_key(thrust::cuda::par,
|
||||
::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(keys),
|
||||
IteratorEnd(keys),
|
||||
IteratorBegin(values),
|
||||
@ -655,7 +663,7 @@ private:
|
||||
try
|
||||
{
|
||||
auto begin = IteratorBegin(values);
|
||||
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values));
|
||||
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values));
|
||||
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
||||
}
|
||||
catch (...)
|
||||
@ -674,7 +682,7 @@ private:
|
||||
try
|
||||
{
|
||||
auto begin = IteratorBegin(values);
|
||||
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values), bop);
|
||||
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values), bop);
|
||||
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
||||
}
|
||||
catch (...)
|
||||
@ -691,7 +699,7 @@ private:
|
||||
{
|
||||
try
|
||||
{
|
||||
::thrust::upper_bound(thrust::cuda::par,
|
||||
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(values),
|
||||
@ -716,7 +724,7 @@ private:
|
||||
binary_compare);
|
||||
try
|
||||
{
|
||||
::thrust::upper_bound(thrust::cuda::par,
|
||||
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(values),
|
||||
@ -736,7 +744,7 @@ private:
|
||||
{
|
||||
try
|
||||
{
|
||||
::thrust::upper_bound(thrust::cuda::par,
|
||||
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||
IteratorBegin(input),
|
||||
IteratorEnd(input),
|
||||
IteratorBegin(values_output),
|
||||
@ -1147,11 +1155,12 @@ private:
|
||||
VTKM_CONT
|
||||
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
|
||||
{
|
||||
static bool gridQueryInit = false;
|
||||
static std::atomic<bool> gridQueryInit(false);
|
||||
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
|
||||
// NOTE: The following code may still be executed by multiple threads
|
||||
// but it should not cause any correctness issues.
|
||||
if (!gridQueryInit)
|
||||
{
|
||||
gridQueryInit = true;
|
||||
int currDevice;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
|
||||
|
||||
@ -1171,10 +1180,14 @@ private:
|
||||
|
||||
vtkm::UInt32* dev_actual_size;
|
||||
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
|
||||
DetermineProperXGridSize<<<1, 1>>>(maxGridSize[0], dev_actual_size);
|
||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
||||
VTKM_CUDA_CALL(
|
||||
cudaMemcpy(&maxGridSize[0], dev_actual_size, sizeof(vtkm::UInt32), cudaMemcpyDeviceToHost));
|
||||
DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
|
||||
dev_actual_size,
|
||||
sizeof(vtkm::UInt32),
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
gridQueryInit = true;
|
||||
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
|
||||
}
|
||||
return maxGridSize;
|
||||
@ -1223,7 +1236,7 @@ public:
|
||||
//handle datasets larger than 2B, we need to execute multiple kernels
|
||||
if (totalBlocks < maxblocksPerLaunch)
|
||||
{
|
||||
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize>>>(
|
||||
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), numInstances);
|
||||
}
|
||||
else
|
||||
@ -1233,7 +1246,7 @@ public:
|
||||
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
|
||||
numberOfKernelsInvoked += numberOfKernelsToRun)
|
||||
{
|
||||
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize>>>(
|
||||
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, numberOfKernelsInvoked, numInstances);
|
||||
}
|
||||
}
|
||||
@ -1242,7 +1255,7 @@ public:
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
//an exception if the previous schedule wrote an error. This would help
|
||||
//cuda to run longer before we hard sync.
|
||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
//check what the value is
|
||||
if (hostErrorPtr[0] != '\0')
|
||||
@ -1302,13 +1315,14 @@ public:
|
||||
dim3 gridSize3d;
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
|
||||
//sync so that we can check the results of the call.
|
||||
//In the future I want move this before the schedule call, and throwing
|
||||
//an exception if the previous schedule wrote an error. This would help
|
||||
//cuda to run longer before we hard sync.
|
||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
//check what the value is
|
||||
if (hostErrorPtr[0] != '\0')
|
||||
|
@ -128,10 +128,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
@ -157,9 +157,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
|
||||
functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
@ -188,9 +189,10 @@ static void compare_3d_dynamic_block_picker(Functor functor,
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
@ -269,9 +271,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
|
||||
}
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
@ -309,9 +312,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
|
||||
dim3 gridSize3d;
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||
ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
|
@ -49,13 +49,15 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
|
||||
{
|
||||
TargetClass* cutarget;
|
||||
VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
|
||||
VTKM_CUDA_CALL(cudaMemcpy(cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||
cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
|
||||
VirtualObject* cuobject;
|
||||
VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
|
||||
detail::CreateKernel<<<1, 1>>>(cuobject, cutarget);
|
||||
detail::CreateKernel<<<1, 1, 0, cudaStreamPerThread>>>(cuobject, cutarget);
|
||||
VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
|
||||
VTKM_CUDA_CALL(cudaMemcpy(&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost));
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||
&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
||||
VTKM_CUDA_CALL(cudaFree(cuobject));
|
||||
|
||||
return cutarget;
|
||||
@ -63,7 +65,8 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
|
||||
|
||||
static void Update(void* deviceState, const void* target)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMemcpy(deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
|
||||
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||
deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
}
|
||||
|
||||
static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }
|
||||
|
@ -31,6 +31,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
#include <thrust/system/cuda/memory.h>
|
||||
VTKM_THIRDPARTY_POST_INCLUDE
|
||||
|
||||
#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)
|
||||
|
||||
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
|
||||
{
|
||||
};
|
||||
@ -47,7 +49,7 @@ __host__ __device__ void sort(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
||||
{ //sort for concrete pointers and less than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
|
||||
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
|
||||
}
|
||||
|
||||
template <typename T, typename RandomAccessIterator>
|
||||
@ -59,7 +61,8 @@ __host__ __device__ void sort_by_key(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
||||
{ //sort for concrete pointers and less than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
|
||||
return thrust::sort_by_key(
|
||||
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -70,7 +73,7 @@ __host__ __device__ void sort(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
||||
{ //sort for concrete pointers and less than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
|
||||
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
|
||||
}
|
||||
|
||||
template <typename T, typename RandomAccessIterator>
|
||||
@ -82,7 +85,8 @@ __host__ __device__ void sort_by_key(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
||||
{ //sort for concrete pointers and less than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
|
||||
return thrust::sort_by_key(
|
||||
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -93,7 +97,7 @@ __host__ __device__ void sort(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
||||
{ //sort for concrete pointers and greater than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
|
||||
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
|
||||
}
|
||||
|
||||
template <typename T, typename RandomAccessIterator>
|
||||
@ -105,7 +109,8 @@ __host__ __device__ void sort_by_key(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
||||
{ //sort for concrete pointers and greater than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
|
||||
return thrust::sort_by_key(
|
||||
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -116,7 +121,7 @@ __host__ __device__ void sort(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
||||
{ //sort for concrete pointers and greater than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
|
||||
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
|
||||
}
|
||||
|
||||
template <typename T, typename RandomAccessIterator>
|
||||
@ -128,7 +133,8 @@ __host__ __device__ void sort_by_key(
|
||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
||||
{ //sort for concrete pointers and greater than op
|
||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
|
||||
return thrust::sort_by_key(
|
||||
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
|
||||
}
|
||||
|
||||
template <typename RandomAccessIterator, typename StrictWeakOrdering>
|
||||
@ -141,7 +147,7 @@ __host__ __device__ void sort(const vtkm_cuda_policy& exec,
|
||||
//the operator is not an approved less/greater operator.
|
||||
//This most likely will cause thrust to internally determine that
|
||||
//the best sort implementation is merge sort.
|
||||
return thrust::sort(thrust::cuda::par, first, last, comp);
|
||||
return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
|
||||
}
|
||||
|
||||
template <typename RandomAccessIteratorKeys,
|
||||
@ -157,7 +163,7 @@ __host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
|
||||
//the operator is not an approved less/greater operator.
|
||||
//This most likely will cause thrust to internally determine that
|
||||
//the best sort implementation is merge sort.
|
||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, comp);
|
||||
return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
|
||||
}
|
||||
|
||||
template <typename T,
|
||||
@ -178,23 +184,24 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
|
||||
|
||||
{
|
||||
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
|
||||
::thrust::pair<OutputIterator1, OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par,
|
||||
keys_first.get(),
|
||||
keys_last.get(),
|
||||
values_first,
|
||||
keys_output,
|
||||
values_output,
|
||||
binary_pred,
|
||||
binary_op);
|
||||
::thrust::pair<OutputIterator1, OutputIterator2> result =
|
||||
thrust::reduce_by_key(ThrustCudaPolicyPerThread,
|
||||
keys_first.get(),
|
||||
keys_last.get(),
|
||||
values_first,
|
||||
keys_output,
|
||||
values_output,
|
||||
binary_pred,
|
||||
binary_op);
|
||||
|
||||
//only sync if we are being invoked from the host
|
||||
#ifndef __CUDA_ARCH__
|
||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
#endif
|
||||
|
||||
return result;
|
||||
#else
|
||||
return thrust::reduce_by_key(thrust::cuda::par,
|
||||
return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
|
||||
keys_first,
|
||||
keys_last,
|
||||
values_first,
|
||||
|
@ -33,6 +33,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/system/cuda/execution_policy.h>
|
||||
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
|
||||
VTKM_THIRDPARTY_POST_INCLUDE
|
||||
|
||||
namespace vtkm
|
||||
@ -200,7 +201,7 @@ public:
|
||||
|
||||
//Perhaps a direct call to thrust copy should be wrapped in a vtkm
|
||||
//compatble function
|
||||
::thrust::copy(thrust::cuda::par,
|
||||
::thrust::copy(ThrustCudaPolicyPerThread,
|
||||
vtkm::cont::cuda::internal::IteratorBegin(portal),
|
||||
vtkm::cont::cuda::internal::IteratorEnd(portal),
|
||||
thrust::cuda::pointer<ValueType>(beginPointer));
|
||||
|
Loading…
Reference in New Issue
Block a user