Merge topic 'cuda-streams'
72a6cf4a Change cuda calls to use the per-thread stream. Acked-by: Kitware Robot <kwrobot@kitware.com> Merge-request: !887
This commit is contained in:
commit
11fce76732
@ -134,8 +134,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyFromControl(const v
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaMemcpy(
|
VTKM_CUDA_CALL(cudaMemcpyAsync(executionPtr,
|
||||||
executionPtr, controlPtr, static_cast<std::size_t>(numBytes), cudaMemcpyHostToDevice));
|
controlPtr,
|
||||||
|
static_cast<std::size_t>(numBytes),
|
||||||
|
cudaMemcpyHostToDevice,
|
||||||
|
cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
|
|
||||||
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
|
void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const void* executionPtr,
|
||||||
@ -158,8 +161,11 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::CopyToControl(const voi
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaMemcpy(
|
VTKM_CUDA_CALL(cudaMemcpyAsync(controlPtr,
|
||||||
controlPtr, executionPtr, static_cast<std::size_t>(numBytes), cudaMemcpyDeviceToHost));
|
executionPtr,
|
||||||
|
static_cast<std::size_t>(numBytes),
|
||||||
|
cudaMemcpyDeviceToHost,
|
||||||
|
cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -123,7 +123,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
|
|||||||
VTKM_CUDA_CALL(
|
VTKM_CUDA_CALL(
|
||||||
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
|
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +137,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
|
|||||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
|
|||||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,7 +165,7 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
|
|||||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -63,7 +63,10 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
|
|||||||
vtkm::cont::DeviceAdapterTagCuda>
|
vtkm::cont::DeviceAdapterTagCuda>
|
||||||
{
|
{
|
||||||
|
|
||||||
VTKM_CONT static void Synchronize() { VTKM_CUDA_CALL(cudaDeviceSynchronize()); }
|
VTKM_CONT static void Synchronize()
|
||||||
|
{
|
||||||
|
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/// CUDA contains its own high resolution timer.
|
/// CUDA contains its own high resolution timer.
|
||||||
@ -86,13 +89,13 @@ public:
|
|||||||
|
|
||||||
VTKM_CONT void Reset()
|
VTKM_CONT void Reset()
|
||||||
{
|
{
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
|
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT vtkm::Float64 GetElapsedTime()
|
VTKM_CONT vtkm::Float64 GetElapsedTime()
|
||||||
{
|
{
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
|
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
VTKM_CUDA_CALL(
|
VTKM_CUDA_CALL(
|
||||||
|
@ -63,6 +63,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
|||||||
#include <thrust/system/cuda/execution_policy.h>
|
#include <thrust/system/cuda/execution_policy.h>
|
||||||
VTKM_THIRDPARTY_POST_INCLUDE
|
VTKM_THIRDPARTY_POST_INCLUDE
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
namespace vtkm
|
namespace vtkm
|
||||||
{
|
{
|
||||||
namespace cont
|
namespace cont
|
||||||
@ -201,7 +203,7 @@ private:
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::copy(
|
::thrust::copy(
|
||||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
|
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output));
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
@ -226,7 +228,7 @@ private:
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
auto newLast = ::thrust::copy_if(
|
auto newLast = ::thrust::copy_if(
|
||||||
thrust::cuda::par, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
|
ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, IteratorBegin(stencil), outputBegin, up);
|
||||||
return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
|
return static_cast<vtkm::Id>(::thrust::distance(outputBegin, newLast));
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
@ -255,7 +257,7 @@ private:
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::copy_n(thrust::cuda::par,
|
::thrust::copy_n(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input) + inputOffset,
|
IteratorBegin(input) + inputOffset,
|
||||||
static_cast<std::size_t>(size),
|
static_cast<std::size_t>(size),
|
||||||
IteratorBegin(output) + outputOffset);
|
IteratorBegin(output) + outputOffset);
|
||||||
@ -295,7 +297,7 @@ private:
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::lower_bound(thrust::cuda::par,
|
::thrust::lower_bound(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input),
|
IteratorBegin(input),
|
||||||
IteratorEnd(input),
|
IteratorEnd(input),
|
||||||
IteratorBegin(values),
|
IteratorBegin(values),
|
||||||
@ -337,7 +339,7 @@ private:
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
return ::thrust::reduce(
|
return ::thrust::reduce(
|
||||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
|
ThrustCudaPolicyPerThread, IteratorBegin(input), IteratorEnd(input), initialValue, bop);
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
@ -364,8 +366,11 @@ private:
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
return ::thrust::reduce(
|
return ::thrust::reduce(ThrustCudaPolicyPerThread,
|
||||||
thrust::cuda::par, IteratorBegin(castPortal), IteratorEnd(castPortal), initialValue, bop);
|
IteratorBegin(castPortal),
|
||||||
|
IteratorEnd(castPortal),
|
||||||
|
initialValue,
|
||||||
|
bop);
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
@ -447,11 +452,11 @@ private:
|
|||||||
//store the current value of the last position array in a separate cuda
|
//store the current value of the last position array in a separate cuda
|
||||||
//memory location since the exclusive_scan will overwrite that value
|
//memory location since the exclusive_scan will overwrite that value
|
||||||
//once run
|
//once run
|
||||||
::thrust::copy_n(thrust::cuda::par, IteratorEnd(input) - 1, 1, sum.begin());
|
::thrust::copy_n(ThrustCudaPolicyPerThread, IteratorEnd(input) - 1, 1, sum.begin());
|
||||||
|
|
||||||
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
|
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType, BinaryFunctor> bop(binaryOp);
|
||||||
|
|
||||||
auto end = ::thrust::exclusive_scan(thrust::cuda::par,
|
auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input),
|
IteratorBegin(input),
|
||||||
IteratorEnd(input),
|
IteratorEnd(input),
|
||||||
IteratorBegin(output),
|
IteratorBegin(output),
|
||||||
@ -461,10 +466,10 @@ private:
|
|||||||
//Store the new value for the end of the array. This is done because
|
//Store the new value for the end of the array. This is done because
|
||||||
//with items such as the transpose array it is unsafe to pass the
|
//with items such as the transpose array it is unsafe to pass the
|
||||||
//portal to the SumExclusiveScan
|
//portal to the SumExclusiveScan
|
||||||
::thrust::copy_n(thrust::cuda::par, (end - 1), 1, sum.begin() + 1);
|
::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1);
|
||||||
|
|
||||||
//execute the binaryOp one last time on the device.
|
//execute the binaryOp one last time on the device.
|
||||||
SumExclusiveScan<<<1, 1>>>(sum[0], sum[1], sum[2], bop);
|
SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>(sum[0], sum[1], sum[2], bop);
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
@ -491,8 +496,11 @@ private:
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
auto end = ::thrust::inclusive_scan(
|
auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread,
|
||||||
thrust::cuda::par, IteratorBegin(input), IteratorEnd(input), IteratorBegin(output), bop);
|
IteratorBegin(input),
|
||||||
|
IteratorEnd(input),
|
||||||
|
IteratorBegin(output),
|
||||||
|
bop);
|
||||||
return *(end - 1);
|
return *(end - 1);
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
@ -535,7 +543,7 @@ private:
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::inclusive_scan_by_key(thrust::cuda::par,
|
::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(keys),
|
IteratorBegin(keys),
|
||||||
IteratorEnd(keys),
|
IteratorEnd(keys),
|
||||||
IteratorBegin(values),
|
IteratorBegin(values),
|
||||||
@ -585,7 +593,7 @@ private:
|
|||||||
binary_operator);
|
binary_operator);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::exclusive_scan_by_key(thrust::cuda::par,
|
::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(keys),
|
IteratorBegin(keys),
|
||||||
IteratorEnd(keys),
|
IteratorEnd(keys),
|
||||||
IteratorBegin(values),
|
IteratorBegin(values),
|
||||||
@ -655,7 +663,7 @@ private:
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
auto begin = IteratorBegin(values);
|
auto begin = IteratorBegin(values);
|
||||||
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values));
|
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values));
|
||||||
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
@ -674,7 +682,7 @@ private:
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
auto begin = IteratorBegin(values);
|
auto begin = IteratorBegin(values);
|
||||||
auto newLast = ::thrust::unique(thrust::cuda::par, begin, IteratorEnd(values), bop);
|
auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, IteratorEnd(values), bop);
|
||||||
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
return static_cast<vtkm::Id>(::thrust::distance(begin, newLast));
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
@ -691,7 +699,7 @@ private:
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::upper_bound(thrust::cuda::par,
|
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input),
|
IteratorBegin(input),
|
||||||
IteratorEnd(input),
|
IteratorEnd(input),
|
||||||
IteratorBegin(values),
|
IteratorBegin(values),
|
||||||
@ -716,7 +724,7 @@ private:
|
|||||||
binary_compare);
|
binary_compare);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::upper_bound(thrust::cuda::par,
|
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input),
|
IteratorBegin(input),
|
||||||
IteratorEnd(input),
|
IteratorEnd(input),
|
||||||
IteratorBegin(values),
|
IteratorBegin(values),
|
||||||
@ -736,7 +744,7 @@ private:
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
::thrust::upper_bound(thrust::cuda::par,
|
::thrust::upper_bound(ThrustCudaPolicyPerThread,
|
||||||
IteratorBegin(input),
|
IteratorBegin(input),
|
||||||
IteratorEnd(input),
|
IteratorEnd(input),
|
||||||
IteratorBegin(values_output),
|
IteratorBegin(values_output),
|
||||||
@ -1147,11 +1155,12 @@ private:
|
|||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
|
static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
|
||||||
{
|
{
|
||||||
static bool gridQueryInit = false;
|
static std::atomic<bool> gridQueryInit(false);
|
||||||
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
|
static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
|
||||||
|
// NOTE: The following code may still be executed by multiple threads
|
||||||
|
// but it should not cause any correctness issues.
|
||||||
if (!gridQueryInit)
|
if (!gridQueryInit)
|
||||||
{
|
{
|
||||||
gridQueryInit = true;
|
|
||||||
int currDevice;
|
int currDevice;
|
||||||
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
|
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
|
||||||
|
|
||||||
@ -1171,10 +1180,14 @@ private:
|
|||||||
|
|
||||||
vtkm::UInt32* dev_actual_size;
|
vtkm::UInt32* dev_actual_size;
|
||||||
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
|
VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
|
||||||
DetermineProperXGridSize<<<1, 1>>>(maxGridSize[0], dev_actual_size);
|
DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
|
||||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
|
||||||
VTKM_CUDA_CALL(
|
dev_actual_size,
|
||||||
cudaMemcpy(&maxGridSize[0], dev_actual_size, sizeof(vtkm::UInt32), cudaMemcpyDeviceToHost));
|
sizeof(vtkm::UInt32),
|
||||||
|
cudaMemcpyDeviceToHost,
|
||||||
|
cudaStreamPerThread));
|
||||||
|
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
|
gridQueryInit = true;
|
||||||
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
|
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
|
||||||
}
|
}
|
||||||
return maxGridSize;
|
return maxGridSize;
|
||||||
@ -1223,7 +1236,7 @@ public:
|
|||||||
//handle datasets larger than 2B, we need to execute multiple kernels
|
//handle datasets larger than 2B, we need to execute multiple kernels
|
||||||
if (totalBlocks < maxblocksPerLaunch)
|
if (totalBlocks < maxblocksPerLaunch)
|
||||||
{
|
{
|
||||||
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize>>>(
|
Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
|
||||||
functor, vtkm::Id(0), numInstances);
|
functor, vtkm::Id(0), numInstances);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1233,7 +1246,7 @@ public:
|
|||||||
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
|
for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
|
||||||
numberOfKernelsInvoked += numberOfKernelsToRun)
|
numberOfKernelsInvoked += numberOfKernelsToRun)
|
||||||
{
|
{
|
||||||
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize>>>(
|
Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
|
||||||
functor, numberOfKernelsInvoked, numInstances);
|
functor, numberOfKernelsInvoked, numInstances);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1242,7 +1255,7 @@ public:
|
|||||||
//In the future I want move this before the schedule call, and throwing
|
//In the future I want move this before the schedule call, and throwing
|
||||||
//an exception if the previous schedule wrote an error. This would help
|
//an exception if the previous schedule wrote an error. This would help
|
||||||
//cuda to run longer before we hard sync.
|
//cuda to run longer before we hard sync.
|
||||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
|
|
||||||
//check what the value is
|
//check what the value is
|
||||||
if (hostErrorPtr[0] != '\0')
|
if (hostErrorPtr[0] != '\0')
|
||||||
@ -1302,13 +1315,14 @@ public:
|
|||||||
dim3 gridSize3d;
|
dim3 gridSize3d;
|
||||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||||
|
|
||||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||||
|
ranges);
|
||||||
|
|
||||||
//sync so that we can check the results of the call.
|
//sync so that we can check the results of the call.
|
||||||
//In the future I want move this before the schedule call, and throwing
|
//In the future I want move this before the schedule call, and throwing
|
||||||
//an exception if the previous schedule wrote an error. This would help
|
//an exception if the previous schedule wrote an error. This would help
|
||||||
//cuda to run longer before we hard sync.
|
//cuda to run longer before we hard sync.
|
||||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
|
|
||||||
//check what the value is
|
//check what the value is
|
||||||
if (hostErrorPtr[0] != '\0')
|
if (hostErrorPtr[0] != '\0')
|
||||||
|
@ -128,10 +128,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
|
|||||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||||
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
|
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
|
||||||
functor, vtkm::Id(0), size);
|
functor, vtkm::Id(0), size);
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
@ -157,9 +157,10 @@ static void compare_1d_dynamic_block_picker(Functor functor,
|
|||||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||||
Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
|
Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
functor, vtkm::Id(0), size);
|
||||||
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
@ -188,9 +189,10 @@ static void compare_3d_dynamic_block_picker(Functor functor,
|
|||||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
ranges);
|
||||||
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
@ -269,9 +271,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
|
|||||||
}
|
}
|
||||||
|
|
||||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
ranges);
|
||||||
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
@ -309,9 +312,10 @@ static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeM
|
|||||||
dim3 gridSize3d;
|
dim3 gridSize3d;
|
||||||
|
|
||||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
ranges);
|
||||||
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||||
float elapsedTimeMilliseconds;
|
float elapsedTimeMilliseconds;
|
||||||
|
@ -49,13 +49,15 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
|
|||||||
{
|
{
|
||||||
TargetClass* cutarget;
|
TargetClass* cutarget;
|
||||||
VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
|
VTKM_CUDA_CALL(cudaMalloc(&cutarget, sizeof(TargetClass)));
|
||||||
VTKM_CUDA_CALL(cudaMemcpy(cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
|
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||||
|
cutarget, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||||
|
|
||||||
VirtualObject* cuobject;
|
VirtualObject* cuobject;
|
||||||
VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
|
VTKM_CUDA_CALL(cudaMalloc(&cuobject, sizeof(VirtualObject)));
|
||||||
detail::CreateKernel<<<1, 1>>>(cuobject, cutarget);
|
detail::CreateKernel<<<1, 1, 0, cudaStreamPerThread>>>(cuobject, cutarget);
|
||||||
VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
|
VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR();
|
||||||
VTKM_CUDA_CALL(cudaMemcpy(&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost));
|
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||||
|
&object, cuobject, sizeof(VirtualObject), cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
||||||
VTKM_CUDA_CALL(cudaFree(cuobject));
|
VTKM_CUDA_CALL(cudaFree(cuobject));
|
||||||
|
|
||||||
return cutarget;
|
return cutarget;
|
||||||
@ -63,7 +65,8 @@ struct VirtualObjectTransfer<VirtualObject, TargetClass, vtkm::cont::DeviceAdapt
|
|||||||
|
|
||||||
static void Update(void* deviceState, const void* target)
|
static void Update(void* deviceState, const void* target)
|
||||||
{
|
{
|
||||||
VTKM_CUDA_CALL(cudaMemcpy(deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice));
|
VTKM_CUDA_CALL(cudaMemcpyAsync(
|
||||||
|
deviceState, target, sizeof(TargetClass), cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }
|
static void Cleanup(void* deviceState) { VTKM_CUDA_CALL(cudaFree(deviceState)); }
|
||||||
|
@ -31,6 +31,8 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
|||||||
#include <thrust/system/cuda/memory.h>
|
#include <thrust/system/cuda/memory.h>
|
||||||
VTKM_THIRDPARTY_POST_INCLUDE
|
VTKM_THIRDPARTY_POST_INCLUDE
|
||||||
|
|
||||||
|
#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)
|
||||||
|
|
||||||
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
|
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
|
||||||
{
|
{
|
||||||
};
|
};
|
||||||
@ -47,7 +49,7 @@ __host__ __device__ void sort(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
||||||
{ //sort for concrete pointers and less than op
|
{ //sort for concrete pointers and less than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
|
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename RandomAccessIterator>
|
template <typename T, typename RandomAccessIterator>
|
||||||
@ -59,7 +61,8 @@ __host__ __device__ void sort_by_key(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
|
||||||
{ //sort for concrete pointers and less than op
|
{ //sort for concrete pointers and less than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
|
return thrust::sort_by_key(
|
||||||
|
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -70,7 +73,7 @@ __host__ __device__ void sort(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
||||||
{ //sort for concrete pointers and less than op
|
{ //sort for concrete pointers and less than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort(thrust::cuda::par, first, last, thrust::less<T>());
|
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename RandomAccessIterator>
|
template <typename T, typename RandomAccessIterator>
|
||||||
@ -82,7 +85,8 @@ __host__ __device__ void sort_by_key(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
|
||||||
{ //sort for concrete pointers and less than op
|
{ //sort for concrete pointers and less than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::less<T>());
|
return thrust::sort_by_key(
|
||||||
|
ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -93,7 +97,7 @@ __host__ __device__ void sort(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
||||||
{ //sort for concrete pointers and greater than op
|
{ //sort for concrete pointers and greater than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
|
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename RandomAccessIterator>
|
template <typename T, typename RandomAccessIterator>
|
||||||
@ -105,7 +109,8 @@ __host__ __device__ void sort_by_key(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
|
||||||
{ //sort for concrete pointers and greater than op
|
{ //sort for concrete pointers and greater than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
|
return thrust::sort_by_key(
|
||||||
|
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -116,7 +121,7 @@ __host__ __device__ void sort(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
||||||
{ //sort for concrete pointers and greater than op
|
{ //sort for concrete pointers and greater than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort(thrust::cuda::par, first, last, thrust::greater<T>());
|
return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename RandomAccessIterator>
|
template <typename T, typename RandomAccessIterator>
|
||||||
@ -128,7 +133,8 @@ __host__ __device__ void sort_by_key(
|
|||||||
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
|
||||||
{ //sort for concrete pointers and greater than op
|
{ //sort for concrete pointers and greater than op
|
||||||
//this makes sure that we invoke the thrust radix sort and not merge sort
|
//this makes sure that we invoke the thrust radix sort and not merge sort
|
||||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, thrust::greater<T>());
|
return thrust::sort_by_key(
|
||||||
|
ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename RandomAccessIterator, typename StrictWeakOrdering>
|
template <typename RandomAccessIterator, typename StrictWeakOrdering>
|
||||||
@ -141,7 +147,7 @@ __host__ __device__ void sort(const vtkm_cuda_policy& exec,
|
|||||||
//the operator is not an approved less/greater operator.
|
//the operator is not an approved less/greater operator.
|
||||||
//This most likely will cause thrust to internally determine that
|
//This most likely will cause thrust to internally determine that
|
||||||
//the best sort implementation is merge sort.
|
//the best sort implementation is merge sort.
|
||||||
return thrust::sort(thrust::cuda::par, first, last, comp);
|
return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename RandomAccessIteratorKeys,
|
template <typename RandomAccessIteratorKeys,
|
||||||
@ -157,7 +163,7 @@ __host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
|
|||||||
//the operator is not an approved less/greater operator.
|
//the operator is not an approved less/greater operator.
|
||||||
//This most likely will cause thrust to internally determine that
|
//This most likely will cause thrust to internally determine that
|
||||||
//the best sort implementation is merge sort.
|
//the best sort implementation is merge sort.
|
||||||
return thrust::sort_by_key(thrust::cuda::par, first, last, values_first, comp);
|
return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T,
|
template <typename T,
|
||||||
@ -178,23 +184,24 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
|
|||||||
|
|
||||||
{
|
{
|
||||||
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
|
#if defined(__CUDACC_VER__) && (__CUDACC_VER__ >= 70500) && (__CUDACC_VER__ < 80000)
|
||||||
::thrust::pair<OutputIterator1, OutputIterator2> result = thrust::reduce_by_key(thrust::cuda::par,
|
::thrust::pair<OutputIterator1, OutputIterator2> result =
|
||||||
keys_first.get(),
|
thrust::reduce_by_key(ThrustCudaPolicyPerThread,
|
||||||
keys_last.get(),
|
keys_first.get(),
|
||||||
values_first,
|
keys_last.get(),
|
||||||
keys_output,
|
values_first,
|
||||||
values_output,
|
keys_output,
|
||||||
binary_pred,
|
values_output,
|
||||||
binary_op);
|
binary_pred,
|
||||||
|
binary_op);
|
||||||
|
|
||||||
//only sync if we are being invoked from the host
|
//only sync if we are being invoked from the host
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return thrust::reduce_by_key(thrust::cuda::par,
|
return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
|
||||||
keys_first,
|
keys_first,
|
||||||
keys_last,
|
keys_last,
|
||||||
values_first,
|
values_first,
|
||||||
|
@ -33,6 +33,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
|||||||
#include <thrust/copy.h>
|
#include <thrust/copy.h>
|
||||||
#include <thrust/device_ptr.h>
|
#include <thrust/device_ptr.h>
|
||||||
#include <thrust/system/cuda/execution_policy.h>
|
#include <thrust/system/cuda/execution_policy.h>
|
||||||
|
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
|
||||||
VTKM_THIRDPARTY_POST_INCLUDE
|
VTKM_THIRDPARTY_POST_INCLUDE
|
||||||
|
|
||||||
namespace vtkm
|
namespace vtkm
|
||||||
@ -200,7 +201,7 @@ public:
|
|||||||
|
|
||||||
//Perhaps a direct call to thrust copy should be wrapped in a vtkm
|
//Perhaps a direct call to thrust copy should be wrapped in a vtkm
|
||||||
//compatble function
|
//compatble function
|
||||||
::thrust::copy(thrust::cuda::par,
|
::thrust::copy(ThrustCudaPolicyPerThread,
|
||||||
vtkm::cont::cuda::internal::IteratorBegin(portal),
|
vtkm::cont::cuda::internal::IteratorBegin(portal),
|
||||||
vtkm::cont::cuda::internal::IteratorEnd(portal),
|
vtkm::cont::cuda::internal::IteratorEnd(portal),
|
||||||
thrust::cuda::pointer<ValueType>(beginPointer));
|
thrust::cuda::pointer<ValueType>(beginPointer));
|
||||||
|
Loading…
Reference in New Issue
Block a user