mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-20 02:55:47 +00:00
1300 lines
46 KiB
C++
1300 lines
46 KiB
C++
|
|
//============================================================================
|
|
// Copyright (c) Kitware, Inc.
|
|
// All rights reserved.
|
|
// See LICENSE.txt for details.
|
|
// This software is distributed WITHOUT ANY WARRANTY; without even
|
|
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
// PURPOSE. See the above copyright notice for more information.
|
|
//
|
|
// Copyright 2014 Sandia Corporation.
|
|
// Copyright 2014 UT-Battelle, LLC.
|
|
// Copyright 2014 Los Alamos National Security.
|
|
//
|
|
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
|
// Laboratory (LANL), the U.S. Government retains certain rights in
|
|
// this software.
|
|
//============================================================================
|
|
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
|
|
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
|
|
|
|
#include <vtkm/cont/ArrayHandle.h>
|
|
#include <vtkm/cont/ErrorExecution.h>
|
|
#include <vtkm/Types.h>
|
|
#include <vtkm/TypeTraits.h>
|
|
#include <vtkm/UnaryPredicates.h>
|
|
|
|
#include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
|
|
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
|
|
|
|
#include <vtkm/exec/cuda/internal/WrappedOperators.h>
|
|
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
|
#include <vtkm/exec/internal/WorkletInvokeFunctor.h>
|
|
|
|
// Disable warnings we check vtkm for but Thrust does not.
|
|
VTKM_THIRDPARTY_PRE_INCLUDE
|
|
//our own custom thrust execution policy
|
|
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
|
|
#include <thrust/advance.h>
|
|
#include <thrust/binary_search.h>
|
|
#include <thrust/copy.h>
|
|
#include <thrust/count.h>
|
|
#include <thrust/scan.h>
|
|
#include <thrust/sort.h>
|
|
#include <thrust/unique.h>
|
|
#include <thrust/system/cuda/vector.h>
|
|
|
|
#include <thrust/iterator/counting_iterator.h>
|
|
#include <thrust/system/cuda/execution_policy.h>
|
|
VTKM_THIRDPARTY_POST_INCLUDE
|
|
|
|
namespace vtkm {
|
|
namespace cont {
|
|
namespace cuda {
|
|
namespace internal {
|
|
|
|
static
|
|
__global__
|
|
void DetermineProperXGridSize(vtkm::UInt32 desired_size,
|
|
vtkm::UInt32* actual_size)
|
|
{
|
|
//used only to see if we can launch kernels with a x grid size that
|
|
//matches the max of the graphics card, or are we having to fall back
|
|
//to SM_2 grid sizes
|
|
if(blockIdx.x != 0)
|
|
{
|
|
return;
|
|
}
|
|
#if __CUDA_ARCH__ <= 200
|
|
const vtkm::UInt32 maxXGridSizeForSM2 = 65535;
|
|
*actual_size = maxXGridSizeForSM2;
|
|
#else
|
|
*actual_size = desired_size;
|
|
#endif
|
|
}
|
|
|
|
template<class FunctorType>
|
|
__global__
|
|
void Schedule1DIndexKernel(FunctorType functor,
|
|
vtkm::Id numberOfKernelsInvoked,
|
|
vtkm::Id length)
|
|
{
|
|
//Note a cuda launch can only handle at most 2B iterations of a kernel
|
|
//because it holds all of the indexes inside UInt32, so for use to
|
|
//handle datasets larger than 2B, we need to execute multiple kernels
|
|
const vtkm::Id index = numberOfKernelsInvoked +
|
|
static_cast<vtkm::Id>(blockDim.x * blockIdx.x + threadIdx.x);
|
|
if(index < length)
|
|
{
|
|
functor(index);
|
|
}
|
|
}
|
|
|
|
template<class FunctorType>
|
|
__global__
|
|
void Schedule3DIndexKernel(FunctorType functor, dim3 size)
|
|
{
|
|
const vtkm::Id3 index(
|
|
blockIdx.x*blockDim.x + threadIdx.x,
|
|
blockIdx.y*blockDim.y + threadIdx.y,
|
|
blockIdx.z*blockDim.z + threadIdx.z
|
|
);
|
|
if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z)
|
|
{
|
|
return;
|
|
}
|
|
functor( index );
|
|
}
|
|
|
|
template<typename T, typename BinaryOperationType >
|
|
__global__
|
|
void SumExclusiveScan(T a, T b, T result,
|
|
BinaryOperationType binary_op)
|
|
{
|
|
result = binary_op(a,b);
|
|
}
|
|
|
|
inline
|
|
void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d)
|
|
{
|
|
gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1) : (rangeMax.x / blockSize3d.x);
|
|
gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1) : (rangeMax.y / blockSize3d.y);
|
|
gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1) : (rangeMax.z / blockSize3d.z);
|
|
}
|
|
|
|
#ifdef ANALYZE_VTKM_SCHEDULER
|
|
class PerfRecord
|
|
{
|
|
public:
|
|
|
|
PerfRecord(float elapsedT, dim3 block ):
|
|
elapsedTime(elapsedT),
|
|
blockSize(block)
|
|
{
|
|
|
|
}
|
|
|
|
bool operator<(const PerfRecord& other) const
|
|
{ return elapsedTime < other.elapsedTime; }
|
|
|
|
float elapsedTime;
|
|
dim3 blockSize;
|
|
};
|
|
|
|
template<class Functor>
|
|
static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& rangeMax)
|
|
{
|
|
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
|
static_cast<vtkm::UInt32>(rangeMax[1]),
|
|
static_cast<vtkm::UInt32>(rangeMax[2]) );
|
|
std::vector< PerfRecord > results;
|
|
vtkm::UInt32 indexTable[16] = {1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024};
|
|
|
|
for(vtkm::UInt32 i=0; i < 16; i++)
|
|
{
|
|
for(vtkm::UInt32 j=0; j < 16; j++)
|
|
{
|
|
for(vtkm::UInt32 k=0; k < 16; k++)
|
|
{
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]);
|
|
dim3 gridSize3d;
|
|
|
|
if( (blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
|
|
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 ||
|
|
blockSize3d.z >= 64)
|
|
{
|
|
//cuda can't handle more than 1024 threads per block
|
|
//so don't try if we compute higher than that
|
|
|
|
//also don't try stupidly low numbers
|
|
|
|
//cuda can't handle more than 64 threads in the z direction
|
|
continue;
|
|
}
|
|
|
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
|
cudaEventRecord(start, 0);
|
|
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
|
|
cudaEventRecord(stop, 0);
|
|
|
|
cudaEventSynchronize(stop);
|
|
float elapsedTimeMilliseconds;
|
|
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
|
|
|
|
cudaEventDestroy(start);
|
|
cudaEventDestroy(stop);
|
|
|
|
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
|
|
results.push_back( record );
|
|
}
|
|
}
|
|
}
|
|
|
|
std::sort(results.begin(), results.end());
|
|
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
|
|
for(vtkm::Int64 i=1; i <= size; i++)
|
|
{
|
|
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size-i);
|
|
vtkm::UInt32 x = results[index].blockSize.x;
|
|
vtkm::UInt32 y = results[index].blockSize.y;
|
|
vtkm::UInt32 z = results[index].blockSize.z;
|
|
float t = results[index].elapsedTime;
|
|
|
|
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
|
|
}
|
|
|
|
std::cout << "flat array performance " << std::endl;
|
|
{
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
cudaEventRecord(start, 0);
|
|
typedef
|
|
vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
|
|
vtkm::cont::DeviceAdapterTagCuda > Algorithm;
|
|
Algorithm::Schedule(functor, numInstances);
|
|
cudaEventRecord(stop, 0);
|
|
|
|
cudaEventSynchronize(stop);
|
|
float elapsedTimeMilliseconds;
|
|
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
|
|
|
|
cudaEventDestroy(start);
|
|
cudaEventDestroy(stop);
|
|
|
|
std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
|
|
}
|
|
|
|
std::cout << "fixed 3d block size performance " << std::endl;
|
|
{
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
dim3 blockSize3d(64,2,1);
|
|
dim3 gridSize3d;
|
|
|
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
|
cudaEventRecord(start, 0);
|
|
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
|
|
cudaEventRecord(stop, 0);
|
|
|
|
cudaEventSynchronize(stop);
|
|
float elapsedTimeMilliseconds;
|
|
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
|
|
|
|
cudaEventDestroy(start);
|
|
cudaEventDestroy(stop);
|
|
|
|
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
|
|
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
/// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
|
|
/// device that uses thrust as its implementation. The subclass should pass in
|
|
/// the correct device adapter tag as the template parameter.
|
|
///
|
|
template<class DeviceAdapterTag>
|
|
struct DeviceAdapterAlgorithmThrust
|
|
{
|
|
// Because of some funny code conversions in nvcc, kernels for devices have to
|
|
// be public.
|
|
#ifndef VTKM_CUDA
|
|
private:
|
|
#endif
|
|
template<class InputPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static void CopyPortal(const InputPortal &input,
|
|
const OutputPortal &output)
|
|
{
|
|
try
|
|
{
|
|
::thrust::copy(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(output));
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
template<class InputPortal, class ValuesPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static void LowerBoundsPortal(const InputPortal &input,
|
|
const ValuesPortal &values,
|
|
const OutputPortal &output)
|
|
{
|
|
typedef typename ValuesPortal::ValueType ValueType;
|
|
LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>() );
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
void LowerBoundsPortal(const InputPortal &input,
|
|
const OutputPortal &values_output)
|
|
{
|
|
typedef typename InputPortal::ValueType ValueType;
|
|
LowerBoundsPortal(input, values_output, values_output,
|
|
::thrust::less<ValueType>() );
|
|
}
|
|
|
|
template<class InputPortal, class ValuesPortal, class OutputPortal,
|
|
class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void LowerBoundsPortal(const InputPortal &input,
|
|
const ValuesPortal &values,
|
|
const OutputPortal &output,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
typedef typename InputPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
|
|
BinaryCompare> bop(binary_compare);
|
|
|
|
try
|
|
{
|
|
::thrust::lower_bound(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(values),
|
|
IteratorEnd(values),
|
|
IteratorBegin(output),
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
template<class InputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ReducePortal(const InputPortal &input,
|
|
typename InputPortal::ValueType initialValue)
|
|
{
|
|
typedef typename InputPortal::ValueType ValueType;
|
|
return ReducePortal(input,
|
|
initialValue,
|
|
::thrust::plus<ValueType>());
|
|
}
|
|
|
|
template<class InputPortal, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ReducePortal(const InputPortal &input,
|
|
typename InputPortal::ValueType initialValue,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
typedef typename InputPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
|
|
BinaryFunctor> bop(binary_functor);
|
|
|
|
try
|
|
{
|
|
return ::thrust::reduce(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
initialValue,
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
|
|
return typename InputPortal::ValueType();
|
|
}
|
|
|
|
template<class KeysPortal, class ValuesPortal,
|
|
class KeysOutputPortal, class ValueOutputPortal,
|
|
class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static
|
|
vtkm::Id ReduceByKeyPortal(const KeysPortal &keys,
|
|
const ValuesPortal& values,
|
|
const KeysOutputPortal &keys_output,
|
|
const ValueOutputPortal &values_output,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
typedef typename detail::IteratorTraits<KeysOutputPortal>::IteratorType
|
|
KeysIteratorType;
|
|
typedef typename detail::IteratorTraits<ValueOutputPortal>::IteratorType
|
|
ValuesIteratorType;
|
|
|
|
KeysIteratorType keys_out_begin = IteratorBegin(keys_output);
|
|
ValuesIteratorType values_out_begin = IteratorBegin(values_output);
|
|
|
|
::thrust::pair< KeysIteratorType, ValuesIteratorType > result_iterators;
|
|
|
|
::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
|
|
|
|
typedef typename ValuesPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
|
|
BinaryFunctor> bop(binary_functor);
|
|
|
|
try
|
|
{
|
|
result_iterators = ::thrust::reduce_by_key(thrust::cuda::par,
|
|
IteratorBegin(keys),
|
|
IteratorEnd(keys),
|
|
IteratorBegin(values),
|
|
keys_out_begin,
|
|
values_out_begin,
|
|
binaryPredicate,
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
|
|
return static_cast<vtkm::Id>( ::thrust::distance(keys_out_begin,
|
|
result_iterators.first) );
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ScanExclusivePortal(const InputPortal &input,
|
|
const OutputPortal &output)
|
|
{
|
|
typedef typename OutputPortal::ValueType ValueType;
|
|
|
|
return ScanExclusivePortal(input,
|
|
output,
|
|
(::thrust::plus<ValueType>()),
|
|
vtkm::TypeTraits<ValueType>::ZeroInitialization());
|
|
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ScanExclusivePortal(const InputPortal &input,
|
|
const OutputPortal &output,
|
|
BinaryFunctor binaryOp,
|
|
typename InputPortal::ValueType initialValue)
|
|
{
|
|
// Use iterator to get value so that thrust device_ptr has chance to handle
|
|
// data on device.
|
|
typedef typename OutputPortal::ValueType ValueType;
|
|
|
|
//we have size three so that we can store the origin end value, the
|
|
//new end value, and the sum of those two
|
|
::thrust::system::cuda::vector< ValueType > sum(3);
|
|
try
|
|
{
|
|
|
|
//store the current value of the last position array in a separate cuda
|
|
//memory location since the exclusive_scan will overwrite that value
|
|
//once run
|
|
::thrust::copy_n(thrust::cuda::par,
|
|
IteratorEnd(input) - 1, 1, sum.begin());
|
|
|
|
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
|
|
BinaryFunctor> bop(binaryOp);
|
|
|
|
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
|
|
IteratorType;
|
|
IteratorType end = ::thrust::exclusive_scan(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(output),
|
|
initialValue,
|
|
bop);
|
|
|
|
//Store the new value for the end of the array. This is done because
|
|
//with items such as the transpose array it is unsafe to pass the
|
|
//portal to the SumExclusiveScan
|
|
::thrust::copy_n(thrust::cuda::par,
|
|
(end-1), 1, sum.begin()+1);
|
|
|
|
|
|
//execute the binaryOp one last time on the device.
|
|
SumExclusiveScan <<<1,1>>> (sum[0], sum[1], sum[2], bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
return sum[2];
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ScanInclusivePortal(const InputPortal &input,
|
|
const OutputPortal &output)
|
|
{
|
|
typedef typename OutputPortal::ValueType ValueType;
|
|
return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>() );
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static
|
|
typename InputPortal::ValueType ScanInclusivePortal(const InputPortal &input,
|
|
const OutputPortal &output,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
typedef typename OutputPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
|
|
BinaryFunctor> bop(binary_functor);
|
|
|
|
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
|
|
IteratorType;
|
|
|
|
try
|
|
{
|
|
IteratorType end = ::thrust::inclusive_scan(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(output),
|
|
bop);
|
|
return *(end-1);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
return typename InputPortal::ValueType();
|
|
}
|
|
|
|
//return the value at the last index in the array, as that is the sum
|
|
|
|
}
|
|
|
|
template<class ValuesPortal>
|
|
VTKM_CONT_EXPORT static void SortPortal(const ValuesPortal &values)
|
|
{
|
|
typedef typename ValuesPortal::ValueType ValueType;
|
|
SortPortal(values, ::thrust::less<ValueType>());
|
|
}
|
|
|
|
template<class ValuesPortal, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void SortPortal(const ValuesPortal &values,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
typedef typename ValuesPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
|
|
BinaryCompare> bop(binary_compare);
|
|
try
|
|
{
|
|
::thrust::sort(vtkm_cuda_policy(),
|
|
IteratorBegin(values),
|
|
IteratorEnd(values),
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
|
|
template<class KeysPortal, class ValuesPortal>
|
|
VTKM_CONT_EXPORT static void SortByKeyPortal(const KeysPortal &keys,
|
|
const ValuesPortal &values)
|
|
{
|
|
typedef typename KeysPortal::ValueType ValueType;
|
|
SortByKeyPortal(keys,values,::thrust::less<ValueType>());
|
|
}
|
|
|
|
template<class KeysPortal, class ValuesPortal, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void SortByKeyPortal(const KeysPortal &keys,
|
|
const ValuesPortal &values,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
typedef typename KeysPortal::ValueType ValueType;
|
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
|
|
BinaryCompare> bop(binary_compare);
|
|
try
|
|
{
|
|
::thrust::sort_by_key(thrust::cuda::par,
|
|
IteratorBegin(keys),
|
|
IteratorEnd(keys),
|
|
IteratorBegin(values),
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
template<class ValueIterator,
|
|
class StencilPortal,
|
|
class OutputPortal,
|
|
class UnaryPredicate>
|
|
VTKM_CONT_EXPORT static
|
|
vtkm::Id CopyIfPortal(ValueIterator valuesBegin,
|
|
ValueIterator valuesEnd,
|
|
StencilPortal stencil,
|
|
OutputPortal output,
|
|
UnaryPredicate unary_predicate)
|
|
{
|
|
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
|
|
IteratorType;
|
|
|
|
IteratorType outputBegin = IteratorBegin(output);
|
|
|
|
typedef typename StencilPortal::ValueType ValueType;
|
|
|
|
vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType,
|
|
UnaryPredicate> up(unary_predicate);
|
|
|
|
try
|
|
{
|
|
IteratorType newLast = ::thrust::copy_if(thrust::cuda::par,
|
|
valuesBegin,
|
|
valuesEnd,
|
|
IteratorBegin(stencil),
|
|
outputBegin,
|
|
up);
|
|
return static_cast<vtkm::Id>( ::thrust::distance(outputBegin, newLast) );
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
return vtkm::Id(0);
|
|
}
|
|
|
|
|
|
}
|
|
|
|
template<class ValuePortal,
|
|
class StencilPortal,
|
|
class OutputPortal,
|
|
class UnaryPredicate>
|
|
VTKM_CONT_EXPORT static
|
|
vtkm::Id CopyIfPortal(ValuePortal values,
|
|
StencilPortal stencil,
|
|
OutputPortal output,
|
|
UnaryPredicate unary_predicate)
|
|
{
|
|
return CopyIfPortal(IteratorBegin(values),
|
|
IteratorEnd(values),
|
|
stencil,
|
|
output,
|
|
unary_predicate);
|
|
}
|
|
|
|
template<class ValuesPortal>
|
|
VTKM_CONT_EXPORT static
|
|
vtkm::Id UniquePortal(const ValuesPortal values)
|
|
{
|
|
typedef typename detail::IteratorTraits<ValuesPortal>::IteratorType
|
|
IteratorType;
|
|
try
|
|
{
|
|
IteratorType begin = IteratorBegin(values);
|
|
IteratorType newLast = ::thrust::unique(thrust::cuda::par,
|
|
begin,
|
|
IteratorEnd(values));
|
|
return static_cast<vtkm::Id>( ::thrust::distance(begin, newLast) );
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
return vtkm::Id(0);
|
|
}
|
|
}
|
|
|
|
template<class ValuesPortal, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static
|
|
vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
|
|
{
|
|
typedef typename detail::IteratorTraits<ValuesPortal>::IteratorType
|
|
IteratorType;
|
|
typedef typename ValuesPortal::ValueType ValueType;
|
|
|
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
|
|
BinaryCompare> bop(binary_compare);
|
|
try
|
|
{
|
|
IteratorType begin = IteratorBegin(values);
|
|
IteratorType newLast = ::thrust::unique(thrust::cuda::par,
|
|
begin,
|
|
IteratorEnd(values),
|
|
bop);
|
|
return static_cast<vtkm::Id>( ::thrust::distance(begin, newLast) );
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
return vtkm::Id(0);
|
|
}
|
|
}
|
|
|
|
template<class InputPortal, class ValuesPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
void UpperBoundsPortal(const InputPortal &input,
|
|
const ValuesPortal &values,
|
|
const OutputPortal &output)
|
|
{
|
|
try
|
|
{
|
|
::thrust::upper_bound(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(values),
|
|
IteratorEnd(values),
|
|
IteratorBegin(output));
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
template<class InputPortal, class ValuesPortal, class OutputPortal,
|
|
class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void UpperBoundsPortal(const InputPortal &input,
|
|
const ValuesPortal &values,
|
|
const OutputPortal &output,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
typedef typename OutputPortal::ValueType ValueType;
|
|
|
|
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
|
|
BinaryCompare> bop(binary_compare);
|
|
try
|
|
{
|
|
::thrust::upper_bound(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(values),
|
|
IteratorEnd(values),
|
|
IteratorBegin(output),
|
|
bop);
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
template<class InputPortal, class OutputPortal>
|
|
VTKM_CONT_EXPORT static
|
|
void UpperBoundsPortal(const InputPortal &input,
|
|
const OutputPortal &values_output)
|
|
{
|
|
try
|
|
{
|
|
::thrust::upper_bound(thrust::cuda::par,
|
|
IteratorBegin(input),
|
|
IteratorEnd(input),
|
|
IteratorBegin(values_output),
|
|
IteratorEnd(values_output),
|
|
IteratorBegin(values_output));
|
|
}
|
|
catch(...)
|
|
{
|
|
throwAsVTKmException();
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
public:
|
|
template<typename T, typename U, class SIn, class SOut>
|
|
VTKM_CONT_EXPORT static void Copy(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
vtkm::cont::ArrayHandle<U,SOut> &output)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
|
|
//We need call PrepareForInput on the input argument before invoking a
|
|
//function. The order of execution of parameters of a function is undefined
|
|
//so we need to make sure input is called before output, or else in-place
|
|
//use case breaks.
|
|
input.PrepareForInput(DeviceAdapterTag());
|
|
CopyPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn, class SVal, class SOut>
|
|
VTKM_CONT_EXPORT static void LowerBounds(
|
|
const vtkm::cont::ArrayHandle<T,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SVal>& values,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
|
|
{
|
|
vtkm::Id numberOfValues = values.GetNumberOfValues();
|
|
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn, class SVal, class SOut, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void LowerBounds(
|
|
const vtkm::cont::ArrayHandle<T,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SVal>& values,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
vtkm::Id numberOfValues = values.GetNumberOfValues();
|
|
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
|
|
binary_compare);
|
|
}
|
|
|
|
template<class SIn, class SOut>
|
|
VTKM_CONT_EXPORT static void LowerBounds(
|
|
const vtkm::cont::ArrayHandle<vtkm::Id,SIn> &input,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut> &values_output)
|
|
{
|
|
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values_output.PrepareForInPlace(DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn>
|
|
VTKM_CONT_EXPORT static T Reduce(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
T initialValue)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
return initialValue;
|
|
}
|
|
return ReducePortal(input.PrepareForInput( DeviceAdapterTag() ),
|
|
initialValue);
|
|
}
|
|
|
|
template<typename T, class SIn, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static T Reduce(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
T initialValue,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
return initialValue;
|
|
}
|
|
return ReducePortal(input.PrepareForInput( DeviceAdapterTag() ),
|
|
initialValue,
|
|
binary_functor);
|
|
}
|
|
|
|
template<typename T, typename U, class KIn, class VIn, class KOut, class VOut,
|
|
class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static void ReduceByKey(
|
|
const vtkm::cont::ArrayHandle<T,KIn> &keys,
|
|
const vtkm::cont::ArrayHandle<U,VIn> &values,
|
|
vtkm::cont::ArrayHandle<T,KOut> &keys_output,
|
|
vtkm::cont::ArrayHandle<U,VOut> &values_output,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
//there is a concern that by default we will allocate too much
|
|
//space for the keys/values output. 1 option is to
|
|
const vtkm::Id numberOfValues = keys.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
return;
|
|
}
|
|
vtkm::Id reduced_size =
|
|
ReduceByKeyPortal(keys.PrepareForInput( DeviceAdapterTag() ),
|
|
values.PrepareForInput( DeviceAdapterTag() ),
|
|
keys_output.PrepareForOutput( numberOfValues, DeviceAdapterTag() ),
|
|
values_output.PrepareForOutput( numberOfValues, DeviceAdapterTag() ),
|
|
binary_functor);
|
|
|
|
keys_output.Shrink( reduced_size );
|
|
values_output.Shrink( reduced_size );
|
|
}
|
|
|
|
template<typename T, class SIn, class SOut>
|
|
VTKM_CONT_EXPORT static T ScanExclusive(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
vtkm::cont::ArrayHandle<T,SOut>& output)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
output.PrepareForOutput(0, DeviceAdapterTag());
|
|
return vtkm::TypeTraits<T>::ZeroInitialization();
|
|
}
|
|
|
|
//We need call PrepareForInput on the input argument before invoking a
|
|
//function. The order of execution of parameters of a function is undefined
|
|
//so we need to make sure input is called before output, or else in-place
|
|
//use case breaks.
|
|
input.PrepareForInput(DeviceAdapterTag());
|
|
return ScanExclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn, class SOut, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static T ScanExclusive(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
vtkm::cont::ArrayHandle<T,SOut>& output,
|
|
BinaryFunctor binary_functor,
|
|
const T& initialValue)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
output.PrepareForOutput(0, DeviceAdapterTag());
|
|
return vtkm::TypeTraits<T>::ZeroInitialization();
|
|
}
|
|
|
|
//We need call PrepareForInput on the input argument before invoking a
|
|
//function. The order of execution of parameters of a function is undefined
|
|
//so we need to make sure input is called before output, or else in-place
|
|
//use case breaks.
|
|
input.PrepareForInput(DeviceAdapterTag());
|
|
return ScanExclusivePortal(
|
|
input.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
|
|
binary_functor,
|
|
initialValue);
|
|
}
|
|
|
|
template<typename T, class SIn, class SOut>
|
|
VTKM_CONT_EXPORT static T ScanInclusive(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
vtkm::cont::ArrayHandle<T,SOut>& output)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
output.PrepareForOutput(0, DeviceAdapterTag());
|
|
return vtkm::TypeTraits<T>::ZeroInitialization();
|
|
}
|
|
|
|
//We need call PrepareForInput on the input argument before invoking a
|
|
//function. The order of execution of parameters of a function is undefined
|
|
//so we need to make sure input is called before output, or else in-place
|
|
//use case breaks.
|
|
input.PrepareForInput(DeviceAdapterTag());
|
|
return ScanInclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn, class SOut, class BinaryFunctor>
|
|
VTKM_CONT_EXPORT static T ScanInclusive(
|
|
const vtkm::cont::ArrayHandle<T,SIn> &input,
|
|
vtkm::cont::ArrayHandle<T,SOut>& output,
|
|
BinaryFunctor binary_functor)
|
|
{
|
|
const vtkm::Id numberOfValues = input.GetNumberOfValues();
|
|
if (numberOfValues <= 0)
|
|
{
|
|
output.PrepareForOutput(0, DeviceAdapterTag());
|
|
return vtkm::TypeTraits<T>::ZeroInitialization();
|
|
}
|
|
|
|
//We need call PrepareForInput on the input argument before invoking a
|
|
//function. The order of execution of parameters of a function is undefined
|
|
//so we need to make sure input is called before output, or else in-place
|
|
//use case breaks.
|
|
input.PrepareForInput(DeviceAdapterTag());
|
|
return ScanInclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
|
|
binary_functor);
|
|
}
|
|
|
|
// Because of some funny code conversions in nvcc, kernels for devices have to
|
|
// be public.
|
|
#ifndef VTKM_CUDA
|
|
private:
|
|
#endif
|
|
// we use cuda pinned memory to reduce the amount of synchronization
|
|
// and mem copies between the host and device.
|
|
VTKM_CONT_EXPORT
|
|
static char* GetPinnedErrorArray(vtkm::Id &arraySize, char** hostPointer)
|
|
{
|
|
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
|
|
static bool errorArrayInit = false;
|
|
static char* hostPtr = NULL;
|
|
static char* devicePtr = NULL;
|
|
if( !errorArrayInit )
|
|
{
|
|
cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped );
|
|
cudaHostGetDevicePointer(&devicePtr, hostPtr, 0);
|
|
errorArrayInit = true;
|
|
}
|
|
//set the size of the array
|
|
arraySize = ERROR_ARRAY_SIZE;
|
|
|
|
//specify the host pointer to the memory
|
|
*hostPointer = hostPtr;
|
|
(void) hostPointer;
|
|
return devicePtr;
|
|
}
|
|
|
|
// we query cuda for the max blocks per grid for 1D scheduling
|
|
// and cache the values in static variables
|
|
VTKM_CONT_EXPORT
|
|
static vtkm::Vec<vtkm::UInt32,3> GetMaxGridOfThreadBlocks()
|
|
{
|
|
static bool gridQueryInit = false;
|
|
static vtkm::Vec< vtkm::UInt32, 3> maxGridSize;
|
|
if( !gridQueryInit )
|
|
{
|
|
gridQueryInit = true;
|
|
int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda
|
|
|
|
cudaDeviceProp properties;
|
|
cudaGetDeviceProperties(&properties, currDevice);
|
|
maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
|
|
maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
|
|
maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
|
|
|
|
//Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids
|
|
//in the X direction, it is dependent on the code being compiled for SM3+.
|
|
//If not, it falls back to SM_2 limitation of 65535 being the largest grid
|
|
//size.
|
|
//Now since SM architecture is only available inside kernels we have to
|
|
//invoke one to see what the actual limit is for our device. So that is
|
|
//what we are going to do next, and than we will store that result
|
|
|
|
vtkm::UInt32 *dev_actual_size;
|
|
cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) );
|
|
DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size);
|
|
cudaDeviceSynchronize();
|
|
cudaMemcpy( &maxGridSize[0],
|
|
dev_actual_size,
|
|
sizeof(vtkm::UInt32),
|
|
cudaMemcpyDeviceToHost );
|
|
cudaFree(dev_actual_size);
|
|
}
|
|
return maxGridSize;
|
|
}
|
|
|
|
public:
|
|
template<class Functor>
|
|
VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id numInstances)
|
|
{
|
|
//since the memory is pinned we can access it safely on the host
|
|
//without a memcpy
|
|
vtkm::Id errorArraySize = 0;
|
|
char* hostErrorPtr = NULL;
|
|
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
|
|
|
//clear the first character which means that we don't contain an error
|
|
hostErrorPtr[0] = '\0';
|
|
|
|
vtkm::exec::internal::ErrorMessageBuffer errorMessage( deviceErrorPtr,
|
|
errorArraySize);
|
|
|
|
functor.SetErrorMessageBuffer(errorMessage);
|
|
|
|
const vtkm::Id blockSizeAsId = 128;
|
|
const vtkm::UInt32 blockSize = 128;
|
|
const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0];
|
|
const vtkm::UInt32 totalBlocks = static_cast<vtkm::UInt32>(
|
|
(numInstances + blockSizeAsId - 1) / blockSizeAsId);
|
|
|
|
//Note a cuda launch can only handle at most 2B iterations of a kernel
|
|
//because it holds all of the indexes inside UInt32, so for use to
|
|
//handle datasets larger than 2B, we need to execute multiple kernels
|
|
if(totalBlocks < maxblocksPerLaunch)
|
|
{
|
|
Schedule1DIndexKernel<Functor> <<<totalBlocks, blockSize>>> (functor,
|
|
vtkm::Id(0),
|
|
numInstances);
|
|
}
|
|
else
|
|
{
|
|
const vtkm::Id numberOfKernelsToRun = blockSizeAsId * static_cast<vtkm::Id>(maxblocksPerLaunch);
|
|
for(vtkm::Id numberOfKernelsInvoked = 0;
|
|
numberOfKernelsInvoked < numInstances;
|
|
numberOfKernelsInvoked += numberOfKernelsToRun)
|
|
{
|
|
Schedule1DIndexKernel<Functor> <<<maxblocksPerLaunch, blockSize>>> (functor,
|
|
numberOfKernelsInvoked,
|
|
numInstances);
|
|
}
|
|
}
|
|
|
|
//sync so that we can check the results of the call.
|
|
//In the future I want move this before the schedule call, and throwing
|
|
//an exception if the previous schedule wrote an error. This would help
|
|
//cuda to run longer before we hard sync.
|
|
cudaDeviceSynchronize();
|
|
|
|
//check what the value is
|
|
if (hostErrorPtr[0] != '\0')
|
|
{
|
|
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
|
}
|
|
}
|
|
|
|
template<class Functor>
|
|
VTKM_CONT_EXPORT
|
|
static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
|
|
{
|
|
//since the memory is pinned we can access it safely on the host
|
|
//without a memcpy
|
|
vtkm::Id errorArraySize = 0;
|
|
char* hostErrorPtr = NULL;
|
|
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
|
|
|
|
//clear the first character which means that we don't contain an error
|
|
hostErrorPtr[0] = '\0';
|
|
|
|
vtkm::exec::internal::ErrorMessageBuffer errorMessage( deviceErrorPtr,
|
|
errorArraySize);
|
|
|
|
functor.SetErrorMessageBuffer(errorMessage);
|
|
|
|
#ifdef ANALYZE_VTKM_SCHEDULER
|
|
//requires the errormessage buffer be set
|
|
compare_3d_schedule_patterns(functor,rangeMax);
|
|
#endif
|
|
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
|
static_cast<vtkm::UInt32>(rangeMax[1]),
|
|
static_cast<vtkm::UInt32>(rangeMax[2]) );
|
|
|
|
|
|
//currently we presume that 3d scheduling access patterns prefer accessing
|
|
//memory in the X direction. Also should be good for thin in the Z axis
|
|
//algorithms.
|
|
dim3 blockSize3d(64,2,1);
|
|
|
|
//handle the simple use case of 'bad' datasets which are thin in X
|
|
//but larger in the other directions, allowing us decent performance with
|
|
//that use case.
|
|
if(rangeMax[0] <= 128 &&
|
|
(rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2]) )
|
|
{
|
|
blockSize3d = dim3(16,4,4);
|
|
}
|
|
|
|
dim3 gridSize3d;
|
|
compute_block_size(ranges, blockSize3d, gridSize3d);
|
|
|
|
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
|
|
|
|
//sync so that we can check the results of the call.
|
|
//In the future I want move this before the schedule call, and throwing
|
|
//an exception if the previous schedule wrote an error. This would help
|
|
//cuda to run longer before we hard sync.
|
|
cudaDeviceSynchronize();
|
|
|
|
//check what the value is
|
|
if (hostErrorPtr[0] != '\0')
|
|
{
|
|
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
|
}
|
|
}
|
|
|
|
template<typename T, class Storage>
|
|
VTKM_CONT_EXPORT static void Sort(
|
|
vtkm::cont::ArrayHandle<T,Storage>& values)
|
|
{
|
|
SortPortal(values.PrepareForInPlace(DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class Storage, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void Sort(
|
|
vtkm::cont::ArrayHandle<T,Storage>& values,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
SortPortal(values.PrepareForInPlace(DeviceAdapterTag()),binary_compare);
|
|
}
|
|
|
|
template<typename T, typename U,
|
|
class StorageT, class StorageU>
|
|
VTKM_CONT_EXPORT static void SortByKey(
|
|
vtkm::cont::ArrayHandle<T,StorageT>& keys,
|
|
vtkm::cont::ArrayHandle<U,StorageU>& values)
|
|
{
|
|
SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTag()),
|
|
values.PrepareForInPlace(DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, typename U,
|
|
class StorageT, class StorageU,
|
|
class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void SortByKey(
|
|
vtkm::cont::ArrayHandle<T,StorageT>& keys,
|
|
vtkm::cont::ArrayHandle<U,StorageU>& values,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTag()),
|
|
values.PrepareForInPlace(DeviceAdapterTag()),
|
|
binary_compare);
|
|
}
|
|
|
|
|
|
template<typename T, class SStencil, class SOut>
|
|
VTKM_CONT_EXPORT static void StreamCompact(
|
|
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
|
|
{
|
|
vtkm::Id size = stencil.GetNumberOfValues();
|
|
vtkm::Id newSize = CopyIfPortal(::thrust::make_counting_iterator<vtkm::Id>(0),
|
|
::thrust::make_counting_iterator<vtkm::Id>(size),
|
|
stencil.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(size, DeviceAdapterTag()),
|
|
::vtkm::NotZeroInitialized());
|
|
output.Shrink(newSize);
|
|
}
|
|
|
|
template<typename T,
|
|
typename U,
|
|
class SIn,
|
|
class SStencil,
|
|
class SOut>
|
|
VTKM_CONT_EXPORT static void StreamCompact(
|
|
const vtkm::cont::ArrayHandle<U,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
|
|
vtkm::cont::ArrayHandle<U,SOut>& output)
|
|
{
|
|
vtkm::Id size = stencil.GetNumberOfValues();
|
|
vtkm::Id newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
stencil.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(size, DeviceAdapterTag()),
|
|
::vtkm::NotZeroInitialized()); //yes on the stencil
|
|
output.Shrink(newSize);
|
|
}
|
|
|
|
template<typename T,
|
|
typename U,
|
|
class SIn,
|
|
class SStencil,
|
|
class SOut,
|
|
class UnaryPredicate>
|
|
VTKM_CONT_EXPORT static void StreamCompact(
|
|
const vtkm::cont::ArrayHandle<U,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
|
|
vtkm::cont::ArrayHandle<U,SOut>& output,
|
|
UnaryPredicate unary_predicate)
|
|
{
|
|
vtkm::Id size = stencil.GetNumberOfValues();
|
|
vtkm::Id newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
stencil.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(size, DeviceAdapterTag()),
|
|
unary_predicate);
|
|
output.Shrink(newSize);
|
|
}
|
|
|
|
template<typename T, class Storage>
|
|
VTKM_CONT_EXPORT static void Unique(
|
|
vtkm::cont::ArrayHandle<T,Storage> &values)
|
|
{
|
|
vtkm::Id newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTag()));
|
|
|
|
values.Shrink(newSize);
|
|
}
|
|
|
|
template<typename T, class Storage, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void Unique(
|
|
vtkm::cont::ArrayHandle<T,Storage> &values,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
vtkm::Id newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTag()),binary_compare);
|
|
|
|
values.Shrink(newSize);
|
|
}
|
|
|
|
template<typename T, class SIn, class SVal, class SOut>
|
|
VTKM_CONT_EXPORT static void UpperBounds(
|
|
const vtkm::cont::ArrayHandle<T,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SVal>& values,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
|
|
{
|
|
vtkm::Id numberOfValues = values.GetNumberOfValues();
|
|
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
|
|
}
|
|
|
|
template<typename T, class SIn, class SVal, class SOut, class BinaryCompare>
|
|
VTKM_CONT_EXPORT static void UpperBounds(
|
|
const vtkm::cont::ArrayHandle<T,SIn>& input,
|
|
const vtkm::cont::ArrayHandle<T,SVal>& values,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output,
|
|
BinaryCompare binary_compare)
|
|
{
|
|
vtkm::Id numberOfValues = values.GetNumberOfValues();
|
|
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values.PrepareForInput(DeviceAdapterTag()),
|
|
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
|
|
binary_compare);
|
|
}
|
|
|
|
template<class SIn, class SOut>
|
|
VTKM_CONT_EXPORT static void UpperBounds(
|
|
const vtkm::cont::ArrayHandle<vtkm::Id,SIn> &input,
|
|
vtkm::cont::ArrayHandle<vtkm::Id,SOut> &values_output)
|
|
{
|
|
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
|
|
values_output.PrepareForInPlace(DeviceAdapterTag()));
|
|
}
|
|
};
|
|
|
|
}
|
|
}
|
|
}
|
|
} // namespace vtkm::cont::cuda::internal
|
|
|
|
#endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
|