vtk-m/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h

1300 lines
46 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 Sandia Corporation.
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
#include <vtkm/cont/ArrayHandle.h>
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/Types.h>
#include <vtkm/TypeTraits.h>
#include <vtkm/UnaryPredicates.h>
#include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
#include <vtkm/exec/cuda/internal/WrappedOperators.h>
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
#include <vtkm/exec/internal/WorkletInvokeFunctor.h>
// Disable warnings we check vtkm for but Thrust does not.
VTKM_THIRDPARTY_PRE_INCLUDE
//our own custom thrust execution policy
#include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
#include <thrust/advance.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/system/cuda/vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/system/cuda/execution_policy.h>
VTKM_THIRDPARTY_POST_INCLUDE
namespace vtkm {
namespace cont {
namespace cuda {
namespace internal {
static
__global__
void DetermineProperXGridSize(vtkm::UInt32 desired_size,
vtkm::UInt32* actual_size)
{
//used only to see if we can launch kernels with a x grid size that
//matches the max of the graphics card, or are we having to fall back
//to SM_2 grid sizes
if(blockIdx.x != 0)
{
return;
}
#if __CUDA_ARCH__ <= 200
const vtkm::UInt32 maxXGridSizeForSM2 = 65535;
*actual_size = maxXGridSizeForSM2;
#else
*actual_size = desired_size;
#endif
}
template<class FunctorType>
__global__
void Schedule1DIndexKernel(FunctorType functor,
vtkm::Id numberOfKernelsInvoked,
vtkm::Id length)
{
//Note a cuda launch can only handle at most 2B iterations of a kernel
//because it holds all of the indexes inside UInt32, so for use to
//handle datasets larger than 2B, we need to execute multiple kernels
const vtkm::Id index = numberOfKernelsInvoked +
static_cast<vtkm::Id>(blockDim.x * blockIdx.x + threadIdx.x);
if(index < length)
{
functor(index);
}
}
template<class FunctorType>
__global__
void Schedule3DIndexKernel(FunctorType functor, dim3 size)
{
const vtkm::Id3 index(
blockIdx.x*blockDim.x + threadIdx.x,
blockIdx.y*blockDim.y + threadIdx.y,
blockIdx.z*blockDim.z + threadIdx.z
);
if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z)
{
return;
}
functor( index );
}
template<typename T, typename BinaryOperationType >
__global__
void SumExclusiveScan(T a, T b, T result,
BinaryOperationType binary_op)
{
result = binary_op(a,b);
}
inline
void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d)
{
gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1) : (rangeMax.x / blockSize3d.x);
gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1) : (rangeMax.y / blockSize3d.y);
gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1) : (rangeMax.z / blockSize3d.z);
}
#ifdef ANALYZE_VTKM_SCHEDULER
class PerfRecord
{
public:
PerfRecord(float elapsedT, dim3 block ):
elapsedTime(elapsedT),
blockSize(block)
{
}
bool operator<(const PerfRecord& other) const
{ return elapsedTime < other.elapsedTime; }
float elapsedTime;
dim3 blockSize;
};
template<class Functor>
static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& rangeMax)
{
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]) );
std::vector< PerfRecord > results;
vtkm::UInt32 indexTable[16] = {1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024};
for(vtkm::UInt32 i=0; i < 16; i++)
{
for(vtkm::UInt32 j=0; j < 16; j++)
{
for(vtkm::UInt32 k=0; k < 16; k++)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]);
dim3 gridSize3d;
if( (blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 ||
blockSize3d.z >= 64)
{
//cuda can't handle more than 1024 threads per block
//so don't try if we compute higher than that
//also don't try stupidly low numbers
//cuda can't handle more than 64 threads in the z direction
continue;
}
compute_block_size(ranges, blockSize3d, gridSize3d);
cudaEventRecord(start, 0);
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
results.push_back( record );
}
}
}
std::sort(results.begin(), results.end());
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
for(vtkm::Int64 i=1; i <= size; i++)
{
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size-i);
vtkm::UInt32 x = results[index].blockSize.x;
vtkm::UInt32 y = results[index].blockSize.y;
vtkm::UInt32 z = results[index].blockSize.z;
float t = results[index].elapsedTime;
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
}
std::cout << "flat array performance " << std::endl;
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
typedef
vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
vtkm::cont::DeviceAdapterTagCuda > Algorithm;
Algorithm::Schedule(functor, numInstances);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
}
std::cout << "fixed 3d block size performance " << std::endl;
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 blockSize3d(64,2,1);
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
cudaEventRecord(start, 0);
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
}
}
#endif
/// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
/// device that uses thrust as its implementation. The subclass should pass in
/// the correct device adapter tag as the template parameter.
///
template<class DeviceAdapterTag>
struct DeviceAdapterAlgorithmThrust
{
// Because of some funny code conversions in nvcc, kernels for devices have to
// be public.
#ifndef VTKM_CUDA
private:
#endif
template<class InputPortal, class OutputPortal>
VTKM_CONT_EXPORT static void CopyPortal(const InputPortal &input,
const OutputPortal &output)
{
try
{
::thrust::copy(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output));
}
catch(...)
{
throwAsVTKmException();
}
}
template<class InputPortal, class ValuesPortal, class OutputPortal>
VTKM_CONT_EXPORT static void LowerBoundsPortal(const InputPortal &input,
const ValuesPortal &values,
const OutputPortal &output)
{
typedef typename ValuesPortal::ValueType ValueType;
LowerBoundsPortal(input, values, output, ::thrust::less<ValueType>() );
}
template<class InputPortal, class OutputPortal>
VTKM_CONT_EXPORT static
void LowerBoundsPortal(const InputPortal &input,
const OutputPortal &values_output)
{
typedef typename InputPortal::ValueType ValueType;
LowerBoundsPortal(input, values_output, values_output,
::thrust::less<ValueType>() );
}
template<class InputPortal, class ValuesPortal, class OutputPortal,
class BinaryCompare>
VTKM_CONT_EXPORT static void LowerBoundsPortal(const InputPortal &input,
const ValuesPortal &values,
const OutputPortal &output,
BinaryCompare binary_compare)
{
typedef typename InputPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
BinaryCompare> bop(binary_compare);
try
{
::thrust::lower_bound(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
IteratorEnd(values),
IteratorBegin(output),
bop);
}
catch(...)
{
throwAsVTKmException();
}
}
template<class InputPortal>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ReducePortal(const InputPortal &input,
typename InputPortal::ValueType initialValue)
{
typedef typename InputPortal::ValueType ValueType;
return ReducePortal(input,
initialValue,
::thrust::plus<ValueType>());
}
template<class InputPortal, class BinaryFunctor>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ReducePortal(const InputPortal &input,
typename InputPortal::ValueType initialValue,
BinaryFunctor binary_functor)
{
typedef typename InputPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
BinaryFunctor> bop(binary_functor);
try
{
return ::thrust::reduce(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
initialValue,
bop);
}
catch(...)
{
throwAsVTKmException();
}
return typename InputPortal::ValueType();
}
template<class KeysPortal, class ValuesPortal,
class KeysOutputPortal, class ValueOutputPortal,
class BinaryFunctor>
VTKM_CONT_EXPORT static
vtkm::Id ReduceByKeyPortal(const KeysPortal &keys,
const ValuesPortal& values,
const KeysOutputPortal &keys_output,
const ValueOutputPortal &values_output,
BinaryFunctor binary_functor)
{
typedef typename detail::IteratorTraits<KeysOutputPortal>::IteratorType
KeysIteratorType;
typedef typename detail::IteratorTraits<ValueOutputPortal>::IteratorType
ValuesIteratorType;
KeysIteratorType keys_out_begin = IteratorBegin(keys_output);
ValuesIteratorType values_out_begin = IteratorBegin(values_output);
::thrust::pair< KeysIteratorType, ValuesIteratorType > result_iterators;
::thrust::equal_to<typename KeysPortal::ValueType> binaryPredicate;
typedef typename ValuesPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
BinaryFunctor> bop(binary_functor);
try
{
result_iterators = ::thrust::reduce_by_key(thrust::cuda::par,
IteratorBegin(keys),
IteratorEnd(keys),
IteratorBegin(values),
keys_out_begin,
values_out_begin,
binaryPredicate,
bop);
}
catch(...)
{
throwAsVTKmException();
}
return static_cast<vtkm::Id>( ::thrust::distance(keys_out_begin,
result_iterators.first) );
}
template<class InputPortal, class OutputPortal>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ScanExclusivePortal(const InputPortal &input,
const OutputPortal &output)
{
typedef typename OutputPortal::ValueType ValueType;
return ScanExclusivePortal(input,
output,
(::thrust::plus<ValueType>()),
vtkm::TypeTraits<ValueType>::ZeroInitialization());
}
template<class InputPortal, class OutputPortal, class BinaryFunctor>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ScanExclusivePortal(const InputPortal &input,
const OutputPortal &output,
BinaryFunctor binaryOp,
typename InputPortal::ValueType initialValue)
{
// Use iterator to get value so that thrust device_ptr has chance to handle
// data on device.
typedef typename OutputPortal::ValueType ValueType;
//we have size three so that we can store the origin end value, the
//new end value, and the sum of those two
::thrust::system::cuda::vector< ValueType > sum(3);
try
{
//store the current value of the last position array in a separate cuda
//memory location since the exclusive_scan will overwrite that value
//once run
::thrust::copy_n(thrust::cuda::par,
IteratorEnd(input) - 1, 1, sum.begin());
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
BinaryFunctor> bop(binaryOp);
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
IteratorType;
IteratorType end = ::thrust::exclusive_scan(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output),
initialValue,
bop);
//Store the new value for the end of the array. This is done because
//with items such as the transpose array it is unsafe to pass the
//portal to the SumExclusiveScan
::thrust::copy_n(thrust::cuda::par,
(end-1), 1, sum.begin()+1);
//execute the binaryOp one last time on the device.
SumExclusiveScan <<<1,1>>> (sum[0], sum[1], sum[2], bop);
}
catch(...)
{
throwAsVTKmException();
}
return sum[2];
}
template<class InputPortal, class OutputPortal>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ScanInclusivePortal(const InputPortal &input,
const OutputPortal &output)
{
typedef typename OutputPortal::ValueType ValueType;
return ScanInclusivePortal(input, output, ::thrust::plus<ValueType>() );
}
template<class InputPortal, class OutputPortal, class BinaryFunctor>
VTKM_CONT_EXPORT static
typename InputPortal::ValueType ScanInclusivePortal(const InputPortal &input,
const OutputPortal &output,
BinaryFunctor binary_functor)
{
typedef typename OutputPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryOperator<ValueType,
BinaryFunctor> bop(binary_functor);
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
IteratorType;
try
{
IteratorType end = ::thrust::inclusive_scan(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(output),
bop);
return *(end-1);
}
catch(...)
{
throwAsVTKmException();
return typename InputPortal::ValueType();
}
//return the value at the last index in the array, as that is the sum
}
template<class ValuesPortal>
VTKM_CONT_EXPORT static void SortPortal(const ValuesPortal &values)
{
typedef typename ValuesPortal::ValueType ValueType;
SortPortal(values, ::thrust::less<ValueType>());
}
template<class ValuesPortal, class BinaryCompare>
VTKM_CONT_EXPORT static void SortPortal(const ValuesPortal &values,
BinaryCompare binary_compare)
{
typedef typename ValuesPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
BinaryCompare> bop(binary_compare);
try
{
::thrust::sort(vtkm_cuda_policy(),
IteratorBegin(values),
IteratorEnd(values),
bop);
}
catch(...)
{
throwAsVTKmException();
}
}
template<class KeysPortal, class ValuesPortal>
VTKM_CONT_EXPORT static void SortByKeyPortal(const KeysPortal &keys,
const ValuesPortal &values)
{
typedef typename KeysPortal::ValueType ValueType;
SortByKeyPortal(keys,values,::thrust::less<ValueType>());
}
template<class KeysPortal, class ValuesPortal, class BinaryCompare>
VTKM_CONT_EXPORT static void SortByKeyPortal(const KeysPortal &keys,
const ValuesPortal &values,
BinaryCompare binary_compare)
{
typedef typename KeysPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
BinaryCompare> bop(binary_compare);
try
{
::thrust::sort_by_key(thrust::cuda::par,
IteratorBegin(keys),
IteratorEnd(keys),
IteratorBegin(values),
bop);
}
catch(...)
{
throwAsVTKmException();
}
}
template<class ValueIterator,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
VTKM_CONT_EXPORT static
vtkm::Id CopyIfPortal(ValueIterator valuesBegin,
ValueIterator valuesEnd,
StencilPortal stencil,
OutputPortal output,
UnaryPredicate unary_predicate)
{
typedef typename detail::IteratorTraits<OutputPortal>::IteratorType
IteratorType;
IteratorType outputBegin = IteratorBegin(output);
typedef typename StencilPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedUnaryPredicate<ValueType,
UnaryPredicate> up(unary_predicate);
try
{
IteratorType newLast = ::thrust::copy_if(thrust::cuda::par,
valuesBegin,
valuesEnd,
IteratorBegin(stencil),
outputBegin,
up);
return static_cast<vtkm::Id>( ::thrust::distance(outputBegin, newLast) );
}
catch(...)
{
throwAsVTKmException();
return vtkm::Id(0);
}
}
template<class ValuePortal,
class StencilPortal,
class OutputPortal,
class UnaryPredicate>
VTKM_CONT_EXPORT static
vtkm::Id CopyIfPortal(ValuePortal values,
StencilPortal stencil,
OutputPortal output,
UnaryPredicate unary_predicate)
{
return CopyIfPortal(IteratorBegin(values),
IteratorEnd(values),
stencil,
output,
unary_predicate);
}
template<class ValuesPortal>
VTKM_CONT_EXPORT static
vtkm::Id UniquePortal(const ValuesPortal values)
{
typedef typename detail::IteratorTraits<ValuesPortal>::IteratorType
IteratorType;
try
{
IteratorType begin = IteratorBegin(values);
IteratorType newLast = ::thrust::unique(thrust::cuda::par,
begin,
IteratorEnd(values));
return static_cast<vtkm::Id>( ::thrust::distance(begin, newLast) );
}
catch(...)
{
throwAsVTKmException();
return vtkm::Id(0);
}
}
template<class ValuesPortal, class BinaryCompare>
VTKM_CONT_EXPORT static
vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare)
{
typedef typename detail::IteratorTraits<ValuesPortal>::IteratorType
IteratorType;
typedef typename ValuesPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
BinaryCompare> bop(binary_compare);
try
{
IteratorType begin = IteratorBegin(values);
IteratorType newLast = ::thrust::unique(thrust::cuda::par,
begin,
IteratorEnd(values),
bop);
return static_cast<vtkm::Id>( ::thrust::distance(begin, newLast) );
}
catch(...)
{
throwAsVTKmException();
return vtkm::Id(0);
}
}
template<class InputPortal, class ValuesPortal, class OutputPortal>
VTKM_CONT_EXPORT static
void UpperBoundsPortal(const InputPortal &input,
const ValuesPortal &values,
const OutputPortal &output)
{
try
{
::thrust::upper_bound(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
IteratorEnd(values),
IteratorBegin(output));
}
catch(...)
{
throwAsVTKmException();
}
}
template<class InputPortal, class ValuesPortal, class OutputPortal,
class BinaryCompare>
VTKM_CONT_EXPORT static void UpperBoundsPortal(const InputPortal &input,
const ValuesPortal &values,
const OutputPortal &output,
BinaryCompare binary_compare)
{
typedef typename OutputPortal::ValueType ValueType;
vtkm::exec::cuda::internal::WrappedBinaryPredicate<ValueType,
BinaryCompare> bop(binary_compare);
try
{
::thrust::upper_bound(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values),
IteratorEnd(values),
IteratorBegin(output),
bop);
}
catch(...)
{
throwAsVTKmException();
}
}
template<class InputPortal, class OutputPortal>
VTKM_CONT_EXPORT static
void UpperBoundsPortal(const InputPortal &input,
const OutputPortal &values_output)
{
try
{
::thrust::upper_bound(thrust::cuda::par,
IteratorBegin(input),
IteratorEnd(input),
IteratorBegin(values_output),
IteratorEnd(values_output),
IteratorBegin(values_output));
}
catch(...)
{
throwAsVTKmException();
}
}
//-----------------------------------------------------------------------------
public:
template<typename T, typename U, class SIn, class SOut>
VTKM_CONT_EXPORT static void Copy(
const vtkm::cont::ArrayHandle<T,SIn> &input,
vtkm::cont::ArrayHandle<U,SOut> &output)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
//We need call PrepareForInput on the input argument before invoking a
//function. The order of execution of parameters of a function is undefined
//so we need to make sure input is called before output, or else in-place
//use case breaks.
input.PrepareForInput(DeviceAdapterTag());
CopyPortal(input.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
}
template<typename T, class SIn, class SVal, class SOut>
VTKM_CONT_EXPORT static void LowerBounds(
const vtkm::cont::ArrayHandle<T,SIn>& input,
const vtkm::cont::ArrayHandle<T,SVal>& values,
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
{
vtkm::Id numberOfValues = values.GetNumberOfValues();
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
}
template<typename T, class SIn, class SVal, class SOut, class BinaryCompare>
VTKM_CONT_EXPORT static void LowerBounds(
const vtkm::cont::ArrayHandle<T,SIn>& input,
const vtkm::cont::ArrayHandle<T,SVal>& values,
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output,
BinaryCompare binary_compare)
{
vtkm::Id numberOfValues = values.GetNumberOfValues();
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
binary_compare);
}
template<class SIn, class SOut>
VTKM_CONT_EXPORT static void LowerBounds(
const vtkm::cont::ArrayHandle<vtkm::Id,SIn> &input,
vtkm::cont::ArrayHandle<vtkm::Id,SOut> &values_output)
{
LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values_output.PrepareForInPlace(DeviceAdapterTag()));
}
template<typename T, class SIn>
VTKM_CONT_EXPORT static T Reduce(
const vtkm::cont::ArrayHandle<T,SIn> &input,
T initialValue)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
return initialValue;
}
return ReducePortal(input.PrepareForInput( DeviceAdapterTag() ),
initialValue);
}
template<typename T, class SIn, class BinaryFunctor>
VTKM_CONT_EXPORT static T Reduce(
const vtkm::cont::ArrayHandle<T,SIn> &input,
T initialValue,
BinaryFunctor binary_functor)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
return initialValue;
}
return ReducePortal(input.PrepareForInput( DeviceAdapterTag() ),
initialValue,
binary_functor);
}
template<typename T, typename U, class KIn, class VIn, class KOut, class VOut,
class BinaryFunctor>
VTKM_CONT_EXPORT static void ReduceByKey(
const vtkm::cont::ArrayHandle<T,KIn> &keys,
const vtkm::cont::ArrayHandle<U,VIn> &values,
vtkm::cont::ArrayHandle<T,KOut> &keys_output,
vtkm::cont::ArrayHandle<U,VOut> &values_output,
BinaryFunctor binary_functor)
{
//there is a concern that by default we will allocate too much
//space for the keys/values output. 1 option is to
const vtkm::Id numberOfValues = keys.GetNumberOfValues();
if (numberOfValues <= 0)
{
return;
}
vtkm::Id reduced_size =
ReduceByKeyPortal(keys.PrepareForInput( DeviceAdapterTag() ),
values.PrepareForInput( DeviceAdapterTag() ),
keys_output.PrepareForOutput( numberOfValues, DeviceAdapterTag() ),
values_output.PrepareForOutput( numberOfValues, DeviceAdapterTag() ),
binary_functor);
keys_output.Shrink( reduced_size );
values_output.Shrink( reduced_size );
}
template<typename T, class SIn, class SOut>
VTKM_CONT_EXPORT static T ScanExclusive(
const vtkm::cont::ArrayHandle<T,SIn> &input,
vtkm::cont::ArrayHandle<T,SOut>& output)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
output.PrepareForOutput(0, DeviceAdapterTag());
return vtkm::TypeTraits<T>::ZeroInitialization();
}
//We need call PrepareForInput on the input argument before invoking a
//function. The order of execution of parameters of a function is undefined
//so we need to make sure input is called before output, or else in-place
//use case breaks.
input.PrepareForInput(DeviceAdapterTag());
return ScanExclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
}
template<typename T, class SIn, class SOut, class BinaryFunctor>
VTKM_CONT_EXPORT static T ScanExclusive(
const vtkm::cont::ArrayHandle<T,SIn> &input,
vtkm::cont::ArrayHandle<T,SOut>& output,
BinaryFunctor binary_functor,
const T& initialValue)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
output.PrepareForOutput(0, DeviceAdapterTag());
return vtkm::TypeTraits<T>::ZeroInitialization();
}
//We need call PrepareForInput on the input argument before invoking a
//function. The order of execution of parameters of a function is undefined
//so we need to make sure input is called before output, or else in-place
//use case breaks.
input.PrepareForInput(DeviceAdapterTag());
return ScanExclusivePortal(
input.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
binary_functor,
initialValue);
}
template<typename T, class SIn, class SOut>
VTKM_CONT_EXPORT static T ScanInclusive(
const vtkm::cont::ArrayHandle<T,SIn> &input,
vtkm::cont::ArrayHandle<T,SOut>& output)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
output.PrepareForOutput(0, DeviceAdapterTag());
return vtkm::TypeTraits<T>::ZeroInitialization();
}
//We need call PrepareForInput on the input argument before invoking a
//function. The order of execution of parameters of a function is undefined
//so we need to make sure input is called before output, or else in-place
//use case breaks.
input.PrepareForInput(DeviceAdapterTag());
return ScanInclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
}
template<typename T, class SIn, class SOut, class BinaryFunctor>
VTKM_CONT_EXPORT static T ScanInclusive(
const vtkm::cont::ArrayHandle<T,SIn> &input,
vtkm::cont::ArrayHandle<T,SOut>& output,
BinaryFunctor binary_functor)
{
const vtkm::Id numberOfValues = input.GetNumberOfValues();
if (numberOfValues <= 0)
{
output.PrepareForOutput(0, DeviceAdapterTag());
return vtkm::TypeTraits<T>::ZeroInitialization();
}
//We need call PrepareForInput on the input argument before invoking a
//function. The order of execution of parameters of a function is undefined
//so we need to make sure input is called before output, or else in-place
//use case breaks.
input.PrepareForInput(DeviceAdapterTag());
return ScanInclusivePortal(input.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
binary_functor);
}
// Because of some funny code conversions in nvcc, kernels for devices have to
// be public.
#ifndef VTKM_CUDA
private:
#endif
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
VTKM_CONT_EXPORT
static char* GetPinnedErrorArray(vtkm::Id &arraySize, char** hostPointer)
{
const vtkm::Id ERROR_ARRAY_SIZE = 1024;
static bool errorArrayInit = false;
static char* hostPtr = NULL;
static char* devicePtr = NULL;
if( !errorArrayInit )
{
cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped );
cudaHostGetDevicePointer(&devicePtr, hostPtr, 0);
errorArrayInit = true;
}
//set the size of the array
arraySize = ERROR_ARRAY_SIZE;
//specify the host pointer to the memory
*hostPointer = hostPtr;
(void) hostPointer;
return devicePtr;
}
// we query cuda for the max blocks per grid for 1D scheduling
// and cache the values in static variables
VTKM_CONT_EXPORT
static vtkm::Vec<vtkm::UInt32,3> GetMaxGridOfThreadBlocks()
{
static bool gridQueryInit = false;
static vtkm::Vec< vtkm::UInt32, 3> maxGridSize;
if( !gridQueryInit )
{
gridQueryInit = true;
int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, currDevice);
maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
//Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids
//in the X direction, it is dependent on the code being compiled for SM3+.
//If not, it falls back to SM_2 limitation of 65535 being the largest grid
//size.
//Now since SM architecture is only available inside kernels we have to
//invoke one to see what the actual limit is for our device. So that is
//what we are going to do next, and than we will store that result
vtkm::UInt32 *dev_actual_size;
cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) );
DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size);
cudaDeviceSynchronize();
cudaMemcpy( &maxGridSize[0],
dev_actual_size,
sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost );
cudaFree(dev_actual_size);
}
return maxGridSize;
}
public:
template<class Functor>
VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id numInstances)
{
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = NULL;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage( deviceErrorPtr,
errorArraySize);
functor.SetErrorMessageBuffer(errorMessage);
const vtkm::Id blockSizeAsId = 128;
const vtkm::UInt32 blockSize = 128;
const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0];
const vtkm::UInt32 totalBlocks = static_cast<vtkm::UInt32>(
(numInstances + blockSizeAsId - 1) / blockSizeAsId);
//Note a cuda launch can only handle at most 2B iterations of a kernel
//because it holds all of the indexes inside UInt32, so for use to
//handle datasets larger than 2B, we need to execute multiple kernels
if(totalBlocks < maxblocksPerLaunch)
{
Schedule1DIndexKernel<Functor> <<<totalBlocks, blockSize>>> (functor,
vtkm::Id(0),
numInstances);
}
else
{
const vtkm::Id numberOfKernelsToRun = blockSizeAsId * static_cast<vtkm::Id>(maxblocksPerLaunch);
for(vtkm::Id numberOfKernelsInvoked = 0;
numberOfKernelsInvoked < numInstances;
numberOfKernelsInvoked += numberOfKernelsToRun)
{
Schedule1DIndexKernel<Functor> <<<maxblocksPerLaunch, blockSize>>> (functor,
numberOfKernelsInvoked,
numInstances);
}
}
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
cudaDeviceSynchronize();
//check what the value is
if (hostErrorPtr[0] != '\0')
{
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
}
template<class Functor>
VTKM_CONT_EXPORT
static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
{
//since the memory is pinned we can access it safely on the host
//without a memcpy
vtkm::Id errorArraySize = 0;
char* hostErrorPtr = NULL;
char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
//clear the first character which means that we don't contain an error
hostErrorPtr[0] = '\0';
vtkm::exec::internal::ErrorMessageBuffer errorMessage( deviceErrorPtr,
errorArraySize);
functor.SetErrorMessageBuffer(errorMessage);
#ifdef ANALYZE_VTKM_SCHEDULER
//requires the errormessage buffer be set
compare_3d_schedule_patterns(functor,rangeMax);
#endif
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]) );
//currently we presume that 3d scheduling access patterns prefer accessing
//memory in the X direction. Also should be good for thin in the Z axis
//algorithms.
dim3 blockSize3d(64,2,1);
//handle the simple use case of 'bad' datasets which are thin in X
//but larger in the other directions, allowing us decent performance with
//that use case.
if(rangeMax[0] <= 128 &&
(rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2]) )
{
blockSize3d = dim3(16,4,4);
}
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
//sync so that we can check the results of the call.
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
cudaDeviceSynchronize();
//check what the value is
if (hostErrorPtr[0] != '\0')
{
throw vtkm::cont::ErrorExecution(hostErrorPtr);
}
}
template<typename T, class Storage>
VTKM_CONT_EXPORT static void Sort(
vtkm::cont::ArrayHandle<T,Storage>& values)
{
SortPortal(values.PrepareForInPlace(DeviceAdapterTag()));
}
template<typename T, class Storage, class BinaryCompare>
VTKM_CONT_EXPORT static void Sort(
vtkm::cont::ArrayHandle<T,Storage>& values,
BinaryCompare binary_compare)
{
SortPortal(values.PrepareForInPlace(DeviceAdapterTag()),binary_compare);
}
template<typename T, typename U,
class StorageT, class StorageU>
VTKM_CONT_EXPORT static void SortByKey(
vtkm::cont::ArrayHandle<T,StorageT>& keys,
vtkm::cont::ArrayHandle<U,StorageU>& values)
{
SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTag()),
values.PrepareForInPlace(DeviceAdapterTag()));
}
template<typename T, typename U,
class StorageT, class StorageU,
class BinaryCompare>
VTKM_CONT_EXPORT static void SortByKey(
vtkm::cont::ArrayHandle<T,StorageT>& keys,
vtkm::cont::ArrayHandle<U,StorageU>& values,
BinaryCompare binary_compare)
{
SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTag()),
values.PrepareForInPlace(DeviceAdapterTag()),
binary_compare);
}
template<typename T, class SStencil, class SOut>
VTKM_CONT_EXPORT static void StreamCompact(
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
{
vtkm::Id size = stencil.GetNumberOfValues();
vtkm::Id newSize = CopyIfPortal(::thrust::make_counting_iterator<vtkm::Id>(0),
::thrust::make_counting_iterator<vtkm::Id>(size),
stencil.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(size, DeviceAdapterTag()),
::vtkm::NotZeroInitialized());
output.Shrink(newSize);
}
template<typename T,
typename U,
class SIn,
class SStencil,
class SOut>
VTKM_CONT_EXPORT static void StreamCompact(
const vtkm::cont::ArrayHandle<U,SIn>& input,
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
vtkm::cont::ArrayHandle<U,SOut>& output)
{
vtkm::Id size = stencil.GetNumberOfValues();
vtkm::Id newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTag()),
stencil.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(size, DeviceAdapterTag()),
::vtkm::NotZeroInitialized()); //yes on the stencil
output.Shrink(newSize);
}
template<typename T,
typename U,
class SIn,
class SStencil,
class SOut,
class UnaryPredicate>
VTKM_CONT_EXPORT static void StreamCompact(
const vtkm::cont::ArrayHandle<U,SIn>& input,
const vtkm::cont::ArrayHandle<T,SStencil>& stencil,
vtkm::cont::ArrayHandle<U,SOut>& output,
UnaryPredicate unary_predicate)
{
vtkm::Id size = stencil.GetNumberOfValues();
vtkm::Id newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTag()),
stencil.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(size, DeviceAdapterTag()),
unary_predicate);
output.Shrink(newSize);
}
template<typename T, class Storage>
VTKM_CONT_EXPORT static void Unique(
vtkm::cont::ArrayHandle<T,Storage> &values)
{
vtkm::Id newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTag()));
values.Shrink(newSize);
}
template<typename T, class Storage, class BinaryCompare>
VTKM_CONT_EXPORT static void Unique(
vtkm::cont::ArrayHandle<T,Storage> &values,
BinaryCompare binary_compare)
{
vtkm::Id newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTag()),binary_compare);
values.Shrink(newSize);
}
template<typename T, class SIn, class SVal, class SOut>
VTKM_CONT_EXPORT static void UpperBounds(
const vtkm::cont::ArrayHandle<T,SIn>& input,
const vtkm::cont::ArrayHandle<T,SVal>& values,
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output)
{
vtkm::Id numberOfValues = values.GetNumberOfValues();
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()));
}
template<typename T, class SIn, class SVal, class SOut, class BinaryCompare>
VTKM_CONT_EXPORT static void UpperBounds(
const vtkm::cont::ArrayHandle<T,SIn>& input,
const vtkm::cont::ArrayHandle<T,SVal>& values,
vtkm::cont::ArrayHandle<vtkm::Id,SOut>& output,
BinaryCompare binary_compare)
{
vtkm::Id numberOfValues = values.GetNumberOfValues();
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values.PrepareForInput(DeviceAdapterTag()),
output.PrepareForOutput(numberOfValues, DeviceAdapterTag()),
binary_compare);
}
template<class SIn, class SOut>
VTKM_CONT_EXPORT static void UpperBounds(
const vtkm::cont::ArrayHandle<vtkm::Id,SIn> &input,
vtkm::cont::ArrayHandle<vtkm::Id,SOut> &values_output)
{
UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTag()),
values_output.PrepareForInPlace(DeviceAdapterTag()));
}
};
}
}
}
} // namespace vtkm::cont::cuda::internal
#endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h