Merge topic 'kokkos-opt-scan'

e57f5a175 Fix DeviceAdapterAlgorithmGeneral Reduce
1b7fc3d3e Use `Kokkos::parallel_scan` for Scan functionality

Acked-by: Kitware Robot <kwrobot@kitware.com>
Merge-request: !2360
This commit is contained in:
Sujin Philip 2021-01-08 15:32:58 +00:00 committed by Kitware Robot
commit 4fac642273
3 changed files with 381 additions and 54 deletions

@ -12,6 +12,7 @@
#include <vtkm/Types.h>
#include <vtkm/cont/BitField.h>
#include <vtkm/cont/DeviceAdapter.h>
#include <vtkm/cont/ExecutionObjectBase.h>
#include <vtkm/cont/Token.h>

@ -12,8 +12,8 @@
#define vtk_m_cont_internal_DeviceAdapterAlgorithmGeneral_h
#include <vtkm/cont/ArrayHandle.h>
#include <vtkm/cont/ArrayHandleDecorator.h>
#include <vtkm/cont/ArrayHandleDiscard.h>
#include <vtkm/cont/ArrayHandleImplicit.h>
#include <vtkm/cont/ArrayHandleIndex.h>
#include <vtkm/cont/ArrayHandleView.h>
#include <vtkm/cont/ArrayHandleZip.h>
@ -529,6 +529,33 @@ public:
//--------------------------------------------------------------------------
// Reduce
private:
template <typename T, typename BinaryFunctor>
class ReduceDecoratorImpl
{
public:
VTKM_CONT ReduceDecoratorImpl() = default;
VTKM_CONT
ReduceDecoratorImpl(const T& initialValue, const BinaryFunctor& binaryFunctor)
: InitialValue(initialValue)
, ReduceOperator(binaryFunctor)
{
}
template <typename Portal>
VTKM_CONT ReduceKernel<Portal, T, BinaryFunctor> CreateFunctor(const Portal& portal) const
{
return ReduceKernel<Portal, T, BinaryFunctor>(
portal, this->InitialValue, this->ReduceOperator);
}
private:
T InitialValue;
BinaryFunctor ReduceOperator;
};
public:
template <typename T, typename U, class CIn>
VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input, U initialValue)
{
@ -544,24 +571,16 @@ public:
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
vtkm::cont::Token token;
//Crazy Idea:
//We create a implicit array handle that wraps the input
//array handle. The implicit functor is passed the input array handle, and
//the number of elements it needs to sum. This way the implicit handle
//acts as the first level reduction. Say for example reducing 16 values
//at a time.
//
//Now that we have an implicit array that is 1/16 the length of full array
//we can use scan inclusive to compute the final sum
auto inputPortal = input.PrepareForInput(DeviceAdapterTag(), token);
ReduceKernel<decltype(inputPortal), U, BinaryFunctor> kernel(
inputPortal, initialValue, binary_functor);
//We perform the reduction in two levels. The first level is performed by
//an `ArrayHandleDecorator` which reduces 16 input values and maps them to
//one value. The decorator array is then 1/16 the length of the input array,
//and we can use inclusive scan as the second level to compute the final
//result.
vtkm::Id length = (input.GetNumberOfValues() / 16);
length += (input.GetNumberOfValues() % 16 == 0) ? 0 : 1;
auto reduced = vtkm::cont::make_ArrayHandleImplicit(kernel, length);
auto reduced = vtkm::cont::make_ArrayHandleDecorator(
length, ReduceDecoratorImpl<U, BinaryFunctor>(initialValue, binary_functor), input);
vtkm::cont::ArrayHandle<U, vtkm::cont::StorageTagBasic> inclusiveScanStorage;
const U scanResult =

@ -10,10 +10,9 @@
#ifndef vtk_m_cont_kokkos_internal_DeviceAdapterAlgorithmKokkos_h
#define vtk_m_cont_kokkos_internal_DeviceAdapterAlgorithmKokkos_h
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/ArrayHandleImplicit.h>
#include <vtkm/cont/ArrayHandleIndex.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
#include <vtkm/cont/vtkm_cont_export.h>
@ -164,6 +163,7 @@ struct ReductionIdentity<vtkm::BitwiseOr, ResultType>
}
} // kokkos::internal
//=============================================================================
template <>
struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagKokkos>
: vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
@ -208,6 +208,7 @@ public:
vtkm::Id{ 0 });
}
//----------------------------------------------------------------------------
using Superclass::Copy;
template <typename T>
@ -227,66 +228,104 @@ public:
Kokkos::deep_copy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), viewOut, viewIn);
}
//----------------------------------------------------------------------------
private:
template <typename ArrayHandle, typename BinaryFunctor, typename ResultType>
template <typename ArrayHandle, typename BinaryOperator, typename ResultType>
VTKM_CONT static ResultType ReduceImpl(const ArrayHandle& input,
BinaryFunctor binary_functor,
BinaryOperator binaryOperator,
ResultType initialValue,
std::false_type)
{
return Superclass::Reduce(input, initialValue, binary_functor);
return Superclass::Reduce(input, initialValue, binaryOperator);
}
template <typename ArrayPortal, typename BinaryFunctor, typename ResultType>
class ReduceFunctor
template <typename BinaryOperator, typename FunctorOperator, typename ResultType>
class KokkosReduceFunctor
{
public:
using size_type = vtkm::Id;
using value_type = ResultType;
KOKKOS_INLINE_FUNCTION
ReduceFunctor() {}
KokkosReduceFunctor() {}
KOKKOS_INLINE_FUNCTION
explicit ReduceFunctor(const ArrayPortal& portal, const BinaryFunctor& op)
: Portal(portal)
, Operator(op)
template <typename... Args>
KOKKOS_INLINE_FUNCTION explicit KokkosReduceFunctor(const BinaryOperator& op, Args... args)
: Operator(op)
, Functor(std::forward<Args>(args)...)
{
}
KOKKOS_INLINE_FUNCTION
void operator()(const size_type i, value_type& update) const
{
update = this->Operator(update, this->Portal.Get(i));
}
KOKKOS_INLINE_FUNCTION
void join(volatile value_type& dst, const volatile value_type& src) const
{
dst = this->Operator(dst, src);
}
KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
KOKKOS_INLINE_FUNCTION
void init(value_type& dst) const
{
dst = kokkos::internal::ReductionIdentity<BinaryFunctor, value_type>::value;
dst = kokkos::internal::ReductionIdentity<BinaryOperator, value_type>::value;
}
// Reduce operator
KOKKOS_INLINE_FUNCTION
void operator()(vtkm::Id i, ResultType& update) const
{
this->Functor(this->Operator, i, update);
}
// Scan operator
KOKKOS_INLINE_FUNCTION
void operator()(vtkm::Id i, ResultType& update, const bool final) const
{
this->Functor(this->Operator, i, update, final);
}
private:
BinaryOperator Operator;
FunctorOperator Functor;
};
template <typename ArrayPortal, typename BinaryOperator, typename ResultType>
class ReduceOperator
{
public:
KOKKOS_INLINE_FUNCTION
ReduceOperator() {}
KOKKOS_INLINE_FUNCTION
explicit ReduceOperator(const ArrayPortal& portal)
: Portal(portal)
{
}
KOKKOS_INLINE_FUNCTION
void operator()(const BinaryOperator& op, vtkm::Id i, ResultType& update) const
{
update = op(update, this->Portal.Get(i));
}
private:
ArrayPortal Portal;
BinaryFunctor Operator;
};
template <typename ArrayHandle, typename BinaryFunctor, typename ResultType>
template <typename BinaryOperator, typename ArrayPortal, typename ResultType>
using ReduceFunctor = KokkosReduceFunctor<BinaryOperator,
ReduceOperator<ArrayPortal, BinaryOperator, ResultType>,
ResultType>;
template <typename ArrayHandle, typename BinaryOperator, typename ResultType>
VTKM_CONT static ResultType ReduceImpl(const ArrayHandle& input,
BinaryFunctor binary_functor,
BinaryOperator binaryOperator,
ResultType initialValue,
std::true_type)
{
vtkm::cont::Token token;
auto inputPortal = input.PrepareForInput(vtkm::cont::DeviceAdapterTagKokkos{}, token);
ReduceFunctor<decltype(inputPortal), BinaryFunctor, ResultType> functor(inputPortal,
binary_functor);
ReduceFunctor<BinaryOperator, decltype(inputPortal), ResultType> functor(binaryOperator,
inputPortal);
ResultType result;
@ -294,36 +333,47 @@ private:
vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, input.GetNumberOfValues());
Kokkos::parallel_reduce(policy, functor, result);
return binary_functor(initialValue, result);
return binaryOperator(initialValue, result);
}
template <bool P1, typename BinaryFunctor, typename ResultType>
template <bool P1, typename BinaryOperator, typename ResultType>
struct UseKokkosReduceP1 : std::false_type
{
};
template <typename BinaryFunctor, typename ResultType>
struct UseKokkosReduceP1<true, BinaryFunctor, ResultType>
template <typename BinaryOperator, typename ResultType>
struct UseKokkosReduceP1<true, BinaryOperator, ResultType>
: vtkm::internal::is_type_complete<
kokkos::internal::ReductionIdentity<BinaryFunctor, ResultType>>
kokkos::internal::ReductionIdentity<BinaryOperator, ResultType>>
{
};
template <typename BinaryFunctor, typename ResultType>
template <typename BinaryOperator, typename ResultType>
struct UseKokkosReduce
: UseKokkosReduceP1<
vtkm::internal::is_type_complete<Kokkos::reduction_identity<ResultType>>::value,
BinaryFunctor,
BinaryOperator,
ResultType>
{
};
public:
template <typename T, typename U, class CIn, class BinaryFunctor>
template <typename T, typename U, class CIn, class BinaryOperator>
VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input,
U initialValue,
BinaryFunctor binary_functor)
BinaryOperator binaryOperator)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
if (input.GetNumberOfValues() == 0)
{
return initialValue;
}
if (input.GetNumberOfValues() == 1)
{
return binaryOperator(initialValue, input.ReadPortal().Get(0));
}
#if defined(VTKM_KOKKOS_CUDA)
// Kokkos reduce is having some issues with the cuda backend. Please refer to issue #586.
// Following is a work around where we use the Superclass reduce implementation when using
@ -331,21 +381,276 @@ public:
std::integral_constant<
bool,
!std::is_same<vtkm::cont::kokkos::internal::ExecutionSpace, Kokkos::Cuda>::value &&
UseKokkosReduce<BinaryFunctor, U>::value>
UseKokkosReduce<BinaryOperator, U>::value>
use_kokkos_reduce;
#else
typename UseKokkosReduce<BinaryFunctor, U>::type use_kokkos_reduce;
typename UseKokkosReduce<BinaryOperator, U>::type use_kokkos_reduce;
#endif
return ReduceImpl(input, binary_functor, initialValue, use_kokkos_reduce);
return ReduceImpl(input, binaryOperator, initialValue, use_kokkos_reduce);
}
template <typename T, typename U, class CIn>
VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input, U initialValue)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
return Reduce(input, initialValue, vtkm::Add());
}
//----------------------------------------------------------------------------
private:
// Scan and Reduce have the same conditions
template <typename BinaryOperator, typename ResultType>
using UseKokkosScan = UseKokkosReduce<BinaryOperator, ResultType>;
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
VTKM_CONT static T ScanExclusiveImpl(const vtkm::cont::ArrayHandle<T, StorageIn>& input,
vtkm::cont::ArrayHandle<T, StorageOut>& output,
BinaryOperator binaryOperator,
const T& initialValue,
std::false_type)
{
return Superclass::ScanExclusive(input, output, binaryOperator, initialValue);
}
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
class ScanExclusiveOperator
{
private:
using ArrayPortalIn =
typename ArrayHandle<T,
StorageIn>::template ExecutionTypes<DeviceAdapterTagKokkos>::PortalConst;
using ArrayPortalOut =
typename ArrayHandle<T, StorageOut>::template ExecutionTypes<DeviceAdapterTagKokkos>::Portal;
public:
KOKKOS_INLINE_FUNCTION
ScanExclusiveOperator() {}
KOKKOS_INLINE_FUNCTION
explicit ScanExclusiveOperator(const ArrayPortalIn& portalIn,
const ArrayPortalOut& portalOut,
const T& initialValue)
: PortalIn(portalIn)
, PortalOut(portalOut)
, InitialValue(initialValue)
{
}
KOKKOS_INLINE_FUNCTION
void operator()(const BinaryOperator& op, const vtkm::Id i, T& update, const bool final) const
{
auto val = this->PortalIn.Get(i);
if (i == 0)
{
update = InitialValue;
}
if (final)
{
this->PortalOut.Set(i, update);
}
update = op(update, val);
}
private:
ArrayPortalIn PortalIn;
ArrayPortalOut PortalOut;
T InitialValue;
};
template <typename BinaryOperator, typename T, typename StorageIn, typename StorageOut>
using ScanExclusiveFunctor =
KokkosReduceFunctor<BinaryOperator,
ScanExclusiveOperator<T, StorageIn, StorageOut, BinaryOperator>,
T>;
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
VTKM_CONT static T ScanExclusiveImpl(const vtkm::cont::ArrayHandle<T, StorageIn>& input,
vtkm::cont::ArrayHandle<T, StorageOut>& output,
BinaryOperator binaryOperator,
const T& initialValue,
std::true_type)
{
vtkm::Id length = input.GetNumberOfValues();
vtkm::cont::Token token;
auto inputPortal = input.PrepareForInput(vtkm::cont::DeviceAdapterTagKokkos{}, token);
auto outputPortal =
output.PrepareForOutput(length, vtkm::cont::DeviceAdapterTagKokkos{}, token);
ScanExclusiveFunctor<BinaryOperator, T, StorageIn, StorageOut> functor(
binaryOperator, inputPortal, outputPortal, initialValue);
T result = vtkm::TypeTraits<T>::ZeroInitialization();
Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace, vtkm::Id> policy(
vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, length);
Kokkos::parallel_scan(policy, functor, result);
return result;
}
public:
template <typename T, class CIn, class COut, class BinaryOperator>
VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::cont::ArrayHandle<T, COut>& output,
BinaryOperator binaryOperator,
const T& initialValue)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
vtkm::Id length = input.GetNumberOfValues();
if (length == 0)
{
output.Shrink(0);
return initialValue;
}
if (length == 1)
{
auto v0 = input.ReadPortal().Get(0);
Fill(output, initialValue, 1);
return binaryOperator(initialValue, v0);
}
#if defined(VTKM_KOKKOS_CUDA)
// Kokkos scan for the cuda backend is not working correctly for int/uint types of 8 and 16 bits.
std::integral_constant<bool,
!(std::is_integral<T>::value && sizeof(T) < 4) &&
UseKokkosScan<BinaryOperator, T>::value>
use_kokkos_scan;
#else
typename UseKokkosScan<BinaryOperator, T>::type use_kokkos_scan;
#endif
return ScanExclusiveImpl(input, output, binaryOperator, initialValue, use_kokkos_scan);
}
template <typename T, class CIn, class COut>
VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::cont::ArrayHandle<T, COut>& output)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
return ScanExclusive(input, output, vtkm::Sum(), vtkm::TypeTraits<T>::ZeroInitialization());
}
//----------------------------------------------------------------------------
private:
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
VTKM_CONT static T ScanInclusiveImpl(const vtkm::cont::ArrayHandle<T, StorageIn>& input,
vtkm::cont::ArrayHandle<T, StorageOut>& output,
BinaryOperator binaryOperator,
std::false_type)
{
return Superclass::ScanInclusive(input, output, binaryOperator);
}
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
class ScanInclusiveOperator
{
private:
using ArrayPortalIn =
typename ArrayHandle<T,
StorageIn>::template ExecutionTypes<DeviceAdapterTagKokkos>::PortalConst;
using ArrayPortalOut =
typename ArrayHandle<T, StorageOut>::template ExecutionTypes<DeviceAdapterTagKokkos>::Portal;
public:
KOKKOS_INLINE_FUNCTION
ScanInclusiveOperator() {}
KOKKOS_INLINE_FUNCTION
explicit ScanInclusiveOperator(const ArrayPortalIn& portalIn, const ArrayPortalOut& portalOut)
: PortalIn(portalIn)
, PortalOut(portalOut)
{
}
KOKKOS_INLINE_FUNCTION
void operator()(const BinaryOperator& op, const vtkm::Id i, T& update, const bool final) const
{
update = op(update, this->PortalIn.Get(i));
if (final)
{
this->PortalOut.Set(i, update);
}
}
private:
ArrayPortalIn PortalIn;
ArrayPortalOut PortalOut;
};
template <typename BinaryOperator, typename T, typename StorageIn, typename StorageOut>
using ScanInclusiveFunctor =
KokkosReduceFunctor<BinaryOperator,
ScanInclusiveOperator<T, StorageIn, StorageOut, BinaryOperator>,
T>;
template <typename T, typename StorageIn, typename StorageOut, typename BinaryOperator>
VTKM_CONT static T ScanInclusiveImpl(const vtkm::cont::ArrayHandle<T, StorageIn>& input,
vtkm::cont::ArrayHandle<T, StorageOut>& output,
BinaryOperator binaryOperator,
std::true_type)
{
vtkm::Id length = input.GetNumberOfValues();
vtkm::cont::Token token;
auto inputPortal = input.PrepareForInput(vtkm::cont::DeviceAdapterTagKokkos{}, token);
auto outputPortal =
output.PrepareForOutput(length, vtkm::cont::DeviceAdapterTagKokkos{}, token);
ScanInclusiveFunctor<BinaryOperator, T, StorageIn, StorageOut> functor(
binaryOperator, inputPortal, outputPortal);
T result = vtkm::TypeTraits<T>::ZeroInitialization();
Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace, vtkm::Id> policy(
vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, length);
Kokkos::parallel_scan(policy, functor, result);
return result;
}
public:
template <typename T, class CIn, class COut, class BinaryOperator>
VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::cont::ArrayHandle<T, COut>& output,
BinaryOperator binaryOperator)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
vtkm::Id length = input.GetNumberOfValues();
if (length == 0)
{
return vtkm::TypeTraits<T>::ZeroInitialization();
}
if (length == 1)
{
auto result = input.ReadPortal().Get(0);
Fill(output, result, 1);
return result;
}
#if defined(VTKM_KOKKOS_CUDA)
// Kokkos scan for the cuda backend is not working correctly for int/uint types of 8 and 16 bits.
std::integral_constant<bool,
!(std::is_integral<T>::value && sizeof(T) < 4) &&
UseKokkosScan<BinaryOperator, T>::value>
use_kokkos_scan;
#else
typename UseKokkosScan<BinaryOperator, T>::type use_kokkos_scan;
#endif
return ScanInclusiveImpl(input, output, binaryOperator, use_kokkos_scan);
}
template <typename T, class CIn, class COut>
VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::cont::ArrayHandle<T, COut>& output)
{
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
return ScanInclusive(input, output, vtkm::Add());
}
//----------------------------------------------------------------------------
template <typename WType, typename IType>
VTKM_CONT static void ScheduleTask(
vtkm::exec::kokkos::internal::TaskBasic1D<WType, IType>& functor,
@ -415,6 +720,7 @@ public:
ScheduleTask(kernel, rangeMax);
}
//----------------------------------------------------------------------------
private:
template <typename T>
VTKM_CONT static void SortImpl(vtkm::cont::ArrayHandle<T>& values, vtkm::SortLess, std::true_type)
@ -456,6 +762,7 @@ public:
}
};
//=============================================================================
template <>
class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagKokkos>
{