//============================================================================ // Copyright (c) Kitware, Inc. // All rights reserved. // See LICENSE.txt for details. // // This software is distributed WITHOUT ANY WARRANTY; without even // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // PURPOSE. See the above copyright notice for more information. //============================================================================ #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Disable warnings we check vtkm for but Thrust does not. #include VTKM_THIRDPARTY_PRE_INCLUDE //needs to be first #include #include #include #include #include #include #include #include #include #include #include #include #include VTKM_THIRDPARTY_POST_INCLUDE #include #include namespace vtkm { namespace cont { namespace cuda { /// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels /// /// \c ScheduleParameters represents how VTK-m should schedule different /// cuda kernel types. By default VTK-m uses a preset table based on the /// GPU's found at runtime. /// /// When these defaults are insufficient for certain projects it is possible /// to override the defaults by using \c InitScheduleParameters. /// /// struct VTKM_CONT_EXPORT ScheduleParameters { int one_d_blocks; int one_d_threads_per_block; int two_d_blocks; dim3 two_d_threads_per_block; int three_d_blocks; dim3 three_d_threads_per_block; }; /// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches /// /// By default VTK-m uses a preset table based on the GPU's found at runtime to /// determine the best scheduling parameters for a worklet. When these defaults /// are insufficient for certain projects it is possible to override the defaults /// by binding a custom function to \c InitScheduleParameters. /// /// Note: The this function must be called before any invocation of any worklets /// by VTK-m. /// /// Note: This function will be called for each GPU on a machine. /// /// \code{.cpp} /// /// ScheduleParameters CustomScheduleValues(char const* name, /// int major, /// int minor, /// int multiProcessorCount, /// int maxThreadsPerMultiProcessor, /// int maxThreadsPerBlock) /// { /// /// ScheduleParameters params { /// 64 * multiProcessorCount, //1d blocks /// 64, //1d threads per block /// 64 * multiProcessorCount, //2d blocks /// { 8, 8, 1 }, //2d threads per block /// 64 * multiProcessorCount, //3d blocks /// { 4, 4, 4 } }; //3d threads per block /// return params; /// } /// \endcode /// /// VTKM_CONT_EXPORT void InitScheduleParameters( vtkm::cont::cuda::ScheduleParameters (*)(char const* name, int major, int minor, int multiProcessorCount, int maxThreadsPerMultiProcessor, int maxThreadsPerBlock)); namespace internal { #if (defined(VTKM_GCC) || defined(VTKM_CLANG)) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #endif template __global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size) { //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ //for why our inc is grid-stride const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x; const vtkm::Id inc = blockDim.x * gridDim.x; task(start, size, inc); } template __global__ void TaskStrided3DLaunch(TaskType task, vtkm::Id3 size) { //This is the 3D version of executing in a grid-stride manner const dim3 start(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y, blockIdx.z * blockDim.z + threadIdx.z); const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z); for (vtkm::Id k = start.z; k < size[2]; k += inc.z) { for (vtkm::Id j = start.y; j < size[1]; j += inc.y) { task(size, start.x, size[0], inc.x, j, k); } } } template __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op) { result = binary_op(a, b); } #if (defined(VTKM_GCC) || defined(VTKM_CLANG)) #pragma GCC diagnostic pop #endif template struct FunctorSupportsUnaryImpl { template ()(std::declval()))> static std::true_type has(int); template static std::false_type has(...); using type = decltype(has(0)); }; template using FunctorSupportsUnary = typename FunctorSupportsUnaryImpl::type; template > struct CastPortal; template struct CastPortal { using InputType = typename PortalType::ValueType; using ValueType = decltype(std::declval()(std::declval())); PortalType Portal; BinaryAndUnaryFunctor Functor; VTKM_CONT CastPortal(const PortalType& portal, const BinaryAndUnaryFunctor& functor) : Portal(portal) , Functor(functor) { } VTKM_EXEC vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); } VTKM_EXEC ValueType Get(vtkm::Id index) const { return this->Functor(this->Portal.Get(index)); } }; template struct CastPortal { using InputType = typename PortalType::ValueType; using ValueType = decltype(std::declval()(std::declval(), std::declval())); PortalType Portal; VTKM_CONT CastPortal(const PortalType& portal, const BinaryFunctor&) : Portal(portal) { } VTKM_EXEC vtkm::Id GetNumberOfValues() const { return this->Portal.GetNumberOfValues(); } VTKM_EXEC ValueType Get(vtkm::Id index) const { return static_cast(this->Portal.Get(index)); } }; struct CudaFreeFunctor { void operator()(void* ptr) const { VTKM_CUDA_CALL(cudaFree(ptr)); } }; template using CudaUniquePtr = std::unique_ptr; template CudaUniquePtr make_CudaUniquePtr(std::size_t numElements) { T* ptr; VTKM_CUDA_CALL(cudaMalloc(&ptr, sizeof(T) * numElements)); return CudaUniquePtr(ptr); } } } // end namespace cuda::internal template <> struct DeviceAdapterAlgorithm : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral< vtkm::cont::DeviceAdapterAlgorithm, vtkm::cont::DeviceAdapterTagCuda> { // Because of some funny code conversions in nvcc, kernels for devices have to // be public. #ifndef VTKM_CUDA private: #endif using Superclass = vtkm::cont::internal::DeviceAdapterAlgorithmGeneral< vtkm::cont::DeviceAdapterAlgorithm, vtkm::cont::DeviceAdapterTagCuda>; template struct BitFieldToUnorderedSetFunctor : public vtkm::exec::FunctorBase { VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same::value || std::is_same::value || std::is_same::value), "Unsupported GlobalPopCountType. Must support CUDA atomicAdd."); //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5 //which is the GCC required compiler for CUDA 9.2 on summit/power9 using Word = vtkm::AtomicTypePreferred; VTKM_STATIC_ASSERT( VTKM_PASS_COMMAS(std::is_same::value)); VTKM_CONT BitFieldToUnorderedSetFunctor(const BitsPortal& input, const IndicesPortal& output, GlobalPopCountType* globalPopCount) : Input{ input } , Output{ output } , GlobalPopCount{ globalPopCount } , FinalWordIndex{ input.GetNumberOfWords() - 1 } , FinalWordMask(input.GetFinalWordMask()) { } ~BitFieldToUnorderedSetFunctor() {} VTKM_CONT void Initialize() { assert(this->GlobalPopCount != nullptr); VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType))); } VTKM_SUPPRESS_EXEC_WARNINGS __device__ void operator()(vtkm::Id wordIdx) const { Word word = this->Input.GetWord(wordIdx); // The last word may be partial -- mask out trailing bits if needed. const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 }; word &= mask; if (word != 0) { this->LocalPopCount = vtkm::CountSetBits(word); this->ReduceAllocate(); vtkm::Id firstBitIdx = wordIdx * sizeof(Word) * CHAR_BIT; do { // Find next bit. FindFirstSetBit's result is indexed starting at 1. vtkm::Int32 bit = vtkm::FindFirstSetBit(word) - 1; vtkm::Id outIdx = this->GetNextOutputIndex(); // Write index of bit this->Output.Set(outIdx, firstBitIdx + bit); word ^= (1 << bit); // clear bit } while (word != 0); // have bits } } VTKM_CONT vtkm::Id Finalize() const { assert(this->GlobalPopCount != nullptr); GlobalPopCountType result; VTKM_CUDA_CALL(cudaMemcpy( &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost)); return static_cast(result); } private: // Every thread with a non-zero local popcount calls this function, which // computes the total popcount for the coalesced threads and allocates // a contiguous block in the output by atomically increasing the global // popcount. VTKM_SUPPRESS_EXEC_WARNINGS __device__ void ReduceAllocate() const { const auto activeLanes = cooperative_groups::coalesced_threads(); const int activeRank = activeLanes.thread_rank(); const int activeSize = activeLanes.size(); // Reduction value: vtkm::Int32 rVal = this->LocalPopCount; for (int delta = 1; delta < activeSize; delta *= 2) { const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta); if (activeRank + delta < activeSize) { rVal += shflVal; } } if (activeRank == 0) { this->AllocationHead = atomicAdd(this->GlobalPopCount, static_cast(rVal)); } this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0); } // The global output allocation is written to by striding the writes across // the warp lanes, allowing the writes to global memory to be coalesced. VTKM_SUPPRESS_EXEC_WARNINGS __device__ vtkm::Id GetNextOutputIndex() const { // Only lanes with unwritten output indices left will call this method, // so just check the coalesced threads: const auto activeLanes = cooperative_groups::coalesced_threads(); const int activeRank = activeLanes.thread_rank(); const int activeSize = activeLanes.size(); vtkm::Id nextIdx = static_cast(this->AllocationHead + activeRank); this->AllocationHead += activeSize; return nextIdx; } const BitsPortal Input; const IndicesPortal Output; GlobalPopCountType* GlobalPopCount; mutable vtkm::UInt64 AllocationHead{ 0 }; mutable vtkm::Int32 LocalPopCount{ 0 }; // Used to mask trailing bits the in last word. vtkm::Id FinalWordIndex{ 0 }; Word FinalWordMask{ 0 }; }; template VTKM_CONT static void CopyPortal(const InputPortal& input, const OutputPortal& output) { try { ::thrust::copy(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(output)); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static vtkm::Id CopyIfPortal(ValueIterator valuesBegin, ValueIterator valuesEnd, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate) { auto outputBegin = cuda::internal::IteratorBegin(output); using ValueType = typename StencilPortal::ValueType; vtkm::exec::cuda::internal::WrappedUnaryPredicate up( unary_predicate); try { auto newLast = ::thrust::copy_if(ThrustCudaPolicyPerThread, valuesBegin, valuesEnd, cuda::internal::IteratorBegin(stencil), outputBegin, up); return static_cast(::thrust::distance(outputBegin, newLast)); } catch (...) { cuda::internal::throwAsVTKmException(); return vtkm::Id(0); } } template VTKM_CONT static vtkm::Id CopyIfPortal(ValuePortal values, StencilPortal stencil, OutputPortal output, UnaryPredicate unary_predicate) { return CopyIfPortal(cuda::internal::IteratorBegin(values), cuda::internal::IteratorEnd(values), stencil, output, unary_predicate); } template VTKM_CONT static void CopySubRangePortal(const InputPortal& input, vtkm::Id inputOffset, vtkm::Id size, const OutputPortal& output, vtkm::Id outputOffset) { try { ::thrust::copy_n(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input) + inputOffset, static_cast(size), cuda::internal::IteratorBegin(output) + outputOffset); } catch (...) { cuda::internal::throwAsVTKmException(); } } template struct CountSetBitsFunctor : public vtkm::exec::FunctorBase { VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same::value || std::is_same::value || std::is_same::value), "Unsupported GlobalPopCountType. Must support CUDA atomicAdd."); //Using typename BitsPortal::WordTypePreferred causes dependent type errors using GCC 4.8.5 //which is the GCC required compiler for CUDA 9.2 on summit/power9 using Word = vtkm::AtomicTypePreferred; VTKM_CONT CountSetBitsFunctor(const BitsPortal& portal, GlobalPopCountType* globalPopCount) : Portal{ portal } , GlobalPopCount{ globalPopCount } , FinalWordIndex{ portal.GetNumberOfWords() - 1 } , FinalWordMask{ portal.GetFinalWordMask() } { } ~CountSetBitsFunctor() {} VTKM_CONT void Initialize() { assert(this->GlobalPopCount != nullptr); VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType))); } VTKM_SUPPRESS_EXEC_WARNINGS __device__ void operator()(vtkm::Id wordIdx) const { Word word = this->Portal.GetWord(wordIdx); // The last word may be partial -- mask out trailing bits if needed. const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 }; word &= mask; if (word != 0) { this->LocalPopCount = vtkm::CountSetBits(word); this->Reduce(); } } VTKM_CONT vtkm::Id Finalize() const { assert(this->GlobalPopCount != nullptr); GlobalPopCountType result; VTKM_CUDA_CALL(cudaMemcpy( &result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost)); return static_cast(result); } private: // Every thread with a non-zero local popcount calls this function, which // computes the total popcount for the coalesced threads and atomically // increasing the global popcount. VTKM_SUPPRESS_EXEC_WARNINGS __device__ void Reduce() const { const auto activeLanes = cooperative_groups::coalesced_threads(); const int activeRank = activeLanes.thread_rank(); const int activeSize = activeLanes.size(); // Reduction value: vtkm::Int32 rVal = this->LocalPopCount; for (int delta = 1; delta < activeSize; delta *= 2) { const vtkm::Int32 shflVal = activeLanes.shfl_down(rVal, delta); if (activeRank + delta < activeSize) { rVal += shflVal; } } if (activeRank == 0) { atomicAdd(this->GlobalPopCount, static_cast(rVal)); } } const BitsPortal Portal; GlobalPopCountType* GlobalPopCount; mutable vtkm::Int32 LocalPopCount{ 0 }; // Used to mask trailing bits the in last word. vtkm::Id FinalWordIndex{ 0 }; Word FinalWordMask{ 0 }; }; template VTKM_CONT static void LowerBoundsPortal(const InputPortal& input, const ValuesPortal& values, const OutputPortal& output) { using ValueType = typename ValuesPortal::ValueType; LowerBoundsPortal(input, values, output, ::thrust::less()); } template VTKM_CONT static void LowerBoundsPortal(const InputPortal& input, const OutputPortal& values_output) { using ValueType = typename InputPortal::ValueType; LowerBoundsPortal(input, values_output, values_output, ::thrust::less()); } template VTKM_CONT static void LowerBoundsPortal(const InputPortal& input, const ValuesPortal& values, const OutputPortal& output, BinaryCompare binary_compare) { using ValueType = typename InputPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryPredicate bop( binary_compare); try { ::thrust::lower_bound(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(values), cuda::internal::IteratorEnd(values), cuda::internal::IteratorBegin(output), bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static T ReducePortal(const InputPortal& input, T initialValue) { return ReducePortal(input, initialValue, ::thrust::plus()); } template VTKM_CONT static T ReducePortal(const InputPortal& input, T initialValue, BinaryFunctor binary_functor) { using fast_path = std::is_same; return ReducePortalImpl(input, initialValue, binary_functor, fast_path()); } template VTKM_CONT static T ReducePortalImpl(const InputPortal& input, T initialValue, BinaryFunctor binary_functor, std::true_type) { //The portal type and the initial value are the same so we can use //the thrust reduction algorithm vtkm::exec::cuda::internal::WrappedBinaryOperator bop(binary_functor); try { return ::thrust::reduce(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), initialValue, bop); } catch (...) { cuda::internal::throwAsVTKmException(); } return initialValue; } template VTKM_CONT static T ReducePortalImpl(const InputPortal& input, T initialValue, BinaryFunctor binary_functor, std::false_type) { //The portal type and the initial value AREN'T the same type so we have //to a slower approach, where we wrap the input portal inside a cast //portal vtkm::cont::cuda::internal::CastPortal castPortal(input, binary_functor); vtkm::exec::cuda::internal::WrappedBinaryOperator bop(binary_functor); try { return ::thrust::reduce(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(castPortal), cuda::internal::IteratorEnd(castPortal), initialValue, bop); } catch (...) { cuda::internal::throwAsVTKmException(); } return initialValue; } template VTKM_CONT static vtkm::Id ReduceByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, const KeysOutputPortal& keys_output, const ValueOutputPortal& values_output, BinaryFunctor binary_functor) { auto keys_out_begin = cuda::internal::IteratorBegin(keys_output); auto values_out_begin = cuda::internal::IteratorBegin(values_output); ::thrust::pair result_iterators; ::thrust::equal_to binaryPredicate; using ValueType = typename ValuesPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bop(binary_functor); try { result_iterators = ::thrust::reduce_by_key(vtkm_cuda_policy(), cuda::internal::IteratorBegin(keys), cuda::internal::IteratorEnd(keys), cuda::internal::IteratorBegin(values), keys_out_begin, values_out_begin, binaryPredicate, bop); } catch (...) { cuda::internal::throwAsVTKmException(); } return static_cast(::thrust::distance(keys_out_begin, result_iterators.first)); } template VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal(const InputPortal& input, const OutputPortal& output) { using ValueType = typename OutputPortal::ValueType; return ScanExclusivePortal(input, output, (::thrust::plus()), vtkm::TypeTraits::ZeroInitialization()); } template VTKM_CONT static typename InputPortal::ValueType ScanExclusivePortal( const InputPortal& input, const OutputPortal& output, BinaryFunctor binaryOp, typename InputPortal::ValueType initialValue) { // Use iterator to get value so that thrust device_ptr has chance to handle // data on device. using ValueType = typename OutputPortal::ValueType; //we have size three so that we can store the origin end value, the //new end value, and the sum of those two ::thrust::system::cuda::vector sum(3); try { //store the current value of the last position array in a separate cuda //memory location since the exclusive_scan will overwrite that value //once run ::thrust::copy_n( ThrustCudaPolicyPerThread, cuda::internal::IteratorEnd(input) - 1, 1, sum.begin()); vtkm::exec::cuda::internal::WrappedBinaryOperator bop(binaryOp); auto end = ::thrust::exclusive_scan(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(output), initialValue, bop); //Store the new value for the end of the array. This is done because //with items such as the transpose array it is unsafe to pass the //portal to the SumExclusiveScan ::thrust::copy_n(ThrustCudaPolicyPerThread, (end - 1), 1, sum.begin() + 1); //execute the binaryOp one last time on the device. cuda::internal::SumExclusiveScan<<<1, 1, 0, cudaStreamPerThread>>>( sum[0], sum[1], sum[2], bop); } catch (...) { cuda::internal::throwAsVTKmException(); } return sum[2]; } template VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input, const OutputPortal& output) { using ValueType = typename OutputPortal::ValueType; return ScanInclusivePortal(input, output, ::thrust::plus()); } template VTKM_CONT static typename InputPortal::ValueType ScanInclusivePortal(const InputPortal& input, const OutputPortal& output, BinaryFunctor binary_functor) { using ValueType = typename OutputPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bop(binary_functor); try { ::thrust::system::cuda::vector result(1); auto end = ::thrust::inclusive_scan(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(output), bop); ::thrust::copy_n(ThrustCudaPolicyPerThread, end - 1, 1, result.begin()); return result[0]; } catch (...) { cuda::internal::throwAsVTKmException(); return typename InputPortal::ValueType(); } //return the value at the last index in the array, as that is the sum } template VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, const OutputPortal& output) { using KeyType = typename KeysPortal::ValueType; using ValueType = typename OutputPortal::ValueType; ScanInclusiveByKeyPortal( keys, values, output, ::thrust::equal_to(), ::thrust::plus()); } template VTKM_CONT static void ScanInclusiveByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, const OutputPortal& output, BinaryPredicate binary_predicate, AssociativeOperator binary_operator) { using KeyType = typename KeysPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bpred( binary_predicate); using ValueType = typename OutputPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bop( binary_operator); try { ::thrust::inclusive_scan_by_key(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(keys), cuda::internal::IteratorEnd(keys), cuda::internal::IteratorBegin(values), cuda::internal::IteratorBegin(output), bpred, bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, const OutputPortal& output) { using KeyType = typename KeysPortal::ValueType; using ValueType = typename OutputPortal::ValueType; ScanExclusiveByKeyPortal(keys, values, output, vtkm::TypeTraits::ZeroInitialization(), ::thrust::equal_to(), ::thrust::plus()); } template VTKM_CONT static void ScanExclusiveByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, const OutputPortal& output, T initValue, BinaryPredicate binary_predicate, AssociativeOperator binary_operator) { using KeyType = typename KeysPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bpred( binary_predicate); using ValueType = typename OutputPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryOperator bop( binary_operator); try { ::thrust::exclusive_scan_by_key(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(keys), cuda::internal::IteratorEnd(keys), cuda::internal::IteratorBegin(values), cuda::internal::IteratorBegin(output), initValue, bpred, bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static void SortPortal(const ValuesPortal& values) { using ValueType = typename ValuesPortal::ValueType; SortPortal(values, ::thrust::less()); } template VTKM_CONT static void SortPortal(const ValuesPortal& values, BinaryCompare binary_compare) { using ValueType = typename ValuesPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryPredicate bop( binary_compare); try { ::thrust::sort(vtkm_cuda_policy(), cuda::internal::IteratorBegin(values), cuda::internal::IteratorEnd(values), bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys, const ValuesPortal& values) { using ValueType = typename KeysPortal::ValueType; SortByKeyPortal(keys, values, ::thrust::less()); } template VTKM_CONT static void SortByKeyPortal(const KeysPortal& keys, const ValuesPortal& values, BinaryCompare binary_compare) { using ValueType = typename KeysPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryPredicate bop( binary_compare); try { ::thrust::sort_by_key(vtkm_cuda_policy(), cuda::internal::IteratorBegin(keys), cuda::internal::IteratorEnd(keys), cuda::internal::IteratorBegin(values), bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values) { try { auto begin = cuda::internal::IteratorBegin(values); auto newLast = ::thrust::unique(ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values)); return static_cast(::thrust::distance(begin, newLast)); } catch (...) { cuda::internal::throwAsVTKmException(); return vtkm::Id(0); } } template VTKM_CONT static vtkm::Id UniquePortal(const ValuesPortal values, BinaryCompare binary_compare) { using ValueType = typename ValuesPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryPredicate bop( binary_compare); try { auto begin = cuda::internal::IteratorBegin(values); auto newLast = ::thrust::unique( ThrustCudaPolicyPerThread, begin, cuda::internal::IteratorEnd(values), bop); return static_cast(::thrust::distance(begin, newLast)); } catch (...) { cuda::internal::throwAsVTKmException(); return vtkm::Id(0); } } template VTKM_CONT static void UpperBoundsPortal(const InputPortal& input, const ValuesPortal& values, const OutputPortal& output) { try { ::thrust::upper_bound(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(values), cuda::internal::IteratorEnd(values), cuda::internal::IteratorBegin(output)); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static void UpperBoundsPortal(const InputPortal& input, const ValuesPortal& values, const OutputPortal& output, BinaryCompare binary_compare) { using ValueType = typename OutputPortal::ValueType; vtkm::exec::cuda::internal::WrappedBinaryPredicate bop( binary_compare); try { ::thrust::upper_bound(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(values), cuda::internal::IteratorEnd(values), cuda::internal::IteratorBegin(output), bop); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static void UpperBoundsPortal(const InputPortal& input, const OutputPortal& values_output) { try { ::thrust::upper_bound(ThrustCudaPolicyPerThread, cuda::internal::IteratorBegin(input), cuda::internal::IteratorEnd(input), cuda::internal::IteratorBegin(values_output), cuda::internal::IteratorEnd(values_output), cuda::internal::IteratorBegin(values_output)); } catch (...) { cuda::internal::throwAsVTKmException(); } } template VTKM_CONT static vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal& bits, const IndicesPortal& indices) { using Functor = BitFieldToUnorderedSetFunctor; // RAII for the global atomic counter. auto globalCount = cuda::internal::make_CudaUniquePtr(1); Functor functor{ bits, indices, globalCount.get() }; functor.Initialize(); Schedule(functor, bits.GetNumberOfWords()); Synchronize(); // Ensure kernel is done before checking final atomic count return functor.Finalize(); } template VTKM_CONT static vtkm::Id CountSetBitsPortal(const BitsPortal& bits) { using Functor = CountSetBitsFunctor; // RAII for the global atomic counter. auto globalCount = cuda::internal::make_CudaUniquePtr(1); Functor functor{ bits, globalCount.get() }; functor.Initialize(); Schedule(functor, bits.GetNumberOfWords()); Synchronize(); // Ensure kernel is done before checking final atomic count return functor.Finalize(); } //----------------------------------------------------------------------------- public: template VTKM_CONT static vtkm::Id BitFieldToUnorderedSet( const vtkm::cont::BitField& bits, vtkm::cont::ArrayHandle& indices) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id numBits = bits.GetNumberOfBits(); { vtkm::cont::Token token; auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token); auto indicesPortal = indices.PrepareForOutput(numBits, DeviceAdapterTagCuda{}, token); // Use a uint64 for accumulator, as atomicAdd does not support signed int64. numBits = BitFieldToUnorderedSetPortal(bitsPortal, indicesPortal); } indices.Allocate(numBits, vtkm::CopyFlag::On); return numBits; } template VTKM_CONT static void Copy(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id inSize = input.GetNumberOfValues(); if (inSize <= 0) { output.Allocate(inSize, vtkm::CopyFlag::On); return; } vtkm::cont::Token token; CopyPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(inSize, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& stencil, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id size = stencil.GetNumberOfValues(); if (size <= 0) { output.Allocate(size, vtkm::CopyFlag::On); return; } vtkm::Id newSize; { vtkm::cont::Token token; newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), stencil.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(size, DeviceAdapterTagCuda(), token), ::vtkm::NotZeroInitialized()); //yes on the stencil } output.Allocate(newSize, vtkm::CopyFlag::On); } template VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& stencil, vtkm::cont::ArrayHandle& output, UnaryPredicate unary_predicate) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id size = stencil.GetNumberOfValues(); if (size <= 0) { output.Allocate(size, vtkm::CopyFlag::On); return; } vtkm::Id newSize; { vtkm::cont::Token token; newSize = CopyIfPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), stencil.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(size, DeviceAdapterTagCuda(), token), unary_predicate); } output.Allocate(newSize, vtkm::CopyFlag::On); } template VTKM_CONT static bool CopySubRange(const vtkm::cont::ArrayHandle& input, vtkm::Id inputStartIndex, vtkm::Id numberOfElementsToCopy, vtkm::cont::ArrayHandle& output, vtkm::Id outputIndex = 0) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id inSize = input.GetNumberOfValues(); // Check if the ranges overlap and fail if they do. if (input == output && ((outputIndex >= inputStartIndex && outputIndex < inputStartIndex + numberOfElementsToCopy) || (inputStartIndex >= outputIndex && inputStartIndex < outputIndex + numberOfElementsToCopy))) { return false; } if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 || inputStartIndex >= inSize) { //invalid parameters return false; } //determine if the numberOfElementsToCopy needs to be reduced if (inSize < (inputStartIndex + numberOfElementsToCopy)) { //adjust the size numberOfElementsToCopy = (inSize - inputStartIndex); } const vtkm::Id outSize = output.GetNumberOfValues(); const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy; if (outSize < copyOutEnd) { //output is not large enough if (outSize == 0) { //since output has nothing, just need to allocate to correct length output.Allocate(copyOutEnd); } else { //we currently have data in this array, so preserve it in the new //resized array vtkm::cont::ArrayHandle temp; temp.Allocate(copyOutEnd); CopySubRange(output, 0, outSize, temp); output = temp; } } vtkm::cont::Token token; CopySubRangePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), inputStartIndex, numberOfElementsToCopy, output.PrepareForInPlace(DeviceAdapterTagCuda(), token), outputIndex); return true; } VTKM_CONT static vtkm::Id CountSetBits(const vtkm::cont::BitField& bits) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; auto bitsPortal = bits.PrepareForInput(DeviceAdapterTagCuda{}, token); // Use a uint64 for accumulator, as atomicAdd does not support signed int64. return CountSetBitsPortal(bitsPortal); } template VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id numberOfValues = values.GetNumberOfValues(); vtkm::cont::Token token; LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output, BinaryCompare binary_compare) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id numberOfValues = values.GetNumberOfValues(); vtkm::cont::Token token; LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), binary_compare); } template VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& values_output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; LowerBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token)); } template VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle& input, U initialValue) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { return initialValue; } vtkm::cont::Token token; return ReducePortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue); } template VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle& input, U initialValue, BinaryFunctor binary_functor) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { return initialValue; } vtkm::cont::Token token; return ReducePortal( input.PrepareForInput(DeviceAdapterTagCuda(), token), initialValue, binary_functor); } // At least some versions of Thrust/nvcc result in compile errors when calling Thrust's // reduce with sufficiently complex iterators, which can happen with some versions of // ArrayHandleMultiplexer. Thus, don't use the Thrust version for ArrayHandleMultiplexer. template VTKM_CONT static U Reduce( const vtkm::cont::ArrayHandle>& input, U initialValue) { return Superclass::Reduce(input, initialValue); } template VTKM_CONT static U Reduce( const vtkm::cont::ArrayHandle>& input, U initialValue, BinaryFunctor binary_functor) { return Superclass::Reduce(input, initialValue, binary_functor); } template VTKM_CONT static void ReduceByKey(const vtkm::cont::ArrayHandle& keys, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& keys_output, vtkm::cont::ArrayHandle& values_output, BinaryFunctor binary_functor) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); //there is a concern that by default we will allocate too much //space for the keys/values output. 1 option is to const vtkm::Id numberOfValues = keys.GetNumberOfValues(); if (numberOfValues <= 0) { return; } vtkm::Id reduced_size; { vtkm::cont::Token token; reduced_size = ReduceByKeyPortal( keys.PrepareForInput(DeviceAdapterTagCuda(), token), values.PrepareForInput(DeviceAdapterTagCuda(), token), keys_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), values_output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), binary_functor); } keys_output.Allocate(reduced_size, vtkm::CopyFlag::On); values_output.Allocate(reduced_size, vtkm::CopyFlag::On); } template VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return vtkm::TypeTraits::ZeroInitialization(); } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token); return ScanExclusivePortal( inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& output, BinaryFunctor binary_functor, const T& initialValue) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return vtkm::TypeTraits::ZeroInitialization(); } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token); return ScanExclusivePortal( inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), binary_functor, initialValue); } template VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return vtkm::TypeTraits::ZeroInitialization(); } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token); return ScanInclusivePortal( inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& output, BinaryFunctor binary_functor) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = input.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return vtkm::TypeTraits::ZeroInitialization(); } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto inputPortal = input.PrepareForInput(DeviceAdapterTagCuda(), token); return ScanInclusivePortal( inputPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), binary_functor); } template VTKM_CONT static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle& keys, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = keys.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return; } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token); auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token); ScanInclusiveByKeyPortal( keysPortal, valuesPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle& keys, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output, BinaryFunctor binary_functor) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = keys.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return; } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token); auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token); ScanInclusiveByKeyPortal(keysPortal, valuesPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), ::thrust::equal_to(), binary_functor); } template VTKM_CONT static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle& keys, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = keys.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return; } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token); auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token); ScanExclusiveByKeyPortal(keysPortal, valuesPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), vtkm::TypeTraits::ZeroInitialization(), ::thrust::equal_to(), vtkm::Add()); } template VTKM_CONT static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle& keys, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output, const U& initialValue, BinaryFunctor binary_functor) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); const vtkm::Id numberOfValues = keys.GetNumberOfValues(); if (numberOfValues <= 0) { output.Allocate(0); return; } //We need call PrepareForInput on the input argument before invoking a //function. The order of execution of parameters of a function is undefined //so we need to make sure input is called before output, or else in-place //use case breaks. vtkm::cont::Token token; auto keysPortal = keys.PrepareForInput(DeviceAdapterTagCuda(), token); auto valuesPortal = values.PrepareForInput(DeviceAdapterTagCuda(), token); ScanExclusiveByKeyPortal(keysPortal, valuesPortal, output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), initialValue, ::thrust::equal_to(), binary_functor); } // we use cuda pinned memory to reduce the amount of synchronization // and mem copies between the host and device. struct VTKM_CONT_EXPORT PinnedErrorArray { char* HostPtr = nullptr; char* DevicePtr = nullptr; vtkm::Id Size = 0; }; VTKM_CONT_EXPORT static const PinnedErrorArray& GetPinnedErrorArray(); VTKM_CONT_EXPORT static void CheckForErrors(); // throws vtkm::cont::ErrorExecution VTKM_CONT_EXPORT static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor); VTKM_CONT_EXPORT static void GetBlocksAndThreads(vtkm::UInt32& blocks, vtkm::UInt32& threadsPerBlock, vtkm::Id size, vtkm::IdComponent maxThreadsPerBlock); VTKM_CONT_EXPORT static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size, vtkm::IdComponent maxThreadsPerBlock); template static void GetBlocksAndThreads(vtkm::cont::internal::HintList, Args&&... args) { using ThreadsPerBlock = vtkm::cont::internal::HintFind, vtkm::cont::internal::HintThreadsPerBlock<0>, vtkm::cont::DeviceAdapterTagCuda>; GetBlocksAndThreads(std::forward(args)..., ThreadsPerBlock::MaxThreads); } VTKM_CONT_EXPORT static void LogKernelLaunch(const cudaFuncAttributes& func_attrs, const std::type_info& worklet_info, vtkm::UInt32 blocks, vtkm::UInt32 threadsPerBlock, vtkm::Id size); VTKM_CONT_EXPORT static void LogKernelLaunch(const cudaFuncAttributes& func_attrs, const std::type_info& worklet_info, vtkm::UInt32 blocks, dim3 threadsPerBlock, const dim3& size); public: template static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D& functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); VTKM_ASSERT(numInstances >= 0); if (numInstances < 1) { // No instances means nothing to run. Just return. return; } CheckForErrors(); SetupErrorBuffer(functor); vtkm::UInt32 blocks, threadsPerBlock; GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances); #ifdef VTKM_ENABLE_LOGGING if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches) { using FunctorType = std::decay_t; cudaFuncAttributes empty_kernel_attrs; VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs, cuda::internal::TaskStrided1DLaunch)); LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances); } #endif cuda::internal::TaskStrided1DLaunch<<>>( functor, numInstances); } template static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D& functor, vtkm::Id3 rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0)); if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1)) { // No instances means nothing to run. Just return. return; } CheckForErrors(); SetupErrorBuffer(functor); const dim3 ranges(static_cast(rangeMax[0]), static_cast(rangeMax[1]), static_cast(rangeMax[2])); vtkm::UInt32 blocks; dim3 threadsPerBlock; GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges); #ifdef VTKM_ENABLE_LOGGING if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches) { using FunctorType = std::decay_t; cudaFuncAttributes empty_kernel_attrs; VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs, cuda::internal::TaskStrided3DLaunch)); LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges); } #endif cuda::internal::TaskStrided3DLaunch<<>>( functor, rangeMax); } template VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::exec::cuda::internal::TaskStrided1D kernel( functor); ScheduleTask(kernel, numInstances); } template VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances) { Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); } template VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::exec::cuda::internal::TaskStrided3D kernel( functor); ScheduleTask(kernel, rangeMax); } template VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax) { Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax); } template VTKM_CONT static void Sort(vtkm::cont::ArrayHandle& values) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void Sort(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; SortPortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare); } template VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token), values.PrepareForInPlace(DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle& keys, vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; SortByKeyPortal(keys.PrepareForInPlace(DeviceAdapterTagCuda(), token), values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare); } template VTKM_CONT static void Unique(vtkm::cont::ArrayHandle& values) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id newSize; { vtkm::cont::Token token; newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token)); } values.Allocate(newSize, vtkm::CopyFlag::On); } template VTKM_CONT static void Unique(vtkm::cont::ArrayHandle& values, BinaryCompare binary_compare) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id newSize; { vtkm::cont::Token token; newSize = UniquePortal(values.PrepareForInPlace(DeviceAdapterTagCuda(), token), binary_compare); } values.Allocate(newSize, vtkm::CopyFlag::On); } template VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id numberOfValues = values.GetNumberOfValues(); vtkm::cont::Token token; UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token)); } template VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle& input, const vtkm::cont::ArrayHandle& values, vtkm::cont::ArrayHandle& output, BinaryCompare binary_compare) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::Id numberOfValues = values.GetNumberOfValues(); vtkm::cont::Token token; UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values.PrepareForInput(DeviceAdapterTagCuda(), token), output.PrepareForOutput(numberOfValues, DeviceAdapterTagCuda(), token), binary_compare); } template VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle& input, vtkm::cont::ArrayHandle& values_output) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); vtkm::cont::Token token; UpperBoundsPortal(input.PrepareForInput(DeviceAdapterTagCuda(), token), values_output.PrepareForInPlace(DeviceAdapterTagCuda(), token)); } VTKM_CONT static void Synchronize() { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CheckForErrors(); } }; template <> class DeviceTaskTypes { public: template static vtkm::exec::cuda::internal::TaskStrided1D MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{}) { return { worklet, invocation }; } template static vtkm::exec::cuda::internal::TaskStrided3D MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = Hints{}) { return { worklet, invocation }; } template VTKM_CONT static auto MakeTask(WorkletType& worklet, InvocationType& invocation, const RangeType& range) { return MakeTask>(worklet, invocation, range); } }; } } // namespace vtkm::cont #endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h