diff --git a/docs/changelog/hints.md b/docs/changelog/hints.md new file mode 100644 index 000000000..10f9bf47a --- /dev/null +++ b/docs/changelog/hints.md @@ -0,0 +1,32 @@ +# Add hints to device adapter scheduler + +The `DeviceAdapter` provides an abstract interface to the accelerator +devices worklets and other algorithms run on. As such, the programmer has +less control about how the device launches each worklet. Each device +adapter has its own configuration parameters and other ways to attempt to +optimize how things are run, but these are always a universal set of +options that are applied to everything run on the device. There is no way +to specify launch parameters for a particular worklet. + +To provide this information, VTK-m now supports `Hint`s to the device +adapter. The `DeviceAdapterAlgorithm::Schedule` method takes a templated +argument that is of the type `HintList`. This object contains a template +list of `Hint` types that provide suggestions on how to launch the parallel +execution. The device adapter will pick out hints that pertain to it and +adjust its launching accordingly. + +These are called hints rather than, say, directives, because they don't +force the device adapter to do anything. The device adapter is free to +ignore any (and all) hints. The point is that the device adapter can take +into account the information to try to optimize for itself. + +A provided hint can be tied to specific device adapters. In this way, an +worklet can further optimize itself. If multiple hints match a device +adapter, the last one in the list will be selected. + +The `Worklet` base now has an internal type named `Hints` that points to a +`HintList` that is applied when the worklet is scheduled. Derived worklet +classes can provide hints by simply defining their own `Hints` type. + +This feature is experimental and consequently hidden in an `internal` +namespace. diff --git a/vtkm/cont/Algorithm.h b/vtkm/cont/Algorithm.h index e8002abfc..d197b8c89 100644 --- a/vtkm/cont/Algorithm.h +++ b/vtkm/cont/Algorithm.h @@ -17,6 +17,7 @@ #include #include #include +#include namespace vtkm @@ -932,29 +933,43 @@ struct Algorithm ScanExtended(vtkm::cont::DeviceAdapterTagAny(), input, output, binaryFunctor, initialValue); } - - template + // Should this be deprecated in favor of `RuntimeDeviceTracker`? + template VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId, Functor functor, vtkm::Id numInstances) { - vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, numInstances); + vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor{}, functor, numInstances); } - template + template + VTKM_CONT static void Schedule(vtkm::cont::internal::HintList hints, + Functor functor, + vtkm::Id numInstances) + { + vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, numInstances); + } + template VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances) { - Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, numInstances); + Schedule(vtkm::cont::DeviceAdapterTagAny{}, functor, numInstances); } - template + template VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId, Functor functor, vtkm::Id3 rangeMax) { vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, rangeMax); } - template + template + VTKM_CONT static void Schedule(vtkm::cont::internal::HintList hints, + Functor functor, + vtkm::Id3 rangeMax) + { + vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, rangeMax); + } + template VTKM_CONT static void Schedule(Functor functor, vtkm::Id3 rangeMax) { Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, rangeMax); diff --git a/vtkm/cont/CMakeLists.txt b/vtkm/cont/CMakeLists.txt index 5ff482a1f..0c19f40a6 100644 --- a/vtkm/cont/CMakeLists.txt +++ b/vtkm/cont/CMakeLists.txt @@ -283,6 +283,11 @@ vtkm_library( NAME vtkm_cont DEVICE_SOURCES ${device_sources} ) +target_sources(vtkm_cont + PRIVATE + internal/Hints.h +) + add_subdirectory(internal) add_subdirectory(arg) diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu index beeeef528..f8915f162 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu @@ -203,7 +203,8 @@ void DeviceAdapterAlgorithm::CheckForErrors() void DeviceAdapterAlgorithm::GetBlocksAndThreads( vtkm::UInt32& blocks, vtkm::UInt32& threadsPerBlock, - vtkm::Id size) + vtkm::Id size, + vtkm::IdComponent maxThreadsPerBlock) { (void)size; vtkm::cont::cuda::internal::SetupKernelSchedulingParameters(); @@ -215,12 +216,17 @@ void DeviceAdapterAlgorithm::GetBlocksAndThrea const auto& params = cuda::internal::scheduling_1d_parameters[static_cast(deviceId)]; blocks = static_cast(params.first); threadsPerBlock = static_cast(params.second); + if ((maxThreadsPerBlock > 0) && (threadsPerBlock < static_cast(maxThreadsPerBlock))) + { + threadsPerBlock = static_cast(maxThreadsPerBlock); + } } void DeviceAdapterAlgorithm::GetBlocksAndThreads( vtkm::UInt32& blocks, dim3& threadsPerBlock, - const dim3& size) + const dim3& size, + vtkm::IdComponent maxThreadsPerBlock) { vtkm::cont::cuda::internal::SetupKernelSchedulingParameters(); @@ -240,6 +246,27 @@ void DeviceAdapterAlgorithm::GetBlocksAndThrea blocks = static_cast(params.first); threadsPerBlock = params.second; } + + if (maxThreadsPerBlock > 0) + { + while ((threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z) > + static_cast(maxThreadsPerBlock)) + { + // Reduce largest element until threads are small enough. + if (threadsPerBlock.x > threadsPerBlock.y) + { + threadsPerBlock.x /= 2; + } + else if (threadsPerBlock.y > threadsPerBlock.z) + { + threadsPerBlock.y /= 2; + } + else + { + threadsPerBlock.z /= 2; + } + } + } } void DeviceAdapterAlgorithm::LogKernelLaunch( diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h index 21773c441..859a4be0e 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h @@ -1654,10 +1654,24 @@ public: VTKM_CONT_EXPORT static void GetBlocksAndThreads(vtkm::UInt32& blocks, vtkm::UInt32& threadsPerBlock, - vtkm::Id size); + vtkm::Id size, + vtkm::IdComponent maxThreadsPerBlock); VTKM_CONT_EXPORT - static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size); + static void GetBlocksAndThreads(vtkm::UInt32& blocks, + dim3& threadsPerBlock, + const dim3& size, + vtkm::IdComponent maxThreadsPerBlock); + + template + static void GetBlocksAndThreads(vtkm::cont::internal::HintList, Args&&... args) + { + using ThreadsPerBlock = + vtkm::cont::internal::HintFind, + vtkm::cont::internal::HintThreadsPerBlock<0>, + vtkm::cont::DeviceAdapterTagCuda>; + GetBlocksAndThreads(std::forward(args)..., ThreadsPerBlock::MaxThreads); + } VTKM_CONT_EXPORT static void LogKernelLaunch(const cudaFuncAttributes& func_attrs, @@ -1674,8 +1688,8 @@ public: const dim3& size); public: - template - static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D& functor, + template + static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D& functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -1691,12 +1705,12 @@ public: SetupErrorBuffer(functor); vtkm::UInt32 blocks, threadsPerBlock; - GetBlocksAndThreads(blocks, threadsPerBlock, numInstances); + GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances); #ifdef VTKM_ENABLE_LOGGING if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches) { - using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D; + using FunctorType = std::decay_t; cudaFuncAttributes empty_kernel_attrs; VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs, cuda::internal::TaskStrided1DLaunch)); @@ -1708,8 +1722,8 @@ public: functor, numInstances); } - template - static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D& functor, + template + static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D& functor, vtkm::Id3 rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -1730,12 +1744,12 @@ public: vtkm::UInt32 blocks; dim3 threadsPerBlock; - GetBlocksAndThreads(blocks, threadsPerBlock, ranges); + GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges); #ifdef VTKM_ENABLE_LOGGING if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches) { - using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D; + using FunctorType = std::decay_t; cudaFuncAttributes empty_kernel_attrs; VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs, cuda::internal::TaskStrided3DLaunch)); @@ -1747,25 +1761,39 @@ public: functor, rangeMax); } - template - VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances) + template + VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); - vtkm::exec::cuda::internal::TaskStrided1D kernel(functor); + vtkm::exec::cuda::internal::TaskStrided1D kernel( + functor); ScheduleTask(kernel, numInstances); } - template - VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax) + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); + } + + template + VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); - vtkm::exec::cuda::internal::TaskStrided3D kernel(functor); + vtkm::exec::cuda::internal::TaskStrided3D kernel( + functor); ScheduleTask(kernel, rangeMax); } + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax); + } + template VTKM_CONT static void Sort(vtkm::cont::ArrayHandle& values) { @@ -1894,20 +1922,26 @@ template <> class DeviceTaskTypes { public: - template - static vtkm::exec::cuda::internal::TaskStrided1D - MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id) + template + static vtkm::exec::cuda::internal::TaskStrided1D + MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{}) { - using Task = vtkm::exec::cuda::internal::TaskStrided1D; - return Task(worklet, invocation); + return { worklet, invocation }; } - template - static vtkm::exec::cuda::internal::TaskStrided3D - MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3) + template + static vtkm::exec::cuda::internal::TaskStrided3D + MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = Hints{}) { - using Task = vtkm::exec::cuda::internal::TaskStrided3D; - return Task(worklet, invocation); + return { worklet, invocation }; + } + + template + VTKM_CONT static auto MakeTask(WorkletType& worklet, + InvocationType& invocation, + const RangeType& range) + { + return MakeTask>(worklet, invocation, range); } }; } diff --git a/vtkm/cont/internal/CMakeLists.txt b/vtkm/cont/internal/CMakeLists.txt index 9146f36ce..74c4459ce 100644 --- a/vtkm/cont/internal/CMakeLists.txt +++ b/vtkm/cont/internal/CMakeLists.txt @@ -25,6 +25,7 @@ set(headers DeviceAdapterListHelpers.h FieldCollection.h FunctorsGeneral.h + Hints.h IteratorFromArrayPortal.h KXSort.h MapArrayPermutation.h diff --git a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h index 9fc50061a..8f9cb794c 100644 --- a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h +++ b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -58,20 +59,30 @@ namespace internal /// : DeviceAdapterAlgorithmGeneral, /// DeviceAdapterTagFoo> /// { -/// template -/// VTKM_CONT static void Schedule(Functor functor, -/// vtkm::Id numInstances) +/// template +/// VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances) /// { /// ... /// } /// -/// template -/// VTKM_CONT static void Schedule(Functor functor, -/// vtkm::Id3 maxRange) +/// template +/// VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id numInstances) +/// { +/// Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); +/// } +/// +/// template +/// VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id3 maxRange) /// { /// ... /// } /// +/// template +/// VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id3 maxRange) +/// { +/// Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); +/// } +/// /// VTKM_CONT static void Synchronize() /// { /// ... diff --git a/vtkm/cont/internal/Hints.h b/vtkm/cont/internal/Hints.h new file mode 100644 index 000000000..acd35a2f9 --- /dev/null +++ b/vtkm/cont/internal/Hints.h @@ -0,0 +1,124 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +//============================================================================ +#ifndef vtk_m_cont_internal_Hints_h +#define vtk_m_cont_internal_Hints_h + +#include +#include + +#include + +namespace vtkm +{ +namespace cont +{ +namespace internal +{ + +/// @brief Representation of a hint for execution. +/// +/// A hint is a (potentially) device independent parameter that can be used when +/// scheduling parallel execution on a device. Control-side code can provide hints +/// when scheduling parallel device execution to provide some context about what +/// is being run and potentially optimize the algorithm. An implementation for +/// a device adapter can choose to use or ignore hints. Likewise, a hint can be +/// attached to a specific list of devices. +/// +/// This base class is not intended to be used directly. Use one of the +/// derived hint structures to specify a hint. +template +struct HintBase +{ + using Derived = Derived_; + using Tag = Tag_; + using DeviceList = DeviceList_; +}; + +struct HintTagThreadsPerBlock +{ +}; + +/// @brief Suggest the number of threads to use when scheduling blocks of threads. +/// +/// Many accelerator devices, particularly GPUs, schedule threads in blocks. This +/// hint suggests the size of block to use during the scheduling. +template +struct HintThreadsPerBlock + : HintBase, HintTagThreadsPerBlock, DeviceList_> +{ + static constexpr vtkm::IdComponent MaxThreads = MaxThreads_; +}; + +/// @brief Container for hints. +/// +/// When scheduling or invoking a parallel routine, the caller can provide a list +/// of hints to suggest the best way to execute the routine. This list is provided +/// as arguments to a `HintList` template and passed as an argument. +template +struct HintList : vtkm::List +{ + using List = vtkm::List; +}; + +template +struct IsHintList : std::false_type +{ +}; +template +struct IsHintList> : std::true_type +{ +}; + +/// @brief Performs a static assert that the given object is a hint list. +/// +/// If the provided type is a `vtkm::cont::internal::HintList`, then this macro +/// does nothing. If the type is anything else, a compile error will occur. This +/// macro is useful for checking that template arguments are an expected hint +/// list. This helps diagnose improper template use more easily. +#define VTKM_IS_HINT_LIST(T) VTKM_STATIC_ASSERT(::vtkm::cont::internal::IsHintList::value) + +namespace detail +{ + +template +struct FindHintOperators +{ + VTKM_IS_DEVICE_ADAPTER_TAG(Device); + + template + using HintMatches = vtkm::internal::meta::And, + vtkm::ListHas>; + template + using ReduceOperator = typename std::conditional::value, Next, Found>::type; +}; + +} // namespace detail + +/// @brief Find a hint of a particular type. +/// +/// The `HintFind` template can be used to find a hint of a particular type. +/// `HintFind` is provided a default value to use for a hint, and it returns +/// a hint in the hint list that matches the type of the provided default and +/// applies to the provided device tag. +/// +/// If multiple hints match the type and device, the _last_ one in the list +/// is returned. Thus, when constructing hint lists, but the more general hints +/// first and more specific ones last. +template +using HintFind = vtkm::ListReduce< + typename HList::List, + detail::FindHintOperators::template ReduceOperator, + DefaultHint>; + +} +} +} // namespace vtkm::cont::internal + +#endif // vtk_m_cont_internal_Hints_h diff --git a/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h b/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h index 1a16ef492..73ff14f70 100644 --- a/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h +++ b/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h @@ -670,9 +670,9 @@ public: } //---------------------------------------------------------------------------- - template + template VTKM_CONT static void ScheduleTask( - vtkm::exec::kokkos::internal::TaskBasic1D& functor, + vtkm::exec::kokkos::internal::TaskBasic1D& functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -685,15 +685,22 @@ public: functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance()); - Kokkos::RangePolicy policy( - vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances); + constexpr vtkm::IdComponent maxThreadsPerBlock = + vtkm::cont::internal::HintFind, + vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads; + + Kokkos::RangePolicy, + Kokkos::IndexType> + policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances); Kokkos::parallel_for(policy, functor); CheckForErrors(); // synchronizes } - template + template VTKM_CONT static void ScheduleTask( - vtkm::exec::kokkos::internal::TaskBasic3D& functor, + vtkm::exec::kokkos::internal::TaskBasic3D& functor, vtkm::Id3 rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -706,7 +713,13 @@ public: functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance()); + constexpr vtkm::IdComponent maxThreadsPerBlock = + vtkm::cont::internal::HintFind, + vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads; + Kokkos::MDRangePolicy, Kokkos::Rank<3>, Kokkos::IndexType> policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), @@ -729,24 +742,38 @@ public: CheckForErrors(); // synchronizes } - template - VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances) + template + VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); - vtkm::exec::kokkos::internal::TaskBasic1D kernel(functor); + vtkm::exec::kokkos::internal::TaskBasic1D kernel( + functor); ScheduleTask(kernel, numInstances); } - template - VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax) + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); + } + + template + VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); - vtkm::exec::kokkos::internal::TaskBasic3D kernel(functor); + vtkm::exec::kokkos::internal::TaskBasic3D kernel( + functor); ScheduleTask(kernel, rangeMax); } + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax); + } + //---------------------------------------------------------------------------- private: template @@ -1020,20 +1047,28 @@ template <> class DeviceTaskTypes { public: - template - VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D - MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id) + template + VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D + MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{}) { - return vtkm::exec::kokkos::internal::TaskBasic1D(worklet, - invocation); + return vtkm::exec::kokkos::internal::TaskBasic1D( + worklet, invocation); } - template - VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D - MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3) + template + VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D + MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = {}) { - return vtkm::exec::kokkos::internal::TaskBasic3D(worklet, - invocation); + return vtkm::exec::kokkos::internal::TaskBasic3D( + worklet, invocation); + } + + template + VTKM_CONT static auto MakeTask(WorkletType& worklet, + InvocationType& invocation, + const RangeType& range) + { + return MakeTask>(worklet, invocation, range); } }; } diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 7b2079a93..beddb9f75 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -359,8 +359,8 @@ public: VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::openmp::internal::TaskTiling3D& functor, vtkm::Id3 size); - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances) + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -368,8 +368,14 @@ public: ScheduleTask(kernel, numInstances); } - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax) + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); + } + + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -377,6 +383,12 @@ public: ScheduleTask(kernel, rangeMax); } + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax); + } + VTKM_CONT static void Synchronize() { // Nothing to do. This device schedules all of its operations using a @@ -390,21 +402,33 @@ template <> class DeviceTaskTypes { public: - template + template static vtkm::exec::openmp::internal::TaskTiling1D MakeTask(const WorkletType& worklet, const InvocationType& invocation, - vtkm::Id) + vtkm::Id, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::openmp::internal::TaskTiling1D(worklet, invocation); } - template + template static vtkm::exec::openmp::internal::TaskTiling3D MakeTask(const WorkletType& worklet, const InvocationType& invocation, - vtkm::Id3) + vtkm::Id3, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::openmp::internal::TaskTiling3D(worklet, invocation); } + + template + VTKM_CONT static auto MakeTask(WorkletType& worklet, + InvocationType& invocation, + const RangeType& range) + { + return MakeTask>(worklet, invocation, range); + } }; } } // namespace vtkm::cont diff --git a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h index cc187c68b..463b67cc6 100644 --- a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h +++ b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h @@ -400,8 +400,8 @@ public: VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::serial::internal::TaskTiling3D& functor, vtkm::Id3 size); - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id size) + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id size) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -409,8 +409,14 @@ public: ScheduleTask(kernel, size); } - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 size) + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id size) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, size); + } + + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 size) { VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf); @@ -418,6 +424,12 @@ public: ScheduleTask(kernel, size); } + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 size) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, size); + } + private: template class DeviceTaskTypes { public: - template + template static vtkm::exec::serial::internal::TaskTiling1D MakeTask(WorkletType& worklet, InvocationType& invocation, - vtkm::Id) + vtkm::Id, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::serial::internal::TaskTiling1D(worklet, invocation); } - template + template static vtkm::exec::serial::internal::TaskTiling3D MakeTask(WorkletType& worklet, InvocationType& invocation, - vtkm::Id3) + vtkm::Id3, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::serial::internal::TaskTiling3D(worklet, invocation); } + + template + VTKM_CONT static auto MakeTask(WorkletType& worklet, + InvocationType& invocation, + const RangeType& range) + { + return MakeTask>(worklet, invocation, range); + } }; } } // namespace vtkm::cont diff --git a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h index 687d84a2b..43c00d925 100644 --- a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h +++ b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h @@ -259,8 +259,8 @@ public: VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::tbb::internal::TaskTiling3D& functor, vtkm::Id3 size); - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances) + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances) { VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf, "Schedule TBB 1D: '%s'", @@ -270,8 +270,14 @@ public: ScheduleTask(kernel, numInstances); } - template - VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax) + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances); + } + + template + VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax) { VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf, "Schedule TBB 3D: '%s'", @@ -281,6 +287,12 @@ public: ScheduleTask(kernel, rangeMax); } + template + VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax) + { + Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax); + } + //1. We need functions for each of the following @@ -421,21 +433,33 @@ template <> class DeviceTaskTypes { public: - template + template static vtkm::exec::tbb::internal::TaskTiling1D MakeTask(WorkletType& worklet, InvocationType& invocation, - vtkm::Id) + vtkm::Id, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::tbb::internal::TaskTiling1D(worklet, invocation); } - template + template static vtkm::exec::tbb::internal::TaskTiling3D MakeTask(WorkletType& worklet, InvocationType& invocation, - vtkm::Id3) + vtkm::Id3, + Hints = Hints{}) { + // Currently ignoring hints. return vtkm::exec::tbb::internal::TaskTiling3D(worklet, invocation); } + + template + VTKM_CONT static auto MakeTask(WorkletType& worklet, + InvocationType& invocation, + const RangeType& range) + { + return MakeTask>(worklet, invocation, range); + } }; } } // namespace vtkm::cont diff --git a/vtkm/cont/testing/CMakeLists.txt b/vtkm/cont/testing/CMakeLists.txt index 98ff167a1..850ecd497 100644 --- a/vtkm/cont/testing/CMakeLists.txt +++ b/vtkm/cont/testing/CMakeLists.txt @@ -107,6 +107,7 @@ set(unit_tests_device UnitTestDataSetPermutation.cxx UnitTestDataSetSingleType.cxx UnitTestDeviceAdapterAlgorithmDependency.cxx + UnitTestHints.cxx UnitTestImplicitFunction.cxx UnitTestParticleArrayCopy.cxx UnitTestPointLocatorSparseGrid.cxx @@ -131,6 +132,11 @@ endif() vtkm_unit_tests(SOURCES ${unit_tests} DEVICE_SOURCES ${unit_tests_device}) +target_sources(UnitTests_vtkm_cont_testing + PRIVATE + UnitTestHints.cxx +) + #add distributed tests i.e.test to run with MPI #if MPI is enabled. set(mpi_unit_tests diff --git a/vtkm/cont/testing/UnitTestHints.cxx b/vtkm/cont/testing/UnitTestHints.cxx new file mode 100644 index 000000000..073005cce --- /dev/null +++ b/vtkm/cont/testing/UnitTestHints.cxx @@ -0,0 +1,108 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +//============================================================================ + +#include + +#include +#include + +#include + +#include + +namespace UnitTestHintNamespace +{ + +void CheckFind() +{ + std::cout << "Empty list returns default.\n"; + VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind, + vtkm::cont::internal::HintThreadsPerBlock<128>, + vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == + 128); + + std::cout << "Find a hint that matches.\n"; + VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind< + vtkm::cont::internal::HintList>, + vtkm::cont::internal::HintThreadsPerBlock<0>, + vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128); + VTKM_TEST_ASSERT( + vtkm::cont::internal::HintFind< + vtkm::cont::internal::HintList< + vtkm::cont::internal::HintThreadsPerBlock<128, + vtkm::List>>, + vtkm::cont::internal::HintThreadsPerBlock<0>, + vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128); + + std::cout << "Skip a hint that does not match.\n"; + VTKM_TEST_ASSERT( + (vtkm::cont::internal::HintFind< + vtkm::cont::internal::HintList< + vtkm::cont::internal::HintThreadsPerBlock<128, + vtkm::List>>, + vtkm::cont::internal::HintThreadsPerBlock<0>, + vtkm::cont::DeviceAdapterTagSerial>::MaxThreads == 0)); + + std::cout << "Given a list of hints, pick the last one that matches\n"; + { + using HList = vtkm::cont::internal::HintList< + vtkm::cont::internal::HintThreadsPerBlock<64>, + vtkm::cont::internal::HintThreadsPerBlock<128, vtkm::List>, + vtkm::cont::internal::HintThreadsPerBlock<256, + vtkm::List>>; + using HInit = vtkm::cont::internal::HintThreadsPerBlock<0>; + VTKM_TEST_ASSERT((vtkm::cont::internal:: + HintFind::MaxThreads == + 64)); + VTKM_TEST_ASSERT( + (vtkm::cont::internal::HintFind::MaxThreads == + 128)); + VTKM_TEST_ASSERT((vtkm::cont::internal:: + HintFind::MaxThreads == + 256)); + } +} + +struct MyFunctor : vtkm::exec::FunctorBase +{ + VTKM_EXEC void operator()(vtkm::Id vtkmNotUsed(index)) const + { + // NOP + } + + VTKM_EXEC void operator()(vtkm::Id3 vtkmNotUsed(index)) const + { + // NOP + } +}; + +void CheckSchedule() +{ + std::cout << "Schedule a functor using hints.\n"; + // There is no good way to see if the device adapter got or used the hints + // as device adapters are free to ignore hints. This just tests that the + // hints can be passed. + using Hints = vtkm::cont::internal::HintList>; + vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, 10); + vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, vtkm::Id3{ 2 }); +} + +void Run() +{ + CheckFind(); + CheckSchedule(); +} + +} // anonymous UnitTestHintNamespace + +int UnitTestHints(int argc, char* argv[]) +{ + return vtkm::cont::testing::Testing::Run(UnitTestHintNamespace::Run, argc, argv); +} diff --git a/vtkm/exec/TaskBase.h b/vtkm/exec/TaskBase.h index 18bd97339..9de749f4d 100644 --- a/vtkm/exec/TaskBase.h +++ b/vtkm/exec/TaskBase.h @@ -12,6 +12,8 @@ #include +#include + #include namespace vtkm diff --git a/vtkm/exec/cuda/internal/TaskStrided.h b/vtkm/exec/cuda/internal/TaskStrided.h index ee9b5818e..98f55e292 100644 --- a/vtkm/exec/cuda/internal/TaskStrided.h +++ b/vtkm/exec/cuda/internal/TaskStrided.h @@ -50,9 +50,11 @@ protected: SetErrorBufferSignature SetErrorBufferFunction = nullptr; }; -template +template class TaskStrided1D : public TaskStrided { + VTKM_IS_HINT_LIST(Hints); + public: TaskStrided1D(const WType& worklet, const IType& invocation) : TaskStrided() @@ -90,9 +92,11 @@ private: const IType Invocation; }; -template -class TaskStrided1D : public TaskStrided +template +class TaskStrided1D : public TaskStrided { + VTKM_IS_HINT_LIST(Hints); + public: TaskStrided1D(WType& worklet) : TaskStrided() @@ -116,9 +120,11 @@ private: typename std::remove_const::type Worklet; }; -template +template class TaskStrided3D : public TaskStrided { + VTKM_IS_HINT_LIST(Hints); + public: TaskStrided3D(const WType& worklet, const IType& invocation) : TaskStrided() @@ -165,9 +171,11 @@ private: const IType Invocation; }; -template -class TaskStrided3D : public TaskStrided +template +class TaskStrided3D : public TaskStrided { + VTKM_IS_HINT_LIST(Hints); + public: TaskStrided3D(WType& worklet) : TaskStrided() diff --git a/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu b/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu index c79ac2924..76d960a3d 100644 --- a/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu +++ b/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu @@ -342,8 +342,8 @@ void TestErrorFunctorInvoke() TestExecObject(input.PrepareForInPlace(DeviceAdapter(), token)), TestExecObject(output.PrepareForInPlace(DeviceAdapter(), token))); - using TaskStrided1 = - vtkm::exec::cuda::internal::TaskStrided1D; + using TaskStrided1 = vtkm::exec::cuda::internal:: + TaskStrided1D>; TestWorkletErrorProxy worklet; InvocationType1 invocation(execObjects); diff --git a/vtkm/exec/kokkos/internal/TaskBasic.h b/vtkm/exec/kokkos/internal/TaskBasic.h index 8ce8e6fdb..48ca6d86d 100644 --- a/vtkm/exec/kokkos/internal/TaskBasic.h +++ b/vtkm/exec/kokkos/internal/TaskBasic.h @@ -24,9 +24,11 @@ namespace kokkos namespace internal { -template +template class TaskBasic1D : public vtkm::exec::TaskBase { + VTKM_IS_HINT_LIST(Hints); + public: TaskBasic1D(const WType& worklet, const IType& invocation) : Worklet(worklet) @@ -57,9 +59,11 @@ private: IType Invocation; }; -template -class TaskBasic1D : public vtkm::exec::TaskBase +template +class TaskBasic1D : public vtkm::exec::TaskBase { + VTKM_IS_HINT_LIST(Hints); + public: explicit TaskBasic1D(const WType& worklet) : Worklet(worklet) @@ -78,9 +82,11 @@ private: typename std::remove_const::type Worklet; }; -template +template class TaskBasic3D : public vtkm::exec::TaskBase { + VTKM_IS_HINT_LIST(Hints); + public: TaskBasic3D(const WType& worklet, const IType& invocation) : Worklet(worklet) @@ -112,9 +118,11 @@ private: IType Invocation; }; -template -class TaskBasic3D : public vtkm::exec::TaskBase +template +class TaskBasic3D : public vtkm::exec::TaskBase { + VTKM_IS_HINT_LIST(Hints); + public: explicit TaskBasic3D(const WType& worklet) : Worklet(worklet) diff --git a/vtkm/worklet/internal/DispatcherBase.h b/vtkm/worklet/internal/DispatcherBase.h index 03fbe1bc2..92e1b2fa1 100644 --- a/vtkm/worklet/internal/DispatcherBase.h +++ b/vtkm/worklet/internal/DispatcherBase.h @@ -792,7 +792,8 @@ private: // vtkm::exec::internal::TaskSingular // vtkm::exec::internal::TaskTiling1D // vtkm::exec::internal::TaskTiling3D - auto task = TaskTypes::MakeTask(this->Worklet, invocation, range); + auto task = + TaskTypes::MakeTask(this->Worklet, invocation, range, typename WorkletType::Hints{}); Algorithm::ScheduleTask(task, range); } }; diff --git a/vtkm/worklet/internal/WorkletBase.h b/vtkm/worklet/internal/WorkletBase.h index cdf669cbf..2a619719a 100644 --- a/vtkm/worklet/internal/WorkletBase.h +++ b/vtkm/worklet/internal/WorkletBase.h @@ -40,6 +40,8 @@ #include #include +#include + #include #include #include @@ -136,6 +138,11 @@ public: /// everything in the output domain. using MaskType = vtkm::worklet::MaskNone; + /// Worklets can provide hints to the scheduler by defining a `Hints` type that + /// resolves to a `vtkm::cont::internal::HintList`. The default hint list is empty + /// so that scheduling uses all defaults. + using Hints = vtkm::cont::internal::HintList<>; + /// @brief `ControlSignature` tag for whole input arrays. /// /// The `WholeArrayIn` control signature tag specifies a `vtkm::cont::ArrayHandle`