Merge topic 'hints'

c44f68649 Add hints to device adapter scheduler Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Dave Pugmire <dpugmire@gmail.com> Merge-request: !3189
2024-10-05 01:49:02 +00:00 · 2024-02-17 12:47:10 +00:00 · 2024-02-17 12:47:10 +00:00 · af28ec2766
commit af28ec2766
parent 66e27bf683 c44f686496
20 changed files with 598 additions and 102 deletions
--- a/docs/changelog/hints.md
+++ b/docs/changelog/hints.md
@ -0,0 +1,32 @@
 # Add hints to device adapter scheduler
 The `DeviceAdapter` provides an abstract interface to the accelerator
 devices worklets and other algorithms run on. As such, the programmer has
 less control about how the device launches each worklet. Each device
 adapter has its own configuration parameters and other ways to attempt to
 optimize how things are run, but these are always a universal set of
 options that are applied to everything run on the device. There is no way
 to specify launch parameters for a particular worklet.
 To provide this information, VTK-m now supports `Hint`s to the device
 adapter. The `DeviceAdapterAlgorithm::Schedule` method takes a templated
 argument that is of the type `HintList`. This object contains a template
 list of `Hint` types that provide suggestions on how to launch the parallel
 execution. The device adapter will pick out hints that pertain to it and
 adjust its launching accordingly.
 These are called hints rather than, say, directives, because they don't
 force the device adapter to do anything. The device adapter is free to
 ignore any (and all) hints. The point is that the device adapter can take
 into account the information to try to optimize for itself.
 A provided hint can be tied to specific device adapters. In this way, an
 worklet can further optimize itself. If multiple hints match a device
 adapter, the last one in the list will be selected.
 The `Worklet` base now has an internal type named `Hints` that points to a
 `HintList` that is applied when the worklet is scheduled. Derived worklet
 classes can provide hints by simply defining their own `Hints` type.
 This feature is experimental and consequently hidden in an `internal`
 namespace.
--- a/vtkm/cont/Algorithm.h
+++ b/vtkm/cont/Algorithm.h
@ -17,6 +17,7 @@
 #include <vtkm/cont/ExecutionObjectBase.h>
 #include <vtkm/cont/Token.h>
 #include <vtkm/cont/TryExecute.h>
 #include <vtkm/cont/internal/Hints.h>
 namespace vtkm
@ -932,29 +933,43 @@ struct Algorithm
    ScanExtended(vtkm::cont::DeviceAdapterTagAny(), input, output, binaryFunctor, initialValue);
  }
-
+  // Should this be deprecated in favor of `RuntimeDeviceTracker`?
-  template <class Functor>
+  template <typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId,
                                 Functor functor,
                                 vtkm::Id numInstances)
  {
-    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, numInstances);
+    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor{}, functor, numInstances);
  }
-  template <class Functor>
+  template <typename... Hints, typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::internal::HintList<Hints...> hints,
                                 Functor functor,
                                 vtkm::Id numInstances)
  {
    vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, numInstances);
  }
  template <typename Functor>
  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
  {
-    Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, numInstances);
+    Schedule(vtkm::cont::DeviceAdapterTagAny{}, functor, numInstances);
  }
-  template <class Functor>
+  template <typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId,
                                 Functor functor,
                                 vtkm::Id3 rangeMax)
  {
    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, rangeMax);
  }
-  template <class Functor>
+  template <typename... Hints, typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::internal::HintList<Hints...> hints,
                                 Functor functor,
                                 vtkm::Id3 rangeMax)
  {
    vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, rangeMax);
  }
  template <typename Functor>
  VTKM_CONT static void Schedule(Functor functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, rangeMax);
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@ -283,6 +283,11 @@ vtkm_library( NAME vtkm_cont
              DEVICE_SOURCES ${device_sources}
            )
 target_sources(vtkm_cont
  PRIVATE
    internal/Hints.h
 )
 add_subdirectory(internal)
 add_subdirectory(arg)
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@ -203,7 +203,8 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& blocks,
  vtkm::UInt32& threadsPerBlock,
-  vtkm::Id size)
+  vtkm::Id size,
  vtkm::IdComponent maxThreadsPerBlock)
 {
  (void)size;
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
@ -215,12 +216,17 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
  const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
  blocks = static_cast<vtkm::UInt32>(params.first);
  threadsPerBlock = static_cast<vtkm::UInt32>(params.second);
  if ((maxThreadsPerBlock > 0) && (threadsPerBlock < static_cast<vtkm::UInt32>(maxThreadsPerBlock)))
  {
    threadsPerBlock = static_cast<vtkm::UInt32>(maxThreadsPerBlock);
  }
 }
 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& blocks,
  dim3& threadsPerBlock,
-  const dim3& size)
+  const dim3& size,
  vtkm::IdComponent maxThreadsPerBlock)
 {
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
@ -240,6 +246,27 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
    blocks = static_cast<vtkm::UInt32>(params.first);
    threadsPerBlock = params.second;
  }
  if (maxThreadsPerBlock > 0)
  {
    while ((threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z) >
           static_cast<vtkm::UInt32>(maxThreadsPerBlock))
    {
      // Reduce largest element until threads are small enough.
      if (threadsPerBlock.x > threadsPerBlock.y)
      {
        threadsPerBlock.x /= 2;
      }
      else if (threadsPerBlock.y > threadsPerBlock.z)
      {
        threadsPerBlock.y /= 2;
      }
      else
      {
        threadsPerBlock.z /= 2;
      }
    }
  }
 }
 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -1654,10 +1654,24 @@ public:
  VTKM_CONT_EXPORT
  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
                                  vtkm::UInt32& threadsPerBlock,
-                                  vtkm::Id size);
+                                  vtkm::Id size,
                                  vtkm::IdComponent maxThreadsPerBlock);
  VTKM_CONT_EXPORT
-  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
                                  dim3& threadsPerBlock,
                                  const dim3& size,
                                  vtkm::IdComponent maxThreadsPerBlock);
  template <typename... Hints, typename... Args>
  static void GetBlocksAndThreads(vtkm::cont::internal::HintList<Hints...>, Args&&... args)
  {
    using ThreadsPerBlock =
      vtkm::cont::internal::HintFind<vtkm::cont::internal::HintList<Hints...>,
                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
                                     vtkm::cont::DeviceAdapterTagCuda>;
    GetBlocksAndThreads(std::forward<Args>(args)..., ThreadsPerBlock::MaxThreads);
  }
  VTKM_CONT_EXPORT
  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
@ -1674,8 +1688,8 @@ public:
                              const dim3& size);
 public:
-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
-  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType, Hints>& functor,
                           vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -1691,12 +1705,12 @@ public:
    SetupErrorBuffer(functor);
    vtkm::UInt32 blocks, threadsPerBlock;
-    GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
+    GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances);
 #ifdef VTKM_ENABLE_LOGGING
    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
    {
-      using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
+      using FunctorType = std::decay_t<decltype(functor)>;
      cudaFuncAttributes empty_kernel_attrs;
      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
                                           cuda::internal::TaskStrided1DLaunch<FunctorType>));
@ -1708,8 +1722,8 @@ public:
      functor, numInstances);
  }
-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
-  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType, Hints>& functor,
                           vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -1730,12 +1744,12 @@ public:
    vtkm::UInt32 blocks;
    dim3 threadsPerBlock;
-    GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
+    GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges);
 #ifdef VTKM_ENABLE_LOGGING
    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
    {
-      using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
+      using FunctorType = std::decay_t<decltype(functor)>;
      cudaFuncAttributes empty_kernel_attrs;
      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
                                           cuda::internal::TaskStrided3DLaunch<FunctorType>));
@ -1747,25 +1761,39 @@ public:
      functor, rangeMax);
  }
-  template <class Functor>
+  template <typename Hints, typename Functor>
-  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
-    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType, Hints> kernel(
      functor);
    ScheduleTask(kernel, numInstances);
  }
-  template <class Functor>
+  template <typename FunctorType>
-  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
  }
  template <typename Hints, typename Functor>
  VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
-    vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType, Hints> kernel(
      functor);
    ScheduleTask(kernel, rangeMax);
  }
  template <typename FunctorType>
  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
  }
  template <typename T, class Storage>
  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
  {
@ -1894,20 +1922,26 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
-  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>
+  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType, Hints>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id)
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{})
  {
-    using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
+    return { worklet, invocation };
    return Task(worklet, invocation);
  }
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
-  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>
+  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType, Hints>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3)
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = Hints{})
  {
-    using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
+    return { worklet, invocation };
-    return Task(worklet, invocation);
+  }
  template <typename WorkletType, typename InvocationType, typename RangeType>
  VTKM_CONT static auto MakeTask(WorkletType& worklet,
                                 InvocationType& invocation,
                                 const RangeType& range)
  {
    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
--- a/vtkm/cont/internal/CMakeLists.txt
+++ b/vtkm/cont/internal/CMakeLists.txt
@ -25,6 +25,7 @@ set(headers
  DeviceAdapterListHelpers.h
  FieldCollection.h
  FunctorsGeneral.h
  Hints.h
  IteratorFromArrayPortal.h
  KXSort.h
  MapArrayPermutation.h
--- a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
+++ b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
@ -20,6 +20,7 @@
 #include <vtkm/cont/BitField.h>
 #include <vtkm/cont/Logging.h>
 #include <vtkm/cont/internal/FunctorsGeneral.h>
 #include <vtkm/cont/internal/Hints.h>
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 #include <vtkm/exec/internal/TaskSingular.h>
@ -58,20 +59,30 @@ namespace internal
 ///    : DeviceAdapterAlgorithmGeneral<DeviceAdapterAlgorithm<DeviceAdapterTagFoo>,
 ///                                    DeviceAdapterTagFoo>
 /// {
-///   template<class Functor>
+///   template<typename Hints, typename Functor>
-///   VTKM_CONT static void Schedule(Functor functor,
+///   VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
 ///                                        vtkm::Id numInstances)
 ///   {
 ///     ...
 ///   }
 ///
-///   template<class Functor>
+///   template<typename Functor>
-///   VTKM_CONT static void Schedule(Functor functor,
+///   VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id numInstances)
-///                                        vtkm::Id3 maxRange)
+///   {
 ///     Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
 ///   }
 ///
 ///   template<typename Hints, typename Functor>
 ///   VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id3 maxRange)
 ///   {
 ///     ...
 ///   }
 ///
 ///   template<typename Functor>
 ///   VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id3 maxRange)
 ///   {
 ///     Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
 ///   }
 ///
 ///   VTKM_CONT static void Synchronize()
 ///   {
 ///     ...
--- a/vtkm/cont/internal/Hints.h
+++ b/vtkm/cont/internal/Hints.h
@ -0,0 +1,124 @@
 //============================================================================
 //  Copyright (c) Kitware, Inc.
 //  All rights reserved.
 //  See LICENSE.txt for details.
 //
 //  This software is distributed WITHOUT ANY WARRANTY; without even
 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 //  PURPOSE.  See the above copyright notice for more information.
 //============================================================================
 #ifndef vtk_m_cont_internal_Hints_h
 #define vtk_m_cont_internal_Hints_h
 #include <vtkm/Assert.h>
 #include <vtkm/List.h>
 #include <vtkm/cont/DeviceAdapterTag.h>
 namespace vtkm
 {
 namespace cont
 {
 namespace internal
 {
 /// @brief Representation of a hint for execution.
 ///
 /// A hint is a (potentially) device independent parameter that can be used when
 /// scheduling parallel execution on a device. Control-side code can provide hints
 /// when scheduling parallel device execution to provide some context about what
 /// is being run and potentially optimize the algorithm. An implementation for
 /// a device adapter can choose to use or ignore hints. Likewise, a hint can be
 /// attached to a specific list of devices.
 ///
 /// This base class is not intended to be used directly. Use one of the
 /// derived hint structures to specify a hint.
 template <typename Derived_, typename Tag_, typename DeviceList_>
 struct HintBase
 {
  using Derived = Derived_;
  using Tag = Tag_;
  using DeviceList = DeviceList_;
 };
 struct HintTagThreadsPerBlock
 {
 };
 /// @brief Suggest the number of threads to use when scheduling blocks of threads.
 ///
 /// Many accelerator devices, particularly GPUs, schedule threads in blocks. This
 /// hint suggests the size of block to use during the scheduling.
 template <vtkm::IdComponent MaxThreads_, typename DeviceList_ = vtkm::ListUniversal>
 struct HintThreadsPerBlock
  : HintBase<HintThreadsPerBlock<MaxThreads_, DeviceList_>, HintTagThreadsPerBlock, DeviceList_>
 {
  static constexpr vtkm::IdComponent MaxThreads = MaxThreads_;
 };
 /// @brief Container for hints.
 ///
 /// When scheduling or invoking a parallel routine, the caller can provide a list
 /// of hints to suggest the best way to execute the routine. This list is provided
 /// as arguments to a `HintList` template and passed as an argument.
 template <typename... Hints>
 struct HintList : vtkm::List<Hints...>
 {
  using List = vtkm::List<Hints...>;
 };
 template <typename T>
 struct IsHintList : std::false_type
 {
 };
 template <typename... Hints>
 struct IsHintList<HintList<Hints...>> : std::true_type
 {
 };
 /// @brief Performs a static assert that the given object is a hint list.
 ///
 /// If the provided type is a `vtkm::cont::internal::HintList`, then this macro
 /// does nothing. If the type is anything else, a compile error will occur. This
 /// macro is useful for checking that template arguments are an expected hint
 /// list. This helps diagnose improper template use more easily.
 #define VTKM_IS_HINT_LIST(T) VTKM_STATIC_ASSERT(::vtkm::cont::internal::IsHintList<T>::value)
 namespace detail
 {
 template <typename Device, typename HintTag>
 struct FindHintOperators
 {
  VTKM_IS_DEVICE_ADAPTER_TAG(Device);
  template <typename Hint>
  using HintMatches = vtkm::internal::meta::And<std::is_same<typename Hint::Tag, HintTag>,
                                                vtkm::ListHas<typename Hint::DeviceList, Device>>;
  template <typename Found, typename Next>
  using ReduceOperator = typename std::conditional<HintMatches<Next>::value, Next, Found>::type;
 };
 } // namespace detail
 /// @brief Find a hint of a particular type.
 ///
 /// The `HintFind` template can be used to find a hint of a particular type.
 /// `HintFind` is provided a default value to use for a hint, and it returns
 /// a hint in the hint list that matches the type of the provided default and
 /// applies to the provided device tag.
 ///
 /// If multiple hints match the type and device, the _last_ one in the list
 /// is returned. Thus, when constructing hint lists, but the more general hints
 /// first and more specific ones last.
 template <typename HList, typename DefaultHint, typename Device>
 using HintFind = vtkm::ListReduce<
  typename HList::List,
  detail::FindHintOperators<Device, typename DefaultHint::Tag>::template ReduceOperator,
  DefaultHint>;
 }
 }
 } // namespace vtkm::cont::internal
 #endif // vtk_m_cont_internal_Hints_h
--- a/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h
+++ b/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h
@ -670,9 +670,9 @@ public:
  }
  //----------------------------------------------------------------------------
-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
  VTKM_CONT static void ScheduleTask(
-    vtkm::exec::kokkos::internal::TaskBasic1D<WType, IType>& functor,
+    vtkm::exec::kokkos::internal::TaskBasic1D<WType, IType, Hints>& functor,
    vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -685,15 +685,22 @@ public:
    functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance());
-    Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace, vtkm::Id> policy(
+    constexpr vtkm::IdComponent maxThreadsPerBlock =
-      vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances);
+      vtkm::cont::internal::HintFind<Hints,
                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
                                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads;
    Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace,
                        Kokkos::LaunchBounds<maxThreadsPerBlock, 0>,
                        Kokkos::IndexType<vtkm::Id>>
      policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances);
    Kokkos::parallel_for(policy, functor);
    CheckForErrors(); // synchronizes
  }
-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
  VTKM_CONT static void ScheduleTask(
-    vtkm::exec::kokkos::internal::TaskBasic3D<WType, IType>& functor,
+    vtkm::exec::kokkos::internal::TaskBasic3D<WType, IType, Hints>& functor,
    vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -706,7 +713,13 @@ public:
    functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance());
    constexpr vtkm::IdComponent maxThreadsPerBlock =
      vtkm::cont::internal::HintFind<Hints,
                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
                                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads;
    Kokkos::MDRangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace,
                          Kokkos::LaunchBounds<maxThreadsPerBlock, 0>,
                          Kokkos::Rank<3>,
                          Kokkos::IndexType<vtkm::Id>>
      policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(),
@ -729,24 +742,38 @@ public:
    CheckForErrors(); // synchronizes
  }
-  template <class Functor>
+  template <typename Hints, typename Functor>
-  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
-    vtkm::exec::kokkos::internal::TaskBasic1D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::kokkos::internal::TaskBasic1D<Functor, vtkm::internal::NullType, Hints> kernel(
      functor);
    ScheduleTask(kernel, numInstances);
  }
-  template <class Functor>
+  template <typename FunctorType>
-  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
  }
  template <typename Hints, typename Functor>
  VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
-    vtkm::exec::kokkos::internal::TaskBasic3D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::kokkos::internal::TaskBasic3D<Functor, vtkm::internal::NullType, Hints> kernel(
      functor);
    ScheduleTask(kernel, rangeMax);
  }
  template <typename FunctorType>
  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
  }
  //----------------------------------------------------------------------------
 private:
  template <typename T>
@ -1020,20 +1047,28 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagKokkos>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
-  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType>
+  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType, Hints>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id)
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{})
  {
-    return vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType>(worklet,
+    return vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType, Hints>(
-                                                                                  invocation);
+      worklet, invocation);
  }
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
-  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType>
+  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType, Hints>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3)
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = {})
  {
-    return vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType>(worklet,
+    return vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType, Hints>(
-                                                                                  invocation);
+      worklet, invocation);
  }
  template <typename WorkletType, typename InvocationType, typename RangeType>
  VTKM_CONT static auto MakeTask(WorkletType& worklet,
                                 InvocationType& invocation,
                                 const RangeType& range)
  {
    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
--- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
+++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
@ -359,8 +359,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::openmp::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);
-  template <class FunctorType>
+  template <typename Hints, typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances)
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -368,8 +368,14 @@ public:
    ScheduleTask(kernel, numInstances);
  }
-  template <class FunctorType>
+  template <typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax)
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
  }
  template <typename Hints, typename FunctorType>
  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -377,6 +383,12 @@ public:
    ScheduleTask(kernel, rangeMax);
  }
  template <typename FunctorType>
  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
  }
  VTKM_CONT static void Synchronize()
  {
    // Nothing to do. This device schedules all of its operations using a
@ -390,21 +402,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagOpenMP>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::openmp::internal::TaskTiling1D MakeTask(const WorkletType& worklet,
                                                             const InvocationType& invocation,
-                                                             vtkm::Id)
+                                                             vtkm::Id,
                                                             Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::openmp::internal::TaskTiling1D(worklet, invocation);
  }
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::openmp::internal::TaskTiling3D MakeTask(const WorkletType& worklet,
                                                             const InvocationType& invocation,
-                                                             vtkm::Id3)
+                                                             vtkm::Id3,
                                                             Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::openmp::internal::TaskTiling3D(worklet, invocation);
  }
  template <typename WorkletType, typename InvocationType, typename RangeType>
  VTKM_CONT static auto MakeTask(WorkletType& worklet,
                                 InvocationType& invocation,
                                 const RangeType& range)
  {
    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
+++ b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
@ -400,8 +400,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::serial::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);
-  template <class FunctorType>
+  template <typename Hints, typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id size)
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id size)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -409,8 +409,14 @@ public:
    ScheduleTask(kernel, size);
  }
-  template <class FunctorType>
+  template <typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 size)
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id size)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, size);
  }
  template <typename Hints, typename FunctorType>
  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 size)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -418,6 +424,12 @@ public:
    ScheduleTask(kernel, size);
  }
  template <typename FunctorType>
  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 size)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, size);
  }
 private:
  template <typename Vin,
            typename I,
@ -557,21 +569,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagSerial>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::serial::internal::TaskTiling1D MakeTask(WorkletType& worklet,
                                                             InvocationType& invocation,
-                                                             vtkm::Id)
+                                                             vtkm::Id,
                                                             Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::serial::internal::TaskTiling1D(worklet, invocation);
  }
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::serial::internal::TaskTiling3D MakeTask(WorkletType& worklet,
                                                             InvocationType& invocation,
-                                                             vtkm::Id3)
+                                                             vtkm::Id3,
                                                             Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::serial::internal::TaskTiling3D(worklet, invocation);
  }
  template <typename WorkletType, typename InvocationType, typename RangeType>
  VTKM_CONT static auto MakeTask(WorkletType& worklet,
                                 InvocationType& invocation,
                                 const RangeType& range)
  {
    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
+++ b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
@ -259,8 +259,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::tbb::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);
-  template <class FunctorType>
+  template <typename Hints, typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances)
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf,
                   "Schedule TBB 1D: '%s'",
@ -270,8 +270,14 @@ public:
    ScheduleTask(kernel, numInstances);
  }
-  template <class FunctorType>
+  template <typename FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax)
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
  }
  template <typename Hints, typename FunctorType>
  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf,
                   "Schedule TBB 3D: '%s'",
@ -281,6 +287,12 @@ public:
    ScheduleTask(kernel, rangeMax);
  }
  template <typename FunctorType>
  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
  }
  //1. We need functions for each of the following
@ -421,21 +433,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagTBB>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::tbb::internal::TaskTiling1D MakeTask(WorkletType& worklet,
                                                          InvocationType& invocation,
-                                                          vtkm::Id)
+                                                          vtkm::Id,
                                                          Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::tbb::internal::TaskTiling1D(worklet, invocation);
  }
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::tbb::internal::TaskTiling3D MakeTask(WorkletType& worklet,
                                                          InvocationType& invocation,
-                                                          vtkm::Id3)
+                                                          vtkm::Id3,
                                                          Hints = Hints{})
  {
    // Currently ignoring hints.
    return vtkm::exec::tbb::internal::TaskTiling3D(worklet, invocation);
  }
  template <typename WorkletType, typename InvocationType, typename RangeType>
  VTKM_CONT static auto MakeTask(WorkletType& worklet,
                                 InvocationType& invocation,
                                 const RangeType& range)
  {
    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/testing/CMakeLists.txt
+++ b/vtkm/cont/testing/CMakeLists.txt
@ -107,6 +107,7 @@ set(unit_tests_device
  UnitTestDataSetPermutation.cxx
  UnitTestDataSetSingleType.cxx
  UnitTestDeviceAdapterAlgorithmDependency.cxx
  UnitTestHints.cxx
  UnitTestImplicitFunction.cxx
  UnitTestParticleArrayCopy.cxx
  UnitTestPointLocatorSparseGrid.cxx
@ -131,6 +132,11 @@ endif()
 vtkm_unit_tests(SOURCES ${unit_tests} DEVICE_SOURCES ${unit_tests_device})
 target_sources(UnitTests_vtkm_cont_testing
  PRIVATE
    UnitTestHints.cxx
 )
 #add distributed tests i.e.test to run with MPI
 #if MPI is enabled.
 set(mpi_unit_tests
--- a/vtkm/cont/testing/UnitTestHints.cxx
+++ b/vtkm/cont/testing/UnitTestHints.cxx
@ -0,0 +1,108 @@
 //============================================================================
 //  Copyright (c) Kitware, Inc.
 //  All rights reserved.
 //  See LICENSE.txt for details.
 //
 //  This software is distributed WITHOUT ANY WARRANTY; without even
 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 //  PURPOSE.  See the above copyright notice for more information.
 //============================================================================
 #include <vtkm/cont/internal/Hints.h>
 #include <vtkm/cont/Algorithm.h>
 #include <vtkm/cont/DeviceAdapter.h>
 #include <vtkm/exec/FunctorBase.h>
 #include <vtkm/cont/testing/Testing.h>
 namespace UnitTestHintNamespace
 {
 void CheckFind()
 {
  std::cout << "Empty list returns default.\n";
  VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind<vtkm::cont::internal::HintList<>,
                                                  vtkm::cont::internal::HintThreadsPerBlock<128>,
                                                  vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads ==
                   128);
  std::cout << "Find a hint that matches.\n";
  VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind<
                     vtkm::cont::internal::HintList<vtkm::cont::internal::HintThreadsPerBlock<128>>,
                     vtkm::cont::internal::HintThreadsPerBlock<0>,
                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128);
  VTKM_TEST_ASSERT(
    vtkm::cont::internal::HintFind<
      vtkm::cont::internal::HintList<
        vtkm::cont::internal::HintThreadsPerBlock<128,
                                                  vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>,
      vtkm::cont::internal::HintThreadsPerBlock<0>,
      vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128);
  std::cout << "Skip a hint that does not match.\n";
  VTKM_TEST_ASSERT(
    (vtkm::cont::internal::HintFind<
       vtkm::cont::internal::HintList<
         vtkm::cont::internal::HintThreadsPerBlock<128,
                                                   vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>,
       vtkm::cont::internal::HintThreadsPerBlock<0>,
       vtkm::cont::DeviceAdapterTagSerial>::MaxThreads == 0));
  std::cout << "Given a list of hints, pick the last one that matches\n";
  {
    using HList = vtkm::cont::internal::HintList<
      vtkm::cont::internal::HintThreadsPerBlock<64>,
      vtkm::cont::internal::HintThreadsPerBlock<128, vtkm::List<vtkm::cont::DeviceAdapterTagCuda>>,
      vtkm::cont::internal::HintThreadsPerBlock<256,
                                                vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>;
    using HInit = vtkm::cont::internal::HintThreadsPerBlock<0>;
    VTKM_TEST_ASSERT((vtkm::cont::internal::
                        HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagSerial>::MaxThreads ==
                      64));
    VTKM_TEST_ASSERT(
      (vtkm::cont::internal::HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagCuda>::MaxThreads ==
       128));
    VTKM_TEST_ASSERT((vtkm::cont::internal::
                        HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads ==
                      256));
  }
 }
 struct MyFunctor : vtkm::exec::FunctorBase
 {
  VTKM_EXEC void operator()(vtkm::Id vtkmNotUsed(index)) const
  {
    // NOP
  }
  VTKM_EXEC void operator()(vtkm::Id3 vtkmNotUsed(index)) const
  {
    // NOP
  }
 };
 void CheckSchedule()
 {
  std::cout << "Schedule a functor using hints.\n";
  // There is no good way to see if the device adapter got or used the hints
  // as device adapters are free to ignore hints. This just tests that the
  // hints can be passed.
  using Hints = vtkm::cont::internal::HintList<vtkm::cont::internal::HintThreadsPerBlock<128>>;
  vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, 10);
  vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, vtkm::Id3{ 2 });
 }
 void Run()
 {
  CheckFind();
  CheckSchedule();
 }
 } // anonymous UnitTestHintNamespace
 int UnitTestHints(int argc, char* argv[])
 {
  return vtkm::cont::testing::Testing::Run(UnitTestHintNamespace::Run, argc, argv);
 }
--- a/vtkm/exec/TaskBase.h
+++ b/vtkm/exec/TaskBase.h
@ -12,6 +12,8 @@
 #include <vtkm/Types.h>
 #include <vtkm/cont/internal/Hints.h>
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 namespace vtkm
--- a/vtkm/exec/cuda/internal/TaskStrided.h
+++ b/vtkm/exec/cuda/internal/TaskStrided.h
@ -50,9 +50,11 @@ protected:
  SetErrorBufferSignature SetErrorBufferFunction = nullptr;
 };
-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskStrided1D : public TaskStrided
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskStrided1D(const WType& worklet, const IType& invocation)
    : TaskStrided()
@ -90,9 +92,11 @@ private:
  const IType Invocation;
 };
-template <typename WType>
+template <typename WType, typename Hints>
-class TaskStrided1D<WType, vtkm::internal::NullType> : public TaskStrided
+class TaskStrided1D<WType, vtkm::internal::NullType, Hints> : public TaskStrided
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskStrided1D(WType& worklet)
    : TaskStrided()
@ -116,9 +120,11 @@ private:
  typename std::remove_const<WType>::type Worklet;
 };
-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskStrided3D : public TaskStrided
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskStrided3D(const WType& worklet, const IType& invocation)
    : TaskStrided()
@ -165,9 +171,11 @@ private:
  const IType Invocation;
 };
-template <typename WType>
+template <typename WType, typename Hints>
-class TaskStrided3D<WType, vtkm::internal::NullType> : public TaskStrided
+class TaskStrided3D<WType, vtkm::internal::NullType, Hints> : public TaskStrided
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskStrided3D(WType& worklet)
    : TaskStrided()
--- a/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu
+++ b/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu
@ -342,8 +342,8 @@ void TestErrorFunctorInvoke()
      TestExecObject(input.PrepareForInPlace(DeviceAdapter(), token)),
      TestExecObject(output.PrepareForInPlace(DeviceAdapter(), token)));
-  using TaskStrided1 =
+  using TaskStrided1 = vtkm::exec::cuda::internal::
-    vtkm::exec::cuda::internal::TaskStrided1D<TestWorkletErrorProxy, InvocationType1>;
+    TaskStrided1D<TestWorkletErrorProxy, InvocationType1, vtkm::cont::internal::HintList<>>;
  TestWorkletErrorProxy worklet;
  InvocationType1 invocation(execObjects);
--- a/vtkm/exec/kokkos/internal/TaskBasic.h
+++ b/vtkm/exec/kokkos/internal/TaskBasic.h
@ -24,9 +24,11 @@ namespace kokkos
 namespace internal
 {
-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskBasic1D : public vtkm::exec::TaskBase
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskBasic1D(const WType& worklet, const IType& invocation)
    : Worklet(worklet)
@ -57,9 +59,11 @@ private:
  IType Invocation;
 };
-template <typename WType>
+template <typename WType, typename Hints>
-class TaskBasic1D<WType, vtkm::internal::NullType> : public vtkm::exec::TaskBase
+class TaskBasic1D<WType, vtkm::internal::NullType, Hints> : public vtkm::exec::TaskBase
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  explicit TaskBasic1D(const WType& worklet)
    : Worklet(worklet)
@ -78,9 +82,11 @@ private:
  typename std::remove_const<WType>::type Worklet;
 };
-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskBasic3D : public vtkm::exec::TaskBase
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  TaskBasic3D(const WType& worklet, const IType& invocation)
    : Worklet(worklet)
@ -112,9 +118,11 @@ private:
  IType Invocation;
 };
-template <typename WType>
+template <typename WType, typename Hints>
-class TaskBasic3D<WType, vtkm::internal::NullType> : public vtkm::exec::TaskBase
+class TaskBasic3D<WType, vtkm::internal::NullType, Hints> : public vtkm::exec::TaskBase
 {
  VTKM_IS_HINT_LIST(Hints);
 public:
  explicit TaskBasic3D(const WType& worklet)
    : Worklet(worklet)
--- a/vtkm/worklet/internal/DispatcherBase.h
+++ b/vtkm/worklet/internal/DispatcherBase.h
@ -792,7 +792,8 @@ private:
    // vtkm::exec::internal::TaskSingular
    // vtkm::exec::internal::TaskTiling1D
    // vtkm::exec::internal::TaskTiling3D
-    auto task = TaskTypes::MakeTask(this->Worklet, invocation, range);
+    auto task =
      TaskTypes::MakeTask(this->Worklet, invocation, range, typename WorkletType::Hints{});
    Algorithm::ScheduleTask(task, range);
  }
 };
--- a/vtkm/worklet/internal/WorkletBase.h
+++ b/vtkm/worklet/internal/WorkletBase.h
@ -40,6 +40,8 @@
 #include <vtkm/cont/arg/TypeCheckTagCellSet.h>
 #include <vtkm/cont/arg/TypeCheckTagExecObject.h>
 #include <vtkm/cont/internal/Hints.h>
 #include <vtkm/worklet/MaskNone.h>
 #include <vtkm/worklet/ScatterIdentity.h>
 #include <vtkm/worklet/internal/Placeholders.h>
@ -136,6 +138,11 @@ public:
  /// everything in the output domain.
  using MaskType = vtkm::worklet::MaskNone;
  /// Worklets can provide hints to the scheduler by defining a `Hints` type that
  /// resolves to a `vtkm::cont::internal::HintList`. The default hint list is empty
  /// so that scheduling uses all defaults.
  using Hints = vtkm::cont::internal::HintList<>;
  /// @brief `ControlSignature` tag for whole input arrays.
  ///
  /// The `WholeArrayIn` control signature tag specifies a `vtkm::cont::ArrayHandle`