Merge topic 'hints'

c44f68649 Add hints to device adapter scheduler Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Dave Pugmire <dpugmire@gmail.com> Merge-request: !3189
2024-07-06 07:17:25 +00:00 · 2024-02-17 12:47:10 +00:00 · 2024-02-17 12:47:10 +00:00 · af28ec2766
commit af28ec2766
parent 66e27bf683 c44f686496
20 changed files with 598 additions and 102 deletions
--- a/docs/changelog/hints.md
+++ b/docs/changelog/hints.md
@ -0,0 +1,32 @@
+# Add hints to device adapter scheduler
+
+The `DeviceAdapter` provides an abstract interface to the accelerator
+devices worklets and other algorithms run on. As such, the programmer has
+less control about how the device launches each worklet. Each device
+adapter has its own configuration parameters and other ways to attempt to
+optimize how things are run, but these are always a universal set of
+options that are applied to everything run on the device. There is no way
+to specify launch parameters for a particular worklet.
+
+To provide this information, VTK-m now supports `Hint`s to the device
+adapter. The `DeviceAdapterAlgorithm::Schedule` method takes a templated
+argument that is of the type `HintList`. This object contains a template
+list of `Hint` types that provide suggestions on how to launch the parallel
+execution. The device adapter will pick out hints that pertain to it and
+adjust its launching accordingly.
+
+These are called hints rather than, say, directives, because they don't
+force the device adapter to do anything. The device adapter is free to
+ignore any (and all) hints. The point is that the device adapter can take
+into account the information to try to optimize for itself.
+
+A provided hint can be tied to specific device adapters. In this way, an
+worklet can further optimize itself. If multiple hints match a device
+adapter, the last one in the list will be selected.
+
+The `Worklet` base now has an internal type named `Hints` that points to a
+`HintList` that is applied when the worklet is scheduled. Derived worklet
+classes can provide hints by simply defining their own `Hints` type.
+
+This feature is experimental and consequently hidden in an `internal`
+namespace.
--- a/vtkm/cont/Algorithm.h
+++ b/vtkm/cont/Algorithm.h
@ -17,6 +17,7 @@
 #include <vtkm/cont/ExecutionObjectBase.h>
 #include <vtkm/cont/Token.h>
 #include <vtkm/cont/TryExecute.h>
+#include <vtkm/cont/internal/Hints.h>


 namespace vtkm
@ -932,29 +933,43 @@ struct Algorithm
    ScanExtended(vtkm::cont::DeviceAdapterTagAny(), input, output, binaryFunctor, initialValue);
  }

-
-  template <class Functor>
+  // Should this be deprecated in favor of `RuntimeDeviceTracker`?
+  template <typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId,
                                 Functor functor,
                                 vtkm::Id numInstances)
  {
-    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, numInstances);
+    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor{}, functor, numInstances);
  }
-  template <class Functor>
+  template <typename... Hints, typename Functor>
+  VTKM_CONT static void Schedule(vtkm::cont::internal::HintList<Hints...> hints,
+                                 Functor functor,
+                                 vtkm::Id numInstances)
+  {
+    vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, numInstances);
+  }
+  template <typename Functor>
  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
  {
-    Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, numInstances);
+    Schedule(vtkm::cont::DeviceAdapterTagAny{}, functor, numInstances);
  }


-  template <class Functor>
+  template <typename Functor>
  VTKM_CONT static void Schedule(vtkm::cont::DeviceAdapterId devId,
                                 Functor functor,
                                 vtkm::Id3 rangeMax)
  {
    vtkm::cont::TryExecuteOnDevice(devId, detail::ScheduleFunctor(), functor, rangeMax);
  }
-  template <class Functor>
+  template <typename... Hints, typename Functor>
+  VTKM_CONT static void Schedule(vtkm::cont::internal::HintList<Hints...> hints,
+                                 Functor functor,
+                                 vtkm::Id3 rangeMax)
+  {
+    vtkm::cont::TryExecute(detail::ScheduleFunctor{}, hints, functor, rangeMax);
+  }
+  template <typename Functor>
  VTKM_CONT static void Schedule(Functor functor, vtkm::Id3 rangeMax)
  {
    Schedule(vtkm::cont::DeviceAdapterTagAny(), functor, rangeMax);
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@ -283,6 +283,11 @@ vtkm_library( NAME vtkm_cont
              DEVICE_SOURCES ${device_sources}
            )

+target_sources(vtkm_cont
+  PRIVATE
+    internal/Hints.h
+)
+
 add_subdirectory(internal)
 add_subdirectory(arg)

--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@ -203,7 +203,8 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& blocks,
  vtkm::UInt32& threadsPerBlock,
-  vtkm::Id size)
+  vtkm::Id size,
+  vtkm::IdComponent maxThreadsPerBlock)
 {
  (void)size;
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
@ -215,12 +216,17 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
  const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
  blocks = static_cast<vtkm::UInt32>(params.first);
  threadsPerBlock = static_cast<vtkm::UInt32>(params.second);
+  if ((maxThreadsPerBlock > 0) && (threadsPerBlock < static_cast<vtkm::UInt32>(maxThreadsPerBlock)))
+  {
+    threadsPerBlock = static_cast<vtkm::UInt32>(maxThreadsPerBlock);
+  }
 }

 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& blocks,
  dim3& threadsPerBlock,
-  const dim3& size)
+  const dim3& size,
+  vtkm::IdComponent maxThreadsPerBlock)
 {
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();

@ -240,6 +246,27 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
    blocks = static_cast<vtkm::UInt32>(params.first);
    threadsPerBlock = params.second;
  }
+
+  if (maxThreadsPerBlock > 0)
+  {
+    while ((threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z) >
+           static_cast<vtkm::UInt32>(maxThreadsPerBlock))
+    {
+      // Reduce largest element until threads are small enough.
+      if (threadsPerBlock.x > threadsPerBlock.y)
+      {
+        threadsPerBlock.x /= 2;
+      }
+      else if (threadsPerBlock.y > threadsPerBlock.z)
+      {
+        threadsPerBlock.y /= 2;
+      }
+      else
+      {
+        threadsPerBlock.z /= 2;
+      }
+    }
+  }
 }

 void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -1654,10 +1654,24 @@ public:
  VTKM_CONT_EXPORT
  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
                                  vtkm::UInt32& threadsPerBlock,
-                                  vtkm::Id size);
+                                  vtkm::Id size,
+                                  vtkm::IdComponent maxThreadsPerBlock);

  VTKM_CONT_EXPORT
-  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
+                                  dim3& threadsPerBlock,
+                                  const dim3& size,
+                                  vtkm::IdComponent maxThreadsPerBlock);
+
+  template <typename... Hints, typename... Args>
+  static void GetBlocksAndThreads(vtkm::cont::internal::HintList<Hints...>, Args&&... args)
+  {
+    using ThreadsPerBlock =
+      vtkm::cont::internal::HintFind<vtkm::cont::internal::HintList<Hints...>,
+                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
+                                     vtkm::cont::DeviceAdapterTagCuda>;
+    GetBlocksAndThreads(std::forward<Args>(args)..., ThreadsPerBlock::MaxThreads);
+  }

  VTKM_CONT_EXPORT
  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
@ -1674,8 +1688,8 @@ public:
                              const dim3& size);

 public:
-  template <typename WType, typename IType>
-  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
+  template <typename WType, typename IType, typename Hints>
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType, Hints>& functor,
                           vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -1691,12 +1705,12 @@ public:
    SetupErrorBuffer(functor);

    vtkm::UInt32 blocks, threadsPerBlock;
-    GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
+    GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, numInstances);

 #ifdef VTKM_ENABLE_LOGGING
    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
    {
-      using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
+      using FunctorType = std::decay_t<decltype(functor)>;
      cudaFuncAttributes empty_kernel_attrs;
      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
                                           cuda::internal::TaskStrided1DLaunch<FunctorType>));
@ -1708,8 +1722,8 @@ public:
      functor, numInstances);
  }

-  template <typename WType, typename IType>
-  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
+  template <typename WType, typename IType, typename Hints>
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType, Hints>& functor,
                           vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -1730,12 +1744,12 @@ public:

    vtkm::UInt32 blocks;
    dim3 threadsPerBlock;
-    GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
+    GetBlocksAndThreads(Hints{}, blocks, threadsPerBlock, ranges);

 #ifdef VTKM_ENABLE_LOGGING
    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
    {
-      using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
+      using FunctorType = std::decay_t<decltype(functor)>;
      cudaFuncAttributes empty_kernel_attrs;
      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
                                           cuda::internal::TaskStrided3DLaunch<FunctorType>));
@ -1747,25 +1761,39 @@ public:
      functor, rangeMax);
  }

-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  template <typename Hints, typename Functor>
+  VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

-    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType, Hints> kernel(
+      functor);

    ScheduleTask(kernel, numInstances);
  }

-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+  }
+
+  template <typename Hints, typename Functor>
+  VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

-    vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType, Hints> kernel(
+      functor);
    ScheduleTask(kernel, rangeMax);
  }

+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
+  }
+
  template <typename T, class Storage>
  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
  {
@ -1894,20 +1922,26 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
-  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id)
+  template <typename Hints, typename WorkletType, typename InvocationType>
+  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType, Hints>
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{})
  {
-    using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
-    return Task(worklet, invocation);
+    return { worklet, invocation };
  }

-  template <typename WorkletType, typename InvocationType>
-  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3)
+  template <typename Hints, typename WorkletType, typename InvocationType>
+  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType, Hints>
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = Hints{})
  {
-    using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
-    return Task(worklet, invocation);
+    return { worklet, invocation };
+  }
+
+  template <typename WorkletType, typename InvocationType, typename RangeType>
+  VTKM_CONT static auto MakeTask(WorkletType& worklet,
+                                 InvocationType& invocation,
+                                 const RangeType& range)
+  {
+    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
--- a/vtkm/cont/internal/CMakeLists.txt
+++ b/vtkm/cont/internal/CMakeLists.txt
@ -25,6 +25,7 @@ set(headers
  DeviceAdapterListHelpers.h
  FieldCollection.h
  FunctorsGeneral.h
+  Hints.h
  IteratorFromArrayPortal.h
  KXSort.h
  MapArrayPermutation.h
--- a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
+++ b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
@ -20,6 +20,7 @@
 #include <vtkm/cont/BitField.h>
 #include <vtkm/cont/Logging.h>
 #include <vtkm/cont/internal/FunctorsGeneral.h>
+#include <vtkm/cont/internal/Hints.h>

 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 #include <vtkm/exec/internal/TaskSingular.h>
@ -58,20 +59,30 @@ namespace internal
 ///    : DeviceAdapterAlgorithmGeneral<DeviceAdapterAlgorithm<DeviceAdapterTagFoo>,
 ///                                    DeviceAdapterTagFoo>
 /// {
-///   template<class Functor>
-///   VTKM_CONT static void Schedule(Functor functor,
-///                                        vtkm::Id numInstances)
+///   template<typename Hints, typename Functor>
+///   VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
 ///   {
 ///     ...
 ///   }
 ///
-///   template<class Functor>
-///   VTKM_CONT static void Schedule(Functor functor,
-///                                        vtkm::Id3 maxRange)
+///   template<typename Functor>
+///   VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id numInstances)
+///   {
+///     Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+///   }
+///
+///   template<typename Hints, typename Functor>
+///   VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id3 maxRange)
 ///   {
 ///     ...
 ///   }
 ///
+///   template<typename Functor>
+///   VTKM_CONT static void Schedule(Functor&& functor, vtkm::Id3 maxRange)
+///   {
+///     Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+///   }
+///
 ///   VTKM_CONT static void Synchronize()
 ///   {
 ///     ...
--- a/vtkm/cont/internal/Hints.h
+++ b/vtkm/cont/internal/Hints.h
@ -0,0 +1,124 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+#ifndef vtk_m_cont_internal_Hints_h
+#define vtk_m_cont_internal_Hints_h
+
+#include <vtkm/Assert.h>
+#include <vtkm/List.h>
+
+#include <vtkm/cont/DeviceAdapterTag.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace internal
+{
+
+/// @brief Representation of a hint for execution.
+///
+/// A hint is a (potentially) device independent parameter that can be used when
+/// scheduling parallel execution on a device. Control-side code can provide hints
+/// when scheduling parallel device execution to provide some context about what
+/// is being run and potentially optimize the algorithm. An implementation for
+/// a device adapter can choose to use or ignore hints. Likewise, a hint can be
+/// attached to a specific list of devices.
+///
+/// This base class is not intended to be used directly. Use one of the
+/// derived hint structures to specify a hint.
+template <typename Derived_, typename Tag_, typename DeviceList_>
+struct HintBase
+{
+  using Derived = Derived_;
+  using Tag = Tag_;
+  using DeviceList = DeviceList_;
+};
+
+struct HintTagThreadsPerBlock
+{
+};
+
+/// @brief Suggest the number of threads to use when scheduling blocks of threads.
+///
+/// Many accelerator devices, particularly GPUs, schedule threads in blocks. This
+/// hint suggests the size of block to use during the scheduling.
+template <vtkm::IdComponent MaxThreads_, typename DeviceList_ = vtkm::ListUniversal>
+struct HintThreadsPerBlock
+  : HintBase<HintThreadsPerBlock<MaxThreads_, DeviceList_>, HintTagThreadsPerBlock, DeviceList_>
+{
+  static constexpr vtkm::IdComponent MaxThreads = MaxThreads_;
+};
+
+/// @brief Container for hints.
+///
+/// When scheduling or invoking a parallel routine, the caller can provide a list
+/// of hints to suggest the best way to execute the routine. This list is provided
+/// as arguments to a `HintList` template and passed as an argument.
+template <typename... Hints>
+struct HintList : vtkm::List<Hints...>
+{
+  using List = vtkm::List<Hints...>;
+};
+
+template <typename T>
+struct IsHintList : std::false_type
+{
+};
+template <typename... Hints>
+struct IsHintList<HintList<Hints...>> : std::true_type
+{
+};
+
+/// @brief Performs a static assert that the given object is a hint list.
+///
+/// If the provided type is a `vtkm::cont::internal::HintList`, then this macro
+/// does nothing. If the type is anything else, a compile error will occur. This
+/// macro is useful for checking that template arguments are an expected hint
+/// list. This helps diagnose improper template use more easily.
+#define VTKM_IS_HINT_LIST(T) VTKM_STATIC_ASSERT(::vtkm::cont::internal::IsHintList<T>::value)
+
+namespace detail
+{
+
+template <typename Device, typename HintTag>
+struct FindHintOperators
+{
+  VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+
+  template <typename Hint>
+  using HintMatches = vtkm::internal::meta::And<std::is_same<typename Hint::Tag, HintTag>,
+                                                vtkm::ListHas<typename Hint::DeviceList, Device>>;
+  template <typename Found, typename Next>
+  using ReduceOperator = typename std::conditional<HintMatches<Next>::value, Next, Found>::type;
+};
+
+} // namespace detail
+
+/// @brief Find a hint of a particular type.
+///
+/// The `HintFind` template can be used to find a hint of a particular type.
+/// `HintFind` is provided a default value to use for a hint, and it returns
+/// a hint in the hint list that matches the type of the provided default and
+/// applies to the provided device tag.
+///
+/// If multiple hints match the type and device, the _last_ one in the list
+/// is returned. Thus, when constructing hint lists, but the more general hints
+/// first and more specific ones last.
+template <typename HList, typename DefaultHint, typename Device>
+using HintFind = vtkm::ListReduce<
+  typename HList::List,
+  detail::FindHintOperators<Device, typename DefaultHint::Tag>::template ReduceOperator,
+  DefaultHint>;
+
+}
+}
+} // namespace vtkm::cont::internal
+
+#endif // vtk_m_cont_internal_Hints_h
--- a/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h
+++ b/vtkm/cont/kokkos/internal/DeviceAdapterAlgorithmKokkos.h
@ -670,9 +670,9 @@ public:
  }

  //----------------------------------------------------------------------------
-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
  VTKM_CONT static void ScheduleTask(
-    vtkm::exec::kokkos::internal::TaskBasic1D<WType, IType>& functor,
+    vtkm::exec::kokkos::internal::TaskBasic1D<WType, IType, Hints>& functor,
    vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -685,15 +685,22 @@ public:

    functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance());

-    Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace, vtkm::Id> policy(
-      vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances);
+    constexpr vtkm::IdComponent maxThreadsPerBlock =
+      vtkm::cont::internal::HintFind<Hints,
+                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
+                                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads;
+
+    Kokkos::RangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace,
+                        Kokkos::LaunchBounds<maxThreadsPerBlock, 0>,
+                        Kokkos::IndexType<vtkm::Id>>
+      policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(), 0, numInstances);
    Kokkos::parallel_for(policy, functor);
    CheckForErrors(); // synchronizes
  }

-  template <typename WType, typename IType>
+  template <typename WType, typename IType, typename Hints>
  VTKM_CONT static void ScheduleTask(
-    vtkm::exec::kokkos::internal::TaskBasic3D<WType, IType>& functor,
+    vtkm::exec::kokkos::internal::TaskBasic3D<WType, IType, Hints>& functor,
    vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
@ -706,7 +713,13 @@ public:

    functor.SetErrorMessageBuffer(GetErrorMessageBufferInstance());

+    constexpr vtkm::IdComponent maxThreadsPerBlock =
+      vtkm::cont::internal::HintFind<Hints,
+                                     vtkm::cont::internal::HintThreadsPerBlock<0>,
+                                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads;
+
    Kokkos::MDRangePolicy<vtkm::cont::kokkos::internal::ExecutionSpace,
+                          Kokkos::LaunchBounds<maxThreadsPerBlock, 0>,
                          Kokkos::Rank<3>,
                          Kokkos::IndexType<vtkm::Id>>
      policy(vtkm::cont::kokkos::internal::GetExecutionSpaceInstance(),
@ -729,24 +742,38 @@ public:
    CheckForErrors(); // synchronizes
  }

-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  template <typename Hints, typename Functor>
+  VTKM_CONT static void Schedule(Hints, Functor functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

-    vtkm::exec::kokkos::internal::TaskBasic1D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::kokkos::internal::TaskBasic1D<Functor, vtkm::internal::NullType, Hints> kernel(
+      functor);
    ScheduleTask(kernel, numInstances);
  }

-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+  }
+
+  template <typename Hints, typename Functor>
+  VTKM_CONT static void Schedule(Hints, Functor functor, const vtkm::Id3& rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

-    vtkm::exec::kokkos::internal::TaskBasic3D<Functor, vtkm::internal::NullType> kernel(functor);
+    vtkm::exec::kokkos::internal::TaskBasic3D<Functor, vtkm::internal::NullType, Hints> kernel(
+      functor);
    ScheduleTask(kernel, rangeMax);
  }

+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
+  }
+
  //----------------------------------------------------------------------------
 private:
  template <typename T>
@ -1020,20 +1047,28 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagKokkos>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
-  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id)
+  template <typename Hints, typename WorkletType, typename InvocationType>
+  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType, Hints>
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id, Hints = Hints{})
  {
-    return vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType>(worklet,
-                                                                                  invocation);
+    return vtkm::exec::kokkos::internal::TaskBasic1D<WorkletType, InvocationType, Hints>(
+      worklet, invocation);
  }

-  template <typename WorkletType, typename InvocationType>
-  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType>
-  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3)
+  template <typename Hints, typename WorkletType, typename InvocationType>
+  VTKM_CONT static vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType, Hints>
+  MakeTask(WorkletType& worklet, InvocationType& invocation, vtkm::Id3, Hints = {})
  {
-    return vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType>(worklet,
-                                                                                  invocation);
+    return vtkm::exec::kokkos::internal::TaskBasic3D<WorkletType, InvocationType, Hints>(
+      worklet, invocation);
+  }
+
+  template <typename WorkletType, typename InvocationType, typename RangeType>
+  VTKM_CONT static auto MakeTask(WorkletType& worklet,
+                                 InvocationType& invocation,
+                                 const RangeType& range)
+  {
+    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
  }
 };
 }
--- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
+++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
@ -359,8 +359,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::openmp::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances)
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

@ -368,8 +368,14 @@ public:
    ScheduleTask(kernel, numInstances);
  }

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax)
+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+  }
+
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

@ -377,6 +383,12 @@ public:
    ScheduleTask(kernel, rangeMax);
  }

+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
+  }
+
  VTKM_CONT static void Synchronize()
  {
    // Nothing to do. This device schedules all of its operations using a
@ -390,21 +402,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagOpenMP>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::openmp::internal::TaskTiling1D MakeTask(const WorkletType& worklet,
                                                             const InvocationType& invocation,
-                                                             vtkm::Id)
+                                                             vtkm::Id,
+                                                             Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::openmp::internal::TaskTiling1D(worklet, invocation);
  }

-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::openmp::internal::TaskTiling3D MakeTask(const WorkletType& worklet,
                                                             const InvocationType& invocation,
-                                                             vtkm::Id3)
+                                                             vtkm::Id3,
+                                                             Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::openmp::internal::TaskTiling3D(worklet, invocation);
  }
+
+  template <typename WorkletType, typename InvocationType, typename RangeType>
+  VTKM_CONT static auto MakeTask(WorkletType& worklet,
+                                 InvocationType& invocation,
+                                 const RangeType& range)
+  {
+    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
+  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
+++ b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
@ -400,8 +400,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::serial::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id size)
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id size)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

@ -409,8 +409,14 @@ public:
    ScheduleTask(kernel, size);
  }

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 size)
+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id size)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, size);
+  }
+
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 size)
  {
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

@ -418,6 +424,12 @@ public:
    ScheduleTask(kernel, size);
  }

+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 size)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, size);
+  }
+
 private:
  template <typename Vin,
            typename I,
@ -557,21 +569,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagSerial>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::serial::internal::TaskTiling1D MakeTask(WorkletType& worklet,
                                                             InvocationType& invocation,
-                                                             vtkm::Id)
+                                                             vtkm::Id,
+                                                             Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::serial::internal::TaskTiling1D(worklet, invocation);
  }

-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::serial::internal::TaskTiling3D MakeTask(WorkletType& worklet,
                                                             InvocationType& invocation,
-                                                             vtkm::Id3)
+                                                             vtkm::Id3,
+                                                             Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::serial::internal::TaskTiling3D(worklet, invocation);
  }
+
+  template <typename WorkletType, typename InvocationType, typename RangeType>
+  VTKM_CONT static auto MakeTask(WorkletType& worklet,
+                                 InvocationType& invocation,
+                                 const RangeType& range)
+  {
+    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
+  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
+++ b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
@ -259,8 +259,8 @@ public:
  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::tbb::internal::TaskTiling3D& functor,
                                            vtkm::Id3 size);

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances)
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id numInstances)
  {
    VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf,
                   "Schedule TBB 1D: '%s'",
@ -270,8 +270,14 @@ public:
    ScheduleTask(kernel, numInstances);
  }

-  template <class FunctorType>
-  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax)
+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id numInstances)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, numInstances);
+  }
+
+  template <typename Hints, typename FunctorType>
+  VTKM_CONT static inline void Schedule(Hints, FunctorType functor, vtkm::Id3 rangeMax)
  {
    VTKM_LOG_SCOPE(vtkm::cont::LogLevel::Perf,
                   "Schedule TBB 3D: '%s'",
@ -281,6 +287,12 @@ public:
    ScheduleTask(kernel, rangeMax);
  }

+  template <typename FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType&& functor, vtkm::Id3 rangeMax)
+  {
+    Schedule(vtkm::cont::internal::HintList<>{}, functor, rangeMax);
+  }
+
  //1. We need functions for each of the following


@ -421,21 +433,33 @@ template <>
 class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagTBB>
 {
 public:
-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::tbb::internal::TaskTiling1D MakeTask(WorkletType& worklet,
                                                          InvocationType& invocation,
-                                                          vtkm::Id)
+                                                          vtkm::Id,
+                                                          Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::tbb::internal::TaskTiling1D(worklet, invocation);
  }

-  template <typename WorkletType, typename InvocationType>
+  template <typename Hints, typename WorkletType, typename InvocationType>
  static vtkm::exec::tbb::internal::TaskTiling3D MakeTask(WorkletType& worklet,
                                                          InvocationType& invocation,
-                                                          vtkm::Id3)
+                                                          vtkm::Id3,
+                                                          Hints = Hints{})
  {
+    // Currently ignoring hints.
    return vtkm::exec::tbb::internal::TaskTiling3D(worklet, invocation);
  }
+
+  template <typename WorkletType, typename InvocationType, typename RangeType>
+  VTKM_CONT static auto MakeTask(WorkletType& worklet,
+                                 InvocationType& invocation,
+                                 const RangeType& range)
+  {
+    return MakeTask<vtkm::cont::internal::HintList<>>(worklet, invocation, range);
+  }
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/testing/CMakeLists.txt
+++ b/vtkm/cont/testing/CMakeLists.txt
@ -107,6 +107,7 @@ set(unit_tests_device
  UnitTestDataSetPermutation.cxx
  UnitTestDataSetSingleType.cxx
  UnitTestDeviceAdapterAlgorithmDependency.cxx
+  UnitTestHints.cxx
  UnitTestImplicitFunction.cxx
  UnitTestParticleArrayCopy.cxx
  UnitTestPointLocatorSparseGrid.cxx
@ -131,6 +132,11 @@ endif()

 vtkm_unit_tests(SOURCES ${unit_tests} DEVICE_SOURCES ${unit_tests_device})

+target_sources(UnitTests_vtkm_cont_testing
+  PRIVATE
+    UnitTestHints.cxx
+)
+
 #add distributed tests i.e.test to run with MPI
 #if MPI is enabled.
 set(mpi_unit_tests
--- a/vtkm/cont/testing/UnitTestHints.cxx
+++ b/vtkm/cont/testing/UnitTestHints.cxx
@ -0,0 +1,108 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include <vtkm/cont/internal/Hints.h>
+
+#include <vtkm/cont/Algorithm.h>
+#include <vtkm/cont/DeviceAdapter.h>
+
+#include <vtkm/exec/FunctorBase.h>
+
+#include <vtkm/cont/testing/Testing.h>
+
+namespace UnitTestHintNamespace
+{
+
+void CheckFind()
+{
+  std::cout << "Empty list returns default.\n";
+  VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind<vtkm::cont::internal::HintList<>,
+                                                  vtkm::cont::internal::HintThreadsPerBlock<128>,
+                                                  vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads ==
+                   128);
+
+  std::cout << "Find a hint that matches.\n";
+  VTKM_TEST_ASSERT(vtkm::cont::internal::HintFind<
+                     vtkm::cont::internal::HintList<vtkm::cont::internal::HintThreadsPerBlock<128>>,
+                     vtkm::cont::internal::HintThreadsPerBlock<0>,
+                     vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128);
+  VTKM_TEST_ASSERT(
+    vtkm::cont::internal::HintFind<
+      vtkm::cont::internal::HintList<
+        vtkm::cont::internal::HintThreadsPerBlock<128,
+                                                  vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>,
+      vtkm::cont::internal::HintThreadsPerBlock<0>,
+      vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads == 128);
+
+  std::cout << "Skip a hint that does not match.\n";
+  VTKM_TEST_ASSERT(
+    (vtkm::cont::internal::HintFind<
+       vtkm::cont::internal::HintList<
+         vtkm::cont::internal::HintThreadsPerBlock<128,
+                                                   vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>,
+       vtkm::cont::internal::HintThreadsPerBlock<0>,
+       vtkm::cont::DeviceAdapterTagSerial>::MaxThreads == 0));
+
+  std::cout << "Given a list of hints, pick the last one that matches\n";
+  {
+    using HList = vtkm::cont::internal::HintList<
+      vtkm::cont::internal::HintThreadsPerBlock<64>,
+      vtkm::cont::internal::HintThreadsPerBlock<128, vtkm::List<vtkm::cont::DeviceAdapterTagCuda>>,
+      vtkm::cont::internal::HintThreadsPerBlock<256,
+                                                vtkm::List<vtkm::cont::DeviceAdapterTagKokkos>>>;
+    using HInit = vtkm::cont::internal::HintThreadsPerBlock<0>;
+    VTKM_TEST_ASSERT((vtkm::cont::internal::
+                        HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagSerial>::MaxThreads ==
+                      64));
+    VTKM_TEST_ASSERT(
+      (vtkm::cont::internal::HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagCuda>::MaxThreads ==
+       128));
+    VTKM_TEST_ASSERT((vtkm::cont::internal::
+                        HintFind<HList, HInit, vtkm::cont::DeviceAdapterTagKokkos>::MaxThreads ==
+                      256));
+  }
+}
+
+struct MyFunctor : vtkm::exec::FunctorBase
+{
+  VTKM_EXEC void operator()(vtkm::Id vtkmNotUsed(index)) const
+  {
+    // NOP
+  }
+
+  VTKM_EXEC void operator()(vtkm::Id3 vtkmNotUsed(index)) const
+  {
+    // NOP
+  }
+};
+
+void CheckSchedule()
+{
+  std::cout << "Schedule a functor using hints.\n";
+  // There is no good way to see if the device adapter got or used the hints
+  // as device adapters are free to ignore hints. This just tests that the
+  // hints can be passed.
+  using Hints = vtkm::cont::internal::HintList<vtkm::cont::internal::HintThreadsPerBlock<128>>;
+  vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, 10);
+  vtkm::cont::Algorithm::Schedule(Hints{}, MyFunctor{}, vtkm::Id3{ 2 });
+}
+
+void Run()
+{
+  CheckFind();
+  CheckSchedule();
+}
+
+} // anonymous UnitTestHintNamespace
+
+int UnitTestHints(int argc, char* argv[])
+{
+  return vtkm::cont::testing::Testing::Run(UnitTestHintNamespace::Run, argc, argv);
+}
--- a/vtkm/exec/TaskBase.h
+++ b/vtkm/exec/TaskBase.h
@ -12,6 +12,8 @@

 #include <vtkm/Types.h>

+#include <vtkm/cont/internal/Hints.h>
+
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>

 namespace vtkm
--- a/vtkm/exec/cuda/internal/TaskStrided.h
+++ b/vtkm/exec/cuda/internal/TaskStrided.h
@ -50,9 +50,11 @@ protected:
  SetErrorBufferSignature SetErrorBufferFunction = nullptr;
 };

-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskStrided1D : public TaskStrided
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskStrided1D(const WType& worklet, const IType& invocation)
    : TaskStrided()
@ -90,9 +92,11 @@ private:
  const IType Invocation;
 };

-template <typename WType>
-class TaskStrided1D<WType, vtkm::internal::NullType> : public TaskStrided
+template <typename WType, typename Hints>
+class TaskStrided1D<WType, vtkm::internal::NullType, Hints> : public TaskStrided
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskStrided1D(WType& worklet)
    : TaskStrided()
@ -116,9 +120,11 @@ private:
  typename std::remove_const<WType>::type Worklet;
 };

-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskStrided3D : public TaskStrided
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskStrided3D(const WType& worklet, const IType& invocation)
    : TaskStrided()
@ -165,9 +171,11 @@ private:
  const IType Invocation;
 };

-template <typename WType>
-class TaskStrided3D<WType, vtkm::internal::NullType> : public TaskStrided
+template <typename WType, typename Hints>
+class TaskStrided3D<WType, vtkm::internal::NullType, Hints> : public TaskStrided
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskStrided3D(WType& worklet)
    : TaskStrided()
--- a/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu
+++ b/vtkm/exec/cuda/testing/UnitTestTaskStrided.cu
@ -342,8 +342,8 @@ void TestErrorFunctorInvoke()
      TestExecObject(input.PrepareForInPlace(DeviceAdapter(), token)),
      TestExecObject(output.PrepareForInPlace(DeviceAdapter(), token)));

-  using TaskStrided1 =
-    vtkm::exec::cuda::internal::TaskStrided1D<TestWorkletErrorProxy, InvocationType1>;
+  using TaskStrided1 = vtkm::exec::cuda::internal::
+    TaskStrided1D<TestWorkletErrorProxy, InvocationType1, vtkm::cont::internal::HintList<>>;
  TestWorkletErrorProxy worklet;
  InvocationType1 invocation(execObjects);

--- a/vtkm/exec/kokkos/internal/TaskBasic.h
+++ b/vtkm/exec/kokkos/internal/TaskBasic.h
@ -24,9 +24,11 @@ namespace kokkos
 namespace internal
 {

-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskBasic1D : public vtkm::exec::TaskBase
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskBasic1D(const WType& worklet, const IType& invocation)
    : Worklet(worklet)
@ -57,9 +59,11 @@ private:
  IType Invocation;
 };

-template <typename WType>
-class TaskBasic1D<WType, vtkm::internal::NullType> : public vtkm::exec::TaskBase
+template <typename WType, typename Hints>
+class TaskBasic1D<WType, vtkm::internal::NullType, Hints> : public vtkm::exec::TaskBase
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  explicit TaskBasic1D(const WType& worklet)
    : Worklet(worklet)
@ -78,9 +82,11 @@ private:
  typename std::remove_const<WType>::type Worklet;
 };

-template <typename WType, typename IType>
+template <typename WType, typename IType, typename Hints>
 class TaskBasic3D : public vtkm::exec::TaskBase
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  TaskBasic3D(const WType& worklet, const IType& invocation)
    : Worklet(worklet)
@ -112,9 +118,11 @@ private:
  IType Invocation;
 };

-template <typename WType>
-class TaskBasic3D<WType, vtkm::internal::NullType> : public vtkm::exec::TaskBase
+template <typename WType, typename Hints>
+class TaskBasic3D<WType, vtkm::internal::NullType, Hints> : public vtkm::exec::TaskBase
 {
+  VTKM_IS_HINT_LIST(Hints);
+
 public:
  explicit TaskBasic3D(const WType& worklet)
    : Worklet(worklet)
--- a/vtkm/worklet/internal/DispatcherBase.h
+++ b/vtkm/worklet/internal/DispatcherBase.h
@ -792,7 +792,8 @@ private:
    // vtkm::exec::internal::TaskSingular
    // vtkm::exec::internal::TaskTiling1D
    // vtkm::exec::internal::TaskTiling3D
-    auto task = TaskTypes::MakeTask(this->Worklet, invocation, range);
+    auto task =
+      TaskTypes::MakeTask(this->Worklet, invocation, range, typename WorkletType::Hints{});
    Algorithm::ScheduleTask(task, range);
  }
 };
--- a/vtkm/worklet/internal/WorkletBase.h
+++ b/vtkm/worklet/internal/WorkletBase.h
@ -40,6 +40,8 @@
 #include <vtkm/cont/arg/TypeCheckTagCellSet.h>
 #include <vtkm/cont/arg/TypeCheckTagExecObject.h>

+#include <vtkm/cont/internal/Hints.h>
+
 #include <vtkm/worklet/MaskNone.h>
 #include <vtkm/worklet/ScatterIdentity.h>
 #include <vtkm/worklet/internal/Placeholders.h>
@ -136,6 +138,11 @@ public:
  /// everything in the output domain.
  using MaskType = vtkm::worklet::MaskNone;

+  /// Worklets can provide hints to the scheduler by defining a `Hints` type that
+  /// resolves to a `vtkm::cont::internal::HintList`. The default hint list is empty
+  /// so that scheduling uses all defaults.
+  using Hints = vtkm::cont::internal::HintList<>;
+
  /// @brief `ControlSignature` tag for whole input arrays.
  ///
  /// The `WholeArrayIn` control signature tag specifies a `vtkm::cont::ArrayHandle`