Move VTK-m Cuda backend over to a grid-stride iteration pattern.

This allows for easier host side logic when determining grid and block sizes, and allows for a smaller library side by moving some logic into compiled in functions.
2024-09-08 13:23:51 +00:00 · 2018-04-27 14:19:15 -04:00 · 2018-04-27 14:19:15 -04:00 · b56894dd09
commit b56894dd09
parent 5c5fb020a8
12 changed files with 596 additions and 539 deletions
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@ -43,4 +43,5 @@ target_sources(vtkm_cont PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
 )
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -31,6 +31,8 @@
 // Here are the actual implementation of the algorithms.
 #include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>

+#include <vtkm/exec/cuda/internal/TaskStrided.h>
+
 #include <cuda.h>

 namespace vtkm
@ -257,24 +259,24 @@ class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
 {
 public:
  template <typename WorkletType, typename InvocationType>
-  static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
+  static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType> MakeTask(
    WorkletType& worklet,
    InvocationType& invocation,
    vtkm::Id,
    vtkm::Id globalIndexOffset = 0)
  {
-    using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
+    using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
    return Task(worklet, invocation, globalIndexOffset);
  }

  template <typename WorkletType, typename InvocationType>
-  static vtkm::exec::internal::TaskSingular<WorkletType, InvocationType> MakeTask(
+  static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType> MakeTask(
    WorkletType& worklet,
    InvocationType& invocation,
    vtkm::Id3,
    vtkm::Id globalIndexOffset = 0)
  {
-    using Task = vtkm::exec::internal::TaskSingular<WorkletType, InvocationType>;
+    using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
    return Task(worklet, invocation, globalIndexOffset);
  }
 };
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
@ -0,0 +1,153 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
+
+#include <atomic>
+#include <mutex>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+VTKM_CONT_EXPORT int getNumSMs(int dId)
+{
+  //check
+  static bool lookupBuilt = false;
+  static std::vector<int> numSMs;
+
+  if (!lookupBuilt)
+  {
+    //lock the mutex
+    static std::mutex built_mutex;
+    std::lock_guard<std::mutex> lock(built_mutex);
+
+    //iterate over all devices
+    int numberOfSMs = 0;
+    int count = 0;
+    VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
+    numSMs.reserve(static_cast<std::size_t>(count));
+    for (int deviceId = 0; deviceId < count; ++deviceId)
+    { //get the number of sm's per deviceId
+      VTKM_CUDA_CALL(
+        cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
+      numSMs.push_back(numberOfSMs);
+    }
+    lookupBuilt = true;
+  }
+  return numSMs[dId];
+}
+
+// we use cuda pinned memory to reduce the amount of synchronization
+// and mem copies between the host and device.
+template <>
+char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
+  vtkm::Id& arraySize,
+  char** hostPointer)
+{
+  const vtkm::Id ERROR_ARRAY_SIZE = 1024;
+  static bool errorArrayInit = false;
+  static char* hostPtr = nullptr;
+  static char* devicePtr = nullptr;
+  if (!errorArrayInit)
+  {
+    VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
+    VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
+    errorArrayInit = true;
+  }
+  //set the size of the array
+  arraySize = ERROR_ARRAY_SIZE;
+
+  //specify the host pointer to the memory
+  *hostPointer = hostPtr;
+  (void)hostPointer;
+  return devicePtr;
+}
+
+template <>
+char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
+  vtkm::exec::cuda::internal::TaskStrided& functor)
+{
+  //since the memory is pinned we can access it safely on the host
+  //without a memcpy
+  vtkm::Id errorArraySize = 0;
+  char* hostErrorPtr = nullptr;
+  char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
+
+  //clear the first character which means that we don't contain an error
+  hostErrorPtr[0] = '\0';
+
+  vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
+  functor.SetErrorMessageBuffer(errorMessage);
+
+  return hostErrorPtr;
+}
+
+template <>
+void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+  int& grids,
+  int& blocks,
+  vtkm::Id size)
+{
+  (void)size;
+  int deviceId;
+  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
+  grids = 32 * getNumSMs(deviceId);
+  blocks = 128;
+}
+
+template <>
+void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+  int& grids,
+  dim3& blocks,
+  const dim3& size)
+{
+  int deviceId;
+  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
+  grids = 32 * getNumSMs(deviceId);
+
+  if (size.x == 0)
+  { //grids that have no x dimension
+    blocks.x = 1;
+    blocks.y = 16;
+    blocks.z = 8;
+  }
+  else if (size.x > 128)
+  {
+    blocks.x = 64;
+    blocks.y = 2;
+    blocks.z = 1;
+  }
+  else
+  { //for really small grids
+    blocks.x = 8;
+    blocks.y = 4;
+    blocks.z = 4;
+  }
+}
+}
+}
+}
+}
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@ -21,6 +21,8 @@
 #ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h
 #define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmThrust_h

+#include <vtkm/cont/vtkm_cont_export.h>
+
 #include <vtkm/TypeTraits.h>
 #include <vtkm/Types.h>
 #include <vtkm/UnaryPredicates.h>
@ -34,15 +36,16 @@
 #include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
 #include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>

+#include <vtkm/exec/cuda/internal/TaskStrided.h>
 #include <vtkm/exec/cuda/internal/WrappedOperators.h>
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
-#include <vtkm/exec/internal/TaskSingular.h>

 #include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>

-// #define ANALYZE_VTKM_SCHEDULER_1D
-// #define ANALYZE_VTKM_SCHEDULER_3D
-#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
+
+// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
+// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
+#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
 #include <vtkm/cont/cuda/internal/TaskTuner.h>
 #endif

@ -62,7 +65,6 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 #include <vtkm/exec/cuda/internal/ExecutionPolicy.h>
 VTKM_THIRDPARTY_POST_INCLUDE

-#include <atomic>

 namespace vtkm
 {
@ -82,73 +84,20 @@ namespace internal
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif

-static __global__ void DetermineProperXGridSize(vtkm::UInt32 desired_size,
-                                                vtkm::UInt32* actual_size)
+template <typename TaskType>
+__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id size)
 {
-  //used only to see if we can launch kernels with a x grid size that
-  //matches the max of the graphics card, or are we having to fall back
-  //to SM_2 grid sizes
-  if (blockIdx.x != 0)
-  {
-    return;
-  }
-#if __CUDA_ARCH__ <= 200
-  const vtkm::UInt32 maxXGridSizeForSM2 = 65535;
-  *actual_size = maxXGridSizeForSM2;
-#else
-  *actual_size = desired_size;
-#endif
+  //see https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+  //for why our inc is grid-stride
+  const vtkm::Id start = blockIdx.x * blockDim.x + threadIdx.x;
+  const vtkm::Id inc = blockDim.x * gridDim.x;
+  task(start, size, inc);
 }

-template <class FunctorType>
-__global__ void Schedule1DIndexKernel(FunctorType functor,
-                                      vtkm::Id numberOfKernelsInvoked,
-                                      vtkm::Id length)
-{
-  //Note a cuda launch can only handle at most 2B iterations of a kernel
-  //because it holds all of the indexes inside UInt32, so for use to
-  //handle datasets larger than 2B, we need to execute multiple kernels
-  const vtkm::Id index =
-    numberOfKernelsInvoked + static_cast<vtkm::Id>(blockDim.x * blockIdx.x + threadIdx.x);
-  if (index < length)
-  {
-    functor(index);
-  }
-}
-
-template <class FunctorType>
-__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size)
-{
-  const vtkm::Id3 index(blockIdx.x * blockDim.x + threadIdx.x,
-                        blockIdx.y * blockDim.y + threadIdx.y,
-                        blockIdx.z * blockDim.z + threadIdx.z);
-  if (index[0] >= size.x || index[1] >= size.y || index[2] >= size.z)
-  {
-    return;
-  }
-  functor(index);
-}
-
-#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
-
-// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd
-// for an unknown reason
-template <class FunctorType>
-__global__ void Schedule1DIndexKernel2(FunctorType functor,
-                                       vtkm::Id numberOfKernelsInvoked,
-                                       vtkm::Id length)
-{
-  vtkm::Id index = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
-  const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
-  for (; index < length; index += inc)
-  {
-    functor(index);
-  }
-}
-
-template <class FunctorType>
-__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
+template <typename TaskType>
+__global__ void TaskStrided3DLaunch(TaskType task, dim3 size)
 {
+  //This is the 3D version of executing in a grid-stride manner
  const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
                   blockIdx.y * blockDim.y + threadIdx.y,
                   blockIdx.z * blockDim.z + threadIdx.z);
@ -158,17 +107,11 @@ __global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
  {
    for (uint j = start.y; j < size.y; j += inc.y)
    {
-      vtkm::Id3 index(start.x, j, k);
-      for (vtkm::Id i = start.x; i < size.x; i += inc.x)
-      {
-        index[0] = i;
-        functor(index);
-      }
+      task(start.x, size.x, inc.x, j, k);
    }
  }
 }

-#endif

 template <typename T, typename BinaryOperationType>
 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
@ -180,16 +123,6 @@ __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_
 #pragma GCC diagnostic pop
 #endif

-inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d)
-{
-  gridSize3d.x = (rangeMax.x % blockSize3d.x != 0) ? (rangeMax.x / blockSize3d.x + 1)
-                                                   : (rangeMax.x / blockSize3d.x);
-  gridSize3d.y = (rangeMax.y % blockSize3d.y != 0) ? (rangeMax.y / blockSize3d.y + 1)
-                                                   : (rangeMax.y / blockSize3d.y);
-  gridSize3d.z = (rangeMax.z % blockSize3d.z != 0) ? (rangeMax.z / blockSize3d.z + 1)
-                                                   : (rangeMax.z / blockSize3d.z);
-}
-
 /// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
 /// device that uses thrust as its implementation. The subclass should pass in
 /// the correct device adapter tag as the template parameter.
@ -1142,89 +1075,24 @@ public:
                             ::thrust::equal_to<T>(),
                             binary_functor);
  }
-// Because of some funny code conversions in nvcc, kernels for devices have to
-// be public.
-#ifndef VTKM_CUDA
-private:
-#endif
  // we use cuda pinned memory to reduce the amount of synchronization
  // and mem copies between the host and device.
-  VTKM_CONT
-  static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer)
-  {
-    const vtkm::Id ERROR_ARRAY_SIZE = 1024;
-    static bool errorArrayInit = false;
-    static char* hostPtr = nullptr;
-    static char* devicePtr = nullptr;
-    if (!errorArrayInit)
-    {
-      VTKM_CUDA_CALL(cudaMallocHost((void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
-      VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
-      errorArrayInit = true;
-    }
-    //set the size of the array
-    arraySize = ERROR_ARRAY_SIZE;
+  VTKM_CONT_EXPORT
+  static char* GetPinnedErrorArray(vtkm::Id& arraySize, char** hostPointer);

-    //specify the host pointer to the memory
-    *hostPointer = hostPtr;
-    (void)hostPointer;
-    return devicePtr;
-  }
+  VTKM_CONT_EXPORT
+  static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);

-  // we query cuda for the max blocks per grid for 1D scheduling
-  // and cache the values in static variables
-  VTKM_CONT
-  static vtkm::Vec<vtkm::UInt32, 3> GetMaxGridOfThreadBlocks()
-  {
-    static std::atomic<bool> gridQueryInit(false);
-    static vtkm::Vec<vtkm::UInt32, 3> maxGridSize;
-    // NOTE: The following code may still be executed by multiple threads
-    // but it should not cause any correctness issues.
-    if (!gridQueryInit)
-    {
-      int currDevice;
-      VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
+  VTKM_CONT_EXPORT
+  static void GetGridsAndBlocks(int& grid, int& blocks, vtkm::Id size);

-      cudaDeviceProp properties;
-      VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
-      maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
-      maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
-      maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
-
-      //Note: While in practice SM_3+ devices can schedule up to (2^31-1) grids
-      //in the X direction, it is dependent on the code being compiled for SM3+.
-      //If not, it falls back to SM_2 limitation of 65535 being the largest grid
-      //size.
-      //Now since SM architecture is only available inside kernels we have to
-      //invoke one to see what the actual limit is for our device.  So that is
-      //what we are going to do next, and than we will store that result
-
-      vtkm::UInt32* dev_actual_size;
-      VTKM_CUDA_CALL(cudaMalloc((void**)&dev_actual_size, sizeof(vtkm::UInt32)));
-      DetermineProperXGridSize<<<1, 1, 0, cudaStreamPerThread>>>(maxGridSize[0], dev_actual_size);
-      VTKM_CUDA_CALL(cudaMemcpyAsync(&maxGridSize[0],
-                                     dev_actual_size,
-                                     sizeof(vtkm::UInt32),
-                                     cudaMemcpyDeviceToHost,
-                                     cudaStreamPerThread));
-      VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
-      gridQueryInit = true;
-      VTKM_CUDA_CALL(cudaFree(dev_actual_size));
-    }
-    return maxGridSize;
-  }
+  VTKM_CONT_EXPORT
+  static void GetGridsAndBlocks(int& grid, dim3& blocks, const dim3& size);

 public:
-  template <class TaskType, typename RangeType>
-  VTKM_CONT static void ScheduleTask(TaskType task, RangeType size)
-  { //for now defer to the schedule api, we need to do a significant
-    //amount of build infrastructure work to implement type erasure tasks
-    //for cuda
-    Schedule(task, size);
-  }
-
-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  template <typename WType, typename IType>
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
+                           vtkm::Id numInstances)
  {
    VTKM_ASSERT(numInstances >= 0);
    if (numInstances < 1)
@ -1232,45 +1100,12 @@ public:
      // No instances means nothing to run. Just return.
      return;
    }
+    char* hostErrorPtr = SetupErrorBuffer(functor);

-    //since the memory is pinned we can access it safely on the host
-    //without a memcpy
-    vtkm::Id errorArraySize = 0;
-    char* hostErrorPtr = nullptr;
-    char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
+    int grids, blocks;
+    GetGridsAndBlocks(grids, blocks, numInstances);

-    //clear the first character which means that we don't contain an error
-    hostErrorPtr[0] = '\0';
-
-    vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
-
-    functor.SetErrorMessageBuffer(errorMessage);
-
-    const vtkm::Id blockSizeAsId = 128;
-    const vtkm::UInt32 blockSize = 128;
-    const vtkm::UInt32 maxblocksPerLaunch = GetMaxGridOfThreadBlocks()[0];
-    const vtkm::UInt32 totalBlocks =
-      static_cast<vtkm::UInt32>((numInstances + blockSizeAsId - 1) / blockSizeAsId);
-
-    //Note a cuda launch can only handle at most 2B iterations of a kernel
-    //because it holds all of the indexes inside UInt32, so for use to
-    //handle datasets larger than 2B, we need to execute multiple kernels
-    if (totalBlocks < maxblocksPerLaunch)
-    {
-      Schedule1DIndexKernel<Functor><<<totalBlocks, blockSize, 0, cudaStreamPerThread>>>(
-        functor, vtkm::Id(0), numInstances);
-    }
-    else
-    {
-      const vtkm::Id numberOfKernelsToRun =
-        blockSizeAsId * static_cast<vtkm::Id>(maxblocksPerLaunch);
-      for (vtkm::Id numberOfKernelsInvoked = 0; numberOfKernelsInvoked < numInstances;
-           numberOfKernelsInvoked += numberOfKernelsToRun)
-      {
-        Schedule1DIndexKernel<Functor><<<maxblocksPerLaunch, blockSize, 0, cudaStreamPerThread>>>(
-          functor, numberOfKernelsInvoked, numInstances);
-      }
-    }
+    TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, numInstances);

    //sync so that we can check the results of the call.
    //In the future I want move this before the schedule call, and throwing
@ -1284,13 +1119,14 @@ public:
      throw vtkm::cont::ErrorExecution(hostErrorPtr);
    }

-#ifdef ANALYZE_VTKM_SCHEDULER_1D
-    compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize);
+#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
+    parameter_sweep_1d_schedule(functor, numInstances);
 #endif
  }

-  template <class Functor>
-  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  template <typename WType, typename IType>
+  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>& functor,
+                           vtkm::Id3 rangeMax)
  {
    VTKM_ASSERT((rangeMax[0] >= 0) && (rangeMax[1] >= 0) && (rangeMax[2] >= 0));
    if ((rangeMax[0] < 1) || (rangeMax[1] < 1) || (rangeMax[2] < 1))
@ -1298,46 +1134,17 @@ public:
      // No instances means nothing to run. Just return.
      return;
    }
-
-    //since the memory is pinned we can access it safely on the host
-    //without a memcpy
-    vtkm::Id errorArraySize = 0;
-    char* hostErrorPtr = nullptr;
-    char* deviceErrorPtr = GetPinnedErrorArray(errorArraySize, &hostErrorPtr);
-
-    //clear the first character which means that we don't contain an error
-    hostErrorPtr[0] = '\0';
-
-    vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
-
-    functor.SetErrorMessageBuffer(errorMessage);
+    char* hostErrorPtr = SetupErrorBuffer(functor);

    const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
                      static_cast<vtkm::UInt32>(rangeMax[1]),
                      static_cast<vtkm::UInt32>(rangeMax[2]));

-    //currently we presume that 3d scheduling access patterns prefer accessing
-    //memory in the X direction. Also should be good for thin in the Z axis
-    //algorithms.
-    dim3 blockSize3d(64, 2, 1);
-    //In general we need more information as this doesn't work well when
-    //executing on the points, and need to fetch all cells used, as the z
-    //width is not fat enough
+    int grids;
+    dim3 blocks;
+    GetGridsAndBlocks(grids, blocks, ranges);

-    //handle the simple use case of 'bad' datasets which are thin in X
-    //but larger in the other directions, allowing us decent performance with
-    //that use case.
-    if (rangeMax[0] <= 128 && (rangeMax[0] < rangeMax[1] || rangeMax[0] < rangeMax[2]))
-    {
-      blockSize3d = dim3(16, 4, 4);
-    }
-
-
-    dim3 gridSize3d;
-    compute_block_size(ranges, blockSize3d, gridSize3d);
-
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
-                                                                                        ranges);
+    TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);

    //sync so that we can check the results of the call.
    //In the future I want move this before the schedule call, and throwing
@ -1351,23 +1158,25 @@ public:
      throw vtkm::cont::ErrorExecution(hostErrorPtr);
    }

-#ifdef ANALYZE_VTKM_SCHEDULER_1D
-    compare_1d_dynamic_block_picker(functor,
-                                    rangeMax[0] * rangeMax[1] * rangeMax[2],
-                                    gridSize3d.x * gridSize3d.y * gridSize3d.z,
-                                    blockSize3d.x * blockSize3d.y * blockSize3d.z);
-#endif
-
-#ifdef ANALYZE_VTKM_SCHEDULER_3D
-    //requires the errormessage buffer be set
-    compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d);
-#endif
-
 #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
    parameter_sweep_3d_schedule(functor, rangeMax);
 #endif
  }

+  template <class Functor>
+  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  {
+    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
+    ScheduleTask(kernel, numInstances);
+  }
+
+  template <class Functor>
+  VTKM_CONT static void Schedule(Functor functor, const vtkm::Id3& rangeMax)
+  {
+    vtkm::exec::cuda::internal::TaskStrided3D<Functor, vtkm::internal::NullType> kernel(functor);
+    ScheduleTask(kernel, rangeMax);
+  }
+
  template <typename T, class Storage>
  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
  {
--- a/vtkm/cont/cuda/internal/TaskTuner.h
+++ b/vtkm/cont/cuda/internal/TaskTuner.h
@ -41,293 +41,167 @@ namespace cuda
 namespace internal
 {

-template <class FunctorType>
-__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id);
-template <class FunctorType>
-__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id);
-template <class FunctorType>
-__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size);
-template <class FunctorType>
-__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size);
+int getNumSMs(int dId);

-void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d);
+template <typename TaskType>
+__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
+template <typename TaskType>
+__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);

-
-template <typename Task>
-__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size)
+struct PerfRecord1d
 {
-  const vtkm::Id start = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
-  const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
-  for (vtkm::Id i = start; i < size; i += inc)
-  {
-    task(i);
-  }
-}
-
-class PerfRecord
-{
-public:
-  PerfRecord(float elapsedT, dim3 block)
+  PerfRecord1d(float elapsedT, int g, int b)
    : elapsedTime(elapsedT)
-    , blockSize(block)
+    , grid(g)
+    , block(b)
  {
  }

-  bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
+  bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }

  float elapsedTime;
-  dim3 blockSize;
+  int grid;
+  int block;
 };

-template <typename Task>
-static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy)
+inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
 {
-  int blockSize;   // The launch configurator returned block size
-  int minGridSize; // The minimum grid size needed to achieve the
-                   // maximum occupancy for a full device launch
-  int gridSize;    // The actual grid size needed, based on number of SM's
-  int device;      // device to run on
-  int numSMs;      // number of SMs on the active device
-
-  cudaGetDevice(&device);
-  cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device);
-
-  cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2<Task>, 0, 0);
-
-
-  blockSize /= (numSMs * 2);
-  // Round up according to array size
-  // gridSize = (size + blockSize - 1) / blockSize;
-  gridSize = 32 * numSMs;
-  // std::cout << "numSMs: " << numSMs << std::endl;
-
-  // calculate theoretical occupancy
-  int maxActiveBlocks;
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &maxActiveBlocks, Schedule1DIndexKernel2<Task>, blockSize, 0);
-
-  cudaDeviceProp props;
-  cudaGetDeviceProperties(&props, device);
-
-  grids = gridSize;
-  blocks = blockSize;
-  occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
-    (float)(props.maxThreadsPerMultiProcessor / props.warpSize);
+  os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
+     << ">>> required: " << record.elapsedTime << "\n";
+  return os;
 }

-template <class Functor>
-static void compare_1d_dynamic_block_picker(Functor functor,
-                                            vtkm::Id size,
-                                            const vtkm::Id& currentGridSize,
-                                            const vtkm::Id& currentBlockSize)
+
+struct PerfRecord3d
 {
-  const std::type_info& ti = typeid(functor);
-  std::cout << "fixed 1d block size performance " << ti.name() << std::endl;
+  PerfRecord3d(float elapsedT, int g, dim3 b)
+    : elapsedTime(elapsedT)
+    , grid(g)
+    , block(b)
  {
-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-    Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize, 0, cudaStreamPerThread>>>(
-      functor, vtkm::Id(0), size);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    std::cout << "Schedule1DIndexKernel size: " << size << std::endl;
-    std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize
-              << " required: " << elapsedTimeMilliseconds << std::endl;
  }

-  std::cout << "dynamic 1d block size performance " << ti.name() << std::endl;
-  {
+  bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }

-    int grids, blocks;
-    float occupancy;
-    BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
+  float elapsedTime;
+  int grid;
+  dim3 block;
+};

-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
+{

-
-    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-    Schedule1DIndexKernel2<Functor><<<grids, blocks, 0, cudaStreamPerThread>>>(
-      functor, vtkm::Id(0), size);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl;
-    std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks
-              << " required: " << elapsedTimeMilliseconds << std::endl;
-  }
-  std::cout << std::endl;
+  os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
+     << "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
+  return os;
 }

-template <class Functor>
-static void compare_3d_dynamic_block_picker(Functor functor,
-                                            vtkm::Id3 ranges,
-                                            const dim3& gridSize3d,
-                                            const dim3& blockSize3d)
+
+template <typename TaskT>
+static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
 {
-  const std::type_info& ti = typeid(functor);
-  std::cout << "fixed 3d block size performance " << ti.name() << std::endl;
+  std::vector<PerfRecord1d> results;
+  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
+  constexpr vtkm::UInt32 blockIndexTable[12] = { 4,   8,   16,   32,   64,   128,
+                                                 256, 512, 1024, 2048, 4096, 8192 };
+
+  int deviceId;
+  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
+
+  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+    int grids = gridIndexTable[g] * getNumSMs(deviceId);
+    for (vtkm::UInt32 b = 0; b < 12; b++)
+    {
+      int blocks = blockIndexTable[b];

-    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
-                                                                                        ranges);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
+      cudaEvent_t start, stop;
+      VTKM_CUDA_CALL(cudaEventCreate(&start));
+      VTKM_CUDA_CALL(cudaEventCreate(&stop));

-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+      TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);

-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
+      VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));

-    // std::cout << "Schedule3DIndexKernel size: " << size << std::endl;
-    // std::cout << "GridSize of: " << currentGridSize
-    //           << " BlockSize of: " << currentBlockSize  << " required: " << elapsedTimeMilliseconds << std::endl;
+      VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+      float elapsedTimeMilliseconds;
+      VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+      VTKM_CUDA_CALL(cudaEventDestroy(start));
+      VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+      results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
+    }
  }

-  std::cout << "dynamic 3d block size performance " << ti.name() << std::endl;
+  std::sort(results.begin(), results.end());
+  for (auto&& i : results)
  {
-
-    // int grids, blocks;
-    // float occupancy;
-    // BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
-
-    // cudaEvent_t start, stop;
-    // VTKM_CUDA_CALL(cudaEventCreate(&start));
-    // VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-
-    // VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    // Schedule3DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
-    // VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
-
-    // VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    // float elapsedTimeMilliseconds;
-    // VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    // VTKM_CUDA_CALL(cudaEventDestroy(start));
-    // VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    // std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl;
-    // std::cout << "GridSize of: " << grids
-    //           << " BlockSize of: " << blocks  << " required: " << elapsedTimeMilliseconds << std::endl;
+    std::cout << i << std::endl;
  }
-  std::cout << std::endl;
 }

-template <class Functor>
-static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax)
+template <typename TaskT>
+static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
 {
  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
                    static_cast<vtkm::UInt32>(rangeMax[1]),
                    static_cast<vtkm::UInt32>(rangeMax[2]));
-  std::vector<PerfRecord> results;
-  vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
+  std::vector<PerfRecord3d> results;

-  for (vtkm::UInt32 i = 0; i < 16; i++)
+  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
+  constexpr vtkm::UInt32 blockIndexTable[16] = { 1,  2,  4,  8,  12,  16,  20,  24,
+                                                 28, 30, 32, 64, 128, 256, 512, 1024 };
+
+  int deviceId;
+  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
-    for (vtkm::UInt32 j = 0; j < 16; j++)
+    int grids = gridIndexTable[g] * getNumSMs(deviceId);
+    for (vtkm::UInt32 i = 0; i < 16; i++)
    {
-      for (vtkm::UInt32 k = 0; k < 16; k++)
+      for (vtkm::UInt32 j = 0; j < 16; j++)
      {
-        cudaEvent_t start, stop;
-        VTKM_CUDA_CALL(cudaEventCreate(&start));
-        VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-        dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
-        dim3 gridSize3d;
-
-        if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
-            (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
+        for (vtkm::UInt32 k = 0; k < 16; k++)
        {
-          //cuda can't handle more than 1024 threads per block
-          //so don't try if we compute higher than that
+          cudaEvent_t start, stop;
+          VTKM_CUDA_CALL(cudaEventCreate(&start));
+          VTKM_CUDA_CALL(cudaEventCreate(&stop));

-          //also don't try stupidly low numbers
+          dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);

-          //cuda can't handle more than 64 threads in the z direction
-          continue;
+          if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
+              blocks.z >= 64)
+          {
+            //cuda can't handle more than 1024 threads per block
+            //so don't try if we compute higher than that
+
+            //also don't try stupidly low numbers
+
+            //cuda can't handle more than 64 threads in the z direction
+            continue;
+          }
+
+          VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
+          TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
+          VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
+
+          VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+          float elapsedTimeMilliseconds;
+          VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+          VTKM_CUDA_CALL(cudaEventDestroy(start));
+          VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+          results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
        }
-
-        compute_block_size(ranges, blockSize3d, gridSize3d);
-        VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-        Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
-                                                                                            ranges);
-        VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-        float elapsedTimeMilliseconds;
-        VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-        VTKM_CUDA_CALL(cudaEventDestroy(start));
-        VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-        PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
-        results.push_back(record);
      }
    }
  }

  std::sort(results.begin(), results.end());
-  const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
-  for (vtkm::Int64 i = 1; i <= size; i++)
+  for (auto&& i : results)
  {
-    vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
-    vtkm::UInt32 x = results[index].blockSize.x;
-    vtkm::UInt32 y = results[index].blockSize.y;
-    vtkm::UInt32 z = results[index].blockSize.z;
-    float t = results[index].elapsedTime;
-
-    std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
-  }
-
-  std::cout << "fixed 3d block size performance " << std::endl;
-  {
-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-    dim3 blockSize3d(64, 2, 1);
-    dim3 gridSize3d;
-
-    compute_block_size(ranges, blockSize3d, gridSize3d);
-    VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d, 0, cudaStreamPerThread>>>(functor,
-                                                                                        ranges);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
-              << " required: " << elapsedTimeMilliseconds << std::endl;
-    std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
-              << " required: " << elapsedTimeMilliseconds << std::endl;
+    std::cout << i << std::endl;
  }
 }
 }
--- a/vtkm/exec/cuda/internal/CMakeLists.txt
+++ b/vtkm/exec/cuda/internal/CMakeLists.txt
@ -22,8 +22,9 @@ set(headers
  ArrayPortalFromThrust.h
  ExecutionPolicy.h
  IteratorFromArrayPortal.h
-  WrappedOperators.h
+  TaskStrided.h
  ThrustPatches.h
+  WrappedOperators.h
  )

 #-----------------------------------------------------------------------------
--- a/vtkm/exec/cuda/internal/TaskStrided.h
+++ b/vtkm/exec/cuda/internal/TaskStrided.h
@ -0,0 +1,207 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_exec_cuda_internal_TaskStrided_h
+#define vtk_m_exec_cuda_internal_TaskStrided_h
+
+#include <vtkm/exec/TaskBase.h>
+
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
+
+//Todo: rename this header to TaskInvokeWorkletDetail.h
+#include <vtkm/exec/internal/WorkletInvokeFunctorDetail.h>
+
+namespace vtkm
+{
+namespace exec
+{
+namespace cuda
+{
+namespace internal
+{
+
+template <typename WType>
+void TaskStridedSetErrorBuffer(void* w, const vtkm::exec::internal::ErrorMessageBuffer& buffer)
+{
+  using WorkletType = typename std::remove_cv<WType>::type;
+  WorkletType* const worklet = static_cast<WorkletType*>(w);
+  worklet->SetErrorMessageBuffer(buffer);
+}
+
+class TaskStrided : public vtkm::exec::TaskBase
+{
+public:
+  void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer& buffer)
+  {
+    (void)buffer;
+    this->SetErrorBufferFunction(this->WPtr, buffer);
+  }
+
+protected:
+  void* WPtr = nullptr;
+
+  using SetErrorBufferSignature = void (*)(void*, const vtkm::exec::internal::ErrorMessageBuffer&);
+  SetErrorBufferSignature SetErrorBufferFunction = nullptr;
+};
+
+template <typename WType, typename IType>
+class TaskStrided1D : public TaskStrided
+{
+public:
+  TaskStrided1D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
+    : TaskStrided()
+    , Worklet(worklet)
+    , Invocation(invocation)
+    , GlobalIndexOffset(globalIndexOffset)
+  {
+    this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
+    //Bind the Worklet to void*
+    this->WPtr = (void*)&this->Worklet;
+  }
+
+  VTKM_EXEC
+  void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
+  {
+    for (vtkm::Id index = start; index < end; index += inc)
+    {
+      //Todo: rename this function to DoTaskInvokeWorklet
+      vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
+        this->Worklet,
+        this->Invocation,
+        this->Worklet.GetThreadIndices(index,
+                                       this->Invocation.OutputToInputMap,
+                                       this->Invocation.VisitArray,
+                                       this->Invocation.GetInputDomain(),
+                                       this->GlobalIndexOffset));
+    }
+  }
+
+private:
+  typename std::remove_const<WType>::type Worklet;
+  // This is held by by value so that when we transfer the invocation object
+  // over to CUDA it gets properly copied to the device. While we want to
+  // hold by reference to reduce the number of copies, it is not possible
+  // currently.
+  const IType Invocation;
+  const vtkm::Id GlobalIndexOffset;
+};
+
+template <typename WType>
+class TaskStrided1D<WType, vtkm::internal::NullType> : public TaskStrided
+{
+public:
+  TaskStrided1D(WType& worklet)
+    : TaskStrided()
+    , Worklet(worklet)
+  {
+    this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
+    //Bind the Worklet to void*
+    this->WPtr = (void*)&this->Worklet;
+  }
+
+  VTKM_EXEC
+  void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc) const
+  {
+    for (vtkm::Id index = start; index < end; index += inc)
+    {
+      this->Worklet(index);
+    }
+  }
+
+private:
+  typename std::remove_const<WType>::type Worklet;
+};
+
+template <typename WType, typename IType>
+class TaskStrided3D : public TaskStrided
+{
+public:
+  TaskStrided3D(const WType& worklet, const IType& invocation, vtkm::Id globalIndexOffset = 0)
+    : TaskStrided()
+    , Worklet(worklet)
+    , Invocation(invocation)
+    , GlobalIndexOffset(globalIndexOffset)
+  {
+    this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
+    //Bind the Worklet to void*
+    this->WPtr = (void*)&this->Worklet;
+  }
+
+  VTKM_EXEC
+  void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
+  {
+    vtkm::Id3 index(start, j, k);
+    for (vtkm::Id i = start; i < end; i += inc)
+    {
+      index[0] = i;
+      //Todo: rename this function to DoTaskInvokeWorklet
+      vtkm::exec::internal::detail::DoWorkletInvokeFunctor(
+        this->Worklet,
+        this->Invocation,
+        this->Worklet.GetThreadIndices(index,
+                                       this->Invocation.OutputToInputMap,
+                                       this->Invocation.VisitArray,
+                                       this->Invocation.GetInputDomain(),
+                                       this->GlobalIndexOffset));
+    }
+  }
+
+private:
+  typename std::remove_const<WType>::type Worklet;
+  // This is held by by value so that when we transfer the invocation object
+  // over to CUDA it gets properly copied to the device. While we want to
+  // hold by reference to reduce the number of copies, it is not possible
+  // currently.
+  const IType Invocation;
+  const vtkm::Id GlobalIndexOffset;
+};
+
+template <typename WType>
+class TaskStrided3D<WType, vtkm::internal::NullType> : public TaskStrided
+{
+public:
+  TaskStrided3D(WType& worklet)
+    : TaskStrided()
+    , Worklet(worklet)
+  {
+    this->SetErrorBufferFunction = &TaskStridedSetErrorBuffer<WType>;
+    //Bind the Worklet to void*
+    this->WPtr = (void*)&this->Worklet;
+  }
+
+  VTKM_EXEC
+  void operator()(vtkm::Id start, vtkm::Id end, vtkm::Id inc, vtkm::Id j, vtkm::Id k) const
+  {
+    vtkm::Id3 index(start, j, k);
+    for (vtkm::Id i = start; i < end; i += inc)
+    {
+      index[0] = i;
+      this->Worklet(index);
+    }
+  }
+
+private:
+  typename std::remove_const<WType>::type Worklet;
+};
+}
+}
+}
+} // vtkm::exec::cuda::internal
+
+#endif //vtk_m_exec_cuda_internal_TaskStrided_h
--- a/vtkm/exec/cuda/internal/testing/CMakeLists.txt
+++ b/vtkm/exec/cuda/internal/testing/CMakeLists.txt
@ -22,6 +22,6 @@

 set(unit_tests
  UnitTestTextureMemorySupport.cu
-  UnitTestTaskSingularCuda.cu
+  UnitTestTaskStrided.cu
  )
 vtkm_unit_tests(SOURCES ${unit_tests})
--- a/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu
+++ b/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu
@ -23,7 +23,7 @@
 #include <vtkm/exec/FunctorBase.h>
 #include <vtkm/exec/arg/BasicArg.h>
 #include <vtkm/exec/arg/ThreadIndicesBasic.h>
-#include <vtkm/exec/internal/TaskSingular.h>
+#include <vtkm/exec/cuda/internal/TaskStrided.h>

 #include <vtkm/StaticAssert.h>

@ -165,14 +165,14 @@ using InvocationType2 = vtkm::internal::Invocation<ExecutionParameterInterface,
                                                   MyVisitArrayPortal>;

 template <typename TaskType>
-static __global__ void ScheduleTaskSingular(TaskType task, vtkm::Id start, vtkm::Id end)
+static __global__ void ScheduleTaskStrided(TaskType task, vtkm::Id start, vtkm::Id end)
 {

  const vtkm::Id index = blockIdx.x * blockDim.x + threadIdx.x;
-
+  const vtkm::Id inc = blockDim.x * gridDim.x;
  if (index >= start && index < end)
  {
-    task(index);
+    task(index, end, inc);
  }
 }

@ -269,14 +269,13 @@ void TestNormalFunctorInvoke()
      TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));

  std::cout << "  Try void return." << std::endl;
-  using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType1>;
  TestWorkletProxy worklet;
  InvocationType1 invocation1(execObjects);

  using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
  auto task1 = TaskTypes::MakeTask(worklet, invocation1, vtkm::Id());

-  ScheduleTaskSingular<decltype(task1)><<<32, 256>>>(task1, 1, 2);
+  ScheduleTaskStrided<decltype(task1)><<<32, 256>>>(task1, 1, 2);
  cudaDeviceSynchronize();
  input.SyncControlArray();
  output.SyncControlArray();
@ -291,13 +290,12 @@ void TestNormalFunctorInvoke()
    TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
    TestExecObject(output.PrepareForOutput(3, DeviceAdapter())));

-  using TaskSingular2 = vtkm::exec::internal::TaskSingular<TestWorkletProxy, InvocationType2>;
  InvocationType2 invocation2(execObjects);

  using TaskTypes = typename vtkm::cont::DeviceTaskTypes<DeviceAdapter>;
  auto task2 = TaskTypes::MakeTask(worklet, invocation2, vtkm::Id());

-  ScheduleTaskSingular<decltype(task2)><<<32, 256>>>(task2, 2, 3);
+  ScheduleTaskStrided<decltype(task2)><<<32, 256>>>(task2, 2, 3);
  cudaDeviceSynchronize();
  input.SyncControlArray();
  output.SyncControlArray();
@ -323,7 +321,8 @@ void TestErrorFunctorInvoke()
      TestExecObject(input.PrepareForInPlace(DeviceAdapter())),
      TestExecObject(output.PrepareForInPlace(DeviceAdapter())));

-  using TaskSingular1 = vtkm::exec::internal::TaskSingular<TestWorkletErrorProxy, InvocationType1>;
+  using TaskStrided1 =
+    vtkm::exec::cuda::internal::TaskStrided1D<TestWorkletErrorProxy, InvocationType1>;
  TestWorkletErrorProxy worklet;
  InvocationType1 invocation(execObjects);

@ -340,7 +339,7 @@ void TestErrorFunctorInvoke()
  vtkm::exec::internal::ErrorMessageBuffer errorMessage(deviceErrorPtr, errorArraySize);
  task.SetErrorMessageBuffer(errorMessage);

-  ScheduleTaskSingular<decltype(task)><<<32, 256>>>(task, 1, 2);
+  ScheduleTaskStrided<decltype(task)><<<32, 256>>>(task, 1, 2);
  cudaDeviceSynchronize();

  VTKM_TEST_ASSERT(errorMessage.IsErrorRaised(), "Error not raised correctly.");
@ -348,7 +347,7 @@ void TestErrorFunctorInvoke()
 }

 template <typename DeviceAdapter>
-void TestTaskSingular()
+void TestTaskStrided()
 {
  TestNormalFunctorInvoke<DeviceAdapter>();
  TestErrorFunctorInvoke<DeviceAdapter>();
@ -356,7 +355,7 @@ void TestTaskSingular()

 } // anonymous namespace

-int UnitTestTaskSingularCuda(int, char* [])
+int UnitTestTaskStrided(int, char* [])
 {
-  return vtkm::cont::testing::Testing::Run(TestTaskSingular<vtkm::cont::DeviceAdapterTagCuda>);
+  return vtkm::cont::testing::Testing::Run(TestTaskStrided<vtkm::cont::DeviceAdapterTagCuda>);
 }
--- a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h
+++ b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h
@ -28,9 +28,11 @@
 #ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
 #define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h

-#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) &&  \
-  !defined(VTKM_TEST_HEADER_BUILD)
-#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
+#if !defined(vtk_m_exec_internal_TaskSingular_h) && \
+    !defined(vtk_m_exec_internal_TaskTiling_h) &&  \
+    !defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \
+    !defined(VTKM_TEST_HEADER_BUILD)
+#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
 #endif

 #include <vtkm/internal/FunctionInterface.h>
--- a/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in
+++ b/vtkm/exec/internal/WorkletInvokeFunctorDetail.h.in
@ -40,9 +40,11 @@ $# Ignore the following comment. It is meant for the generated file.
 #ifndef vtk_m_exec_internal_WorkletInvokeFunctorDetail_h
 #define vtk_m_exec_internal_WorkletInvokeFunctorDetail_h

-#if !defined(vtk_m_exec_internal_TaskSingular_h) && !defined(vtk_m_exec_internal_TaskTiling_h) &&  \\
-  !defined(VTKM_TEST_HEADER_BUILD)
-#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h or TaskTiling.h
+#if !defined(vtk_m_exec_internal_TaskSingular_h) && \\
+    !defined(vtk_m_exec_internal_TaskTiling_h) &&  \\
+    !defined(vtk_m_exec_cuda_internal_TaskStrided_h) && \\
+    !defined(VTKM_TEST_HEADER_BUILD)
+#error WorkletInvokeFunctorDetail.h must be included from TaskSingular.h, TaskTiling.h, TaskStrided.h
 #endif

 #include <vtkm/internal/FunctionInterface.h>
--- a/vtkm/worklet/wavelets/WaveletTransforms.h
+++ b/vtkm/worklet/wavelets/WaveletTransforms.h
@ -3007,42 +3007,49 @@ public:
  template <typename InputPortalType, typename OutputPortalType>
  VTKM_EXEC void operator()(const InputPortalType& coeffs,
                            OutputPortalType& sigOut,
-                            const vtkm::Id& workIndex) const
+                            vtkm::Id workIndex) const
  {
-    if (workIndex < cALen2) // valid calculation region
+    if (workIndex >= cALen2) // valid calculation region
    {
-      vtkm::Id xi;     // coeff indices
-      vtkm::Id k1, k2; // indices for low and high filter
-      VAL sum = 0.0;
-
-      if (workIndex % 2 != 0)
-      {
-        k1 = this->filterLen - 2;
-        k2 = this->filterLen - 1;
-      }
-      else
-      {
-        k1 = this->filterLen - 1;
-        k2 = this->filterLen - 2;
-      }
-
-      xi = (workIndex + 1) / 2;
-      while (k1 > -1) // k1 >= 0
-      {
-        sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi++));
-        k1 -= 2;
-      }
-
-      xi = workIndex / 2;
-      while (k2 > -1) // k2 >= 0
-      {
-        sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(this->cALenExtended + xi++));
-        k2 -= 2;
-      }
-
-      sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
+      return;
    }
+    vtkm::Id xi1 = (workIndex + 1) / 2;                     // coeff indices
+    vtkm::Id xi2 = this->cALenExtended + ((workIndex) / 2); // coeff indices
+    VAL sum = 0.0;
+
+    const bool odd = workIndex % 2 != 0;
+    if (odd)
+    {
+      vtkm::Id k1 = this->filterLen - 2;
+      vtkm::Id k2 = this->filterLen - 1;
+      for (; k1 >= 0; k1 -= 2, k2 -= 2)
+      {
+        sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
+        sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
+      }
+      if (k2 >= 0)
+      {
+        sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
+      }
+    }
+    else //even
+    {
+      vtkm::Id k1 = this->filterLen - 1;
+      vtkm::Id k2 = this->filterLen - 2;
+      for (; k2 >= 0; k1 -= 2, k2 -= 2)
+      {
+        sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
+        sum += highFilter.Get(k2) * MAKEVAL(coeffs.Get(xi2++));
+      }
+      if (k1 >= 0)
+      {
+        sum += lowFilter.Get(k1) * MAKEVAL(coeffs.Get(xi1++));
+      }
+    }
+
+    sigOut.Set(workIndex, static_cast<typename OutputPortalType::ValueType>(sum));
  }
+
 #undef MAKEVAL
 #undef VAL