From 047b64651771fe0057269280b983f76f6d7ccb9e Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Mon, 15 Apr 2019 08:06:33 -0400
Subject: [PATCH] VTK-m now provides better scheduling parameters controls

VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
When VTK-m first launches a kernel we do system introspection and determine
what GPU's are on the machine and than match this information to a preset
table of values. The implementation is designed in a way that allows for
VTK-m to offer both specific presets for a given GPU ( V100 ) or for
an entire generation of cards ( Pascal ).

Currently VTK-m offers preset tables for the following GPU's:
- Tesla V100
- Tesla P100

If the hardware doesn't match a specific GPU card we than try to find the
nearest know hardware generation and use those defaults. Currently we offer
defaults for
- Older than Pascal Hardware
- Pascal Hardware
- Volta+ Hardware

Some users have workloads that don't align with the defaults provided by
VTK-m. When that is the cause, it is possible to override the defaults
by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
As shown below:

```cpp
  ScheduleParameters CustomScheduleValues(char const* name,
                                          int major,
                                          int minor,
                                          int multiProcessorCount,
                                          int maxThreadsPerMultiProcessor,
                                          int maxThreadsPerBlock)
  {

    ScheduleParameters params  {
        64 * multiProcessorCount,  //1d blocks
        64,                        //1d threads per block
        64 * multiProcessorCount,  //2d blocks
        { 8, 8, 1 },               //2d threads per block
        64 * multiProcessorCount,  //3d blocks
        { 4, 4, 4 } };             //3d threads per block
    return params;
  }
  vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
```
---
 docs/changelog/improve-cuda-scheduling.md     |  45 ++++
 vtkm/cont/cuda/internal/CMakeLists.txt        |   1 -
 .../internal/DeviceAdapterAlgorithmCuda.cu    | 177 +++++++++++----
 .../internal/DeviceAdapterAlgorithmCuda.h     | 104 ++++++---
 vtkm/cont/cuda/internal/TaskTuner.h           | 211 ------------------
 5 files changed, 260 insertions(+), 278 deletions(-)
 create mode 100644 docs/changelog/improve-cuda-scheduling.md
 delete mode 100644 vtkm/cont/cuda/internal/TaskTuner.h

diff --git a/docs/changelog/improve-cuda-scheduling.md b/docs/changelog/improve-cuda-scheduling.md
new file mode 100644
index 000000000..c8a193e37
--- /dev/null
+++ b/docs/changelog/improve-cuda-scheduling.md
@@ -0,0 +1,45 @@
+# VTK-m CUDA kernel scheduling including improved defaults, and user customization
+
+VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
+When VTK-m first launches a kernel we do system introspection and determine
+what GPU's are on the machine and than match this information to a preset
+table of values. The implementation is designed in a way that allows for
+VTK-m to offer both specific presets for a given GPU ( V100 ) or for
+an entire generation of cards ( Pascal ).
+
+Currently VTK-m offers preset tables for the following GPU's:
+- Tesla V100
+- Tesla P100
+
+If the hardware doesn't match a specific GPU card we than try to find the
+nearest know hardware generation and use those defaults. Currently we offer
+defaults for
+- Older than Pascal Hardware
+- Pascal Hardware
+- Volta+ Hardware
+
+Some users have workloads that don't align with the defaults provided by
+VTK-m. When that is the cause, it is possible to override the defaults
+by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
+As shown below:
+
+```cpp
+  ScheduleParameters CustomScheduleValues(char const* name,
+                                          int major,
+                                          int minor,
+                                          int multiProcessorCount,
+                                          int maxThreadsPerMultiProcessor,
+                                          int maxThreadsPerBlock)
+  {
+
+    ScheduleParameters params  {
+        64 * multiProcessorCount,  //1d blocks
+        64,                        //1d threads per block
+        64 * multiProcessorCount,  //2d blocks
+        { 8, 8, 1 },               //2d threads per block
+        64 * multiProcessorCount,  //3d blocks
+        { 4, 4, 4 } };             //3d threads per block
+    return params;
+  }
+  vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
+```
diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt
index 284389052..0a0aeb110 100644
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@@ -29,7 +29,6 @@ set(headers
   DeviceAdapterTimerImplementationCuda.h
   ExecutionArrayInterfaceBasicCuda.h
   MakeThrustIterator.h
-  TaskTuner.h
   ThrustExceptionHandler.h
   VirtualObjectTransferCuda.h
   )
diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
index afd3b06c7..aaec8b2c6 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@@ -21,43 +21,143 @@
 #include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
 
 #include <atomic>
+#include <cstring>
+#include <functional>
 #include <mutex>
 
+#include <cuda.h>
+
 namespace vtkm
 {
 namespace cont
 {
 namespace cuda
 {
+
+static vtkm::cont::cuda::ScheduleParameters (
+  *ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
+
+//Use the provided function as the the compute function for ScheduleParameterBuilder
+VTKM_CONT_EXPORT void InitScheduleParameters(
+  vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
+{
+  ComputeFromEnv = function;
+}
+
 namespace internal
 {
 
-VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
-{
-  std::size_t index = 0;
-  if (dId > 0)
-  {
-    index = static_cast<size_t>(dId);
-  }
+//These represent the best block/threads-per for scheduling on each GPU
+static std::vector<std::pair<int, int>> scheduling_1d_parameters;
+static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
+static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
 
-  //check
+struct VTKM_CONT_EXPORT ScheduleParameterBuilder
+{
+  //This represents information that is used to compute the best
+  //ScheduleParameters for a given GPU
+  enum struct GPU_STRATA
+  {
+    ENV = 0,
+    OLDER = 5,
+    PASCAL = 6,
+    VOLTA = 7,
+    PASCAL_HPC = 6000,
+    VOLTA_HPC = 7000
+  };
+
+  std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
+  std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
+
+  // clang-format off
+  // The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
+  ScheduleParameterBuilder()
+    : Presets{
+      { GPU_STRATA::ENV,        {  0,   0,  0, {  0,  0, 0 },  0, { 0, 0, 0 } } }, //use env settings
+      { GPU_STRATA::OLDER,
+                                { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
+      { GPU_STRATA::PASCAL,     { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
+      { GPU_STRATA::VOLTA,      { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
+      { GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
+      { GPU_STRATA::VOLTA_HPC,  { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
+    }
+    , Compute(nullptr)
+  {
+    if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
+    {
+      this->Compute = vtkm::cont::cuda::ComputeFromEnv;
+    }
+    else
+    {
+      this->Compute = [=] (const char* name, int major, int minor,
+                          int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters  {
+        return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
+    }
+  }
+  // clang-format on
+
+  vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
+                                                         int major,
+                                                         int minor,
+                                                         int numSMs,
+                                                         int maxThreadsPerSM,
+                                                         int maxThreadsPerBlock)
+  {
+    (void)minor;
+    (void)maxThreadsPerSM;
+    (void)maxThreadsPerBlock;
+
+    const constexpr int GPU_STRATA_MAX_GEN = 7;
+    const constexpr int GPU_STRATA_MIN_GEN = 5;
+    int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
+    strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
+    if (strataAsInt > GPU_STRATA_MIN_GEN)
+    { //only pascal and above have fancy
+
+      //Currently the only
+      bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
+      if (is_tesla)
+      {
+        strataAsInt *= 1000; //tesla modifier
+      }
+    }
+
+    auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
+    ScheduleParameters params{ preset->second };
+    params.one_d_blocks = params.one_d_blocks * numSMs;
+    params.two_d_blocks = params.two_d_blocks * numSMs;
+    params.three_d_blocks = params.three_d_blocks * numSMs;
+    return params;
+  }
+};
+
+VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
+{
+  //check flag
   static std::once_flag lookupBuiltFlag;
-  static std::vector<vtkm::UInt32> numSMs;
 
   std::call_once(lookupBuiltFlag, []() {
+    ScheduleParameterBuilder builder;
     //iterate over all devices
-    int numberOfSMs = 0;
     int count = 0;
     VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
-    numSMs.reserve(static_cast<std::size_t>(count));
     for (int deviceId = 0; deviceId < count; ++deviceId)
-    { //get the number of sm's per deviceId
-      VTKM_CUDA_CALL(
-        cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
-      numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
+    {
+      cudaDeviceProp deviceProp;
+      cudaGetDeviceProperties(&deviceProp, deviceId);
+
+      ScheduleParameters params = builder.Compute(deviceProp.name,
+                                                  deviceProp.major,
+                                                  deviceProp.minor,
+                                                  deviceProp.multiProcessorCount,
+                                                  deviceProp.maxThreadsPerMultiProcessor,
+                                                  deviceProp.maxThreadsPerBlock);
+      scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
+      scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
+      scheduling_3d_parameters.emplace_back(params.three_d_blocks,
+                                            params.three_d_threads_per_block);
     }
   });
-  return numSMs[index];
 }
 }
 } // end namespace cuda::internal
@@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
   }
 }
 
-void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
-  vtkm::UInt32& grids,
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
   vtkm::UInt32& blocks,
+  vtkm::UInt32& threadsPerBlock,
   vtkm::Id size)
 {
   (void)size;
+  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
+
   int deviceId;
   VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * cuda::internal::getNumSMs(deviceId);
-  blocks = 128;
+  const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
+  blocks = params.first;
+  threadsPerBlock = params.second;
 }
 
-void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
-  vtkm::UInt32& grids,
-  dim3& blocks,
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
+  vtkm::UInt32& blocks,
+  dim3& threadsPerBlock,
   const dim3& size)
 {
+  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
+
   int deviceId;
   VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * cuda::internal::getNumSMs(deviceId);
-
-  if (size.x == 0)
-  { //grids that have no x dimension
-    blocks.x = 1;
-    blocks.y = 8;
-    blocks.z = 8;
-  }
-  else if (size.x > 128)
-  {
-    blocks.x = 8;
-    blocks.y = 8;
-    blocks.z = 4;
+  if (size.z <= 1)
+  { //2d images
+    const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
+    blocks = params.first;
+    threadsPerBlock = params.second;
   }
   else
-  { //for really small grids
-    blocks.x = 4;
-    blocks.y = 4;
-    blocks.z = 4;
+  { //3d images
+    const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
+    blocks = params.first;
+    threadsPerBlock = params.second;
   }
 }
 }
diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
index 2990a3bf5..81889c5f6 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@@ -48,12 +48,6 @@
 #include <vtkm/exec/cuda/internal/TaskStrided.h>
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 
-// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
-// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
-#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
-#include <vtkm/cont/cuda/internal/TaskTuner.h>
-#endif
-
 // Disable warnings we check vtkm for but Thrust does not.
 VTKM_THIRDPARTY_PRE_INCLUDE
 #include <cooperative_groups.h>
@@ -79,10 +73,73 @@ namespace vtkm
 {
 namespace cont
 {
-
-
 namespace cuda
 {
+/// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels
+///
+/// \c ScheduleParameters represents how VTK-m should schedule different
+/// cuda kernel types. By default VTK-m uses a preset table based on the
+/// GPU's found at runtime.
+///
+/// When these defaults are insufficient for certain projects it is possible
+/// to override the defaults by using \c InitScheduleParameters.
+///
+///
+struct VTKM_CONT_EXPORT ScheduleParameters
+{
+  int one_d_blocks;
+  int one_d_threads_per_block;
+
+  int two_d_blocks;
+  dim3 two_d_threads_per_block;
+
+  int three_d_blocks;
+  dim3 three_d_threads_per_block;
+};
+
+/// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches
+///
+/// By default VTK-m uses a preset table based on the GPU's found at runtime to
+/// determine the best scheduling parameters for a worklet.  When these defaults
+/// are insufficient for certain projects it is possible to override the defaults
+/// by binding a custom function to \c InitScheduleParameters.
+///
+/// Note: The this function must be called before any invocation of any worklets
+/// by VTK-m.
+///
+/// Note: This function will be called for each GPU on a machine.
+///
+/// \code{.cpp}
+///
+///  ScheduleParameters CustomScheduleValues(char const* name,
+///                                          int major,
+///                                          int minor,
+///                                          int multiProcessorCount,
+///                                          int maxThreadsPerMultiProcessor,
+///                                          int maxThreadsPerBlock)
+///  {
+///
+///    ScheduleParameters params  {
+///        64 * multiProcessorCount,  //1d blocks
+///        64,                        //1d threads per block
+///        64 * multiProcessorCount,  //2d blocks
+///        { 8, 8, 1 },               //2d threads per block
+///        64 * multiProcessorCount,  //3d blocks
+///        { 4, 4, 4 } };             //3d threads per block
+///    return params;
+///  }
+/// \endcode
+///
+///
+VTKM_CONT_EXPORT void InitScheduleParameters(
+  vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
+                                           int major,
+                                           int minor,
+                                           int multiProcessorCount,
+                                           int maxThreadsPerMultiProcessor,
+                                           int maxThreadsPerBlock));
+
+
 namespace internal
 {
 
@@ -1358,10 +1415,12 @@ public:
   static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
 
   VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
+                                  vtkm::UInt32& threadsPerBlock,
+                                  vtkm::Id size);
 
   VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
 
 public:
   template <typename WType, typename IType>
@@ -1378,15 +1437,11 @@ public:
     CheckForErrors();
     SetupErrorBuffer(functor);
 
-    vtkm::UInt32 grids, blocks;
-    GetGridsAndBlocks(grids, blocks, numInstances);
+    vtkm::UInt32 blocks, threadsPerBlock;
+    GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
 
-    cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
-                                                                                   numInstances);
-
-#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
-    parameter_sweep_1d_schedule(functor, numInstances);
-#endif
+    cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
+      functor, numInstances);
   }
 
   template <typename WType, typename IType>
@@ -1407,15 +1462,12 @@ public:
                       static_cast<vtkm::UInt32>(rangeMax[1]),
                       static_cast<vtkm::UInt32>(rangeMax[2]));
 
-    vtkm::UInt32 grids;
-    dim3 blocks;
-    GetGridsAndBlocks(grids, blocks, ranges);
+    vtkm::UInt32 blocks;
+    dim3 threadsPerBlock;
+    GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
 
-    cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
-
-#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
-    parameter_sweep_3d_schedule(functor, rangeMax);
-#endif
+    cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
+      functor, ranges);
   }
 
   template <class Functor>
diff --git a/vtkm/cont/cuda/internal/TaskTuner.h b/vtkm/cont/cuda/internal/TaskTuner.h
deleted file mode 100644
index 608646bc6..000000000
--- a/vtkm/cont/cuda/internal/TaskTuner.h
+++ /dev/null
@@ -1,211 +0,0 @@
-//============================================================================
-//  Copyright (c) Kitware, Inc.
-//  All rights reserved.
-//  See LICENSE.txt for details.
-//  This software is distributed WITHOUT ANY WARRANTY; without even
-//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-//  PURPOSE.  See the above copyright notice for more information.
-//
-//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-//  Copyright 2014 UT-Battelle, LLC.
-//  Copyright 2014 Los Alamos National Security.
-//
-//  Under the terms of Contract DE-NA0003525 with NTESS,
-//  the U.S. Government retains certain rights in this software.
-//
-//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
-//  Laboratory (LANL), the U.S. Government retains certain rights in
-//  this software.
-//============================================================================
-
-#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
-#define vtk_m_cont_cuda_internal_TaskTuner_h
-
-#include <vtkm/Types.h>
-#include <vtkm/cont/cuda/ErrorCuda.h>
-
-#include <cuda.h>
-
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace vtkm
-{
-namespace cont
-{
-namespace cuda
-{
-namespace internal
-{
-
-vtkm::UInt32 getNumSMs(int dId);
-
-template <typename TaskType>
-__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
-template <typename TaskType>
-__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
-
-struct PerfRecord1d
-{
-  PerfRecord1d(float elapsedT, int g, int b)
-    : elapsedTime(elapsedT)
-    , grid(g)
-    , block(b)
-  {
-  }
-
-  bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
-
-  float elapsedTime;
-  int grid;
-  int block;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
-{
-  os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
-     << ">>> required: " << record.elapsedTime << "\n";
-  return os;
-}
-
-
-struct PerfRecord3d
-{
-  PerfRecord3d(float elapsedT, int g, dim3 b)
-    : elapsedTime(elapsedT)
-    , grid(g)
-    , block(b)
-  {
-  }
-
-  bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
-
-  float elapsedTime;
-  int grid;
-  dim3 block;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
-{
-
-  os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
-     << "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
-  return os;
-}
-
-
-template <typename TaskT>
-static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
-{
-  std::vector<PerfRecord1d> results;
-  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
-  constexpr vtkm::UInt32 blockIndexTable[12] = { 4,   8,   16,   32,   64,   128,
-                                                 256, 512, 1024, 2048, 4096, 8192 };
-
-  int deviceId;
-  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-
-  for (vtkm::UInt32 g = 0; g < 12; g++)
-  {
-    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
-    for (vtkm::UInt32 b = 0; b < 12; b++)
-    {
-      vtkm::UInt32 blocks = blockIndexTable[b];
-
-      cudaEvent_t start, stop;
-      VTKM_CUDA_CALL(cudaEventCreate(&start));
-      VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-      TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
-
-      VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-      VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-      float elapsedTimeMilliseconds;
-      VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-      VTKM_CUDA_CALL(cudaEventDestroy(start));
-      VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-      results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
-    }
-  }
-
-  std::sort(results.begin(), results.end());
-  for (auto&& i : results)
-  {
-    std::cout << i << std::endl;
-  }
-}
-
-template <typename TaskT>
-static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
-{
-  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
-                    static_cast<vtkm::UInt32>(rangeMax[1]),
-                    static_cast<vtkm::UInt32>(rangeMax[2]));
-  std::vector<PerfRecord3d> results;
-
-  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
-  constexpr vtkm::UInt32 blockIndexTable[16] = { 1,  2,  4,  8,  12,  16,  20,  24,
-                                                 28, 30, 32, 64, 128, 256, 512, 1024 };
-
-  int deviceId;
-  for (vtkm::UInt32 g = 0; g < 12; g++)
-  {
-    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
-    for (vtkm::UInt32 i = 0; i < 16; i++)
-    {
-      for (vtkm::UInt32 j = 0; j < 16; j++)
-      {
-        for (vtkm::UInt32 k = 0; k < 16; k++)
-        {
-          cudaEvent_t start, stop;
-          VTKM_CUDA_CALL(cudaEventCreate(&start));
-          VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-          dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
-
-          if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
-              blocks.z >= 64)
-          {
-            //cuda can't handle more than 1024 threads per block
-            //so don't try if we compute higher than that
-
-            //also don't try stupidly low numbers
-
-            //cuda can't handle more than 64 threads in the z direction
-            continue;
-          }
-
-          VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
-          TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
-          VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
-
-          VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-          float elapsedTimeMilliseconds;
-          VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-          VTKM_CUDA_CALL(cudaEventDestroy(start));
-          VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-          results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
-        }
-      }
-    }
-  }
-
-  std::sort(results.begin(), results.end());
-  for (auto&& i : results)
-  {
-    std::cout << i << std::endl;
-  }
-}
-}
-}
-}
-}
-
-#endif