Merge topic 'support_cuda_scheduling_parameters_via_runtime'

047b64651 VTK-m now provides better scheduling parameters controls Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Kenneth Moreland <kmorel@sandia.gov> Merge-request: !1643
2019-04-17 14:04:02 +00:00 · 2019-04-17 14:04:02 +00:00 · 6c5c197a37
commit 6c5c197a37
parent da1fb63258 047b646517
5 changed files with 260 additions and 278 deletions
--- a/docs/changelog/improve-cuda-scheduling.md
+++ b/docs/changelog/improve-cuda-scheduling.md
@ -0,0 +1,45 @@
 # VTK-m CUDA kernel scheduling including improved defaults, and user customization
 VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
 When VTK-m first launches a kernel we do system introspection and determine
 what GPU's are on the machine and than match this information to a preset
 table of values. The implementation is designed in a way that allows for
 VTK-m to offer both specific presets for a given GPU ( V100 ) or for
 an entire generation of cards ( Pascal ).
 Currently VTK-m offers preset tables for the following GPU's:
 - Tesla V100
 - Tesla P100
 If the hardware doesn't match a specific GPU card we than try to find the
 nearest know hardware generation and use those defaults. Currently we offer
 defaults for
 - Older than Pascal Hardware
 - Pascal Hardware
 - Volta+ Hardware
 Some users have workloads that don't align with the defaults provided by
 VTK-m. When that is the cause, it is possible to override the defaults
 by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
 As shown below:
 ```cpp
  ScheduleParameters CustomScheduleValues(char const* name,
                                          int major,
                                          int minor,
                                          int multiProcessorCount,
                                          int maxThreadsPerMultiProcessor,
                                          int maxThreadsPerBlock)
  {
    ScheduleParameters params  {
        64 * multiProcessorCount,  //1d blocks
        64,                        //1d threads per block
        64 * multiProcessorCount,  //2d blocks
        { 8, 8, 1 },               //2d threads per block
        64 * multiProcessorCount,  //3d blocks
        { 4, 4, 4 } };             //3d threads per block
    return params;
  }
  vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
 ```
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@ -29,7 +29,6 @@ set(headers
  DeviceAdapterTimerImplementationCuda.h
  ExecutionArrayInterfaceBasicCuda.h
  MakeThrustIterator.h
  TaskTuner.h
  ThrustExceptionHandler.h
  VirtualObjectTransferCuda.h
  )
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@ -21,43 +21,143 @@
 #include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
 #include <atomic>
 #include <cstring>
 #include <functional>
 #include <mutex>
 #include <cuda.h>
 namespace vtkm
 {
 namespace cont
 {
 namespace cuda
 {
 static vtkm::cont::cuda::ScheduleParameters (
  *ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
 //Use the provided function as the the compute function for ScheduleParameterBuilder
 VTKM_CONT_EXPORT void InitScheduleParameters(
  vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
 {
  ComputeFromEnv = function;
 }
 namespace internal
 {
-VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
+//These represent the best block/threads-per for scheduling on each GPU
-{
+static std::vector<std::pair<int, int>> scheduling_1d_parameters;
-  std::size_t index = 0;
+static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
-  if (dId > 0)
+static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
  {
    index = static_cast<size_t>(dId);
  }
-  //check
+struct VTKM_CONT_EXPORT ScheduleParameterBuilder
 {
  //This represents information that is used to compute the best
  //ScheduleParameters for a given GPU
  enum struct GPU_STRATA
  {
    ENV = 0,
    OLDER = 5,
    PASCAL = 6,
    VOLTA = 7,
    PASCAL_HPC = 6000,
    VOLTA_HPC = 7000
  };
  std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
  std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
  // clang-format off
  // The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
  ScheduleParameterBuilder()
    : Presets{
      { GPU_STRATA::ENV,        {  0,   0,  0, {  0,  0, 0 },  0, { 0, 0, 0 } } }, //use env settings
      { GPU_STRATA::OLDER,
                                { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
      { GPU_STRATA::PASCAL,     { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
      { GPU_STRATA::VOLTA,      { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
      { GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
      { GPU_STRATA::VOLTA_HPC,  { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
    }
    , Compute(nullptr)
  {
    if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
    {
      this->Compute = vtkm::cont::cuda::ComputeFromEnv;
    }
    else
    {
      this->Compute = [=] (const char* name, int major, int minor,
                          int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters  {
        return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
    }
  }
  // clang-format on
  vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
                                                         int major,
                                                         int minor,
                                                         int numSMs,
                                                         int maxThreadsPerSM,
                                                         int maxThreadsPerBlock)
  {
    (void)minor;
    (void)maxThreadsPerSM;
    (void)maxThreadsPerBlock;
    const constexpr int GPU_STRATA_MAX_GEN = 7;
    const constexpr int GPU_STRATA_MIN_GEN = 5;
    int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
    strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
    if (strataAsInt > GPU_STRATA_MIN_GEN)
    { //only pascal and above have fancy
      //Currently the only
      bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
      if (is_tesla)
      {
        strataAsInt *= 1000; //tesla modifier
      }
    }
    auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
    ScheduleParameters params{ preset->second };
    params.one_d_blocks = params.one_d_blocks * numSMs;
    params.two_d_blocks = params.two_d_blocks * numSMs;
    params.three_d_blocks = params.three_d_blocks * numSMs;
    return params;
  }
 };
 VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
 {
  //check flag
  static std::once_flag lookupBuiltFlag;
  static std::vector<vtkm::UInt32> numSMs;
  std::call_once(lookupBuiltFlag, []() {
    ScheduleParameterBuilder builder;
    //iterate over all devices
    int numberOfSMs = 0;
    int count = 0;
    VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
    numSMs.reserve(static_cast<std::size_t>(count));
    for (int deviceId = 0; deviceId < count; ++deviceId)
-    { //get the number of sm's per deviceId
+    {
-      VTKM_CUDA_CALL(
+      cudaDeviceProp deviceProp;
-        cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
+      cudaGetDeviceProperties(&deviceProp, deviceId);
-      numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
+
      ScheduleParameters params = builder.Compute(deviceProp.name,
                                                  deviceProp.major,
                                                  deviceProp.minor,
                                                  deviceProp.multiProcessorCount,
                                                  deviceProp.maxThreadsPerMultiProcessor,
                                                  deviceProp.maxThreadsPerBlock);
      scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
      scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
      scheduling_3d_parameters.emplace_back(params.three_d_blocks,
                                            params.three_d_threads_per_block);
    }
  });
  return numSMs[index];
 }
 }
 } // end namespace cuda::internal
@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
  }
 }
-void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& grids,
  vtkm::UInt32& blocks,
  vtkm::UInt32& threadsPerBlock,
  vtkm::Id size)
 {
  (void)size;
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * cuda::internal::getNumSMs(deviceId);
+  const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
-  blocks = 128;
+  blocks = params.first;
  threadsPerBlock = params.second;
 }
-void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
-  vtkm::UInt32& grids,
+  vtkm::UInt32& blocks,
-  dim3& blocks,
+  dim3& threadsPerBlock,
  const dim3& size)
 {
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * cuda::internal::getNumSMs(deviceId);
+  if (size.z <= 1)
-
+  { //2d images
-  if (size.x == 0)
+    const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
-  { //grids that have no x dimension
+    blocks = params.first;
-    blocks.x = 1;
+    threadsPerBlock = params.second;
    blocks.y = 8;
    blocks.z = 8;
  }
  else if (size.x > 128)
  {
    blocks.x = 8;
    blocks.y = 8;
    blocks.z = 4;
  }
  else
-  { //for really small grids
+  { //3d images
-    blocks.x = 4;
+    const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
-    blocks.y = 4;
+    blocks = params.first;
-    blocks.z = 4;
+    threadsPerBlock = params.second;
  }
 }
 }
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -48,12 +48,6 @@
 #include <vtkm/exec/cuda/internal/TaskStrided.h>
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 // #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
 // #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
 #if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
 #include <vtkm/cont/cuda/internal/TaskTuner.h>
 #endif
 // Disable warnings we check vtkm for but Thrust does not.
 VTKM_THIRDPARTY_PRE_INCLUDE
 #include <cooperative_groups.h>
@ -79,10 +73,73 @@ namespace vtkm
 {
 namespace cont
 {
 namespace cuda
 {
 /// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels
 ///
 /// \c ScheduleParameters represents how VTK-m should schedule different
 /// cuda kernel types. By default VTK-m uses a preset table based on the
 /// GPU's found at runtime.
 ///
 /// When these defaults are insufficient for certain projects it is possible
 /// to override the defaults by using \c InitScheduleParameters.
 ///
 ///
 struct VTKM_CONT_EXPORT ScheduleParameters
 {
  int one_d_blocks;
  int one_d_threads_per_block;
  int two_d_blocks;
  dim3 two_d_threads_per_block;
  int three_d_blocks;
  dim3 three_d_threads_per_block;
 };
 /// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches
 ///
 /// By default VTK-m uses a preset table based on the GPU's found at runtime to
 /// determine the best scheduling parameters for a worklet.  When these defaults
 /// are insufficient for certain projects it is possible to override the defaults
 /// by binding a custom function to \c InitScheduleParameters.
 ///
 /// Note: The this function must be called before any invocation of any worklets
 /// by VTK-m.
 ///
 /// Note: This function will be called for each GPU on a machine.
 ///
 /// \code{.cpp}
 ///
 ///  ScheduleParameters CustomScheduleValues(char const* name,
 ///                                          int major,
 ///                                          int minor,
 ///                                          int multiProcessorCount,
 ///                                          int maxThreadsPerMultiProcessor,
 ///                                          int maxThreadsPerBlock)
 ///  {
 ///
 ///    ScheduleParameters params  {
 ///        64 * multiProcessorCount,  //1d blocks
 ///        64,                        //1d threads per block
 ///        64 * multiProcessorCount,  //2d blocks
 ///        { 8, 8, 1 },               //2d threads per block
 ///        64 * multiProcessorCount,  //3d blocks
 ///        { 4, 4, 4 } };             //3d threads per block
 ///    return params;
 ///  }
 /// \endcode
 ///
 ///
 VTKM_CONT_EXPORT void InitScheduleParameters(
  vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
                                           int major,
                                           int minor,
                                           int multiProcessorCount,
                                           int maxThreadsPerMultiProcessor,
                                           int maxThreadsPerBlock));
 namespace internal
 {
@ -1358,10 +1415,12 @@ public:
  static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
  VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks,
                                  vtkm::UInt32& threadsPerBlock,
                                  vtkm::Id size);
  VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);
+  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
 public:
  template <typename WType, typename IType>
@ -1378,15 +1437,11 @@ public:
    CheckForErrors();
    SetupErrorBuffer(functor);
-    vtkm::UInt32 grids, blocks;
+    vtkm::UInt32 blocks, threadsPerBlock;
-    GetGridsAndBlocks(grids, blocks, numInstances);
+    GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
-    cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
+    cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
-                                                                                   numInstances);
+      functor, numInstances);
 #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
    parameter_sweep_1d_schedule(functor, numInstances);
 #endif
  }
  template <typename WType, typename IType>
@ -1407,15 +1462,12 @@ public:
                      static_cast<vtkm::UInt32>(rangeMax[1]),
                      static_cast<vtkm::UInt32>(rangeMax[2]));
-    vtkm::UInt32 grids;
+    vtkm::UInt32 blocks;
-    dim3 blocks;
+    dim3 threadsPerBlock;
-    GetGridsAndBlocks(grids, blocks, ranges);
+    GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
-    cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
+    cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
-
+      functor, ranges);
 #ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
    parameter_sweep_3d_schedule(functor, rangeMax);
 #endif
  }
  template <class Functor>
--- a/vtkm/cont/cuda/internal/TaskTuner.h
+++ b/vtkm/cont/cuda/internal/TaskTuner.h
@ -1,211 +0,0 @@
 //============================================================================
 //  Copyright (c) Kitware, Inc.
 //  All rights reserved.
 //  See LICENSE.txt for details.
 //  This software is distributed WITHOUT ANY WARRANTY; without even
 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 //  PURPOSE.  See the above copyright notice for more information.
 //
 //  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
 //  Copyright 2014 UT-Battelle, LLC.
 //  Copyright 2014 Los Alamos National Security.
 //
 //  Under the terms of Contract DE-NA0003525 with NTESS,
 //  the U.S. Government retains certain rights in this software.
 //
 //  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
 //  Laboratory (LANL), the U.S. Government retains certain rights in
 //  this software.
 //============================================================================
 #ifndef vtk_m_cont_cuda_internal_TaskTuner_h
 #define vtk_m_cont_cuda_internal_TaskTuner_h
 #include <vtkm/Types.h>
 #include <vtkm/cont/cuda/ErrorCuda.h>
 #include <cuda.h>
 #include <algorithm>
 #include <iostream>
 #include <string>
 #include <vector>
 namespace vtkm
 {
 namespace cont
 {
 namespace cuda
 {
 namespace internal
 {
 vtkm::UInt32 getNumSMs(int dId);
 template <typename TaskType>
 __global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
 template <typename TaskType>
 __global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
 struct PerfRecord1d
 {
  PerfRecord1d(float elapsedT, int g, int b)
    : elapsedTime(elapsedT)
    , grid(g)
    , block(b)
  {
  }
  bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
  float elapsedTime;
  int grid;
  int block;
 };
 inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
 {
  os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
     << ">>> required: " << record.elapsedTime << "\n";
  return os;
 }
 struct PerfRecord3d
 {
  PerfRecord3d(float elapsedT, int g, dim3 b)
    : elapsedTime(elapsedT)
    , grid(g)
    , block(b)
  {
  }
  bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
  float elapsedTime;
  int grid;
  dim3 block;
 };
 inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
 {
  os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
     << "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
  return os;
 }
 template <typename TaskT>
 static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
 {
  std::vector<PerfRecord1d> results;
  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
  constexpr vtkm::UInt32 blockIndexTable[12] = { 4,   8,   16,   32,   64,   128,
                                                 256, 512, 1024, 2048, 4096, 8192 };
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
    for (vtkm::UInt32 b = 0; b < 12; b++)
    {
      vtkm::UInt32 blocks = blockIndexTable[b];
      cudaEvent_t start, stop;
      VTKM_CUDA_CALL(cudaEventCreate(&start));
      VTKM_CUDA_CALL(cudaEventCreate(&stop));
      TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
      VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
      VTKM_CUDA_CALL(cudaEventSynchronize(stop));
      float elapsedTimeMilliseconds;
      VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
      VTKM_CUDA_CALL(cudaEventDestroy(start));
      VTKM_CUDA_CALL(cudaEventDestroy(stop));
      results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
    }
  }
  std::sort(results.begin(), results.end());
  for (auto&& i : results)
  {
    std::cout << i << std::endl;
  }
 }
 template <typename TaskT>
 static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
 {
  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
                    static_cast<vtkm::UInt32>(rangeMax[1]),
                    static_cast<vtkm::UInt32>(rangeMax[2]));
  std::vector<PerfRecord3d> results;
  constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
  constexpr vtkm::UInt32 blockIndexTable[16] = { 1,  2,  4,  8,  12,  16,  20,  24,
                                                 28, 30, 32, 64, 128, 256, 512, 1024 };
  int deviceId;
  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
    for (vtkm::UInt32 i = 0; i < 16; i++)
    {
      for (vtkm::UInt32 j = 0; j < 16; j++)
      {
        for (vtkm::UInt32 k = 0; k < 16; k++)
        {
          cudaEvent_t start, stop;
          VTKM_CUDA_CALL(cudaEventCreate(&start));
          VTKM_CUDA_CALL(cudaEventCreate(&stop));
          dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
          if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
              blocks.z >= 64)
          {
            //cuda can't handle more than 1024 threads per block
            //so don't try if we compute higher than that
            //also don't try stupidly low numbers
            //cuda can't handle more than 64 threads in the z direction
            continue;
          }
          VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
          TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
          VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
          VTKM_CUDA_CALL(cudaEventSynchronize(stop));
          float elapsedTimeMilliseconds;
          VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
          VTKM_CUDA_CALL(cudaEventDestroy(start));
          VTKM_CUDA_CALL(cudaEventDestroy(stop));
          results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
        }
      }
    }
  }
  std::sort(results.begin(), results.end());
  for (auto&& i : results)
  {
    std::cout << i << std::endl;
  }
 }
 }
 }
 }
 }
 #endif