From c11f29c093ec1edca980e6cfe8e8695013ccd660 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Fri, 23 Jun 2017 13:27:33 -0400
Subject: [PATCH] Move the parameter sweeping code to a separate header.

The parameter sweeping code is only enabled when tuning for new GPU's
so we should move it to a separate header to make DeviceAdapterAlgorithmThrust
easier to read.
---
 vtkm/cont/cuda/internal/CMakeLists.txt        |   1 +
 .../internal/DeviceAdapterAlgorithmThrust.h   | 210 ++++-------
 vtkm/cont/cuda/internal/TaskTuner.h           | 333 ++++++++++++++++++
 3 files changed, 405 insertions(+), 139 deletions(-)
 create mode 100644 vtkm/cont/cuda/internal/TaskTuner.h
diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt
index 2bf1a27c3..944ca3d5e 100644
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@@ -25,6 +25,7 @@ set(headers
   DeviceAdapterAlgorithmThrust.h
   DeviceAdapterTagCuda.h
   MakeThrustIterator.h
+  TaskTuner.h
   ThrustExceptionHandler.h
   VirtualObjectTransferCuda.h
   )
diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
index df0d7f5ba..b6b59b7c4 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@@ -38,9 +38,14 @@
 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
 #include <vtkm/exec/internal/TaskSingular.h>
 
-//#include <vtkm/cont/DeviceAdapterAlgorithm.h>
 #include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
 
+// #define ANALYZE_VTKM_SCHEDULER_1D
+// #define ANALYZE_VTKM_SCHEDULER_3D
+#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
+#include <vtkm/cont/cuda/internal/TaskTuner.h>
+#endif
+
 // Disable warnings we check vtkm for but Thrust does not.
 VTKM_THIRDPARTY_PRE_INCLUDE
 //our own custom thrust execution policy
@@ -118,6 +123,47 @@ __global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size)
   functor(index);
 }
 
+#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
+
+// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd
+// for an unknown reason
+template <class FunctorType>
+__global__ void Schedule1DIndexKernel2(FunctorType functor,
+                                       vtkm::Id numberOfKernelsInvoked,
+                                       vtkm::Id length)
+{
+  vtkm::Id index = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
+  const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
+  for (; index < length; index += inc)
+  {
+    functor(index);
+  }
+}
+
+template <class FunctorType>
+__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
+{
+  const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
+                   blockIdx.y * blockDim.y + threadIdx.y,
+                   blockIdx.z * blockDim.z + threadIdx.z);
+  const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
+
+  for (uint k = start.z; k < size.z; k += inc.z)
+  {
+    for (uint j = start.y; j < size.y; j += inc.y)
+    {
+      vtkm::Id3 index(start.x, j, k);
+      for (vtkm::Id i = start.x; i < size.x; i += inc.x)
+      {
+        index[0] = i;
+        functor(index);
+      }
+    }
+  }
+}
+
+#endif
+
 template <typename T, typename BinaryOperationType>
 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
 {
@@ -134,140 +180,6 @@ inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d
                                                    : (rangeMax.z / blockSize3d.z);
 }
 
-#ifdef ANALYZE_VTKM_SCHEDULER
-class PerfRecord
-{
-public:
-  PerfRecord(float elapsedT, dim3 block)
-    : elapsedTime(elapsedT)
-    , blockSize(block)
-  {
-  }
-
-  bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
-
-  float elapsedTime;
-  dim3 blockSize;
-};
-
-template <class Functor>
-static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& rangeMax)
-{
-  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
-                    static_cast<vtkm::UInt32>(rangeMax[1]),
-                    static_cast<vtkm::UInt32>(rangeMax[2]));
-  std::vector<PerfRecord> results;
-  vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
-
-  for (vtkm::UInt32 i = 0; i < 16; i++)
-  {
-    for (vtkm::UInt32 j = 0; j < 16; j++)
-    {
-      for (vtkm::UInt32 k = 0; k < 16; k++)
-      {
-        cudaEvent_t start, stop;
-        VTKM_CUDA_CALL(cudaEventCreate(&start));
-        VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-        dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
-        dim3 gridSize3d;
-
-        if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
-            (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
-        {
-          //cuda can't handle more than 1024 threads per block
-          //so don't try if we compute higher than that
-
-          //also don't try stupidly low numbers
-
-          //cuda can't handle more than 64 threads in the z direction
-          continue;
-        }
-
-        compute_block_size(ranges, blockSize3d, gridSize3d);
-        VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-        Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
-        VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
-
-        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-        float elapsedTimeMilliseconds;
-        VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-        VTKM_CUDA_CALL(cudaEventDestroy(start));
-        VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-        PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
-        results.push_back(record);
-      }
-    }
-  }
-
-  std::sort(results.begin(), results.end());
-  const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
-  for (vtkm::Int64 i = 1; i <= size; i++)
-  {
-    vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
-    vtkm::UInt32 x = results[index].blockSize.x;
-    vtkm::UInt32 y = results[index].blockSize.y;
-    vtkm::UInt32 z = results[index].blockSize.z;
-    float t = results[index].elapsedTime;
-
-    std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
-  }
-
-  std::cout << "flat array performance " << std::endl;
-  {
-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    typedef vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
-      vtkm::cont::DeviceAdapterTagCuda>
-      Algorithm;
-    Algorithm::Schedule(functor, numInstances);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
-
-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
-  }
-
-  std::cout << "fixed 3d block size performance " << std::endl;
-  {
-    cudaEvent_t start, stop;
-    VTKM_CUDA_CALL(cudaEventCreate(&start));
-    VTKM_CUDA_CALL(cudaEventCreate(&stop));
-
-    dim3 blockSize3d(64, 2, 1);
-    dim3 gridSize3d;
-
-    compute_block_size(ranges, blockSize3d, gridSize3d);
-    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
-    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
-    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
-
-    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
-
-    VTKM_CUDA_CALL(cudaEventDestroy(start));
-    VTKM_CUDA_CALL(cudaEventDestroy(stop));
-
-    std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
-              << " required: " << elapsedTimeMilliseconds << std::endl;
-    std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
-              << " required: " << elapsedTimeMilliseconds << std::endl;
-  }
-}
-
-#endif
-
 /// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
 /// device that uses thrust as its implementation. The subclass should pass in
 /// the correct device adapter tag as the template parameter.
@@ -1365,6 +1277,10 @@ public:
     {
       throw vtkm::cont::ErrorExecution(hostErrorPtr);
     }
+
+#ifdef ANALYZE_VTKM_SCHEDULER_1D
+    compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize);
+#endif
   }
 
   template <class Functor>
@@ -1390,10 +1306,6 @@ public:
 
     functor.SetErrorMessageBuffer(errorMessage);
 
-#ifdef ANALYZE_VTKM_SCHEDULER
-    //requires the errormessage buffer be set
-    compare_3d_schedule_patterns(functor, rangeMax);
-#endif
     const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
                       static_cast<vtkm::UInt32>(rangeMax[1]),
                       static_cast<vtkm::UInt32>(rangeMax[2]));
@@ -1402,6 +1314,9 @@ public:
     //memory in the X direction. Also should be good for thin in the Z axis
     //algorithms.
     dim3 blockSize3d(64, 2, 1);
+    //In general we need more information as this doesn't work well when
+    //executing on the points, and need to fetch all cells used, as the z
+    //width is not fat enough
 
     //handle the simple use case of 'bad' datasets which are thin in X
     //but larger in the other directions, allowing us decent performance with
@@ -1411,6 +1326,7 @@ public:
       blockSize3d = dim3(16, 4, 4);
     }
 
+
     dim3 gridSize3d;
     compute_block_size(ranges, blockSize3d, gridSize3d);
 
@@ -1427,6 +1343,22 @@ public:
     {
       throw vtkm::cont::ErrorExecution(hostErrorPtr);
     }
+
+#ifdef ANALYZE_VTKM_SCHEDULER_1D
+    compare_1d_dynamic_block_picker(functor,
+                                    rangeMax[0] * rangeMax[1] * rangeMax[2],
+                                    gridSize3d.x * gridSize3d.y * gridSize3d.z,
+                                    blockSize3d.x * blockSize3d.y * blockSize3d.z);
+#endif
+
+#ifdef ANALYZE_VTKM_SCHEDULER_3D
+    //requires the errormessage buffer be set
+    compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d);
+#endif
+
+#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
+    parameter_sweep_3d_schedule(functor, rangeMax);
+#endif
   }
 
   template <typename T, class Storage>
diff --git a/vtkm/cont/cuda/internal/TaskTuner.h b/vtkm/cont/cuda/internal/TaskTuner.h
new file mode 100644
index 000000000..fe8b15fd0
--- /dev/null
+++ b/vtkm/cont/cuda/internal/TaskTuner.h
@@ -0,0 +1,333 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
+#define vtk_m_cont_cuda_internal_TaskTuner_h
+
+#include <vtkm/Types.h>
+#include <vtkm/cont/cuda/ErrorCuda.h>
+
+#include <cuda.h>
+
+#include <iostream>
+#include <string>
+#include <typeinfo>
+#include <vector>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+template <class FunctorType>
+__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id);
+template <class FunctorType>
+__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id);
+template <class FunctorType>
+__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size);
+template <class FunctorType>
+__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size);
+
+inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d);
+
+
+template <typename Task>
+__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size)
+{
+  const vtkm::Id start = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
+  const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
+  for (vtkm::Id i = start; i < size; i += inc)
+  {
+    task(i);
+  }
+}
+
+class PerfRecord
+{
+public:
+  PerfRecord(float elapsedT, dim3 block)
+    : elapsedTime(elapsedT)
+    , blockSize(block)
+  {
+  }
+
+  bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
+
+  float elapsedTime;
+  dim3 blockSize;
+};
+
+template <typename Task>
+static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy)
+{
+  int blockSize;   // The launch configurator returned block size
+  int minGridSize; // The minimum grid size needed to achieve the
+                   // maximum occupancy for a full device launch
+  int gridSize;    // The actual grid size needed, based on number of SM's
+  int device;      // device to run on
+  int numSMs;      // number of SMs on the active device
+
+  cudaGetDevice(&device);
+  cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device);
+
+  cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2<Task>, 0, 0);
+
+
+  blockSize /= (numSMs * 2);
+  // Round up according to array size
+  // gridSize = (size + blockSize - 1) / blockSize;
+  gridSize = 32 * numSMs;
+  // std::cout << "numSMs: " << numSMs << std::endl;
+
+  // calculate theoretical occupancy
+  int maxActiveBlocks;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &maxActiveBlocks, Schedule1DIndexKernel2<Task>, blockSize, 0);
+
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, device);
+
+  grids = gridSize;
+  blocks = blockSize;
+  occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
+    (float)(props.maxThreadsPerMultiProcessor / props.warpSize);
+}
+
+template <class Functor>
+static void compare_1d_dynamic_block_picker(Functor functor,
+                                            vtkm::Id size,
+                                            const vtkm::Id& currentGridSize,
+                                            const vtkm::Id& currentBlockSize)
+{
+  const std::type_info& ti = typeid(functor);
+  std::cout << "fixed 1d block size performance " << ti.name() << std::endl;
+  {
+    cudaEvent_t start, stop;
+    VTKM_CUDA_CALL(cudaEventCreate(&start));
+    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+    Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
+      functor, vtkm::Id(0), size);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+    float elapsedTimeMilliseconds;
+    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+    VTKM_CUDA_CALL(cudaEventDestroy(start));
+    VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+    std::cout << "Schedule1DIndexKernel size: " << size << std::endl;
+    std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize
+              << " required: " << elapsedTimeMilliseconds << std::endl;
+  }
+
+  std::cout << "dynamic 1d block size performance " << ti.name() << std::endl;
+  {
+
+    int grids, blocks;
+    float occupancy;
+    BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
+
+    cudaEvent_t start, stop;
+    VTKM_CUDA_CALL(cudaEventCreate(&start));
+    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+
+    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+    Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+    float elapsedTimeMilliseconds;
+    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+    VTKM_CUDA_CALL(cudaEventDestroy(start));
+    VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+    std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl;
+    std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks
+              << " required: " << elapsedTimeMilliseconds << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+template <class Functor>
+static void compare_3d_dynamic_block_picker(Functor functor,
+                                            vtkm::Id3 ranges,
+                                            const dim3& gridSize3d,
+                                            const dim3& blockSize3d)
+{
+  const std::type_info& ti = typeid(functor);
+  std::cout << "fixed 3d block size performance " << ti.name() << std::endl;
+  {
+    cudaEvent_t start, stop;
+    VTKM_CUDA_CALL(cudaEventCreate(&start));
+    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+    float elapsedTimeMilliseconds;
+    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+    VTKM_CUDA_CALL(cudaEventDestroy(start));
+    VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+    // std::cout << "Schedule3DIndexKernel size: " << size << std::endl;
+    // std::cout << "GridSize of: " << currentGridSize
+    //           << " BlockSize of: " << currentBlockSize  << " required: " << elapsedTimeMilliseconds << std::endl;
+  }
+
+  std::cout << "dynamic 3d block size performance " << ti.name() << std::endl;
+  {
+
+    // int grids, blocks;
+    // float occupancy;
+    // BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
+
+    // cudaEvent_t start, stop;
+    // VTKM_CUDA_CALL(cudaEventCreate(&start));
+    // VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+
+    // VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+    // Schedule3DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
+    // VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+    // VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+    // float elapsedTimeMilliseconds;
+    // VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+    // VTKM_CUDA_CALL(cudaEventDestroy(start));
+    // VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+    // std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl;
+    // std::cout << "GridSize of: " << grids
+    //           << " BlockSize of: " << blocks  << " required: " << elapsedTimeMilliseconds << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+template <class Functor>
+static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax)
+{
+  const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
+                    static_cast<vtkm::UInt32>(rangeMax[1]),
+                    static_cast<vtkm::UInt32>(rangeMax[2]));
+  std::vector<PerfRecord> results;
+  vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
+
+  for (vtkm::UInt32 i = 0; i < 16; i++)
+  {
+    for (vtkm::UInt32 j = 0; j < 16; j++)
+    {
+      for (vtkm::UInt32 k = 0; k < 16; k++)
+      {
+        cudaEvent_t start, stop;
+        VTKM_CUDA_CALL(cudaEventCreate(&start));
+        VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+        dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
+        dim3 gridSize3d;
+
+        if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
+            (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
+        {
+          //cuda can't handle more than 1024 threads per block
+          //so don't try if we compute higher than that
+
+          //also don't try stupidly low numbers
+
+          //cuda can't handle more than 64 threads in the z direction
+          continue;
+        }
+
+        compute_block_size(ranges, blockSize3d, gridSize3d);
+        VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+        Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
+        VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+        float elapsedTimeMilliseconds;
+        VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+        VTKM_CUDA_CALL(cudaEventDestroy(start));
+        VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+        PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
+        results.push_back(record);
+      }
+    }
+  }
+
+  std::sort(results.begin(), results.end());
+  const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
+  for (vtkm::Int64 i = 1; i <= size; i++)
+  {
+    vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
+    vtkm::UInt32 x = results[index].blockSize.x;
+    vtkm::UInt32 y = results[index].blockSize.y;
+    vtkm::UInt32 z = results[index].blockSize.z;
+    float t = results[index].elapsedTime;
+
+    std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
+  }
+
+  std::cout << "fixed 3d block size performance " << std::endl;
+  {
+    cudaEvent_t start, stop;
+    VTKM_CUDA_CALL(cudaEventCreate(&start));
+    VTKM_CUDA_CALL(cudaEventCreate(&stop));
+
+    dim3 blockSize3d(64, 2, 1);
+    dim3 gridSize3d;
+
+    compute_block_size(ranges, blockSize3d, gridSize3d);
+    VTKM_CUDA_CALL(cudaEventRecord(start, 0));
+    Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
+    VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
+
+    VTKM_CUDA_CALL(cudaEventSynchronize(stop));
+    float elapsedTimeMilliseconds;
+    VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
+
+    VTKM_CUDA_CALL(cudaEventDestroy(start));
+    VTKM_CUDA_CALL(cudaEventDestroy(stop));
+
+    std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
+              << " required: " << elapsedTimeMilliseconds << std::endl;
+    std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
+              << " required: " << elapsedTimeMilliseconds << std::endl;
+  }
+}
+}
+}
+}
+}
+
+#endif