From e982ebe41ea64868b6d00f3666d902aaf8faee98 Mon Sep 17 00:00:00 2001
From: Will Usher <wusher@lanl.gov>
Date: Fri, 24 Jul 2015 15:22:10 -0600
Subject: [PATCH] Measurement and general improvements to the benchmark suite

- A warm up run is done and not timed to allow for any allocation of
  room for output data without accounting for it in the run times.
Previously this time spent allocating memory would be included in the
time we measured for the benchmark.

- Benchmarks are run multiple times and we then compute some statistics
  about the run time of the benchmark to give a better picture of the
expected run time of the function. To this end we run the benchmark
either 500 times or for 1.5s, whichever comes sooner (though these are
easily changeable). We then perform outlier limiting by Winsorising the
data (similar to how Rust's benchmarking library works) and print out
the median, mean, min and max run times along with the median absolute
deviation and standard deviation.

- Because benchmarks are run many times they can now perform some
  initial setup in the constructor, eg. to fill some test input data
array with values to let the main benchmark loop run faster.

- To allow for benchmarks to have members of the data type being
  benchmarked the struct must now be templated on this type, leading to
a bit of awkwardness. I've worked around this by adding the
`VTKM_MAKE_BENCHMARK` and `VTKM_RUN_BENCHMARK` macros, the make
benchmark macro generates a struct that has an `operator()` templated on
the value type which will construct and return the benchmark functor
templated on that type. The run macro will then use this generated
struct to run the benchmark functor on the type list passed. You can
also pass arguments to the benchmark functor's constructor through the
make macro however this makes things more awkward because the name of
the MakeBench struct must be different for each variation of constructor
arguments (for example see `BenchLowerBounds`).

- Added a short comment on how to add benchmarks in
  `vtkm/benchmarking/Benchmarker.h` as the new system is a bit different
from how the tests work.

- You can now pass an extra argument when running the benchmark suite to
  only benchmark specific functions, eg. `Benchmarks_TBB
BenchmarkDeviceAdapter ScanInclusive Sort` will only benchmark
ScanInclusive and Sort. Running without any extra arguments will run all
the benchmarks as before.
---
 vtkm/benchmarking/BenchmarkDeviceAdapter.cxx |  55 +-
 vtkm/benchmarking/BenchmarkDeviceAdapter.h   | 750 ++++++++++++-------
 vtkm/benchmarking/Benchmarker.h              | 256 +++++++
 3 files changed, 792 insertions(+), 269 deletions(-)
 create mode 100644 vtkm/benchmarking/Benchmarker.h
diff --git a/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
index 20a49e1a0..44cffefb2 100644
--- a/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
@@ -22,9 +22,58 @@
 
 #include <vtkm/benchmarking/BenchmarkDeviceAdapter.h>
 
-int BenchmarkDeviceAdapter(int, char *[])
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <cctype>
+
+int BenchmarkDeviceAdapter(int argc, char *argv[])
 {
-	return vtkm::benchmarking::BenchmarkDeviceAdapter
-		<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>::Run();
+  int benchmarks = 0;
+  if (argc < 2){
+    benchmarks = vtkm::benchmarking::ALL;
+  }
+  else {
+    for (int i = 1; i < argc; ++i){
+      std::string arg = argv[i];
+      std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
+      if (arg == "lowerbounds"){
+        benchmarks |= vtkm::benchmarking::LOWER_BOUNDS;
+      }
+      else if (arg == "reduce"){
+        benchmarks |= vtkm::benchmarking::REDUCE;
+      }
+      else if (arg == "reducebykey"){
+        benchmarks |= vtkm::benchmarking::REDUCE_BY_KEY;
+      }
+      else if (arg == "scaninclusive"){
+        benchmarks |= vtkm::benchmarking::SCAN_INCLUSIVE;
+      }
+      else if (arg == "scanexclusive"){
+        benchmarks |= vtkm::benchmarking::SCAN_EXCLUSIVE;
+      }
+      else if (arg == "sort"){
+        benchmarks |= vtkm::benchmarking::SORT;
+      }
+      else if (arg == "sortbykey"){
+        benchmarks |= vtkm::benchmarking::SORT_BY_KEY;
+      }
+      else if (arg == "streamcompact"){
+        benchmarks |= vtkm::benchmarking::STREAM_COMPACT;
+      }
+      else if (arg == "unique"){
+        benchmarks |= vtkm::benchmarking::UNIQUE;
+      }
+      else if (arg == "upperbounds"){
+        benchmarks |= vtkm::benchmarking::UPPER_BOUNDS;
+      }
+      else {
+        std::cout << "Unrecognized benchmark: " << argv[i] << std::endl;
+        return 1;
+      }
+    }
+  }
+  return vtkm::benchmarking::BenchmarkDeviceAdapter
+    <VTKM_DEFAULT_DEVICE_ADAPTER_TAG>::Run(benchmarks);
 }
 
diff --git a/vtkm/benchmarking/BenchmarkDeviceAdapter.h b/vtkm/benchmarking/BenchmarkDeviceAdapter.h
index 9309ab79f..37ed747c3 100644
--- a/vtkm/benchmarking/BenchmarkDeviceAdapter.h
+++ b/vtkm/benchmarking/BenchmarkDeviceAdapter.h
@@ -33,10 +33,9 @@
 #include <vtkm/cont/StorageBasic.h>
 #include <vtkm/cont/Timer.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
-
 #include <vtkm/cont/internal/DeviceAdapterError.h>
-
 #include <vtkm/cont/testing/Testing.h>
+#include <vtkm/benchmarking/Benchmarker.h>
 
 #include <boost/random.hpp>
 
@@ -58,16 +57,30 @@
 namespace vtkm {
 namespace benchmarking {
 
-#define ARRAY_SIZE (1 << 20)
+#define ARRAY_SIZE (1 << 21)
 const static std::string DIVIDER(40, '-');
 
+enum BenchmarkName {
+  LOWER_BOUNDS = 1,
+  REDUCE = 1 << 1,
+  REDUCE_BY_KEY = 1 << 2,
+  SCAN_INCLUSIVE = 1 << 3,
+  SCAN_EXCLUSIVE = 1 << 4,
+  SORT = 1 << 5,
+  SORT_BY_KEY = 1 << 6,
+  STREAM_COMPACT = 1 << 7,
+  UNIQUE = 1 << 8,
+  UPPER_BOUNDS = 1 << 9,
+  ALL = LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY | SCAN_INCLUSIVE
+    | SCAN_EXCLUSIVE | SORT | SORT_BY_KEY | STREAM_COMPACT | UNIQUE
+    | UPPER_BOUNDS
+};
+
 /// This class runs a series of micro-benchmarks to measure
 /// performance of the parallel primitives provided by each
 /// device adapter
-///
 template<class DeviceAdapterTag>
-struct BenchmarkDeviceAdapter {
-private:
+class BenchmarkDeviceAdapter {
   typedef vtkm::cont::StorageTagBasic StorageTagBasic;
   typedef vtkm::cont::StorageTagBasic StorageTag;
 
@@ -78,314 +91,470 @@ private:
 
   typedef vtkm::cont::Timer<DeviceAdapterTag> Timer;
 
+public:
+  // Various kernels used by the different benchmarks to accelerate
+  // initialization of data
+  template<typename Value>
+  struct FillTestValueKernel : vtkm::exec::FunctorBase {
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef typename ValueArrayHandle::template ExecutionTypes<DeviceAdapterTag>
+        ::Portal PortalType;
+
+    PortalType Output;
+
+    VTKM_CONT_EXPORT
+    FillTestValueKernel(PortalType out) : Output(out){}
+
+    VTKM_EXEC_EXPORT void operator()(vtkm::Id i) const {
+      Output.Set(i, TestValue(i, Value()));
+    }
+  };
+
+  template<typename Value>
+  struct FillScaledTestValueKernel : vtkm::exec::FunctorBase {
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef typename ValueArrayHandle::template ExecutionTypes<DeviceAdapterTag>
+        ::Portal PortalType;
+
+    PortalType Output;
+    const vtkm::Id IdScale;
+
+    VTKM_CONT_EXPORT
+    FillScaledTestValueKernel(vtkm::Id id_scale, PortalType out) : Output(out), IdScale(id_scale) {}
+
+    VTKM_EXEC_EXPORT void operator()(vtkm::Id i) const {
+      Output.Set(i, TestValue(i * IdScale, Value()));
+    }
+  };
+
+  template<typename Value>
+  struct FillModuloTestValueKernel : vtkm::exec::FunctorBase {
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef typename ValueArrayHandle::template ExecutionTypes<DeviceAdapterTag>
+        ::Portal PortalType;
+
+    PortalType Output;
+    const vtkm::Id Modulus;
+
+    VTKM_CONT_EXPORT
+    FillModuloTestValueKernel(vtkm::Id modulus, PortalType out) : Output(out), Modulus(modulus) {}
+
+    VTKM_EXEC_EXPORT void operator()(vtkm::Id i) const {
+      Output.Set(i, TestValue(i % Modulus, Value()));
+    }
+  };
+
+  template<typename Value>
+  struct FillBinaryTestValueKernel : vtkm::exec::FunctorBase {
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef typename ValueArrayHandle::template ExecutionTypes<DeviceAdapterTag>
+        ::Portal PortalType;
+
+    PortalType Output;
+    const vtkm::Id Modulus;
+
+    VTKM_CONT_EXPORT
+    FillBinaryTestValueKernel(vtkm::Id modulus, PortalType out) : Output(out), Modulus(modulus) {}
+
+    VTKM_EXEC_EXPORT void operator()(vtkm::Id i) const {
+      Output.Set(i, i % Modulus == 0 ? TestValue(vtkm::Id(1), Value()) : Value());
+    }
+  };
+
+private:
+  template<typename Value>
   struct BenchLowerBounds {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    const vtkm::Id N_VALS;
+    ValueArrayHandle InputHandle, ValueHandle;
+    IdArrayHandle OutHandle;
+
+    VTKM_CONT_EXPORT
+    BenchLowerBounds(vtkm::Id value_percent) : N_VALS((ARRAY_SIZE * value_percent) / 100)
+    {
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            InputHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::Schedule(FillScaledTestValueKernel<Value>(2,
+            ValueHandle.PrepareForOutput(N_VALS, DeviceAdapterTag())), N_VALS);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
+      Algorithm::LowerBounds(InputHandle, ValueHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
 
-      std::vector<Value> input(ARRAY_SIZE, Value());
-      for (size_t i = 0; i < input.size(); ++i){
-        input[i] = TestValue(vtkm::Id(i), Value());
-      }
-      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
-
-      // We benchmark finding indices for the elements using various
-      // ratios of values to input from 5-30% of # of elements in input
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_vals = (ARRAY_SIZE * p) / 100;
-        std::vector<Value> values(n_vals, Value());
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(2 * i), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        IdArrayHandle out_handle;
-        timer.Reset();
-        Algorithm::LowerBounds(input_handle, value_handle, out_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "LowerBounds on " << ARRAY_SIZE << " input and "
-          << n_vals << " values took " << elapsed << "s\n";
-      }
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "LowerBounds on " << ARRAY_SIZE << " input and "
+        << N_VALS << " values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(LowerBounds5, BenchLowerBounds, 5);
+  VTKM_MAKE_BENCHMARK(LowerBounds10, BenchLowerBounds, 10);
+  VTKM_MAKE_BENCHMARK(LowerBounds15, BenchLowerBounds, 15);
+  VTKM_MAKE_BENCHMARK(LowerBounds20, BenchLowerBounds, 20);
+  VTKM_MAKE_BENCHMARK(LowerBounds25, BenchLowerBounds, 25);
+  VTKM_MAKE_BENCHMARK(LowerBounds30, BenchLowerBounds, 30);
 
+  template<typename Value>
   struct BenchReduce {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    ValueArrayHandle InputHandle;
+
+    VTKM_CONT_EXPORT
+    BenchReduce(){
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            InputHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
-      std::vector<Value> input(ARRAY_SIZE, Value());
-      for (size_t i = 0; i < input.size(); ++i){
-        input[i] = TestValue(vtkm::Id(i), Value());
-      }
-      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
-      timer.Reset();
-      Algorithm::Reduce(input_handle, Value());
-      vtkm::Float64 elapsed = timer.GetElapsedTime();
-      std::cout << "Reduce on " << ARRAY_SIZE
-        << " values took " << elapsed << "s\n";
+      Algorithm::Reduce(InputHandle, Value());
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "Reduce on " << ARRAY_SIZE << " values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(Reduce, BenchReduce);
 
+  template<typename Value>
   struct BenchReduceByKey {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    const vtkm::Id N_KEYS;
+    ValueArrayHandle ValueHandle, ValuesOut;
+    IdArrayHandle KeyHandle, KeysOut;
+
+    VTKM_CONT_EXPORT
+    BenchReduceByKey(vtkm::Id key_percent) : N_KEYS((ARRAY_SIZE * key_percent) / 100)
+    {
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::Schedule(FillModuloTestValueKernel<vtkm::Id>(N_KEYS,
+            KeyHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::SortByKey(KeyHandle, ValueHandle);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
-      // We benchmark 5% to 30% of ARRAY_SIZE keys in 5% increments
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_keys = (ARRAY_SIZE * p) / 100;
-        std::vector<Value> values(ARRAY_SIZE, Value());
-        std::vector<vtkm::Id> keys(ARRAY_SIZE, 0);
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(i), Value());
-          keys[i] = vtkm::Id(i % n_keys);
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        ValueArrayHandle values_out;
-        IdArrayHandle key_handle = vtkm::cont::make_ArrayHandle(keys);
-        IdArrayHandle keys_out;
-        Algorithm::SortByKey(key_handle, value_handle);
-        timer.Reset();
-        Algorithm::ReduceByKey(key_handle, value_handle, keys_out, values_out,
-            vtkm::internal::Add());
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "ReduceByKey on " << ARRAY_SIZE
-          << " values with " << n_keys << " distinct vtkm::Id"
-          << " keys took " << elapsed << "s\n";
-      }
+      Algorithm::ReduceByKey(KeyHandle, ValueHandle, KeysOut, ValuesOut,
+          vtkm::internal::Add());
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "ReduceByKey on " << ARRAY_SIZE
+        << " values with " << N_KEYS << " distinct vtkm::Id keys";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(ReduceByKey5, BenchReduceByKey, 5);
+  VTKM_MAKE_BENCHMARK(ReduceByKey10, BenchReduceByKey, 10);
+  VTKM_MAKE_BENCHMARK(ReduceByKey15, BenchReduceByKey, 15);
+  VTKM_MAKE_BENCHMARK(ReduceByKey20, BenchReduceByKey, 20);
+  VTKM_MAKE_BENCHMARK(ReduceByKey25, BenchReduceByKey, 25);
+  VTKM_MAKE_BENCHMARK(ReduceByKey30, BenchReduceByKey, 30);
 
+  template<typename Value>
   struct BenchScanInclusive {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    ValueArrayHandle ValueHandle, OutHandle;
 
+    VTKM_CONT_EXPORT
+    BenchScanInclusive(){
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
-      std::vector<Value> values(ARRAY_SIZE, Value());
-      for (size_t i = 0; i < values.size(); ++i){
-        values[i] = TestValue(vtkm::Id(i), Value());
-      }
-      ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-      ValueArrayHandle out_handle;
-      timer.Reset();
-      Algorithm::ScanInclusive(value_handle, out_handle);
-      vtkm::Float64 elapsed = timer.GetElapsedTime();
-      std::cout << "ScanInclusive on " << ARRAY_SIZE
-        << " values took " << elapsed << "s\n";
+      Algorithm::ScanInclusive(ValueHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "ScanInclusive on " << ARRAY_SIZE << " values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(ScanInclusive, BenchScanInclusive);
 
+  template<typename Value>
   struct BenchScanExclusive {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    ValueArrayHandle ValueHandle, OutHandle;
+
+    VTKM_CONT_EXPORT
+    BenchScanExclusive(){
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
-      std::vector<Value> values(ARRAY_SIZE, Value());
-      for (size_t i = 0; i < values.size(); ++i){
-        values[i] = TestValue(vtkm::Id(i), Value());
-      }
-      ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-      ValueArrayHandle out_handle;
-      timer.Reset();
-      Algorithm::ScanExclusive(value_handle, out_handle);
-      vtkm::Float64 elapsed = timer.GetElapsedTime();
-      std::cout << "ScanExclusive on " << ARRAY_SIZE
-        << " values took " << elapsed << "s\n";
+      Algorithm::ScanExclusive(ValueHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "ScanExclusive on " << ARRAY_SIZE << " values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(ScanExclusive, BenchScanExclusive);
 
-  /// This benchmark tests sort on a few configurations of data
-  /// sorted, reverse-ordered, almost sorted and random
-  /// TODO: Is it really worth testing all these possible configurations
-  /// of data? How often will we really care about anything besides unsorted data?
+  template<typename Value>
   struct BenchSort {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    std::vector<Value> Values;
+    ValueArrayHandle ValueHandle;
+    boost::mt19937 Rng;
+
+    VTKM_CONT_EXPORT
+    BenchSort() : Values(ARRAY_SIZE, Value()) {
+      ValueHandle = vtkm::cont::make_ArrayHandle(Values);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
+      for (size_t i = 0; i < Values.size(); ++i){
+        ValueHandle.GetPortalControl().Set(vtkm::Id(i), TestValue(vtkm::Id(Rng()), Value()));
+      }
       Timer timer;
-      std::vector<Value> values(ARRAY_SIZE, Value());
-      // Test sort on already sorted data
-      {
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(i), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        timer.Reset();
-        Algorithm::Sort(value_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "Sort on " << ARRAY_SIZE << " already sorted "
-          << " values took " << elapsed << "s\n";
-      }
-      // Test sort on reverse-sorted data
-      {
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(values.size() - i), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        timer.Reset();
-        Algorithm::Sort(value_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "Sort on " << ARRAY_SIZE << " reverse-ordered "
-          << " values took " << elapsed << "s\n";
-      }
-      // Test on almost sorted data
-      {
-        size_t modulus = values.size() / 4;
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(i % modulus), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        timer.Reset();
-        Algorithm::Sort(value_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "Sort on " << ARRAY_SIZE << " almost-sorted "
-          << " values took " << elapsed << "s\n";
-      }
-      // Test on random data
-      {
-        boost::mt19937 rng;
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(rng()), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        timer.Reset();
-        Algorithm::Sort(value_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "Sort on " << ARRAY_SIZE << " random "
-          << " values took " << elapsed << "s\n";
-      }
+      Algorithm::Sort(ValueHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "Sort on " << ARRAY_SIZE << " random values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(Sort, BenchSort);
 
+  template<typename Value>
   struct BenchSortByKey {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
-      Timer timer;
-      boost::mt19937 rng;
-      // We benchmark 5% to 30% of ARRAY_SIZE keys in 5% increments
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_keys = (ARRAY_SIZE * p) / 100;
-        std::vector<Value> values(ARRAY_SIZE, Value());
-        std::vector<vtkm::Id> keys(ARRAY_SIZE, 0);
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(rng()), Value());
-          keys[i] = vtkm::Id(i % n_keys);
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        IdArrayHandle key_handle = vtkm::cont::make_ArrayHandle(keys);
-        timer.Reset();
-        Algorithm::SortByKey(value_handle, key_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "SortByKey on " << ARRAY_SIZE
-          << " random values with " << n_keys << " different vtkm::Id keys took "
-          << elapsed << "s\n";
+    boost::mt19937 Rng;
+    vtkm::Id N_KEYS;
+    std::vector<Value> Values;
+    ValueArrayHandle ValueHandle;
+    IdArrayHandle KeyHandle;
+
+    VTKM_CONT_EXPORT
+    BenchSortByKey(vtkm::Id percent_key) : N_KEYS((ARRAY_SIZE * percent_key) / 100),
+      Values(ARRAY_SIZE, Value())
+    {
+      ValueHandle = vtkm::cont::make_ArrayHandle(Values);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
+      for (size_t i = 0; i < Values.size(); ++i){
+        ValueHandle.GetPortalControl().Set(vtkm::Id(i), TestValue(vtkm::Id(Rng()), Value()));
       }
+      Algorithm::Schedule(FillModuloTestValueKernel<vtkm::Id>(N_KEYS,
+            KeyHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Timer timer;
+      Algorithm::SortByKey(ValueHandle, KeyHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "SortByKey on " << ARRAY_SIZE
+        << " random values with " << N_KEYS << " different vtkm::Id keys";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(SortByKey5, BenchSortByKey, 5);
+  VTKM_MAKE_BENCHMARK(SortByKey10, BenchSortByKey, 10);
+  VTKM_MAKE_BENCHMARK(SortByKey15, BenchSortByKey, 15);
+  VTKM_MAKE_BENCHMARK(SortByKey20, BenchSortByKey, 20);
+  VTKM_MAKE_BENCHMARK(SortByKey25, BenchSortByKey, 25);
+  VTKM_MAKE_BENCHMARK(SortByKey30, BenchSortByKey, 30);
 
+  template<typename Value>
   struct BenchStreamCompact {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    const vtkm::Id N_VALID;
+    ValueArrayHandle ValueHandle;
+    IdArrayHandle OutHandle;
+
+    VTKM_CONT_EXPORT
+    BenchStreamCompact(vtkm::Id percent_valid) : N_VALID((ARRAY_SIZE * percent_valid) / 100)
+    {
+      vtkm::Id modulo = ARRAY_SIZE / N_VALID;
+      Algorithm::Schedule(FillBinaryTestValueKernel<Value>(modulo,
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()() {
       Timer timer;
-      // We benchmark 5% to 30% valid values in 5% increments
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_valid = (ARRAY_SIZE * p) / 100;
-        size_t modulo = ARRAY_SIZE / n_valid;
-        std::vector<Value> values(ARRAY_SIZE, Value());
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = i % modulo == 0 ? TestValue(1, Value()) : Value();
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        IdArrayHandle out_handle;
-        timer.Reset();
-        Algorithm::StreamCompact(value_handle, out_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "StreamCompact on " << ARRAY_SIZE << " "
-          << " values with " << out_handle.GetNumberOfValues()
-          << " valid values took " << elapsed << "s\n";
+      Algorithm::StreamCompact(ValueHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
 
-        std::vector<vtkm::Id> stencil(ARRAY_SIZE, 0);
-        for (size_t i = 0; i < stencil.size(); ++i){
-          stencil[i] = i % modulo == 0 ? 1 : vtkm::Id();
-        }
-        IdArrayHandle stencil_handle = vtkm::cont::make_ArrayHandle(stencil);
-        ValueArrayHandle out_val_handle;
-        timer.Reset();
-        Algorithm::StreamCompact(value_handle, stencil_handle, out_val_handle);
-        elapsed = timer.GetElapsedTime();
-        std::cout << "StreamCompact with stencil on " << ARRAY_SIZE
-          << " values with " << out_val_handle.GetNumberOfValues()
-          << " valid values took " << elapsed << "s\n";
-      }
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "StreamCompact on " << ARRAY_SIZE << " "
+          << " values with " << OutHandle.GetNumberOfValues()
+          << " valid values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(StreamCompact5, BenchStreamCompact, 5);
+  VTKM_MAKE_BENCHMARK(StreamCompact10, BenchStreamCompact, 10);
+  VTKM_MAKE_BENCHMARK(StreamCompact15, BenchStreamCompact, 15);
+  VTKM_MAKE_BENCHMARK(StreamCompact20, BenchStreamCompact, 20);
+  VTKM_MAKE_BENCHMARK(StreamCompact25, BenchStreamCompact, 25);
+  VTKM_MAKE_BENCHMARK(StreamCompact30, BenchStreamCompact, 30);
 
+  template<typename Value>
+  struct BenchStreamCompactStencil {
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+    const vtkm::Id N_VALID;
+    ValueArrayHandle ValueHandle;
+    IdArrayHandle StencilHandle, OutHandle;
+
+    VTKM_CONT_EXPORT
+    BenchStreamCompactStencil(vtkm::Id percent_valid) : N_VALID((ARRAY_SIZE * percent_valid) / 100)
+    {
+      vtkm::Id modulo = ARRAY_SIZE / N_VALID;
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::Schdule(FillBinaryTestValueKernel<vtkm::Id>(modulo,
+            StencilHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()() {
+      Timer timer;
+      Algorithm::StreamCompact(ValueHandle, StencilHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "StreamCompactStencil on " << ARRAY_SIZE << " "
+          << " values with " << OutHandle.GetNumberOfValues()
+          << " valid values";
+      return description.str();
+    }
+  };
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil5, BenchStreamCompactStencil, 5);
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil10, BenchStreamCompactStencil, 10);
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil15, BenchStreamCompactStencil, 15);
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil20, BenchStreamCompactStencil, 20);
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil25, BenchStreamCompactStencil, 25);
+  VTKM_MAKE_BENCHMARK(StreamCompactStencil30, BenchStreamCompactStencil, 30);
+
+  template<typename Value>
   struct BenchUnique {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    const vtkm::Id N_VALID;
+    ValueArrayHandle ValueHandle;
+
+    VTKM_CONT_EXPORT
+    BenchUnique(vtkm::Id percent_valid) : N_VALID((ARRAY_SIZE * percent_valid) / 100)
+    {}
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
+      Algorithm::Schedule(FillModuloTestValueKernel<Value>(N_VALID,
+            ValueHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::Sort(ValueHandle);
       Timer timer;
-      // We benchmark 5% to 30% valid values in 5% increments
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_valid = (ARRAY_SIZE * p) / 100;
-        std::vector<Value> values(ARRAY_SIZE, Value());
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(i % n_valid), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        Algorithm::Sort(value_handle);
-        timer.Reset();
-        Algorithm::Unique(value_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "Unique on " << ARRAY_SIZE << " values with "
-          << value_handle.GetNumberOfValues() << " valid values took "
-          << elapsed << "s\n";
-      }
+      Algorithm::Unique(ValueHandle);
+      return timer.GetElapsedTime();
+    }
+
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "Unique on " << ARRAY_SIZE << " values with "
+          << ValueHandle.GetNumberOfValues() << " valid values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(Unique5, BenchUnique, 5);
+  VTKM_MAKE_BENCHMARK(Unique10, BenchUnique, 10);
+  VTKM_MAKE_BENCHMARK(Unique15, BenchUnique, 15);
+  VTKM_MAKE_BENCHMARK(Unique20, BenchUnique, 20);
+  VTKM_MAKE_BENCHMARK(Unique25, BenchUnique, 25);
+  VTKM_MAKE_BENCHMARK(Unique30, BenchUnique, 30);
 
+  template<typename Value>
   struct BenchUpperBounds {
-    template<typename Value>
-    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
-      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+    typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
 
+    const vtkm::Id N_VALS;
+    ValueArrayHandle InputHandle, ValueHandle;
+    IdArrayHandle OutHandle;
+
+    VTKM_CONT_EXPORT
+    BenchUpperBounds(vtkm::Id percent_vals) : N_VALS((ARRAY_SIZE * percent_vals) / 100)
+    {
+      Algorithm::Schedule(FillTestValueKernel<Value>(
+            InputHandle.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag())), ARRAY_SIZE);
+      Algorithm::Schedule(FillScaledTestValueKernel<Value>(2,
+            ValueHandle.PrepareForOutput(N_VALS, DeviceAdapterTag())), N_VALS);
+    }
+
+    VTKM_CONT_EXPORT
+    vtkm::Float64 operator()(){
       Timer timer;
-      std::vector<Value> input(ARRAY_SIZE, Value());
-      for (size_t i = 0; i < input.size(); ++i){
-        input[i] = TestValue(vtkm::Id(i), Value());
-      }
-      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
+      Algorithm::UpperBounds(InputHandle, ValueHandle, OutHandle);
+      return timer.GetElapsedTime();
+    }
 
-      // We benchmark finding indices for the elements using various
-      // ratios of values to input from 5-30% of # of elements in input
-      for (size_t p = 5; p <= 30; p += 5){
-        size_t n_vals = (ARRAY_SIZE * p) / 100;
-        std::vector<Value> values(n_vals, Value());
-        for (size_t i = 0; i < values.size(); ++i){
-          values[i] = TestValue(vtkm::Id(2 * i), Value());
-        }
-        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
-        IdArrayHandle out_handle;
-        timer.Reset();
-        Algorithm::UpperBounds(input_handle, value_handle, out_handle);
-        vtkm::Float64 elapsed = timer.GetElapsedTime();
-        std::cout << "UpperBounds on " << ARRAY_SIZE << " input and "
-          << n_vals << " values took " << elapsed << "s\n";
-      }
+    VTKM_CONT_EXPORT
+    std::string Description() const {
+      std::stringstream description;
+      description << "UpperBounds on " << ARRAY_SIZE << " input and "
+        << N_VALS << " values";
+      return description.str();
     }
   };
+  VTKM_MAKE_BENCHMARK(UpperBounds5, BenchUpperBounds, 5);
+  VTKM_MAKE_BENCHMARK(UpperBounds10, BenchUpperBounds, 10);
+  VTKM_MAKE_BENCHMARK(UpperBounds15, BenchUpperBounds, 15);
+  VTKM_MAKE_BENCHMARK(UpperBounds20, BenchUpperBounds, 20);
+  VTKM_MAKE_BENCHMARK(UpperBounds25, BenchUpperBounds, 25);
+  VTKM_MAKE_BENCHMARK(UpperBounds30, BenchUpperBounds, 30);
 
 public:
 
@@ -395,40 +564,89 @@ public:
                                         vtkm::Float64, vtkm::Vec<vtkm::Float64, 3>,
                                         vtkm::Vec<vtkm::Float32, 4> >{};
 
+  static VTKM_CONT_EXPORT int Run(int benchmarks){
+    std::cout << DIVIDER << "\nRunning DeviceAdapter benchmarks\n";
 
-  static VTKM_CONT_EXPORT int Run(){
-      std::cout << DIVIDER << "\nRunning DeviceAdapter benchmarks\n";
-
+    if (benchmarks & LOWER_BOUNDS){
       std::cout << DIVIDER << "\nBenchmarking LowerBounds\n";
-      vtkm::testing::Testing::TryTypes(BenchLowerBounds(), ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds5, ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds10, ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds15, ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds20, ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds25, ValueTypes());
+      VTKM_RUN_BENCHMARK(LowerBounds30, ValueTypes());
+    }
 
+    if (benchmarks & REDUCE){
       std::cout << "\n" << DIVIDER << "\nBenchmarking Reduce\n";
-      vtkm::testing::Testing::TryTypes(BenchReduce(), ValueTypes());
+      VTKM_RUN_BENCHMARK(Reduce, ValueTypes());
+    }
 
+    if (benchmarks & REDUCE_BY_KEY){
       std::cout << "\n" << DIVIDER << "\nBenchmarking ReduceByKey\n";
-      vtkm::testing::Testing::TryTypes(BenchReduceByKey(), ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey5, ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey10, ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey15, ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey20, ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey25, ValueTypes());
+      VTKM_RUN_BENCHMARK(ReduceByKey30, ValueTypes());
+    }
 
+    if (benchmarks & SCAN_INCLUSIVE){
       std::cout << "\n" << DIVIDER << "\nBenchmarking ScanInclusive\n";
-      vtkm::testing::Testing::TryTypes(BenchScanInclusive(), ValueTypes());
+      VTKM_RUN_BENCHMARK(ScanInclusive, ValueTypes());
+    }
 
+    if (benchmarks & SCAN_EXCLUSIVE){
       std::cout << "\n" << DIVIDER << "\nBenchmarking ScanExclusive\n";
-      vtkm::testing::Testing::TryTypes(BenchScanExclusive(), ValueTypes());
+      VTKM_RUN_BENCHMARK(ScanExclusive, ValueTypes());
+    }
 
+    if (benchmarks & SORT){
       std::cout << "\n" << DIVIDER << "\nBenchmarking Sort\n";
-      vtkm::testing::Testing::TryTypes(BenchSort(), ValueTypes());
+      VTKM_RUN_BENCHMARK(Sort, ValueTypes());
+    }
 
+    if (benchmarks & SORT_BY_KEY){
       std::cout << "\n" << DIVIDER << "\nBenchmarking SortByKey\n";
-      vtkm::testing::Testing::TryTypes(BenchSortByKey(), ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey5, ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey10, ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey15, ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey20, ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey25, ValueTypes());
+      VTKM_RUN_BENCHMARK(SortByKey30, ValueTypes());
+    }
 
+    if (benchmarks & STREAM_COMPACT){
       std::cout << "\n" << DIVIDER << "\nBenchmarking StreamCompact\n";
-      vtkm::testing::Testing::TryTypes(BenchStreamCompact(), ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact5, ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact10, ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact15, ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact20, ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact25, ValueTypes());
+      VTKM_RUN_BENCHMARK(StreamCompact30, ValueTypes());
+    }
 
+    if (benchmarks & UNIQUE){
       std::cout << "\n" << DIVIDER << "\nBenchmarking Unique\n";
-      vtkm::testing::Testing::TryTypes(BenchUnique(), ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique5, ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique10, ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique15, ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique20, ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique25, ValueTypes());
+      VTKM_RUN_BENCHMARK(Unique30, ValueTypes());
+    }
 
+    if (benchmarks & UPPER_BOUNDS){
       std::cout << "\n" << DIVIDER << "\nBenchmarking UpperBounds\n";
-      vtkm::testing::Testing::TryTypes(BenchUpperBounds(), ValueTypes());
-      return 0;
+      VTKM_RUN_BENCHMARK(UpperBounds5, ValueTypes());
+      VTKM_RUN_BENCHMARK(UpperBounds10, ValueTypes());
+      VTKM_RUN_BENCHMARK(UpperBounds15, ValueTypes());
+      VTKM_RUN_BENCHMARK(UpperBounds20, ValueTypes());
+      VTKM_RUN_BENCHMARK(UpperBounds25, ValueTypes());
+      VTKM_RUN_BENCHMARK(UpperBounds30, ValueTypes());
+    }
+    return 0;
   }
 };
 
diff --git a/vtkm/benchmarking/Benchmarker.h b/vtkm/benchmarking/Benchmarker.h
new file mode 100644
index 000000000..48b8915f1
--- /dev/null
+++ b/vtkm/benchmarking/Benchmarker.h
@@ -0,0 +1,256 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_benchmarking_Benchmarker_h
+#define vtk_m_benchmarking_Benchmarker_h
+
+#include <vtkm/Math.h>
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+/*
+ * Writing a Benchmark
+ * -------------------
+ * To write a benchmark you must provide a functor that will run the operations
+ * you want to time and return the run time of those operations using the timer
+ * for the device. The benchmark should also be templated on the value type being
+ * operated on. Then use VTKM_MAKE_BENCHMARK to generate a maker functor and
+ * VTKM_RUN_BENCHMARK to run the benchmark on a list of types.
+ *
+ * For Example:
+ *
+ * template<typename Value>
+ * struct BenchSilly {
+ *   // Setup anything that doesn't need to change per run in the constructor
+ *   VTKM_CONT_EXPORT BenchSilly(){}
+ *
+ *   // The overloaded call operator will run the operations being timed and
+ *   // return the execution time
+ *   VTKM_CONT_EXPORT
+ *   vtkm::Float64 operator()(){
+ *     return 0.05;
+ *   }
+ *
+ *   // The benchmark must also provide a method describing itself, this is
+ *   // used when printing out run time statistics
+ *   VTKM_CONT_EXPORT
+ *   std::string Description() const {
+ *     return "A silly benchmark";
+ *   }
+ * };
+ *
+ * // Now use the VTKM_MAKE_BENCHMARK macro to generate a maker functor for
+ * // your benchmark. This lets us generate the benchmark functor for each type
+ * // we want to test
+ * VTKM_MAKE_BENCHMARK(Silly, BenchSilly);
+ *
+ * // You can also optionally pass arguments to the constructor like so:
+ * // VTKM_MAKE_BENCHMARK(Blah, BenchBlah, 1, 2, 3);
+ * // Note that benchmark names (the first argument) must be unique so different
+ * // parameters to the constructor should have different names
+ *
+ * // We can now run our benchmark using VTKM_RUN_BENCHMARK, passing the
+ * // benchmark name and type list to run on
+ * int main(int, char**){
+ *   VTKM_RUN_BENCHMARK(Silly, vtkm::ListTagBase<vtkm::Float32>());
+ *   return 0;
+ * }
+ *
+ * Check out vtkm/benchmarking/BenchmarkDeviceAdapter.h for some example usage
+ */
+
+/*
+ * Use the VTKM_MAKE_BENCHMARK macro to define a maker functor for your benchmark.
+ * This is used to allow you to template the benchmark functor on the type being benchmarked
+ * so you can write init code in the constructor. Then the maker will return a constructed
+ * instance of your benchmark for the type being benchmarked. The VA_ARGS are used to
+ * pass any extra arguments needed by your benchmark
+ */
+#define VTKM_MAKE_BENCHMARK(Name, Bench, ...) \
+  struct MakeBench##Name { \
+    template<typename Value> \
+    VTKM_CONT_EXPORT \
+    Bench<Value> operator()(const Value vtkmNotUsed(v)) const { \
+      return Bench<Value>(__VA_ARGS__); \
+    } \
+  }
+
+/*
+ * Use the VTKM_RUN_BENCHMARK macro to run your benchmark on the type list passed.
+ * You must have previously defined a maker functor with VTKM_MAKE_BENCHMARK that this
+ * macro will look for and use
+ */
+#define VTKM_RUN_BENCHMARK(Name, Types) \
+  vtkm::benchmarking::BenchmarkTypes(MakeBench##Name(), (Types))
+
+namespace vtkm {
+namespace benchmarking {
+namespace stats {
+
+// Get the value representing the `percent` percentile of the
+// sorted samples using linear interpolation
+vtkm::Float64 PercentileValue(const std::vector<vtkm::Float64> &samples, const vtkm::Float64 percent){
+  VTKM_ASSERT_CONT(!samples.empty());
+  if (samples.size() == 1){
+    return samples.front();
+  }
+  VTKM_ASSERT_CONT(percent >= 0.0);
+  VTKM_ASSERT_CONT(percent <= 100.0);
+  VTKM_ASSERT_CONT(std::is_sorted(samples.begin(), samples.end()));
+  if (percent == 100.0){
+    return samples.back();
+  }
+  // Find the two nearest percentile values and linearly
+  // interpolate between them
+  const vtkm::Float64 rank = percent / 100.0 * (static_cast<vtkm::Float64>(samples.size()) - 1.0);
+  const vtkm::Float64 low_rank = vtkm::Floor(rank);
+  const vtkm::Float64 dist = rank - low_rank;
+  const size_t k = static_cast<size_t>(low_rank);
+  const vtkm::Float64 low = samples[k];
+  const vtkm::Float64 high = samples[k + 1];
+  return low + (high - low) * dist;
+}
+// Winsorize the samples to clean up any very extreme outliers
+// Will replace all samples below `percent` and above 100 - `percent` percentiles
+// with the value at the percentile
+// NOTE: Assumes the samples have been sorted, as we make use of PercentileValue
+void Winsorize(std::vector<vtkm::Float64> &samples, const vtkm::Float64 percent){
+  const vtkm::Float64 low_percentile = PercentileValue(samples, percent);
+  const vtkm::Float64 high_percentile = PercentileValue(samples, 100.0 - percent);
+  for (std::vector<vtkm::Float64>::iterator it = samples.begin(); it != samples.end(); ++it){
+    if (*it < low_percentile){
+      *it = low_percentile;
+    }
+    else if (*it > high_percentile){
+      *it = high_percentile;
+    }
+  }
+}
+// Compute the mean value of the dataset
+vtkm::Float64 Mean(const std::vector<vtkm::Float64> &samples){
+  vtkm::Float64 mean = 0;
+  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it){
+    mean += *it;
+  }
+  return mean / static_cast<vtkm::Float64>(samples.size());
+}
+// Compute the sample variance of the samples
+vtkm::Float64 Variance(const std::vector<vtkm::Float64> &samples){
+  vtkm::Float64 mean = Mean(samples);
+  vtkm::Float64 square_deviations = 0;
+  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it){
+    square_deviations += vtkm::Pow(*it - mean, 2.0);
+  }
+  return square_deviations / (static_cast<vtkm::Float64>(samples.size()) - 1.0);
+}
+// Compute the standard deviation of the samples
+vtkm::Float64 StandardDeviation(const std::vector<vtkm::Float64> &samples){
+  return vtkm::Sqrt(Variance(samples));
+}
+// Compute the median absolute deviation of the dataset
+vtkm::Float64 MedianAbsDeviation(const std::vector<vtkm::Float64> &samples){
+  std::vector<vtkm::Float64> abs_deviations;
+  abs_deviations.reserve(samples.size());
+  const vtkm::Float64 median = PercentileValue(samples, 50.0);
+  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it){
+    abs_deviations.push_back(vtkm::Abs(*it - median));
+  }
+  return PercentileValue(abs_deviations, 50.0);
+}
+} // stats
+
+/*
+ * The benchmarker takes a functor to benchmark and runs it multiple times,
+ * printing out statistics of the run time at the end.
+ * The functor passed should return the run time of the thing being benchmarked
+ * in seconds, this lets us avoid including any per-run setup time in the benchmark.
+ * However any one-time setup should be done in the functor's constructor
+ */
+struct Benchmarker {
+  const vtkm::Float64 MAX_RUNTIME;
+  const size_t MAX_ITERATIONS;
+
+  Benchmarker() : MAX_RUNTIME(1.5), MAX_ITERATIONS(500){}
+
+  template<typename Functor>
+  VTKM_CONT_EXPORT
+  void operator()(Functor func) const {
+    std::vector<vtkm::Float64> samples;
+    // Do a warm-up run. If the benchmark allocates any additional memory
+    // eg. storage for output results, this will let it do that and
+    // allow us to avoid measuring the allocation time in the actual benchmark run
+    func();
+
+    samples.reserve(MAX_ITERATIONS);
+    // Run each benchmark for MAX_RUNTIME seconds or MAX_ITERATIONS iterations, whichever
+    // takes less time. This kind of assumes that running for 500 iterations or 1.5s will give
+    // good statistics, but if median abs dev and/or std dev are too high both these limits
+    // could be increased
+    size_t iter = 0;
+    for (vtkm::Float64 elapsed = 0.0; elapsed < MAX_RUNTIME && iter < MAX_ITERATIONS;
+        elapsed += samples.back(), ++iter)
+    {
+      samples.push_back(func());
+    }
+    std::sort(samples.begin(), samples.end());
+    stats::Winsorize(samples, 5.0);
+    std::cout << "Benchmark \'"
+      << func.Description() << "\' results:\n"
+      << "\tmedian = " << stats::PercentileValue(samples, 50.0) << "s\n"
+      << "\tmedian abs dev = " << stats::MedianAbsDeviation(samples) << "s\n"
+      << "\tmean = " << stats::Mean(samples) << "s\n"
+      << "\tstd dev = " << stats::StandardDeviation(samples) << "s\n"
+      << "\tmin = " << samples.front() << "s\n"
+      << "\tmax = " << samples.back() << "s\n";
+  }
+};
+
+template<typename MakerFunctor>
+class InternalPrintTypeAndBench {
+  MakerFunctor Maker;
+
+public:
+  VTKM_CONT_EXPORT
+  InternalPrintTypeAndBench(MakerFunctor maker) : Maker(maker) {}
+
+  template<typename T>
+  VTKM_CONT_EXPORT
+  void operator()(T t) const {
+    std::cout << "*** "
+              << vtkm::testing::TypeName<T>::Name()
+              << " ***************" << std::endl;
+    Benchmarker bench;
+    bench(Maker(t));
+  }
+};
+
+template<class MakerFunctor, class TypeList>
+VTKM_CONT_EXPORT
+void BenchmarkTypes(const MakerFunctor &maker, TypeList){
+  vtkm::ListForEach(InternalPrintTypeAndBench<MakerFunctor>(maker), TypeList());
+}
+
+}
+}
+
+#endif
+