Port benchmarking framework to Google Benchmark.

2024-10-08 03:18:58 +00:00 · 2019-12-26 14:48:51 -05:00 · 2019-12-26 14:48:51 -05:00 · 539f6e5ad7
commit 539f6e5ad7
parent 39d981bcf9
14 changed files with 4038 additions and 5465 deletions
--- a/benchmarking/BenchmarkArrayTransfer.cxx
+++ b/benchmarking/BenchmarkArrayTransfer.cxx
--- a/benchmarking/BenchmarkAtomicArray.cxx
+++ b/benchmarking/BenchmarkAtomicArray.cxx
--- a/benchmarking/BenchmarkCopySpeeds.cxx
+++ b/benchmarking/BenchmarkCopySpeeds.cxx
@ -8,23 +8,19 @@
 //  PURPOSE.  See the above copyright notice for more information.
 //============================================================================

-#include <vtkm/TypeTraits.h>
-
 #include "Benchmarker.h"

+#include <vtkm/cont/Algorithm.h>
 #include <vtkm/cont/DeviceAdapter.h>
-#include <vtkm/cont/DeviceAdapterAlgorithm.h>
-#include <vtkm/cont/ErrorBadAllocation.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
 #include <vtkm/cont/Timer.h>

-#include <vtkm/cont/serial/DeviceAdapterSerial.h>
-
 #include <vtkm/internal/Configure.h>

 #include <vtkm/testing/Testing.h>

-#include <iomanip>
-#include <iostream>
+#include <vtkm/List.h>
+
 #include <sstream>

 #ifdef VTKM_ENABLE_TBB
@ -34,145 +30,78 @@
 // For the TBB implementation, the number of threads can be customized using a
 // "NumThreads [numThreads]" argument.

-namespace vtkm
-{
-namespace benchmarking
-{
-
-const vtkm::UInt64 COPY_SIZE_MIN = (1 << 10); // 1 KiB
-const vtkm::UInt64 COPY_SIZE_MAX = (1 << 29); // 512 MiB
-const vtkm::UInt64 COPY_SIZE_INC = 1;         // Used as 'size <<= INC'
-
-const size_t COL_WIDTH = 32;
-
-template <typename ValueType, typename DeviceAdapter>
-struct MeasureCopySpeed
-{
-  using Algo = vtkm::cont::Algorithm;
-
-  vtkm::cont::ArrayHandle<ValueType> Source;
-  vtkm::cont::ArrayHandle<ValueType> Destination;
-  vtkm::UInt64 NumBytes;
-
-  VTKM_CONT
-  MeasureCopySpeed(vtkm::UInt64 bytes)
-    : NumBytes(bytes)
-  {
-    vtkm::Id numValues = static_cast<vtkm::Id>(bytes / sizeof(ValueType));
-    this->Source.Allocate(numValues);
-  }
-
-  VTKM_CONT vtkm::Float64 operator()()
-  {
-    vtkm::cont::Timer timer{ DeviceAdapter() };
-    timer.Start();
-    Algo::Copy(this->Source, this->Destination);
-
-    return timer.GetElapsedTime();
-  }
-
-  VTKM_CONT std::string Description() const
-  {
-    vtkm::UInt64 actualSize = sizeof(ValueType);
-    actualSize *= static_cast<vtkm::UInt64>(this->Source.GetNumberOfValues());
-    std::ostringstream out;
-    out << "Copying " << vtkm::cont::GetHumanReadableSize(this->NumBytes)
-        << " (actual=" << vtkm::cont::GetHumanReadableSize(actualSize) << ") of "
-        << vtkm::testing::TypeName<ValueType>::Name() << "\n";
-    return out.str();
-  }
-};
-
-void PrintRow(std::ostream& out, const std::string& label, const std::string& data)
-{
-  out << "| " << std::setw(COL_WIDTH) << label << " | " << std::setw(COL_WIDTH) << data << " |"
-      << std::endl;
-}
-
-void PrintDivider(std::ostream& out)
-{
-  const std::string fillStr(COL_WIDTH, '-');
-
-  out << "|-" << fillStr << "-|-" << fillStr << "-|" << std::endl;
-}
-
-template <typename ValueType, typename DeviceAdapter>
-void BenchmarkValueType(vtkm::cont::DeviceAdapterId id)
-{
-  PrintRow(std::cout, vtkm::testing::TypeName<ValueType>::Name(), id.GetName());
-
-  PrintDivider(std::cout);
-
-  Benchmarker bench(15, 100);
-  for (vtkm::UInt64 size = COPY_SIZE_MIN; size <= COPY_SIZE_MAX; size <<= COPY_SIZE_INC)
-  {
-    MeasureCopySpeed<ValueType, DeviceAdapter> functor(size);
-    bench.Reset();
-
-    std::string speedStr;
-
-    try
-    {
-      bench.GatherSamples(functor);
-      vtkm::Float64 speed = static_cast<Float64>(size) / stats::Mean(bench.GetSamples());
-      speedStr = vtkm::cont::GetHumanReadableSize(static_cast<UInt64>(speed)) + std::string("/s");
-    }
-    catch (vtkm::cont::ErrorBadAllocation&)
-    {
-      speedStr = "[allocation too large]";
-    }
-
-    PrintRow(std::cout, vtkm::cont::GetHumanReadableSize(size), speedStr);
-  }
-
-  std::cout << "\n";
-}
-}
-} // end namespace vtkm::benchmarking
-
 namespace
 {
-using namespace vtkm::benchmarking;

-struct BenchmarkValueTypeFunctor
+// Make this global so benchmarks can access the current device id:
+vtkm::cont::InitializeResult Config;
+
+const vtkm::UInt64 COPY_SIZE_MIN = (1 << 10); // 1 KiB
+const vtkm::UInt64 COPY_SIZE_MAX = (1 << 30); // 1 GiB
+
+using TypeList = vtkm::List<vtkm::UInt8,
+                            vtkm::Vec2ui_8,
+                            vtkm::Vec3ui_8,
+                            vtkm::Vec4ui_8,
+                            vtkm::UInt32,
+                            vtkm::Vec2ui_32,
+                            vtkm::UInt64,
+                            vtkm::Vec2ui_64,
+                            vtkm::Float32,
+                            vtkm::Vec2f_32,
+                            vtkm::Float64,
+                            vtkm::Vec2f_64,
+                            vtkm::Pair<vtkm::UInt32, vtkm::Float32>,
+                            vtkm::Pair<vtkm::UInt32, vtkm::Float64>,
+                            vtkm::Pair<vtkm::UInt64, vtkm::Float32>,
+                            vtkm::Pair<vtkm::UInt64, vtkm::Float64>>;
+
+template <typename ValueType>
+void CopySpeed(benchmark::State& state)
 {
-  template <typename DeviceAdapter>
-  bool operator()(DeviceAdapter id)
+  const vtkm::cont::DeviceAdapterId device = Config.Device;
+  const vtkm::UInt64 numBytes = static_cast<vtkm::UInt64>(state.range(0));
+  const vtkm::Id numValues = static_cast<vtkm::Id>(numBytes / sizeof(ValueType));
+
+  state.SetLabel(vtkm::cont::GetHumanReadableSize(numBytes));
+
+  vtkm::cont::ArrayHandle<ValueType> src;
+  vtkm::cont::ArrayHandle<ValueType> dst;
+  src.Allocate(numValues);
+  dst.Allocate(numValues);
+
+  vtkm::cont::Timer timer(device);
+  for (auto _ : state)
  {
-    BenchmarkValueType<vtkm::UInt8, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec2ui_8, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec3ui_8, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec4ui_8, DeviceAdapter>(id);
+    (void)_;
+    timer.Start();
+    vtkm::cont::Algorithm::Copy(device, src, dst);
+    timer.Stop();

-    BenchmarkValueType<vtkm::UInt32, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec2ui_32, DeviceAdapter>(id);
-
-    BenchmarkValueType<vtkm::UInt64, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec2ui_64, DeviceAdapter>(id);
-
-    BenchmarkValueType<vtkm::Float32, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec2f_32, DeviceAdapter>(id);
-
-    BenchmarkValueType<vtkm::Float64, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Vec2f_64, DeviceAdapter>(id);
-
-    BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float32>, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float64>, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float32>, DeviceAdapter>(id);
-    BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float64>, DeviceAdapter>(id);
-
-    return true;
+    state.SetIterationTime(timer.GetElapsedTime());
  }
-};
+
+  const int64_t iterations = static_cast<int64_t>(state.iterations());
+  state.SetBytesProcessed(static_cast<int64_t>(numBytes) * iterations);
+  state.SetItemsProcessed(static_cast<int64_t>(numValues) * iterations);
 }
+VTKM_BENCHMARK_TEMPLATES_OPTS(CopySpeed,
+                                ->Range(COPY_SIZE_MIN, COPY_SIZE_MAX)
+                                ->ArgName("Bytes"),
+                              TypeList);
+
+} // end anon namespace

 int main(int argc, char* argv[])
 {
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice |
-    vtkm::cont::InitializeOptions::ErrorOnBadOption | vtkm::cont::InitializeOptions::AddHelp;
-  auto config = vtkm::cont::Initialize(argc, argv, opts);
+  // Parse VTK-m options:
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
+  Config = vtkm::cont::Initialize(argc, argv, opts);

+  // Setup device:
+  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);

+// Handle NumThreads command-line arg:
 #ifdef VTKM_ENABLE_TBB
  int numThreads = tbb::task_scheduler_init::automatic;
 #endif // TBB
@ -196,6 +125,6 @@ int main(int argc, char* argv[])
  tbb::task_scheduler_init init(numThreads);
 #endif // TBB

-  BenchmarkValueTypeFunctor functor;
-  vtkm::cont::TryExecuteOnDevice(config.Device, functor);
+  // handle benchmarking related args and run benchmarks:
+  VTKM_EXECUTE_BENCHMARKS(argc, argv);
 }
--- a/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/benchmarking/BenchmarkDeviceAdapter.cxx
--- a/benchmarking/BenchmarkFieldAlgorithms.cxx
+++ b/benchmarking/BenchmarkFieldAlgorithms.cxx
--- a/benchmarking/BenchmarkFilters.cxx
+++ b/benchmarking/BenchmarkFilters.cxx
--- a/benchmarking/BenchmarkRayTracing.cxx
+++ b/benchmarking/BenchmarkRayTracing.cxx
@ -14,6 +14,7 @@

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/Initialize.h>
 #include <vtkm/cont/Timer.h>
 #include <vtkm/cont/testing/MakeTestDataSet.h>

@ -31,114 +32,97 @@
 #include <string>
 #include <vector>

-using namespace vtkm::benchmarking;
-namespace vtkm
-{
-namespace benchmarking
+namespace
 {

-template <typename Precision, typename DeviceAdapter>
-struct BenchRayTracing
+// Hold configuration state (e.g. active device)
+vtkm::cont::InitializeResult Config;
+
+void BenchRayTracing(::benchmark::State& state)
 {
-  vtkm::rendering::raytracing::RayTracer Tracer;
-  vtkm::rendering::raytracing::Camera RayCamera;
-  vtkm::cont::ArrayHandle<vtkm::Id4> Indices;
-  vtkm::rendering::raytracing::Ray<Precision> Rays;
-  vtkm::cont::CoordinateSystem Coords;
-  vtkm::cont::DataSet Data;
+  const vtkm::Id3 dims(128, 128, 128);

-  VTKM_CONT ~BenchRayTracing() {}
+  vtkm::cont::testing::MakeTestDataSet maker;
+  auto dataset = maker.Make3DUniformDataSet3(dims);
+  auto coords = dataset.GetCoordinateSystem();

-  VTKM_CONT BenchRayTracing()
+  vtkm::rendering::Camera camera;
+  vtkm::Bounds bounds = dataset.GetCoordinateSystem().GetBounds();
+  camera.ResetToBounds(bounds);
+
+  vtkm::cont::DynamicCellSet cellset = dataset.GetCellSet();
+
+  vtkm::rendering::raytracing::TriangleExtractor triExtractor;
+  triExtractor.ExtractCells(cellset);
+
+  auto triIntersector = std::make_shared<vtkm::rendering::raytracing::TriangleIntersector>(
+    vtkm::rendering::raytracing::TriangleIntersector());
+
+  vtkm::rendering::raytracing::RayTracer tracer;
+  triIntersector->SetData(coords, triExtractor.GetTriangles());
+  tracer.AddShapeIntersector(triIntersector);
+
+  vtkm::rendering::CanvasRayTracer canvas(1920, 1080);
+  vtkm::rendering::raytracing::Camera rayCamera;
+  rayCamera.SetParameters(camera, canvas);
+  vtkm::rendering::raytracing::Ray<vtkm::Float32> rays;
+  rayCamera.CreateRays(rays, coords.GetBounds());
+
+  rays.Buffers.at(0).InitConst(0.f);
+
+  vtkm::cont::Field field = dataset.GetField("pointvar");
+  vtkm::Range range = field.GetRange().GetPortalConstControl().Get(0);
+
+  tracer.SetField(field, range);
+
+  vtkm::cont::ArrayHandle<vtkm::Vec4ui_8> temp;
+  vtkm::cont::ColorTable table("cool to warm");
+  table.Sample(100, temp);
+
+  vtkm::cont::ArrayHandle<vtkm::Vec4f_32> colors;
+  colors.Allocate(100);
+  auto portal = colors.GetPortalControl();
+  auto colorPortal = temp.GetPortalConstControl();
+  constexpr vtkm::Float32 conversionToFloatSpace = (1.0f / 255.0f);
+  for (vtkm::Id i = 0; i < 100; ++i)
  {
-    vtkm::Id3 dims(128, 128, 128);
-    vtkm::cont::testing::MakeTestDataSet maker;
-    Data = maker.Make3DUniformDataSet3(dims);
-    Coords = Data.GetCoordinateSystem();
-
-    vtkm::rendering::Camera camera;
-    vtkm::Bounds bounds = Data.GetCoordinateSystem().GetBounds();
-    camera.ResetToBounds(bounds);
-
-    vtkm::cont::DynamicCellSet cellset = Data.GetCellSet();
-
-    vtkm::rendering::raytracing::TriangleExtractor triExtractor;
-    triExtractor.ExtractCells(cellset);
-
-    auto triIntersector = std::make_shared<vtkm::rendering::raytracing::TriangleIntersector>(
-      vtkm::rendering::raytracing::TriangleIntersector());
-
-    triIntersector->SetData(Coords, triExtractor.GetTriangles());
-    Tracer.AddShapeIntersector(triIntersector);
-
-    vtkm::rendering::CanvasRayTracer canvas(1920, 1080);
-    RayCamera.SetParameters(camera, canvas);
-    RayCamera.CreateRays(Rays, Coords.GetBounds());
-
-    Rays.Buffers.at(0).InitConst(0.f);
-
-    vtkm::cont::Field field = Data.GetField("pointvar");
-    vtkm::Range range = field.GetRange().GetPortalConstControl().Get(0);
-
-    Tracer.SetField(field, range);
-
-    vtkm::cont::ArrayHandle<vtkm::Vec4ui_8> temp;
-    vtkm::cont::ColorTable table("cool to warm");
-    table.Sample(100, temp);
-
-    vtkm::cont::ArrayHandle<vtkm::Vec4f_32> colors;
-    colors.Allocate(100);
-    auto portal = colors.GetPortalControl();
-    auto colorPortal = temp.GetPortalConstControl();
-    constexpr vtkm::Float32 conversionToFloatSpace = (1.0f / 255.0f);
-    for (vtkm::Id i = 0; i < 100; ++i)
-    {
-      auto color = colorPortal.Get(i);
-      vtkm::Vec4f_32 t(color[0] * conversionToFloatSpace,
-                       color[1] * conversionToFloatSpace,
-                       color[2] * conversionToFloatSpace,
-                       color[3] * conversionToFloatSpace);
-      portal.Set(i, t);
-    }
-
-    Tracer.SetColorMap(colors);
-    Tracer.Render(Rays);
+    auto color = colorPortal.Get(i);
+    vtkm::Vec4f_32 t(color[0] * conversionToFloatSpace,
+                     color[1] * conversionToFloatSpace,
+                     color[2] * conversionToFloatSpace,
+                     color[3] * conversionToFloatSpace);
+    portal.Set(i, t);
  }

-  VTKM_CONT
-  vtkm::Float64 operator()()
+  tracer.SetColorMap(colors);
+  tracer.Render(rays);
+
+  vtkm::cont::Timer timer{ Config.Device };
+  for (auto _ : state)
  {
-    vtkm::cont::Timer timer{ DeviceAdapter() };
+    (void)_;
    timer.Start();
+    rayCamera.CreateRays(rays, coords.GetBounds());
+    tracer.Render(rays);
+    timer.Stop();

-    RayCamera.CreateRays(Rays, Coords.GetBounds());
-    try
-    {
-      Tracer.Render(Rays);
-    }
-    catch (vtkm::cont::ErrorBadValue& e)
-    {
-      std::cout << "exception " << e.what() << "\n";
-    }
-
-    return timer.GetElapsedTime();
+    state.SetIterationTime(timer.GetElapsedTime());
  }
-
-  VTKM_CONT
-  std::string Description() const { return "A ray tracing benchmark"; }
-};
-
-VTKM_MAKE_BENCHMARK(RayTracing, BenchRayTracing);
 }
-} // end namespace vtkm::benchmarking

+VTKM_BENCHMARK(BenchRayTracing);
+
+} // end namespace vtkm::benchmarking

 int main(int argc, char* argv[])
 {
-  auto opts =
-    vtkm::cont::InitializeOptions::DefaultAnyDevice | vtkm::cont::InitializeOptions::Strict;
-  auto config = vtkm::cont::Initialize(argc, argv, opts);
+  // Parse VTK-m options:
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
+  Config = vtkm::cont::Initialize(argc, argv, opts);

-  VTKM_RUN_BENCHMARK(RayTracing, vtkm::List<vtkm::Float32>(), config.Device);
-  return 0;
+  // Setup device:
+  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+
+  // handle benchmarking related args and run benchmarks:
+  VTKM_EXECUTE_BENCHMARKS(argc, argv);
 }
--- a/benchmarking/BenchmarkTopologyAlgorithms.cxx
+++ b/benchmarking/BenchmarkTopologyAlgorithms.cxx
@ -7,40 +7,37 @@
 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 //  PURPOSE.  See the above copyright notice for more information.
 //============================================================================
+
+#include "Benchmarker.h"
+
 #include <vtkm/Math.h>
 #include <vtkm/VectorAnalysis.h>

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/CellSetStructured.h>
+#include <vtkm/cont/Invoker.h>
 #include <vtkm/cont/Timer.h>

-#include <vtkm/worklet/DispatcherMapField.h>
-#include <vtkm/worklet/DispatcherMapTopology.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/WorkletMapTopology.h>

-#include "Benchmarker.h"
 #include <vtkm/cont/testing/Testing.h>

 #include <cctype>
 #include <random>
 #include <string>

-namespace vtkm
-{
-namespace benchmarking
+namespace
 {

 #define CUBE_SIZE 256
-static const std::string DIVIDER(40, '-');

-enum BenchmarkName
-{
-  CELL_TO_POINT = 1 << 1,
-  POINT_TO_CELL = 1 << 2,
-  MC_CLASSIFY = 1 << 3,
-  ALL = CELL_TO_POINT | POINT_TO_CELL | MC_CLASSIFY
-};
+using ValueTypes = vtkm::List<vtkm::UInt32, vtkm::Int32, vtkm::Int64, vtkm::Float32, vtkm::Float64>;
+
+using ValueVariantHandle = vtkm::cont::VariantArrayHandleBase<ValueTypes>;
+
+// Hold configuration state (e.g. active device)
+vtkm::cont::InitializeResult Config;

 class AveragePointToCell : public vtkm::worklet::WorkletVisitCellsWithPoints
 {
@ -118,373 +115,277 @@ public:
  }
 };

-using ValueTypes = vtkm::List<vtkm::UInt32, vtkm::Int32, vtkm::Int64, vtkm::Float32, vtkm::Float64>;
-
-/// This class runs a series of micro-benchmarks to measure
-/// performance of different field operations
-class BenchmarkTopologyAlgorithms
+template <typename T, typename Enable = void>
+struct NumberGenerator
 {
-  using StorageTag = vtkm::cont::StorageTagBasic;
+};

-  using Timer = vtkm::cont::Timer;
-
-  using ValueVariantHandle = vtkm::cont::VariantArrayHandleBase<ValueTypes>;
-
-private:
-  template <typename T, typename Enable = void>
-  struct NumberGenerator
+template <typename T>
+struct NumberGenerator<T, typename std::enable_if<std::is_floating_point<T>::value>::type>
+{
+  std::mt19937 rng;
+  std::uniform_real_distribution<T> distribution;
+  NumberGenerator(T low, T high)
+    : rng()
+    , distribution(low, high)
  {
-  };
+  }
+  T next() { return distribution(rng); }
+};

-  template <typename T>
-  struct NumberGenerator<T, typename std::enable_if<std::is_floating_point<T>::value>::type>
+template <typename T>
+struct NumberGenerator<T, typename std::enable_if<!std::is_floating_point<T>::value>::type>
+{
+  std::mt19937 rng;
+  std::uniform_int_distribution<T> distribution;
+
+  NumberGenerator(T low, T high)
+    : rng()
+    , distribution(low, high)
  {
-    std::mt19937 rng;
-    std::uniform_real_distribution<T> distribution;
-    NumberGenerator(T low, T high)
-      : rng()
-      , distribution(low, high)
-    {
-    }
-    T next() { return distribution(rng); }
-  };
+  }
+  T next() { return distribution(rng); }
+};

-  template <typename T>
-  struct NumberGenerator<T, typename std::enable_if<!std::is_floating_point<T>::value>::type>
+// Returns an extra random value.
+// Like, an additional random value.
+// Not a random value that's somehow "extra random".
+template <typename ArrayT>
+VTKM_CONT typename ArrayT::ValueType FillRandomValues(ArrayT& array,
+                                                      vtkm::Id size,
+                                                      vtkm::Float64 min,
+                                                      vtkm::Float64 max)
+{
+  using ValueType = typename ArrayT::ValueType;
+
+  NumberGenerator<ValueType> generator{ static_cast<ValueType>(min), static_cast<ValueType>(max) };
+  array.Allocate(size);
+  auto portal = array.GetPortalControl();
+  for (vtkm::Id i = 0; i < size; ++i)
  {
-    std::mt19937 rng;
-    std::uniform_int_distribution<T> distribution;
+    portal.Set(i, generator.next());
+  }
+  return generator.next();
+}

-    NumberGenerator(T low, T high)
-      : rng()
-      , distribution(low, high)
-    {
-    }
-    T next() { return distribution(rng); }
-  };
+template <typename Value>
+struct BenchCellToPointAvgImpl
+{
+  vtkm::cont::ArrayHandle<Value> Input;

-  template <typename Value, typename DeviceAdapter>
-  struct BenchCellToPointAvg
+  ::benchmark::State& State;
+  vtkm::Id CubeSize;
+  vtkm::Id NumCells;
+
+  vtkm::cont::Timer Timer;
+  vtkm::cont::Invoker Invoker;
+
+  VTKM_CONT
+  BenchCellToPointAvgImpl(::benchmark::State& state)
+    : State{ state }
+    , CubeSize{ CUBE_SIZE }
+    , NumCells{ (this->CubeSize - 1) * (this->CubeSize - 1) * (this->CubeSize - 1) }
+    , Timer{ Config.Device }
+    , Invoker{ Config.Device }
  {
-    std::vector<Value> input;
-    vtkm::cont::ArrayHandle<Value, StorageTag> InputHandle;
-    std::size_t DomainSize;
+    FillRandomValues(this->Input, this->NumCells, 1., 100.);

-    VTKM_CONT
-    BenchCellToPointAvg()
-    {
-      NumberGenerator<Value> generator(static_cast<Value>(1.0), static_cast<Value>(100.0));
-      //cube size is points in each dim
-      this->DomainSize = (CUBE_SIZE - 1) * (CUBE_SIZE - 1) * (CUBE_SIZE - 1);
-      this->input.resize(DomainSize);
-      for (std::size_t i = 0; i < DomainSize; ++i)
-      {
-        this->input[i] = generator.next();
-      }
-      this->InputHandle = vtkm::cont::make_ArrayHandle(this->input);
+    { // Configure label:
+      std::ostringstream desc;
+      desc << "CubeSize:" << this->CubeSize;
+      this->State.SetLabel(desc.str());
    }
+  }

-    VTKM_CONT
-    vtkm::Float64 operator()()
-    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
-      vtkm::cont::ArrayHandle<Value, StorageTag> result;
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      vtkm::worklet::DispatcherMapTopology<AverageCellToPoint> dispatcher;
-      dispatcher.Invoke(this->InputHandle, cellSet, result);
-
-      return timer.GetElapsedTime();
-    }
-
-    virtual std::string Type() const { return std::string("Static"); }
-
-    VTKM_CONT
-    std::string Description() const
-    {
-
-      std::stringstream description;
-      description << "Computing Cell To Point Average "
-                  << "[" << this->Type() << "] "
-                  << "with a domain size of: " << this->DomainSize;
-      return description.str();
-    }
-  };
-
-  template <typename Value, typename DeviceAdapter>
-  struct BenchCellToPointAvgDynamic : public BenchCellToPointAvg<Value, DeviceAdapter>
+  template <typename BenchArrayType>
+  VTKM_CONT void Run(const BenchArrayType& input)
  {
+    vtkm::cont::CellSetStructured<3> cellSet;
+    cellSet.SetPointDimensions(vtkm::Id3{ this->CubeSize, this->CubeSize, this->CubeSize });
+    vtkm::cont::ArrayHandle<Value> result;

-    VTKM_CONT
-    vtkm::Float64 operator()()
+    for (auto _ : this->State)
    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
+      (void)_;
+      this->Timer.Start();
+      this->Invoker(AverageCellToPoint{}, input, cellSet, result);
+      this->Timer.Stop();

-      ValueVariantHandle dinput(this->InputHandle);
-      vtkm::cont::ArrayHandle<Value, StorageTag> result;
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      vtkm::worklet::DispatcherMapTopology<AverageCellToPoint> dispatcher;
-      dispatcher.Invoke(dinput, cellSet, result);
-
-      return timer.GetElapsedTime();
+      this->State.SetIterationTime(this->Timer.GetElapsedTime());
    }

-    virtual std::string Type() const { return std::string("Dynamic"); }
-  };
-
-  VTKM_MAKE_BENCHMARK(CellToPointAvg, BenchCellToPointAvg);
-  VTKM_MAKE_BENCHMARK(CellToPointAvgDynamic, BenchCellToPointAvgDynamic);
-
-  template <typename Value, typename DeviceAdapter>
-  struct BenchPointToCellAvg
-  {
-    std::vector<Value> input;
-    vtkm::cont::ArrayHandle<Value, StorageTag> InputHandle;
-    std::size_t DomainSize;
-
-    VTKM_CONT
-    BenchPointToCellAvg()
-    {
-      NumberGenerator<Value> generator(static_cast<Value>(1.0), static_cast<Value>(100.0));
-
-      this->DomainSize = (CUBE_SIZE) * (CUBE_SIZE) * (CUBE_SIZE);
-      this->input.resize(DomainSize);
-      for (std::size_t i = 0; i < DomainSize; ++i)
-      {
-        this->input[i] = generator.next();
-      }
-      this->InputHandle = vtkm::cont::make_ArrayHandle(this->input);
-    }
-
-    VTKM_CONT
-    vtkm::Float64 operator()()
-    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
-      vtkm::cont::ArrayHandle<Value, StorageTag> result;
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      vtkm::worklet::DispatcherMapTopology<AveragePointToCell> dispatcher;
-      dispatcher.Invoke(this->InputHandle, cellSet, result);
-
-      return timer.GetElapsedTime();
-    }
-
-    virtual std::string Type() const { return std::string("Static"); }
-
-    VTKM_CONT
-    std::string Description() const
-    {
-
-      std::stringstream description;
-      description << "Computing Point To Cell Average "
-                  << "[" << this->Type() << "] "
-                  << "with a domain size of: " << this->DomainSize;
-      return description.str();
-    }
-  };
-
-  template <typename Value, typename DeviceAdapter>
-  struct BenchPointToCellAvgDynamic : public BenchPointToCellAvg<Value, DeviceAdapter>
-  {
-
-    VTKM_CONT
-    vtkm::Float64 operator()()
-    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
-
-      ValueVariantHandle dinput(this->InputHandle);
-      vtkm::cont::ArrayHandle<Value, StorageTag> result;
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      vtkm::worklet::DispatcherMapTopology<AveragePointToCell> dispatcher;
-      dispatcher.Invoke(dinput, cellSet, result);
-
-      return timer.GetElapsedTime();
-    }
-
-    virtual std::string Type() const { return std::string("Dynamic"); }
-  };
-
-  VTKM_MAKE_BENCHMARK(PointToCellAvg, BenchPointToCellAvg);
-  VTKM_MAKE_BENCHMARK(PointToCellAvgDynamic, BenchPointToCellAvgDynamic);
-
-  template <typename Value, typename DeviceAdapter>
-  struct BenchClassification
-  {
-    std::vector<Value> input;
-    vtkm::cont::ArrayHandle<Value, StorageTag> InputHandle;
-    Value IsoValue;
-    size_t DomainSize;
-
-    VTKM_CONT
-    BenchClassification()
-    {
-      NumberGenerator<Value> generator(static_cast<Value>(1.0), static_cast<Value>(100.0));
-
-      this->DomainSize = (CUBE_SIZE) * (CUBE_SIZE) * (CUBE_SIZE);
-      this->input.resize(DomainSize);
-      for (std::size_t i = 0; i < DomainSize; ++i)
-      {
-        this->input[i] = generator.next();
-      }
-      this->InputHandle = vtkm::cont::make_ArrayHandle(this->input);
-      this->IsoValue = generator.next();
-    }
-
-    VTKM_CONT
-    vtkm::Float64 operator()()
-    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
-      vtkm::cont::ArrayHandle<vtkm::IdComponent, StorageTag> result;
-
-      ValueVariantHandle dinput(this->InputHandle);
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      Classification<Value> worklet(this->IsoValue);
-      vtkm::worklet::DispatcherMapTopology<Classification<Value>> dispatcher(worklet);
-      dispatcher.Invoke(dinput, cellSet, result);
-
-      return timer.GetElapsedTime();
-    }
-
-    virtual std::string Type() const { return std::string("Static"); }
-
-    VTKM_CONT
-    std::string Description() const
-    {
-
-      std::stringstream description;
-      description << "Computing Marching Cubes Classification "
-                  << "[" << this->Type() << "] "
-                  << "with a domain size of: " << this->DomainSize;
-      return description.str();
-    }
-  };
-
-  template <typename Value, typename DeviceAdapter>
-  struct BenchClassificationDynamic : public BenchClassification<Value, DeviceAdapter>
-  {
-    VTKM_CONT
-    vtkm::Float64 operator()()
-    {
-      vtkm::cont::CellSetStructured<3> cellSet;
-      cellSet.SetPointDimensions(vtkm::Id3(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE));
-      vtkm::cont::ArrayHandle<vtkm::IdComponent, StorageTag> result;
-
-      Timer timer{ DeviceAdapter() };
-      timer.Start();
-
-      Classification<Value> worklet(this->IsoValue);
-      vtkm::worklet::DispatcherMapTopology<Classification<Value>> dispatcher(worklet);
-      dispatcher.Invoke(this->InputHandle, cellSet, result);
-
-      timer.Stop();
-      return timer.GetElapsedTime();
-    }
-
-    virtual std::string Type() const { return std::string("Dynamic"); }
-  };
-
-  VTKM_MAKE_BENCHMARK(Classification, BenchClassification);
-  VTKM_MAKE_BENCHMARK(ClassificationDynamic, BenchClassificationDynamic);
-
-public:
-  static VTKM_CONT int Run(int benchmarks, vtkm::cont::DeviceAdapterId id)
-  {
-    std::cout << DIVIDER << "\nRunning Topology Algorithm benchmarks\n";
-
-    if (benchmarks & CELL_TO_POINT)
-    {
-      std::cout << DIVIDER << "\nBenchmarking Cell To Point Average\n";
-      VTKM_RUN_BENCHMARK(CellToPointAvg, ValueTypes(), id);
-      VTKM_RUN_BENCHMARK(CellToPointAvgDynamic, ValueTypes(), id);
-    }
-
-    if (benchmarks & POINT_TO_CELL)
-    {
-      std::cout << DIVIDER << "\nBenchmarking Point to Cell Average\n";
-      VTKM_RUN_BENCHMARK(PointToCellAvg, ValueTypes(), id);
-      VTKM_RUN_BENCHMARK(PointToCellAvgDynamic, ValueTypes(), id);
-    }
-
-    if (benchmarks & MC_CLASSIFY)
-    {
-      std::cout << DIVIDER << "\nBenchmarking Hex/Voxel MC Classification\n";
-      VTKM_RUN_BENCHMARK(Classification, ValueTypes(), id);
-      VTKM_RUN_BENCHMARK(ClassificationDynamic, ValueTypes(), id);
-    }
-
-    return 0;
+    // #items = #points
+    const int64_t iterations = static_cast<int64_t>(this->State.iterations());
+    this->State.SetItemsProcessed(static_cast<int64_t>(cellSet.GetNumberOfPoints()) * iterations);
  }
 };

-#undef ARRAY_SIZE
-}
-} // namespace vtkm::benchmarking
-
-int main(int argc, char* argv[])
+template <typename ValueType>
+void BenchCellToPointAvgStatic(::benchmark::State& state)
 {
-  auto opts = vtkm::cont::InitializeOptions::DefaultAnyDevice;
-  auto config = vtkm::cont::Initialize(argc, argv, opts);
+  BenchCellToPointAvgImpl<ValueType> impl{ state };
+  impl.Run(impl.Input);
+};
+VTKM_BENCHMARK_TEMPLATES(BenchCellToPointAvgStatic, ValueTypes);

-  int benchmarks = 0;
-  if (argc <= 1)
+template <typename ValueType>
+void BenchCellToPointAvgDynamic(::benchmark::State& state)
+{
+  BenchCellToPointAvgImpl<ValueType> impl{ state };
+  impl.Run(ValueVariantHandle{ impl.Input });
+};
+VTKM_BENCHMARK_TEMPLATES(BenchCellToPointAvgDynamic, ValueTypes);
+
+template <typename Value>
+struct BenchPointToCellAvgImpl
+{
+  vtkm::cont::ArrayHandle<Value> Input;
+
+  ::benchmark::State& State;
+  vtkm::Id CubeSize;
+  vtkm::Id NumPoints;
+
+  vtkm::cont::Timer Timer;
+  vtkm::cont::Invoker Invoker;
+
+  VTKM_CONT
+  BenchPointToCellAvgImpl(::benchmark::State& state)
+    : State{ state }
+    , CubeSize{ CUBE_SIZE }
+    , NumPoints{ (this->CubeSize) * (this->CubeSize) * (this->CubeSize) }
+    , Timer{ Config.Device }
+    , Invoker{ Config.Device }
  {
-    benchmarks = vtkm::benchmarking::ALL;
-  }
-  else
-  {
-    for (int i = 1; i < argc; ++i)
-    {
-      std::string arg = argv[i];
-      std::transform(arg.begin(), arg.end(), arg.begin(), [](char c) {
-        return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
-      });
-      if (arg == "celltopoint")
-      {
-        benchmarks |= vtkm::benchmarking::CELL_TO_POINT;
-      }
-      else if (arg == "pointtocell")
-      {
-        benchmarks |= vtkm::benchmarking::POINT_TO_CELL;
-      }
-      else if (arg == "classify")
-      {
-        benchmarks |= vtkm::benchmarking::MC_CLASSIFY;
-      }
-      else
-      {
-        std::cerr << "Unrecognized benchmark: " << argv[i] << std::endl;
-        std::cerr << "USAGE: " << argv[0] << " [options] [<benchmarks>]" << std::endl;
-        std::cerr << "Options are: " << std::endl;
-        std::cerr << config.Usage << std::endl;
-        std::cerr << "Benchmarks are one or more of the following:" << std::endl;
-        std::cerr << "  CellToPoint\tFind average of point data on each cell" << std::endl;
-        std::cerr << "  PointToCell\tFind average of cell data on each point" << std::endl;
-        std::cerr << "  Classify\tFind Marching Cube case of each cell" << std::endl;
-        std::cerr << "If no benchmarks are specified, all are run." << std::endl;
-        return 1;
-      }
+    FillRandomValues(this->Input, this->NumPoints, 1., 100.);
+
+    { // Configure label:
+      std::ostringstream desc;
+      desc << "CubeSize:" << this->CubeSize;
+      this->State.SetLabel(desc.str());
    }
  }

-  //now actually execute the benchmarks
+  template <typename BenchArrayType>
+  VTKM_CONT void Run(const BenchArrayType& input)
+  {
+    vtkm::cont::CellSetStructured<3> cellSet;
+    cellSet.SetPointDimensions(vtkm::Id3{ this->CubeSize, this->CubeSize, this->CubeSize });
+    vtkm::cont::ArrayHandle<Value> result;

-  return vtkm::benchmarking::BenchmarkTopologyAlgorithms::Run(benchmarks, config.Device);
+    for (auto _ : this->State)
+    {
+      (void)_;
+      this->Timer.Start();
+      this->Invoker(AveragePointToCell{}, input, cellSet, result);
+      this->Timer.Stop();
+
+      this->State.SetIterationTime(this->Timer.GetElapsedTime());
+    }
+
+    // #items = #cells
+    const int64_t iterations = static_cast<int64_t>(this->State.iterations());
+    this->State.SetItemsProcessed(static_cast<int64_t>(cellSet.GetNumberOfCells()) * iterations);
+  }
+};
+
+template <typename ValueType>
+void BenchPointToCellAvgStatic(::benchmark::State& state)
+{
+  BenchPointToCellAvgImpl<ValueType> impl{ state };
+  impl.Run(impl.Input);
+};
+VTKM_BENCHMARK_TEMPLATES(BenchPointToCellAvgStatic, ValueTypes);
+
+template <typename ValueType>
+void BenchPointToCellAvgDynamic(::benchmark::State& state)
+{
+  BenchPointToCellAvgImpl<ValueType> impl{ state };
+  impl.Run(ValueVariantHandle{ impl.Input });
+};
+VTKM_BENCHMARK_TEMPLATES(BenchPointToCellAvgDynamic, ValueTypes);
+
+template <typename Value>
+struct BenchClassificationImpl
+{
+  vtkm::cont::ArrayHandle<Value> Input;
+
+  ::benchmark::State& State;
+  vtkm::Id CubeSize;
+  vtkm::Id DomainSize;
+  Value IsoValue;
+
+  vtkm::cont::Timer Timer;
+  vtkm::cont::Invoker Invoker;
+
+  VTKM_CONT
+  BenchClassificationImpl(::benchmark::State& state)
+    : State{ state }
+    , CubeSize{ CUBE_SIZE }
+    , DomainSize{ this->CubeSize * this->CubeSize * this->CubeSize }
+    , Timer{ Config.Device }
+    , Invoker{ Config.Device }
+  {
+    this->IsoValue = FillRandomValues(this->Input, this->DomainSize, 1., 100.);
+
+    { // Configure label:
+      std::ostringstream desc;
+      desc << "CubeSize:" << this->CubeSize;
+      this->State.SetLabel(desc.str());
+    }
+  }
+
+  template <typename BenchArrayType>
+  VTKM_CONT void Run(const BenchArrayType& input)
+  {
+    vtkm::cont::CellSetStructured<3> cellSet;
+    cellSet.SetPointDimensions(vtkm::Id3{ this->CubeSize, this->CubeSize, this->CubeSize });
+    vtkm::cont::ArrayHandle<vtkm::IdComponent> result;
+
+    Classification<Value> worklet(this->IsoValue);
+
+    for (auto _ : this->State)
+    {
+      (void)_;
+      this->Timer.Start();
+      this->Invoker(worklet, input, cellSet, result);
+      this->Timer.Stop();
+
+      this->State.SetIterationTime(this->Timer.GetElapsedTime());
+    }
+
+    // #items = #cells
+    const int64_t iterations = static_cast<int64_t>(this->State.iterations());
+    this->State.SetItemsProcessed(static_cast<int64_t>(cellSet.GetNumberOfCells()) * iterations);
+  }
+};
+
+template <typename ValueType>
+void BenchClassificationStatic(::benchmark::State& state)
+{
+  BenchClassificationImpl<ValueType> impl{ state };
+  impl.Run(impl.Input);
+};
+VTKM_BENCHMARK_TEMPLATES(BenchClassificationStatic, ValueTypes);
+
+template <typename ValueType>
+void BenchClassificationDynamic(::benchmark::State& state)
+{
+  BenchClassificationImpl<ValueType> impl{ state };
+  impl.Run(ValueVariantHandle{ impl.Input });
+};
+VTKM_BENCHMARK_TEMPLATES(BenchClassificationDynamic, ValueTypes);
+
+} // end anon namespace
+
+int main(int argc, char* argv[])
+{
+  // Parse VTK-m options:
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
+  Config = vtkm::cont::Initialize(argc, argv, opts);
+
+  // Setup device:
+  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+
+  // handle benchmarking related args and run benchmarks:
+  VTKM_EXECUTE_BENCHMARKS(argc, argv);
 }
--- a/benchmarking/Benchmarker.h
+++ b/benchmarking/Benchmarker.h
@ -11,330 +11,385 @@
 #ifndef vtk_m_benchmarking_Benchmarker_h
 #define vtk_m_benchmarking_Benchmarker_h

-#include <vtkm/Math.h>
-#include <vtkm/cont/DeviceAdapterTag.h>
-#include <vtkm/cont/TryExecute.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/Timer.h>
+
 #include <vtkm/cont/testing/Testing.h>

-#include <algorithm>
-#include <iostream>
-#include <vector>
+#include <vtkm/internal/brigand.hpp>

-/*
- * Writing a Benchmark
- * -------------------
- * To write a benchmark you must provide a functor that will run the operations
- * you want to time and return the run time of those operations using the timer
- * for the device. The benchmark should also be templated on the value type being
- * operated on. Then use VTKM_MAKE_BENCHMARK to generate a maker functor and
- * VTKM_RUN_BENCHMARK to run the benchmark on a list of types.
- *
- * For Example:
- *
- * template<typename Value>
- * struct BenchSilly {
- *   // Setup anything that doesn't need to change per run in the constructor
- *   VTKM_CONT BenchSilly(){}
- *
- *   // The overloaded call operator will run the operations being timed and
- *   // return the execution time
- *   VTKM_CONT
- *   vtkm::Float64 operator()(){
- *     return 0.05;
- *   }
- *
- *   // The benchmark must also provide a method describing itself, this is
- *   // used when printing out run time statistics
- *   VTKM_CONT
- *   std::string Description() const {
- *     return "A silly benchmark";
- *   }
- * };
- *
- * // Now use the VTKM_MAKE_BENCHMARK macro to generate a maker functor for
- * // your benchmark. This lets us generate the benchmark functor for each type
- * // we want to test
- * VTKM_MAKE_BENCHMARK(Silly, BenchSilly);
- *
- * // You can also optionally pass arguments to the constructor like so:
- * // VTKM_MAKE_BENCHMARK(Blah, BenchBlah, 1, 2, 3);
- * // Note that benchmark names (the first argument) must be unique so different
- * // parameters to the constructor should have different names
- *
- * // We can now run our benchmark using VTKM_RUN_BENCHMARK, passing the
- * // benchmark name and type list to run on
- * int main(int, char**){
- *   VTKM_RUN_BENCHMARK(Silly, vtkm::List<vtkm::Float32>());
- *   return 0;
- * }
- *
- * Check out vtkm/benchmarking/BenchmarkDeviceAdapter.h for some example usage
- */
+#include <benchmark/benchmark.h>

-/*
- * Use the VTKM_MAKE_BENCHMARK macro to define a maker functor for your benchmark.
- * This is used to allow you to template the benchmark functor on the type being benchmarked
- * and the device adapter so you can write init code in the constructor. Then the maker will
- * return a constructed instance of your benchmark for the type being benchmarked.
- * The VA_ARGS are used to pass any extra arguments needed by your benchmark
- */
-#define VTKM_MAKE_BENCHMARK(Name, Bench, ...)                                                      \
-  struct MakeBench##Name                                                                           \
-  {                                                                                                \
-    template <typename Value, typename DeviceAdapter>                                              \
-    VTKM_CONT Bench<Value, DeviceAdapter> operator()(const Value vtkmNotUsed(v),                   \
-                                                     DeviceAdapter vtkmNotUsed(id)) const          \
-    {                                                                                              \
-      return Bench<Value, DeviceAdapter>(__VA_ARGS__);                                             \
-    }                                                                                              \
-  }
+#include <ostream>

-/*
- * Use the VTKM_RUN_BENCHMARK macro to run your benchmark on the type list passed.
- * You must have previously defined a maker functor with VTKM_MAKE_BENCHMARK that this
- * macro will look for and use
- */
-#define VTKM_RUN_BENCHMARK(Name, Types, Id)                                                        \
-  vtkm::benchmarking::BenchmarkTypes(MakeBench##Name(), (Types), (Id))
+/// \file Benchmarker.h
+/// \brief Benchmarking utilities
+///
+/// VTK-m's benchmarking framework is built on top of Google Benchmark.
+///
+/// A benchmark is now a single function, which is passed to a macro:
+///
+/// ```
+/// void MyBenchmark(::benchmark::State& state)
+/// {
+///   MyClass someClass;
+///
+///   // Optional: Add a descriptive label with additional benchmark details:
+///   state.SetLabel("Blah blah blah.");
+///
+///   // Must use a vtkm timer to properly capture eg. CUDA execution times.
+///   vtkm::cont::Timer timer;
+///   for (auto _ : state)
+///   {
+///     someClass.Reset();
+///
+///     timer.Start();
+///     someClass.DoWork();
+///     timer.Stop();
+///
+///     state.SetIterationTime(timer.GetElapsedTime());
+///   }
+///
+///   // Optional: Report items and/or bytes processed per iteration in output:
+///   state.SetItemsProcessed(state.iterations() * someClass.GetNumberOfItems());
+///   state.SetBytesProcessed(state.iterations() * someClass.GetNumberOfBytes());
+/// }
+/// }
+/// VTKM_BENCHMARK(MyBenchmark);
+/// ```
+///
+/// Google benchmark also makes it easy to implement parameter sweep benchmarks:
+///
+/// ```
+/// void MyParameterSweep(::benchmark::State& state)
+/// {
+///   // The current value in the sweep:
+///   const vtkm::Id currentValue = state.range(0);
+///
+///   MyClass someClass;
+///   someClass.SetSomeParameter(currentValue);
+///
+///   vtkm::cont::Timer timer;
+///   for (auto _ : state)
+///   {
+///     someClass.Reset();
+///
+///     timer.Start();
+///     someClass.DoWork();
+///     timer.Stop();
+///
+///     state.SetIterationTime(timer.GetElapsedTime());
+///   }
+/// }
+/// VTKM_BENCHMARK_OPTS(MyBenchmark, ->ArgName("Param")->Range(32, 1024 * 1024));
+/// ```
+///
+/// will generate and launch several benchmarks, exploring the parameter space of
+/// `SetSomeParameter` between the values of 32 and (1024*1024). The chain of
+///   functions calls in the second argument is applied to an instance of
+/// ::benchmark::internal::Benchmark. See Google Benchmark's documentation for
+/// more details.
+///
+/// For more complex benchmark configurations, the VTKM_BENCHMARK_APPLY macro
+///   accepts a function with the signature
+/// `void Func(::benchmark::internal::Benchmark*)` that may be used to generate
+/// more complex configurations.
+///
+/// To instantiate a templated benchmark across a list of types, the
+/// VTKM_BENCHMARK_TEMPLATE* macros take a vtkm::List of types as an additional
+/// parameter. The templated benchmark function will be instantiated and called
+/// for each type in the list:
+///
+/// ```
+/// template <typename T>
+/// void MyBenchmark(::benchmark::State& state)
+/// {
+///   MyClass<T> someClass;
+///
+///   // Must use a vtkm timer to properly capture eg. CUDA execution times.
+///   vtkm::cont::Timer timer;
+///   for (auto _ : state)
+///   {
+///     someClass.Reset();
+///
+///     timer.Start();
+///     someClass.DoWork();
+///     timer.Stop();
+///
+///     state.SetIterationTime(timer.GetElapsedTime());
+///   }
+/// }
+/// }
+/// VTKM_BENCHMARK_TEMPLATE(MyBenchmark, vtkm::List<vtkm::Float32, vtkm::Vec3f_32>);
+/// ```
+///
+/// The benchmarks are executed by calling the `VTKM_EXECUTE_BENCHMARKS(argc, argv)`
+/// macro from `main`. There is also a `VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, some_string)`
+/// macro that appends the contents of `some_string` to the Google Benchmark preamble.
+///
+/// If a benchmark is not compatible with some configuration, it may call
+/// `state.SkipWithError("Error message");` on the `::benchmark::State` object and return. This is
+/// useful, for instance in the filter tests when the input is not compatible with the filter.
+///
+/// When launching a benchmark executable, the following options are supported by Google Benchmark:
+///
+/// - `--benchmark_list_tests`: List all available tests.
+/// - `--benchmark_filter="[regex]"`: Only run benchmark with names that match `[regex]`.
+/// - `--benchmark_filter="-[regex]"`: Only run benchmark with names that DON'T match `[regex]`.
+/// - `--benchmark_min_time=[float]`: Make sure each benchmark repetition gathers `[float]` seconds
+///   of data.
+/// - `--benchmark_repetitions=[int]`: Run each benchmark `[int]` times and report aggregate statistics
+///   (mean, stdev, etc). A "repetition" refers to a single execution of the benchmark function, not
+///   an "iteration", which is a loop of the `for(auto _:state){...}` section.
+/// - `--benchmark_report_aggregates_only="true|false"`: If true, only the aggregate statistics are
+///   reported (affects both console and file output). Requires `--benchmark_repetitions` to be useful.
+/// - `--benchmark_display_aggregates_only="true|false"`: If true, only the aggregate statistics are
+///   printed to the terminal. Any file output will still contain all repetition info.
+/// - `--benchmark_format="console|json|csv"`: Specify terminal output format: human readable
+///   (`console`) or `csv`/`json` formats.
+/// - `--benchmark_out_format="console|json|csv"`: Specify file output format: human readable
+///   (`console`) or `csv`/`json` formats.
+/// - `--benchmark_out=[filename]`: Specify output file.
+/// - `--benchmark_color="true|false"`: Toggle color output in terminal when using `console` output.
+/// - `--benchmark_counters_tabular="true|false"`: Print counter information (e.g. bytes/sec, items/sec)
+///   in the table, rather than appending them as a label.
+///
+/// For more information and examples of practical usage, take a look at the existing benchmarks in
+/// vtk-m/benchmarking/.
+
+/// \def VTKM_EXECUTE_BENCHMARKS(argc, argv)
+///
+/// Run the benchmarks defined in the current file. Benchmarks may be filtered
+/// and modified using the passed arguments; see the Google Benchmark documentation
+/// for more details.
+#define VTKM_EXECUTE_BENCHMARKS(argc, argv) vtkm::bench::detail::ExecuteBenchmarks(argc, argv)
+
+/// \def VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble)
+///
+/// Run the benchmarks defined in the current file. Benchmarks may be filtered
+/// and modified using the passed arguments; see the Google Benchmark documentation
+/// for more details. The `preamble` string may be used to supply additional
+/// information that will be appended to the output's preamble.
+#define VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble)                                     \
+  vtkm::bench::detail::ExecuteBenchmarks(argc, argv, preamble)
+
+/// \def VTKM_BENCHMARK(BenchFunc)
+///
+/// Define a simple benchmark. A single benchmark will be generated that executes
+/// `BenchFunc`. `BenchFunc` must have the signature:
+///
+/// ```
+/// void BenchFunc(::benchmark::State& state)
+/// ```
+#define VTKM_BENCHMARK(BenchFunc) BENCHMARK(BenchFunc)->UseManualTime()
+
+/// \def VTKM_BENCHMARK_OPTS(BenchFunc, Args)
+///
+/// Similar to `VTKM_BENCHMARK`, but allows additional options to be specified
+/// on the `::benchmark::internal::Benchmark` object. Example usage:
+///
+/// ```
+/// VTKM_BENCHMARK_OPTS(MyBenchmark, ->ArgName("MyParam")->Range(32, 1024*1024));
+/// ```
+///
+/// Note the similarity to the raw Google Benchmark usage of
+/// `BENCHMARK(MyBenchmark)->ArgName("MyParam")->Range(32, 1024*1024);`. See
+/// the Google Benchmark documentation for more details on the available options.
+#define VTKM_BENCHMARK_OPTS(BenchFunc, options) BENCHMARK(BenchFunc)->UseManualTime() options
+
+/// \def VTKM_BENCHMARK_APPLY(BenchFunc, ConfigFunc)
+///
+/// Similar to `VTKM_BENCHMARK`, but allows advanced benchmark configuration
+/// via a supplied ConfigFunc, similar to Google Benchmark's
+/// `BENCHMARK(BenchFunc)->Apply(ConfigFunc)`. `ConfigFunc` must have the
+/// signature:
+///
+/// ```
+/// void ConfigFunc(::benchmark::internal::Benchmark*);
+/// ```
+///
+/// See the Google Benchmark documentation for more details on the available options.
+#define VTKM_BENCHMARK_APPLY(BenchFunc, applyFunctor)                                              \
+  BENCHMARK(BenchFunc)->Apply(applyFunctor)->UseManualTime()
+
+/// \def VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)
+///
+/// Define a family of benchmark that vary by template argument. A single
+/// benchmark will be generated for each type in `TypeList` (a vtkm::List of
+/// types) that executes `BenchFunc<T>`. `BenchFunc` must have the signature:
+///
+/// ```
+/// template <typename T>
+/// void BenchFunc(::benchmark::State& state)
+/// ```
+#define VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)                                              \
+  VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, vtkm::bench::detail::NullApply, TypeList)
+
+/// \def VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, Args, TypeList)
+///
+/// Similar to `VTKM_BENCHMARK_TEMPLATES`, but allows additional options to be specified
+/// on the `::benchmark::internal::Benchmark` object. Example usage:
+///
+/// ```
+/// VTKM_BENCHMARK_TEMPLATES_OPTS(MyBenchmark,
+///                                ->ArgName("MyParam")->Range(32, 1024*1024),
+///                              vtkm::List<vtkm::Float32, vtkm::Vec3f_32>);
+/// ```
+#define VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, options, TypeList)                                \
+  VTKM_BENCHMARK_TEMPLATES_APPLY(                                                                  \
+    BenchFunc, [](::benchmark::internal::Benchmark* bm) { bm options; }, TypeList)
+
+/// \def VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ConfigFunc, TypeList)
+///
+/// Similar to `VTKM_BENCHMARK_TEMPLATES`, but allows advanced benchmark configuration
+/// via a supplied ConfigFunc, similar to Google Benchmark's
+/// `BENCHMARK(BenchFunc)->Apply(ConfigFunc)`. `ConfigFunc` must have the
+/// signature:
+///
+/// ```
+/// void ConfigFunc(::benchmark::internal::Benchmark*);
+/// ```
+///
+/// See the Google Benchmark documentation for more details on the available options.
+#define VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ApplyFunctor, TypeList)                                                                                                             \
+  namespace                                                                                                                                                                           \
+  { /* A template function cannot be used as a template parameter, so wrap the function with       \
+     * a template struct to get it into the GenerateTemplateBenchmarks class. */ \
+  template <typename... Ts>                                                                                                                                                           \
+  struct VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                                                                                                       \
+  {                                                                                                                                                                                   \
+    static ::benchmark::internal::Function* GetFunction() { return BenchFunc<Ts...>; }                                                                                                \
+  };                                                                                                                                                                                  \
+  } /* end anon namespace */                                                                                                                                                          \
+  int BENCHMARK_PRIVATE_NAME(BenchFunc) = vtkm::bench::detail::GenerateTemplateBenchmarks<                                                                                            \
+    brigand::bind<VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)>,                                                                                                                            \
+    TypeList>::Register(#BenchFunc, ApplyFunctor)
+
+// Internal use only:
+#define VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                     \
+  BENCHMARK_PRIVATE_CONCAT(_wrapper_, BenchFunc, __LINE__)

 namespace vtkm
 {
-namespace benchmarking
+namespace bench
 {
-namespace stats
+namespace detail
 {
-// Checks that the sequence is sorted, returns true if it's sorted, false
-// otherwise
-template <typename ForwardIt>
-bool is_sorted(ForwardIt first, ForwardIt last)
+
+static inline void NullApply(::benchmark::internal::Benchmark*)
 {
-  ForwardIt next = first;
-  ++next;
-  for (; next != last; ++next, ++first)
+}
+
+/// Do not use directly. The VTKM_BENCHMARK_TEMPLATES macros should be used
+/// instead.
+// TypeLists could be expanded to compute cross products if we ever have that
+// need.
+template <typename BoundBench, typename TypeLists>
+struct GenerateTemplateBenchmarks;
+
+template <template <typename...> class BenchType, typename TypeList>
+struct GenerateTemplateBenchmarks<brigand::bind<BenchType>, TypeList>
+{
+private:
+  template <typename T>
+  using MakeBenchType = BenchType<T>;
+
+  using Benchmarks = brigand::transform<TypeList, brigand::bind<MakeBenchType, brigand::_1>>;
+
+  template <typename ApplyFunctor>
+  struct RegisterImpl
  {
-    if (*first > *next)
+    std::string BenchName;
+    ApplyFunctor Apply;
+
+    template <typename P>
+    void operator()(brigand::type_<BenchType<P>>) const
+    {
+      std::ostringstream name;
+      name << this->BenchName << "<" << vtkm::testing::TypeName<P>::Name() << ">";
+      auto bm = ::benchmark::internal::RegisterBenchmarkInternal(
+        new ::benchmark::internal::FunctionBenchmark(name.str().c_str(),
+                                                     BenchType<P>::GetFunction()));
+      this->Apply(bm);
+
+      // Always use manual time with vtkm::cont::Timer to capture CUDA times accurately.
+      bm->UseManualTime();
+    }
+  };
+
+public:
+  template <typename ApplyFunctor>
+  static int Register(const std::string& benchName, ApplyFunctor&& apply)
+  {
+    brigand::for_each<Benchmarks>(
+      RegisterImpl<ApplyFunctor>{ benchName, std::forward<ApplyFunctor>(apply) });
+    return 0;
+  }
+};
+
+class VTKmConsoleReporter : public ::benchmark::ConsoleReporter
+{
+  std::string UserPreamble;
+
+public:
+  VTKmConsoleReporter() = default;
+
+  explicit VTKmConsoleReporter(const std::string& preamble)
+    : UserPreamble{ preamble }
+  {
+  }
+
+  bool ReportContext(const Context& context) override
+  {
+    if (!::benchmark::ConsoleReporter::ReportContext(context))
    {
      return false;
    }
-  }
-  return true;
-}

-// Get the value representing the `percent` percentile of the
-// sorted samples using linear interpolation
-vtkm::Float64 PercentileValue(const std::vector<vtkm::Float64>& samples,
-                              const vtkm::Float64 percent)
-{
-  VTKM_ASSERT(!samples.empty());
-  if (samples.size() == 1)
-  {
-    return samples.front();
-  }
-  VTKM_ASSERT(percent >= 0.0);
-  VTKM_ASSERT(percent <= 100.0);
-  VTKM_ASSERT(vtkm::benchmarking::stats::is_sorted(samples.begin(), samples.end()));
-  if (percent == 100.0)
-  {
-    return samples.back();
-  }
-  // Find the two nearest percentile values and linearly
-  // interpolate between them
-  const vtkm::Float64 rank = percent / 100.0 * (static_cast<vtkm::Float64>(samples.size()) - 1.0);
-  const vtkm::Float64 low_rank = vtkm::Floor(rank);
-  const vtkm::Float64 dist = rank - low_rank;
-  const size_t k = static_cast<size_t>(low_rank);
-  const vtkm::Float64 low = samples[k];
-  const vtkm::Float64 high = samples[k + 1];
-  return low + (high - low) * dist;
-}
-// Winsorize the samples to clean up any very extreme outliers
-// Will replace all samples below `percent` and above 100 - `percent` percentiles
-// with the value at the percentile
-// NOTE: Assumes the samples have been sorted, as we make use of PercentileValue
-void Winsorize(std::vector<vtkm::Float64>& samples, const vtkm::Float64 percent)
-{
-  const vtkm::Float64 low_percentile = PercentileValue(samples, percent);
-  const vtkm::Float64 high_percentile = PercentileValue(samples, 100.0 - percent);
-  for (std::vector<vtkm::Float64>::iterator it = samples.begin(); it != samples.end(); ++it)
-  {
-    if (*it < low_percentile)
+    // The rest of the preamble is printed to the error stream, so be consistent:
+    auto& out = this->GetErrorStream();
+
+    // Print list of devices:
+    out << "VTK-m Device State:\n";
+    vtkm::cont::GetRuntimeDeviceTracker().PrintSummary(out);
+    if (!this->UserPreamble.empty())
    {
-      *it = low_percentile;
+      out << this->UserPreamble << "\n";
    }
-    else if (*it > high_percentile)
-    {
-      *it = high_percentile;
-    }
-  }
-}
-// Compute the mean value of the dataset
-vtkm::Float64 Mean(const std::vector<vtkm::Float64>& samples)
-{
-  vtkm::Float64 mean = 0;
-  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it)
-  {
-    mean += *it;
-  }
-  return mean / static_cast<vtkm::Float64>(samples.size());
-}
-// Compute the sample variance of the samples
-vtkm::Float64 Variance(const std::vector<vtkm::Float64>& samples)
-{
-  vtkm::Float64 mean = Mean(samples);
-  vtkm::Float64 square_deviations = 0;
-  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it)
-  {
-    square_deviations += vtkm::Pow(*it - mean, 2.0);
-  }
-  return square_deviations / (static_cast<vtkm::Float64>(samples.size()) - 1.0);
-}
-// Compute the standard deviation of the samples
-vtkm::Float64 StandardDeviation(const std::vector<vtkm::Float64>& samples)
-{
-  return vtkm::Sqrt(Variance(samples));
-}
-// Compute the median absolute deviation of the dataset
-vtkm::Float64 MedianAbsDeviation(const std::vector<vtkm::Float64>& samples)
-{
-  std::vector<vtkm::Float64> abs_deviations;
-  abs_deviations.reserve(samples.size());
-  const vtkm::Float64 median = PercentileValue(samples, 50.0);
-  for (std::vector<vtkm::Float64>::const_iterator it = samples.begin(); it != samples.end(); ++it)
-  {
-    abs_deviations.push_back(vtkm::Abs(*it - median));
-  }
-  std::sort(abs_deviations.begin(), abs_deviations.end());
-  return PercentileValue(abs_deviations, 50.0);
-}
-} // stats
+    out.flush();

-/*
- * The benchmarker takes a functor to benchmark and runs it multiple times,
- * printing out statistics of the run time at the end.
- * The functor passed should return the run time of the thing being benchmarked
- * in seconds, this lets us avoid including any per-run setup time in the benchmark.
- * However any one-time setup should be done in the functor's constructor
- */
-struct Benchmarker
-{
-  std::vector<vtkm::Float64> Samples;
-  std::string BenchmarkName;
-
-  const vtkm::Float64 MaxRuntime;
-  const size_t MaxIterations;
-
-public:
-  VTKM_CONT
-  Benchmarker(vtkm::Float64 maxRuntime = 30, std::size_t maxIterations = 100)
-    : MaxRuntime(maxRuntime)
-    , MaxIterations(maxIterations)
-  {
-  }
-
-  template <typename Functor>
-  VTKM_CONT void GatherSamples(Functor func)
-  {
-    this->Samples.clear();
-    this->BenchmarkName = func.Description();
-
-    // Do a warm-up run. If the benchmark allocates any additional memory
-    // eg. storage for output results, this will let it do that and
-    // allow us to avoid measuring the allocation time in the actual benchmark run
-    func();
-
-    this->Samples.reserve(this->MaxIterations);
-
-    // Run each benchmark for MAX_RUNTIME seconds or MAX_ITERATIONS iterations, whichever
-    // takes less time. This kind of assumes that running for 500 iterations or 30s will give
-    // good statistics, but if median abs dev and/or std dev are too high both these limits
-    // could be increased
-    size_t iter = 0;
-    for (vtkm::Float64 elapsed = 0.0; elapsed < this->MaxRuntime && iter < this->MaxIterations;
-         elapsed += this->Samples.back(), ++iter)
-    {
-      this->Samples.push_back(func());
-    }
-
-    std::sort(this->Samples.begin(), this->Samples.end());
-    stats::Winsorize(this->Samples, 5.0);
-  }
-
-  VTKM_CONT void PrintSummary(std::ostream& out = std::cout)
-  {
-    out << "Benchmark \'" << this->BenchmarkName << "\' results:\n";
-
-    if (this->Samples.empty())
-    {
-      out << "\tNo samples gathered!\n";
-      return;
-    }
-
-    out << "\tnumSamples = " << this->Samples.size() << "\n"
-        << "\tmedian = " << stats::PercentileValue(this->Samples, 50.0) << "s\n"
-        << "\tmedian abs dev = " << stats::MedianAbsDeviation(this->Samples) << "s\n"
-        << "\tmean = " << stats::Mean(this->Samples) << "s\n"
-        << "\tstd dev = " << stats::StandardDeviation(this->Samples) << "s\n"
-        << "\tmin = " << this->Samples.front() << "s\n"
-        << "\tmax = " << this->Samples.back() << "s\n";
-  }
-
-  template <typename DeviceAdapter, typename MakerFunctor, typename T>
-  VTKM_CONT bool operator()(DeviceAdapter id, MakerFunctor&& makerFunctor, T t)
-  {
-    auto func = makerFunctor(t, id);
-    std::cout << "Running '" << func.Description() << "'" << std::endl;
-    this->GatherSamples(func);
-    this->PrintSummary();
    return true;
  }
-
-  VTKM_CONT const std::vector<vtkm::Float64>& GetSamples() const { return this->Samples; }
-
-  VTKM_CONT void Reset()
-  {
-    this->Samples.clear();
-    this->BenchmarkName.clear();
-  }
 };

-template <typename MakerFunctor>
-class InternalPrintTypeAndBench
+// Returns the number of executed benchmarks:
+static inline vtkm::Id ExecuteBenchmarks(int& argc,
+                                         char* argv[],
+                                         const std::string& preamble = std::string{})
 {
-  MakerFunctor Maker;
-
-public:
-  VTKM_CONT
-  InternalPrintTypeAndBench(MakerFunctor maker)
-    : Maker(maker)
+  ::benchmark::Initialize(&argc, argv);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv))
  {
+    return 1;
  }

-  template <typename T>
-  VTKM_CONT void operator()(T t, vtkm::cont::DeviceAdapterId id) const
-  {
-    std::cout << "*** " << vtkm::testing::TypeName<T>::Name() << " on device " << id.GetName()
-              << " ***************" << std::endl;
-    Benchmarker bench;
-    try
-    {
-      vtkm::cont::TryExecuteOnDevice(id, bench, Maker, t);
-    }
-    catch (std::exception& e)
-    {
-      std::cout << "\n"
-                << "An exception occurring during a benchmark:\n\t" << e.what() << "\n"
-                << "Attempting to continue with remaining benchmarks...\n\n";
-    }
-  }
-};
+  VTKmConsoleReporter reporter{ preamble };

-template <class MakerFunctor, class TypeList>
-VTKM_CONT void BenchmarkTypes(MakerFunctor&& maker, TypeList, vtkm::cont::DeviceAdapterId id)
-{
-  vtkm::ListForEach(
-    InternalPrintTypeAndBench<MakerFunctor>(std::forward<MakerFunctor>(maker)), TypeList(), id);
+  vtkm::cont::Timer timer;
+  timer.Start();
+  std::size_t num = ::benchmark::RunSpecifiedBenchmarks(&reporter);
+  timer.Stop();
+
+  reporter.GetOutputStream().flush();
+  reporter.GetErrorStream().flush();
+
+  reporter.GetErrorStream() << "Ran " << num << " benchmarks in " << timer.GetElapsedTime()
+                            << " seconds." << std::endl;
+
+  return static_cast<vtkm::Id>(num);
 }
 }
 }
+} // end namespace vtkm::bench::detail

 #endif
--- a/benchmarking/CMakeLists.txt
+++ b/benchmarking/CMakeLists.txt
@ -7,18 +7,24 @@
 ##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 ##  PURPOSE.  See the above copyright notice for more information.
 ##============================================================================
+
+# Find Google Benchmark. Note that benchmark_DIR must be pointed at an
+# installation, not a build directory.
+find_package(benchmark)
+
 function(add_benchmark)
  set(options)
  set(oneValueArgs NAME FILE)
  set(multiValueArgs LIBS)
  cmake_parse_arguments(VTKm_AB
-    "${options}" "${oneValueArgs}" "${multiValueArgs}"
-    ${ARGN}
-    )
+          "${options}" "${oneValueArgs}" "${multiValueArgs}"
+          ${ARGN}
+          )
  set(exe_name ${VTKm_AB_NAME})

  add_executable(${exe_name} ${VTKm_AB_FILE})
  target_link_libraries(${exe_name} PRIVATE ${VTKm_AB_LIBS})
+  target_link_libraries(${exe_name} PRIVATE benchmark::benchmark)
  vtkm_add_drop_unused_function_flags(${exe_name})
  vtkm_add_target_information(${exe_name})

--- a/docs/changelog/google_benchmark.md
+++ b/docs/changelog/google_benchmark.md
@ -0,0 +1,133 @@
+# Updated Benchmark Framework
+
+The benchmarking framework has been updated to use Google Benchmark.
+
+A benchmark is now a single function, which is passed to a macro:
+
+```
+void MyBenchmark(::benchmark::State& state)
+{
+  MyClass someClass;
+
+  // Optional: Add a descriptive label with additional benchmark details:
+  state.SetLabel("Blah blah blah.");
+
+  // Must use a vtkm timer to properly capture eg. CUDA execution times.
+  vtkm::cont::Timer timer;
+  for (auto _ : state)
+  {
+    someClass.Reset();
+
+    timer.Start();
+    someClass.DoWork();
+    timer.Stop();
+
+    state.SetIterationTime(timer.GetElapsedTime());
+  }
+
+  // Optional: Report items and/or bytes processed per iteration in output:
+  state.SetItemsProcessed(state.iterations() * someClass.GetNumberOfItems());
+  state.SetBytesProcessed(state.iterations() * someClass.GetNumberOfBytes());
+}
+}
+VTKM_BENCHMARK(MyBenchmark);
+```
+
+Google benchmark also makes it easy to implement parameter sweep benchmarks:
+
+```
+void MyParameterSweep(::benchmark::State& state)
+{
+  // The current value in the sweep:
+  const vtkm::Id currentValue = state.range(0);
+
+  MyClass someClass;
+  someClass.SetSomeParameter(currentValue);
+
+  vtkm::cont::Timer timer;
+  for (auto _ : state)
+  {
+    someClass.Reset();
+
+    timer.Start();
+    someClass.DoWork();
+    timer.Stop();
+
+    state.SetIterationTime(timer.GetElapsedTime());
+  }
+}
+VTKM_BENCHMARK_OPTS(MyBenchmark, ->ArgName("Param")->Range(32, 1024 * 1024));
+```
+
+will generate and launch several benchmarks, exploring the parameter space of
+`SetSomeParameter` between the values of 32 and (1024*1024). The chain of
+functions calls in the second argument is applied to an instance of
+::benchmark::internal::Benchmark. See Google Benchmark's documentation for
+more details.
+
+For more complex benchmark configurations, the VTKM_BENCHMARK_APPLY macro
+accepts a function with the signature
+`void Func(::benchmark::internal::Benchmark*)` that may be used to generate
+more complex configurations.
+
+To instantiate a templated benchmark across a list of types, the
+VTKM_BENCHMARK_TEMPLATE* macros take a vtkm::List of types as an additional
+parameter. The templated benchmark function will be instantiated and called
+for each type in the list:
+
+```
+template <typename T>
+void MyBenchmark(::benchmark::State& state)
+{
+  MyClass<T> someClass;
+
+  // Must use a vtkm timer to properly capture eg. CUDA execution times.
+  vtkm::cont::Timer timer;
+  for (auto _ : state)
+  {
+    someClass.Reset();
+
+    timer.Start();
+    someClass.DoWork();
+    timer.Stop();
+
+    state.SetIterationTime(timer.GetElapsedTime());
+  }
+}
+}
+VTKM_BENCHMARK_TEMPLATE(MyBenchmark, vtkm::List<vtkm::Float32, vtkm::Vec3f_32>);
+```
+
+The benchmarks are executed by calling the `VTKM_EXECUTE_BENCHMARKS(argc, argv)`
+macro from `main`. There is also a `VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, some_string)`
+macro that appends the contents of `some_string` to the Google Benchmark preamble.
+
+If a benchmark is not compatible with some configuration, it may call 
+`state.SkipWithError("Error message");` on the `::benchmark::State` object and return. This is
+useful, for instance in the filter tests when the input is not compatible with the filter.
+
+When launching a benchmark executable, the following options are supported by Google Benchmark:
+
+- `--benchmark_list_tests`: List all available tests.
+- `--benchmark_filter="[regex]"`: Only run benchmark with names that match `[regex]`.
+- `--benchmark_filter="-[regex]"`: Only run benchmark with names that DON'T match `[regex]`.
+- `--benchmark_min_time=[float]`: Make sure each benchmark repetition gathers `[float]` seconds
+  of data.
+- `--benchmark_repetitions=[int]`: Run each benchmark `[int]` times and report aggregate statistics 
+  (mean, stdev, etc). A "repetition" refers to a single execution of the benchmark function, not
+  an "iteration", which is a loop of the `for(auto _:state){...}` section.
+- `--benchmark_report_aggregates_only="true|false"`: If true, only the aggregate statistics are
+  reported (affects both console and file output). Requires `--benchmark_repetitions` to be useful.
+- `--benchmark_display_aggregates_only="true|false"`: If true, only the aggregate statistics are
+  printed to the terminal. Any file output will still contain all repetition info.
+- `--benchmark_format="console|json|csv"`: Specify terminal output format: human readable 
+  (`console`) or `csv`/`json` formats.
+- `--benchmark_out_format="console|json|csv"`: Specify file output format: human readable 
+  (`console`) or `csv`/`json` formats.
+- `--benchmark_out=[filename]`: Specify output file.
+- `--benchmark_color="true|false"`: Toggle color output in terminal when using `console` output.
+- `--benchmark_counters_tabular="true|false"`: Print counter information (e.g. bytes/sec, items/sec)
+  in the table, rather than appending them as a label.
+
+For more information and examples of practical usage, take a look at the existing benchmarks in
+vtk-m/benchmarking/.
--- a/vtkm/cont/ArrayHandleIndex.h
+++ b/vtkm/cont/ArrayHandleIndex.h
@ -68,6 +68,13 @@ public:
  {
  }
 };
+
+/// A convenience function for creating an ArrayHandleIndex. It takes the
+/// size of the array and generates an array holding vtkm::Id from [0, size - 1]
+VTKM_CONT inline vtkm::cont::ArrayHandleIndex make_ArrayHandleIndex(vtkm::Id length)
+{
+  return vtkm::cont::ArrayHandleIndex(length);
+}
 }
 } // namespace vtkm::cont

--- a/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu
@ -47,7 +47,6 @@ void DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::Reset()
 void DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::Start()
 {
  VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
-  VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
  this->StartReady = true;
 }

--- a/vtkm/exec/AtomicArrayExecutionObject.h
+++ b/vtkm/exec/AtomicArrayExecutionObject.h
@ -22,6 +22,35 @@ namespace vtkm
 namespace exec
 {

+namespace detail
+{
+// Clang-7 as host compiler under nvcc returns types from std::make_unsigned
+// that are not compatible with the AtomicInterface API, so we define our own
+// mapping. This must exist for every entry in vtkm::cont::AtomicArrayTypeList.
+template <typename>
+struct MakeUnsigned;
+template <>
+struct MakeUnsigned<vtkm::UInt32>
+{
+  using type = vtkm::UInt32;
+};
+template <>
+struct MakeUnsigned<vtkm::Int32>
+{
+  using type = vtkm::UInt32;
+};
+template <>
+struct MakeUnsigned<vtkm::UInt64>
+{
+  using type = vtkm::UInt64;
+};
+template <>
+struct MakeUnsigned<vtkm::Int64>
+{
+  using type = vtkm::UInt64;
+};
+}
+
 template <typename T, typename Device>
 class AtomicArrayExecutionObject
 {
@ -66,7 +95,7 @@ public:
    // We only support 32/64 bit signed/unsigned ints, and AtomicInterface
    // currently only provides API for unsigned types.
    // We'll cast the signed types to unsigned to work around this.
-    using APIType = typename std::make_unsigned<ValueType>::type;
+    using APIType = typename detail::MakeUnsigned<ValueType>::type;

    return static_cast<T>(
      AtomicInterface::Load(reinterpret_cast<const APIType*>(this->Data + index)));
@ -89,7 +118,7 @@ public:
    // This is safe, since the only difference between signed/unsigned types
    // is how overflow works, and signed overflow is already undefined. We also
    // document that overflow is undefined for this operation.
-    using APIType = typename std::make_unsigned<ValueType>::type;
+    using APIType = typename detail::MakeUnsigned<ValueType>::type;

    return static_cast<T>(AtomicInterface::Add(reinterpret_cast<APIType*>(this->Data + index),
                                               static_cast<APIType>(value)));
@ -116,7 +145,7 @@ public:
    // This is safe, since the only difference between signed/unsigned types
    // is how overflow works, and signed overflow is already undefined. We also
    // document that overflow is undefined for this operation.
-    using APIType = typename std::make_unsigned<ValueType>::type;
+    using APIType = typename detail::MakeUnsigned<ValueType>::type;

    AtomicInterface::Store(reinterpret_cast<APIType*>(this->Data + index),
                           static_cast<APIType>(value));
@ -169,7 +198,7 @@ public:
    // We'll cast the signed types to unsigned to work around this.
    // This is safe, since the only difference between signed/unsigned types
    // is how overflow works, and signed overflow is already undefined.
-    using APIType = typename std::make_unsigned<ValueType>::type;
+    using APIType = typename detail::MakeUnsigned<ValueType>::type;

    return static_cast<T>(
      AtomicInterface::CompareAndSwap(reinterpret_cast<APIType*>(this->Data + index),