From 238d4fa7594330486bc31c2023c05af6d0735cc0 Mon Sep 17 00:00:00 2001
From: Will Usher <wusher@lanl.gov>
Date: Mon, 6 Jul 2015 15:44:29 -0600
Subject: [PATCH] Adding micro benchmark suite

---
 CMake/VTKmMacros.cmake                       | 144 ++++++
 CMakeLists.txt                               |   1 +
 vtkm/CMakeLists.txt                          |   5 +
 vtkm/benchmarking/BenchmarkDeviceAdapter.cxx |  30 ++
 vtkm/benchmarking/BenchmarkDeviceAdapter.h   | 441 +++++++++++++++++++
 vtkm/benchmarking/CMakeLists.txt             |  35 ++
 6 files changed, 656 insertions(+)
 create mode 100644 vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
 create mode 100644 vtkm/benchmarking/BenchmarkDeviceAdapter.h
 create mode 100644 vtkm/benchmarking/CMakeLists.txt

diff --git a/CMake/VTKmMacros.cmake b/CMake/VTKmMacros.cmake
index 02f97393e..23daf6e3b 100644
--- a/CMake/VTKmMacros.cmake
+++ b/CMake/VTKmMacros.cmake
@@ -388,6 +388,150 @@ function(vtkm_worklet_unit_tests device_adapter)
   set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
 endfunction(vtkm_worklet_unit_tests)
 
+# Save the benchmarks to run with each device adapter
+# This is based on vtkm_save_worklet_unit_tests
+# Usage:
+#
+# vtkm_save_benchmarks( sources )
+#
+# notes: will save the sources absolute path as the
+# vtkm_benchmarks_sources global property
+function(vtkm_save_benchmarks)
+
+  #create the benchmarks driver when we are called, since
+  #the driver expects the files to be in the same
+  #directory as the test driver
+	#TODO: This is probably ok to use for benchmarks as well
+  create_test_sourcelist(bench_sources BenchmarkDriver.cxx ${ARGN})
+
+  #store the absolute path for the driver and all the test
+  #files
+  set(driver ${CMAKE_CURRENT_BINARY_DIR}/BenchmarkDriver.cxx)
+  set(cxx_sources)
+  set(cu_sources)
+
+  #we need to store the absolute source for the file so that
+  #we can properly compile it into the benchmark driver. At
+  #the same time we want to configure each file into the build
+  #directory as a .cu file so that we can compile it with cuda
+  #if needed
+  foreach(fname ${ARGN})
+    set(absPath)
+
+    get_filename_component(absPath ${fname} ABSOLUTE)
+    get_filename_component(file_name_only ${fname} NAME_WE)
+
+    set(cuda_file_name "${CMAKE_CURRENT_BINARY_DIR}/${file_name_only}.cu")
+    configure_file("${absPath}"
+                   "${cuda_file_name}"
+                   COPYONLY)
+    list(APPEND cxx_sources ${absPath})
+    list(APPEND cu_sources ${cuda_file_name})
+  endforeach()
+
+  #we create a property that holds all the worklets to test,
+  #but don't actually attempt to create a unit test with the yet.
+  #That is done by each device adapter
+  set_property( GLOBAL APPEND
+                PROPERTY vtkm_benchmarks_sources ${cxx_sources})
+  set_property( GLOBAL APPEND
+                PROPERTY vtkm_benchmarks_cu_sources ${cu_sources})
+  set_property( GLOBAL APPEND
+                PROPERTY vtkm_benchmarks_drivers ${driver})
+
+endfunction(vtkm_save_benchmarks)
+
+# Call each benchmark for the given device adapter
+# Usage:
+#
+# vtkm_benchmark( device_adapter )
+#
+# notes: will look for the vtkm_benchmarks_sources global
+# property to find what are the benchmarks that need to be
+# compiled for the give device adapter
+function(vtkm_benchmarks device_adapter)
+
+  set(benchmark_srcs)
+  get_property(benchmark_srcs GLOBAL
+               PROPERTY vtkm_benchmarks_sources )
+
+  set(benchmark_drivers)
+  get_property(benchmark_drivers GLOBAL
+               PROPERTY vtkm_benchmarks_drivers )
+
+  #detect if we are generating a .cu files
+  set(is_cuda FALSE)
+  set(old_nvcc_flags ${CUDA_NVCC_FLAGS})
+  if("${device_adapter}" STREQUAL "VTKM_DEVICE_ADAPTER_CUDA")
+    set(is_cuda TRUE)
+    #if we are generating cu files need to setup three things.
+    #1. us the configured .cu files
+    #2. Explicitly set the cuda device adapter as a define this is currently
+    #   done as a work around since the cuda executable ignores compile
+    #   definitions
+    #3. Set BOOST_SP_DISABLE_THREADS to disable threading warnings
+    #4. Disable unused function warnings
+    #   the FindCUDA module and helper methods don't read target level
+    #   properties so we have to modify CUDA_NVCC_FLAGS  instead of using
+    #   target and source level COMPILE_FLAGS and COMPILE_DEFINITIONS
+    get_property(benchmark_srcs GLOBAL PROPERTY vtkm_benchmarks_cu_sources )
+
+    list(APPEND CUDA_NVCC_FLAGS "-DVTKM_DEVICE_ADAPTER=${device_adapter}")
+    list(APPEND CUDA_NVCC_FLAGS "-DBOOST_SP_DISABLE_THREADS")
+    list(APPEND CUDA_NVCC_FLAGS "-w")
+  endif()
+
+
+  if(VTKm_ENABLE_BENCHMARKS AND VTKm_ENABLE_TESTING)
+    string(REPLACE "VTKM_DEVICE_ADAPTER_" "" device_type ${device_adapter})
+
+    vtkm_get_kit_name(kit)
+
+    #inject the device adapter into the benchmark program name so each one is unique
+    set(benchmark_prog Benchmarks_${device_type})
+
+    if(is_cuda)
+      cuda_add_executable(${benchmark_prog} ${benchmark_drivers} ${benchmark_srcs})
+    else()
+      add_executable(${benchmark_prog} ${benchmark_drivers} ${benchmark_srcs})
+      if("${device_adapter}" STREQUAL "VTKM_DEVICE_ADAPTER_TBB")
+        target_link_libraries(${benchmark_prog} ${TBB_LIBRARIES})
+      endif()
+    endif()
+
+    if(MSVC)
+      #disable MSVC CRT and SCL warnings as they recommend using non standard
+      #c++ extensions
+      set_property(TARGET ${benchmark_prog}
+                   APPEND PROPERTY COMPILE_DEFINITIONS
+                   "_SCL_SECURE_NO_WARNINGS"
+                   "_CRT_SECURE_NO_WARNINGS"
+                   )
+
+      #enable large object support 2^32 addressable sections
+      set_property(TARGET ${benchmark_prog}
+                   APPEND PROPERTY COMPILE_FLAGS
+                   "/bigobj"
+                   )
+    endif()
+
+    #increase warning level if needed, we are going to skip cuda here
+    #to remove all the false positive unused function warnings that cuda
+    #generates
+    if(VTKm_EXTRA_COMPILER_WARNINGS)
+      set_property(TARGET ${benchmark_prog}
+                   APPEND PROPERTY COMPILE_FLAGS ${CMAKE_CXX_FLAGS_WARN_EXTRA} )
+    endif()
+
+    #set the device adapter on the executable
+    set_property(TARGET ${benchmark_prog}
+                 APPEND
+                 PROPERTY COMPILE_DEFINITIONS "VTKM_DEVICE_ADAPTER=${device_adapter}" )
+  endif()
+
+  set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
+endfunction(vtkm_benchmarks)
+
 # The Thrust project is not as careful as the VTKm project in avoiding warnings
 # on shadow variables and unused arguments.  With a real GCC compiler, you
 # can disable these warnings inline, but with something like nvcc, those
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df60c1487..91d8bdae1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ include(CMake/VTKmCompilerExtras.cmake)
 option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF)
 option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
 option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON)
+option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)
 
 option(VTKm_USE_DOUBLE_PRECISION
   "Use double precision for floating point calculations"
diff --git a/vtkm/CMakeLists.txt b/vtkm/CMakeLists.txt
index db1a920bc..3796750b8 100644
--- a/vtkm/CMakeLists.txt
+++ b/vtkm/CMakeLists.txt
@@ -53,3 +53,8 @@ add_subdirectory(exec)
 #-----------------------------------------------------------------------------
 #add the worklet folder
 add_subdirectory(worklet)
+
+#-----------------------------------------------------------------------------
+#add the benchmarking folder
+add_subdirectory(benchmarking)
+
diff --git a/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
new file mode 100644
index 000000000..20a49e1a0
--- /dev/null
+++ b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
@@ -0,0 +1,30 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/DeviceAdapter.h>
+
+#include <vtkm/benchmarking/BenchmarkDeviceAdapter.h>
+
+int BenchmarkDeviceAdapter(int, char *[])
+{
+	return vtkm::benchmarking::BenchmarkDeviceAdapter
+		<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>::Run();
+}
+
diff --git a/vtkm/benchmarking/BenchmarkDeviceAdapter.h b/vtkm/benchmarking/BenchmarkDeviceAdapter.h
new file mode 100644
index 000000000..9309ab79f
--- /dev/null
+++ b/vtkm/benchmarking/BenchmarkDeviceAdapter.h
@@ -0,0 +1,441 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_benchmarking_BenchmarkDeviceAdapter_h
+#define vtk_m_benchmarking_BenchmarkDeviceAdapter_h
+
+#include <vtkm/TypeTraits.h>
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/ArrayHandleCounting.h>
+#include <vtkm/cont/ArrayHandleConstant.h>
+#include <vtkm/cont/ArrayHandlePermutation.h>
+#include <vtkm/cont/ArrayHandleZip.h>
+#include <vtkm/cont/ArrayPortalToIterators.h>
+#include <vtkm/cont/ErrorControlOutOfMemory.h>
+#include <vtkm/cont/ErrorExecution.h>
+#include <vtkm/cont/StorageBasic.h>
+#include <vtkm/cont/Timer.h>
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+
+#include <vtkm/cont/internal/DeviceAdapterError.h>
+
+#include <vtkm/cont/testing/Testing.h>
+
+#include <boost/random.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <ctime>
+#include <utility>
+#include <vector>
+#include <string>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#endif
+
+namespace vtkm {
+namespace benchmarking {
+
+#define ARRAY_SIZE (1 << 20)
+const static std::string DIVIDER(40, '-');
+
+/// This class runs a series of micro-benchmarks to measure
+/// performance of the parallel primitives provided by each
+/// device adapter
+///
+template<class DeviceAdapterTag>
+struct BenchmarkDeviceAdapter {
+private:
+  typedef vtkm::cont::StorageTagBasic StorageTagBasic;
+  typedef vtkm::cont::StorageTagBasic StorageTag;
+
+  typedef vtkm::cont::ArrayHandle<vtkm::Id, StorageTag> IdArrayHandle;
+
+  typedef vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTag>
+      Algorithm;
+
+  typedef vtkm::cont::Timer<DeviceAdapterTag> Timer;
+
+  struct BenchLowerBounds {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+
+      std::vector<Value> input(ARRAY_SIZE, Value());
+      for (size_t i = 0; i < input.size(); ++i){
+        input[i] = TestValue(vtkm::Id(i), Value());
+      }
+      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
+
+      // We benchmark finding indices for the elements using various
+      // ratios of values to input from 5-30% of # of elements in input
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_vals = (ARRAY_SIZE * p) / 100;
+        std::vector<Value> values(n_vals, Value());
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(2 * i), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        IdArrayHandle out_handle;
+        timer.Reset();
+        Algorithm::LowerBounds(input_handle, value_handle, out_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "LowerBounds on " << ARRAY_SIZE << " input and "
+          << n_vals << " values took " << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchReduce {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      std::vector<Value> input(ARRAY_SIZE, Value());
+      for (size_t i = 0; i < input.size(); ++i){
+        input[i] = TestValue(vtkm::Id(i), Value());
+      }
+      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
+      timer.Reset();
+      Algorithm::Reduce(input_handle, Value());
+      vtkm::Float64 elapsed = timer.GetElapsedTime();
+      std::cout << "Reduce on " << ARRAY_SIZE
+        << " values took " << elapsed << "s\n";
+    }
+  };
+
+  struct BenchReduceByKey {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      // We benchmark 5% to 30% of ARRAY_SIZE keys in 5% increments
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_keys = (ARRAY_SIZE * p) / 100;
+        std::vector<Value> values(ARRAY_SIZE, Value());
+        std::vector<vtkm::Id> keys(ARRAY_SIZE, 0);
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(i), Value());
+          keys[i] = vtkm::Id(i % n_keys);
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        ValueArrayHandle values_out;
+        IdArrayHandle key_handle = vtkm::cont::make_ArrayHandle(keys);
+        IdArrayHandle keys_out;
+        Algorithm::SortByKey(key_handle, value_handle);
+        timer.Reset();
+        Algorithm::ReduceByKey(key_handle, value_handle, keys_out, values_out,
+            vtkm::internal::Add());
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "ReduceByKey on " << ARRAY_SIZE
+          << " values with " << n_keys << " distinct vtkm::Id"
+          << " keys took " << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchScanInclusive {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      std::vector<Value> values(ARRAY_SIZE, Value());
+      for (size_t i = 0; i < values.size(); ++i){
+        values[i] = TestValue(vtkm::Id(i), Value());
+      }
+      ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+      ValueArrayHandle out_handle;
+      timer.Reset();
+      Algorithm::ScanInclusive(value_handle, out_handle);
+      vtkm::Float64 elapsed = timer.GetElapsedTime();
+      std::cout << "ScanInclusive on " << ARRAY_SIZE
+        << " values took " << elapsed << "s\n";
+    }
+  };
+
+  struct BenchScanExclusive {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      std::vector<Value> values(ARRAY_SIZE, Value());
+      for (size_t i = 0; i < values.size(); ++i){
+        values[i] = TestValue(vtkm::Id(i), Value());
+      }
+      ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+      ValueArrayHandle out_handle;
+      timer.Reset();
+      Algorithm::ScanExclusive(value_handle, out_handle);
+      vtkm::Float64 elapsed = timer.GetElapsedTime();
+      std::cout << "ScanExclusive on " << ARRAY_SIZE
+        << " values took " << elapsed << "s\n";
+    }
+  };
+
+  /// This benchmark tests sort on a few configurations of data
+  /// sorted, reverse-ordered, almost sorted and random
+  /// TODO: Is it really worth testing all these possible configurations
+  /// of data? How often will we really care about anything besides unsorted data?
+  struct BenchSort {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      std::vector<Value> values(ARRAY_SIZE, Value());
+      // Test sort on already sorted data
+      {
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(i), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        timer.Reset();
+        Algorithm::Sort(value_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "Sort on " << ARRAY_SIZE << " already sorted "
+          << " values took " << elapsed << "s\n";
+      }
+      // Test sort on reverse-sorted data
+      {
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(values.size() - i), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        timer.Reset();
+        Algorithm::Sort(value_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "Sort on " << ARRAY_SIZE << " reverse-ordered "
+          << " values took " << elapsed << "s\n";
+      }
+      // Test on almost sorted data
+      {
+        size_t modulus = values.size() / 4;
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(i % modulus), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        timer.Reset();
+        Algorithm::Sort(value_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "Sort on " << ARRAY_SIZE << " almost-sorted "
+          << " values took " << elapsed << "s\n";
+      }
+      // Test on random data
+      {
+        boost::mt19937 rng;
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(rng()), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        timer.Reset();
+        Algorithm::Sort(value_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "Sort on " << ARRAY_SIZE << " random "
+          << " values took " << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchSortByKey {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      boost::mt19937 rng;
+      // We benchmark 5% to 30% of ARRAY_SIZE keys in 5% increments
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_keys = (ARRAY_SIZE * p) / 100;
+        std::vector<Value> values(ARRAY_SIZE, Value());
+        std::vector<vtkm::Id> keys(ARRAY_SIZE, 0);
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(rng()), Value());
+          keys[i] = vtkm::Id(i % n_keys);
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        IdArrayHandle key_handle = vtkm::cont::make_ArrayHandle(keys);
+        timer.Reset();
+        Algorithm::SortByKey(value_handle, key_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "SortByKey on " << ARRAY_SIZE
+          << " random values with " << n_keys << " different vtkm::Id keys took "
+          << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchStreamCompact {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      // We benchmark 5% to 30% valid values in 5% increments
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_valid = (ARRAY_SIZE * p) / 100;
+        size_t modulo = ARRAY_SIZE / n_valid;
+        std::vector<Value> values(ARRAY_SIZE, Value());
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = i % modulo == 0 ? TestValue(1, Value()) : Value();
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        IdArrayHandle out_handle;
+        timer.Reset();
+        Algorithm::StreamCompact(value_handle, out_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "StreamCompact on " << ARRAY_SIZE << " "
+          << " values with " << out_handle.GetNumberOfValues()
+          << " valid values took " << elapsed << "s\n";
+
+        std::vector<vtkm::Id> stencil(ARRAY_SIZE, 0);
+        for (size_t i = 0; i < stencil.size(); ++i){
+          stencil[i] = i % modulo == 0 ? 1 : vtkm::Id();
+        }
+        IdArrayHandle stencil_handle = vtkm::cont::make_ArrayHandle(stencil);
+        ValueArrayHandle out_val_handle;
+        timer.Reset();
+        Algorithm::StreamCompact(value_handle, stencil_handle, out_val_handle);
+        elapsed = timer.GetElapsedTime();
+        std::cout << "StreamCompact with stencil on " << ARRAY_SIZE
+          << " values with " << out_val_handle.GetNumberOfValues()
+          << " valid values took " << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchUnique {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      // We benchmark 5% to 30% valid values in 5% increments
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_valid = (ARRAY_SIZE * p) / 100;
+        std::vector<Value> values(ARRAY_SIZE, Value());
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(i % n_valid), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        Algorithm::Sort(value_handle);
+        timer.Reset();
+        Algorithm::Unique(value_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "Unique on " << ARRAY_SIZE << " values with "
+          << value_handle.GetNumberOfValues() << " valid values took "
+          << elapsed << "s\n";
+      }
+    }
+  };
+
+  struct BenchUpperBounds {
+    template<typename Value>
+    VTKM_CONT_EXPORT void operator()(const Value vtkmNotUsed(v)) const {
+      typedef vtkm::cont::ArrayHandle<Value, StorageTag> ValueArrayHandle;
+
+      Timer timer;
+      std::vector<Value> input(ARRAY_SIZE, Value());
+      for (size_t i = 0; i < input.size(); ++i){
+        input[i] = TestValue(vtkm::Id(i), Value());
+      }
+      ValueArrayHandle input_handle = vtkm::cont::make_ArrayHandle(input);
+
+      // We benchmark finding indices for the elements using various
+      // ratios of values to input from 5-30% of # of elements in input
+      for (size_t p = 5; p <= 30; p += 5){
+        size_t n_vals = (ARRAY_SIZE * p) / 100;
+        std::vector<Value> values(n_vals, Value());
+        for (size_t i = 0; i < values.size(); ++i){
+          values[i] = TestValue(vtkm::Id(2 * i), Value());
+        }
+        ValueArrayHandle value_handle = vtkm::cont::make_ArrayHandle(values);
+        IdArrayHandle out_handle;
+        timer.Reset();
+        Algorithm::UpperBounds(input_handle, value_handle, out_handle);
+        vtkm::Float64 elapsed = timer.GetElapsedTime();
+        std::cout << "UpperBounds on " << ARRAY_SIZE << " input and "
+          << n_vals << " values took " << elapsed << "s\n";
+      }
+    }
+  };
+
+public:
+
+  struct ValueTypes : vtkm::ListTagBase<vtkm::UInt8, vtkm::UInt32, vtkm::Int32,
+                                        vtkm::Int64, vtkm::Vec<vtkm::Int32, 2>,
+                                        vtkm::Vec<vtkm::UInt8, 4>, vtkm::Float32,
+                                        vtkm::Float64, vtkm::Vec<vtkm::Float64, 3>,
+                                        vtkm::Vec<vtkm::Float32, 4> >{};
+
+
+  static VTKM_CONT_EXPORT int Run(){
+      std::cout << DIVIDER << "\nRunning DeviceAdapter benchmarks\n";
+
+      std::cout << DIVIDER << "\nBenchmarking LowerBounds\n";
+      vtkm::testing::Testing::TryTypes(BenchLowerBounds(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking Reduce\n";
+      vtkm::testing::Testing::TryTypes(BenchReduce(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking ReduceByKey\n";
+      vtkm::testing::Testing::TryTypes(BenchReduceByKey(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking ScanInclusive\n";
+      vtkm::testing::Testing::TryTypes(BenchScanInclusive(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking ScanExclusive\n";
+      vtkm::testing::Testing::TryTypes(BenchScanExclusive(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking Sort\n";
+      vtkm::testing::Testing::TryTypes(BenchSort(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking SortByKey\n";
+      vtkm::testing::Testing::TryTypes(BenchSortByKey(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking StreamCompact\n";
+      vtkm::testing::Testing::TryTypes(BenchStreamCompact(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking Unique\n";
+      vtkm::testing::Testing::TryTypes(BenchUnique(), ValueTypes());
+
+      std::cout << "\n" << DIVIDER << "\nBenchmarking UpperBounds\n";
+      vtkm::testing::Testing::TryTypes(BenchUpperBounds(), ValueTypes());
+      return 0;
+  }
+};
+
+#undef ARRAY_SIZE
+
+}
+} // namespace vtkm::benchmarking
+
+#endif
+
diff --git a/vtkm/benchmarking/CMakeLists.txt b/vtkm/benchmarking/CMakeLists.txt
new file mode 100644
index 000000000..8934552d3
--- /dev/null
+++ b/vtkm/benchmarking/CMakeLists.txt
@@ -0,0 +1,35 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2014 Sandia Corporation.
+##  Copyright 2014 UT-Battelle, LLC.
+##  Copyright 2014 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(benchmarks
+	BenchmarkDeviceAdapter.cxx
+  )
+
+vtkm_save_benchmarks(${benchmarks})
+
+vtkm_benchmarks(VTKM_DEVICE_ADAPTER_SERIAL)
+
+if (VTKm_ENABLE_CUDA)
+  vtkm_benchmarks(VTKM_DEVICE_ADAPTER_CUDA)
+endif()
+if (VTKm_ENABLE_TBB)
+  vtkm_benchmarks(VTKM_DEVICE_ADAPTER_TBB)
+endif()
+