diff --git a/Utilities/Scripts/benchCompare.py b/Utilities/Scripts/benchCompare.py
deleted file mode 100755
index c7c6b42a8..000000000
--- a/Utilities/Scripts/benchCompare.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-#
-# Compares the output from BenchmarkDeviceAdapter from the serial
-# device to a parallel device and prints a table containing the results.
-#
-# Example usage:
-#
-# $ BenchmarkDeviceAdapter_SERIAL > serial.out
-# $ BenchmarkDeviceAdapter_TBB > tbb.out
-# $ benchCompare.py serial.out tbb.out
-#
-#
-# The number of threads (optional -- only used to generate the "Warn" column)
-maxThreads = 4
-#
-# Print debugging output:
-doDebug = False
-#
-# End config options.
-
-import re
-import sys
-
-assert(len(sys.argv) == 3)
-
-def debug(str):
-  if (doDebug): print(str)
-
-# Parses "*** vtkm::Float64 ***************" --> vtkm::Float64
-typeParser = re.compile("\\*{3} ([^*]+) on device ([^*]+) \\*{15}")
-
-# Parses "Benchmark 'Benchmark name' results:" --> Benchmark name
-nameParser = re.compile("Benchmark '([^-]+)' results:")
-
-# Parses "mean = 0.0125s" --> 0.0125
-meanParser = re.compile("\\s+mean = ([0-9.Ee+-]+)s")
-
-# Parses "std dev = 0.0125s" --> 0.0125
-stdDevParser = re.compile("\\s+std dev = ([naN0-9.Ee+-]+)s")
-
-serialFilename = sys.argv[1]
-parallelFilename = sys.argv[2]
-
-serialFile = open(serialFilename, 'r')
-parallelFile = open(parallelFilename, 'r')
-
-class BenchKey:
-  def __init__(self, name_, type_):
-    self.name = name_
-    self.type = type_
-
-  def __eq__(self, other):
-    return self.name == other.name and self.type == other.type
-
-  def __lt__(self, other):
-    if self.name < other.name: return True
-    elif self.name > other.name: return False
-    else: return self.type < other.type
-
-  def __hash__(self):
-    return (self.name + self.type).__hash__()
-
-class BenchData:
-  def __init__(self, mean_, stdDev_):
-    self.mean = mean_
-    self.stdDev = stdDev_
-
-def parseFile(f, benchmarks):
-  type = ""
-  bench = ""
-  mean = -1.
-  stdDev = -1.
-  for line in f:
-    debug("Line: {}".format(line))
-
-    typeRes = typeParser.match(line)
-    if typeRes:
-      type = typeRes.group(1)
-      debug("Found type: {}".format(type))
-      continue
-
-    nameRes = nameParser.match(line)
-    if nameRes:
-      name = nameRes.group(1)
-      debug("Found name: {}".format(name))
-      continue
-
-    meanRes = meanParser.match(line)
-    if meanRes:
-      mean = float(meanRes.group(1))
-      debug("Found mean: {}".format(mean))
-      continue
-
-    stdDevRes = stdDevParser.match(line)
-    if stdDevRes:
-      stdDev = float(stdDevRes.group(1))
-      debug("Found stddev: {}".format(stdDev))
-
-      # stdDev is always the last parse for a given benchmark, add entry now
-      benchmarks[BenchKey(name, type)] = BenchData(mean, stdDev)
-      debug("{} records found.".format(len(benchmarks)))
-
-      mean = -1.
-      stdDev = -1.
-
-      continue
-
-serialBenchmarks = {}
-parallelBenchmarks = {}
-
-parseFile(serialFile, serialBenchmarks)
-parseFile(parallelFile, parallelBenchmarks)
-
-serialKeys = set(serialBenchmarks.keys())
-parallelKeys = set(parallelBenchmarks.keys())
-
-commonKeys = sorted(list(serialKeys.intersection(parallelKeys)))
-
-serialOnlyKeys = sorted(list(serialKeys.difference(parallelKeys)))
-parallelOnlyKeys = sorted(list(parallelKeys.difference(serialKeys)))
-
-debug("{} serial keys\n{} parallel keys\n{} common keys\n{} serialOnly keys\n{} parallelOnly keys.".format(
-        len(serialKeys), len(parallelKeys), len(commonKeys), len(serialOnlyKeys), len(parallelOnlyKeys)))
-
-if len(serialOnlyKeys) > 0:
-  print("Keys found only in serial:")
-  for k in serialOnlyKeys:
-    print("%s (%s)"%(k.name, k.type))
-  print("")
-
-if len(parallelOnlyKeys) > 0:
-  print("Keys found only in parallel:")
-  for k in parallelOnlyKeys:
-    print("%s (%s)"%(k.name, k.type))
-  print("")
-
-print("Comparison:")
-print("| %7s | %4s | %8s    %8s | %8s    %8s | %s (%s) |"%(
-        "Speedup", "Warn", "serial", "", "parallel", "", "Benchmark", "Type"))
-print("|-%7s-|-%4s-|-%8s----%8s-|-%8s----%8s-|-%s--%s--|"%(
-        "-"*7, "-"*4, "-"*8, "-"*8, "-"*8, "-"*8, "-"*9, "-"*4))
-for key in commonKeys:
-  sData = serialBenchmarks[key]
-  pData = parallelBenchmarks[key]
-  speedup = sData.mean / pData.mean if pData.mean != 0. else 0.
-  if speedup > maxThreads * .9:
-    flag = "    "
-  elif speedup > maxThreads * .75:
-    flag = "!   "
-  elif speedup > maxThreads * .5:
-    flag = "!!  "
-  elif speedup > maxThreads * .25:
-    flag = "!!! "
-  else:
-    flag = "!!!!"
-  print("| %7.3f | %4s | %08.6f +- %08.6f | %08.6f +- %08.6f | %s (%s) |"%(
-          speedup, flag, sData.mean, sData.stdDev, pData.mean, pData.stdDev, key.name, key.type))
diff --git a/Utilities/Scripts/benchSummary.py b/Utilities/Scripts/benchSummary.py
deleted file mode 100755
index 722add00c..000000000
--- a/Utilities/Scripts/benchSummary.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python
-#
-# Prints a concise summary of a benchmark output as a TSV blob.
-#
-# Example usage:
-#
-# $ BenchmarkXXX_DEVICE > bench.out
-# $ benchSummary.py bench.out
-#
-# Options SortByType, SortByName, or SortByMean may be passed after the
-# filename to sort the output by the indicated quantity. If no sort option
-# is provided, the output order matches the input. If multiple options are
-# specified, the list will be sorted repeatedly in the order requested.
-
-import re
-import sys
-
-assert(len(sys.argv) >= 2)
-
-# Parses "*** vtkm::Float64 ***************" --> vtkm::Float64
-typeParser = re.compile("\\*{3} ([^*]+) \\*{15}")
-
-# Parses "Benchmark 'Benchmark name' results:" --> Benchmark name
-nameParser = re.compile("Benchmark '([^-]+)' results:")
-
-# Parses "mean = 0.0125s" --> 0.0125
-meanParser = re.compile("\\s+mean = ([0-9.Ee+-]+)s")
-
-# Parses "std dev = 0.0125s" --> 0.0125
-stdDevParser = re.compile("\\s+std dev = ([naN0-9.Ee+-]+)s")
-
-filename = sys.argv[1]
-benchFile = open(filename, 'r')
-
-sortOpt = None
-if len(sys.argv) > 2:
-  sortOpt = sys.argv[2:]
-
-class BenchKey:
-  def __init__(self, name_, type_):
-    self.name = name_
-    self.type = type_
-
-  def __eq__(self, other):
-    return self.name == other.name and self.type == other.type
-
-  def __lt__(self, other):
-    if self.name < other.name: return True
-    elif self.name > other.name: return False
-    else: return self.type < other.type
-
-  def __hash__(self):
-    return (self.name + self.type).__hash__()
-
-class BenchData:
-  def __init__(self, mean_, stdDev_):
-    self.mean = mean_
-    self.stdDev = stdDev_
-
-def parseFile(f, benchmarks):
-  type = ""
-  bench = ""
-  mean = -1.
-  stdDev = -1.
-  for line in f:
-    typeRes = typeParser.match(line)
-    if typeRes:
-      type = typeRes.group(1)
-      continue
-
-    nameRes = nameParser.match(line)
-    if nameRes:
-      name = nameRes.group(1)
-      continue
-
-    meanRes = meanParser.match(line)
-    if meanRes:
-      mean = float(meanRes.group(1))
-      continue
-
-    stdDevRes = stdDevParser.match(line)
-    if stdDevRes:
-      stdDev = float(stdDevRes.group(1))
-
-      # stdDev is always the last parse for a given benchmark, add entry now
-      benchmarks[BenchKey(name, type)] = BenchData(mean, stdDev)
-
-      mean = -1.
-      stdDev = -1.
-
-      continue
-
-benchmarks = {}
-parseFile(benchFile, benchmarks)
-
-# Sort keys by type:
-keys = benchmarks.keys()
-if sortOpt:
-  for opt in sortOpt:
-    if opt.lower() == "sortbytype":
-      keys = sorted(keys, key=lambda k: k.type)
-    elif opt.lower() == "sortbyname":
-      keys = sorted(keys, key=lambda k: k.name)
-    elif opt.lower() == "sortbymean":
-      keys = sorted(keys, key=lambda k: benchmarks[k].mean)
-
-print("# Summary: (%s)"%filename)
-print("%-9s\t%-9s\t%-9s\t%-s"%("Mean", "Stdev", "Stdev%", "Benchmark (type)"))
-for key in keys:
-  data = benchmarks[key]
-  print("%9.6f\t%9.6f\t%9.6f\t%s (%s)"%(data.mean, data.stdDev, data.stdDev / data.mean * 100., key.name, key.type))
diff --git a/Utilities/Scripts/benchSummaryWithBaselines.py b/Utilities/Scripts/benchSummaryWithBaselines.py
deleted file mode 100755
index c875b07ba..000000000
--- a/Utilities/Scripts/benchSummaryWithBaselines.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python
-#
-# Prints a concise summary of a benchmark output as a TSV blob. Benchmarks are
-# expected to have "Baseline" in the name, and a matching benchmark with the
-# same name but Baseline replaced with something else. For example,
-#
-# Baseline benchmark name: "Some benchmark: Baseline, Size=4"
-# Test benchmark name:     "Some benchmark: Blahblah, Size=4"
-#
-# The output will print the baseline, test, and overhead times for the
-# benchmarks.
-#
-# Example usage:
-#
-# $ BenchmarkXXX_DEVICE > bench.out
-# $ benchSummaryWithBaselines.py bench.out
-#
-# Options SortByType, SortByName, SortByOverhead, or SortByRatio
-# (testtime/baseline) may be passed after the filename to sort the output by
-# the indicated quantity. If no sort option is provided, the output order
-# matches the input. If multiple options are specified, the list will be sorted
-# repeatedly in the order requested.
-
-import re
-import sys
-
-assert(len(sys.argv) >= 2)
-
-# Parses "*** vtkm::Float64 ***************" --> vtkm::Float64
-typeParser = re.compile("\\*{3} ([^*]+) \\*{15}")
-
-# Parses "Benchmark 'Benchmark name' results:" --> Benchmark name
-nameParser = re.compile("Benchmark '([^-]+)' results:")
-
-# Parses "mean = 0.0125s" --> 0.0125
-meanParser = re.compile("\\s+mean = ([0-9.Ee+-]+)s")
-
-# Parses "std dev = 0.0125s" --> 0.0125
-stdDevParser = re.compile("\\s+std dev = ([naN0-9.Ee+-]+)s")
-
-# Parses "SomeText Baseline Other Text" --> ("SomeText ", " Other Text")
-baselineParser = re.compile("(.*)Baseline(.*)")
-
-filename = sys.argv[1]
-benchFile = open(filename, 'r')
-
-sortOpt = None
-if len(sys.argv) > 2:
-  sortOpt = sys.argv[2:]
-
-class BenchKey:
-  def __init__(self, name_, type_):
-    self.name = name_
-    self.type = type_
-
-  def __eq__(self, other):
-    return self.name == other.name and self.type == other.type
-
-  def __lt__(self, other):
-    if self.name < other.name: return True
-    elif self.name > other.name: return False
-    else: return self.type < other.type
-
-  def __hash__(self):
-    return (self.name + self.type).__hash__()
-
-class BenchData:
-  def __init__(self, mean_, stdDev_):
-    self.mean = mean_
-    self.stdDev = stdDev_
-
-def parseFile(f, benchmarks):
-  type = ""
-  bench = ""
-  mean = -1.
-  stdDev = -1.
-  for line in f:
-    typeRes = typeParser.match(line)
-    if typeRes:
-      type = typeRes.group(1)
-      continue
-
-    nameRes = nameParser.match(line)
-    if nameRes:
-      name = nameRes.group(1)
-      continue
-
-    meanRes = meanParser.match(line)
-    if meanRes:
-      mean = float(meanRes.group(1))
-      continue
-
-    stdDevRes = stdDevParser.match(line)
-    if stdDevRes:
-      stdDev = float(stdDevRes.group(1))
-
-      # stdDev is always the last parse for a given benchmark, add entry now
-      benchmarks[BenchKey(name, type)] = BenchData(mean, stdDev)
-
-      mean = -1.
-      stdDev = -1.
-
-      continue
-
-class BaselinedBenchData:
-  def __init__(self, baseline, test):
-    self.baseline = baseline.mean
-    self.test = test.mean
-    self.overhead = test.mean - baseline.mean
-
-def findBaselines(benchmarks):
-  result = {}
-
-  for baseKey in benchmarks.keys():
-    # Look for baseline entries
-    baselineRes = baselineParser.match(baseKey.name)
-    if baselineRes:
-      prefix = baselineRes.group(1)
-      suffix = baselineRes.group(2)
-
-      # Find the test entry matching the baseline:
-      for testKey in benchmarks.keys():
-        if baseKey.type != testKey.type: # Need same type
-          continue
-        if baseKey.name == testKey.name: # Skip the base key
-          continue
-        if testKey.name.startswith(prefix) and testKey.name.endswith(suffix):
-          newName = (prefix + suffix).replace(", ,", ",")
-          newKey = BenchKey(newName, testKey.type)
-          newVal = BaselinedBenchData(benchmarks[baseKey], benchmarks[testKey])
-          result[newKey] = newVal
-  return result
-
-benchmarks = {}
-parseFile(benchFile, benchmarks)
-benchmarks = findBaselines(benchmarks)
-
-# Sort keys by type:
-keys = benchmarks.keys()
-if sortOpt:
-  for opt in sortOpt:
-    if opt.lower() == "sortbytype":
-      keys = sorted(keys, key=lambda k: k.type)
-    elif opt.lower() == "sortbyname":
-      keys = sorted(keys, key=lambda k: k.name)
-    elif opt.lower() == "sortbyoverhead":
-      keys = sorted(keys, key=lambda k: benchmarks[k].overhead)
-    elif opt.lower() == "sortbyratio":
-      keys = sorted(keys, key=lambda k: benchmarks[k].overhead / benchmarks[k].baseline)
-
-print("# Summary: (%s)"%filename)
-print("%-9s\t%-9s\t%-9s\t%-9s\t%-s"%("Baseline", "TestTime", "Overhead", "Test/Base", "Benchmark (type)"))
-for key in keys:
-  data = benchmarks[key]
-  print("%9.6f\t%9.6f\t%9.6f\t%9.6f\t%s (%s)"%(data.baseline, data.test,
-        data.overhead, data.test / data.baseline, key.name, key.type))
diff --git a/Utilities/Scripts/compare-benchmarks.py b/Utilities/Scripts/compare-benchmarks.py
new file mode 100755
index 000000000..9aa676d6f
--- /dev/null
+++ b/Utilities/Scripts/compare-benchmarks.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""
+compare-benchmarks.py - VTKm + Google Benchmarks compare.py
+"""
+
+import getopt
+import subprocess
+import sys
+import time
+import os
+
+CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
+COMPARE_PY_PATH = os.path.join(CURRENT_DIR, 'compare.py')
+COMPARE_PY = sys.executable + " " + COMPARE_PY_PATH
+
+class Bench():
+    def __init__(self):
+        self.__cmd = None
+
+    @property
+    def cmd(self):
+        return self.__cmd
+
+    @cmd.setter
+    def cmd(self, c):
+        self.__cmd = c
+
+    def launch(self):
+        output_file = "bench-%d.json" % time.time()
+        cmd_exec = "%s --benchmark_out=%s --benchmark_out_format=json" \
+                % (self.cmd, output_file)
+        print(cmd_exec)
+        subprocess.call(cmd_exec, shell=True)
+        return output_file
+
+def print_help(error_msg = None):
+    if error_msg != None:
+        print(error_msg)
+
+    print("usage: compare-benchmarks <opts>\n" \
+            " --benchmark1='<benchmark1> [arg1] [arg2] ...'"\
+            " [--filter1=<filter1>]\n"\
+            " --benchmark2='<benchmark2> [arg1] [arg2] ...'"\
+            " [--filter2=<filter2>]\n"\
+            " -- [-opt] benchmarks|filters|benchmarksfiltered\n\n" \
+            "compare.py help:")
+
+    subprocess.call(COMPARE_PY, shell=True)
+    sys.exit(0)
+
+# -----------------------------------------------------------------------------
+def main():
+    is_filters = False
+    filter1 = str()
+    filter2 = str()
+    bench1 = Bench()
+    bench2 = Bench()
+
+    options, remainder = getopt.gnu_getopt(sys.argv[1:], '',
+            ['help','benchmark1=', 'benchmark2=', 'filter1=', 'filter2='])
+
+    for opt, arg in options:
+        if opt == "--benchmark1":
+            bench1.cmd = arg
+
+        if opt == "--benchmark2":
+            bench2.cmd = arg
+
+        if opt == "--filter1":
+            filter1 = arg
+
+        if opt == "--filter2":
+            filter2 = arg
+
+        if opt == "--help":
+            print_help()
+
+    if bench1.cmd == None:
+        print_help("ERROR: no benchmarks chosen")
+
+    for arg in remainder:
+        if arg == "filters":
+           is_filters = True
+
+    if is_filters and bench2.cmd != None:
+        print_help("ERROR: filters option can only accept --benchmark1= and --filter1")
+
+    b1_output = bench1.launch()
+    b2_output = bench2.launch() if not is_filters else filter1 + " " + filter2
+
+    cmd = "%s %s %s %s" % (COMPARE_PY, " ".join(remainder), b1_output, b2_output)
+    print(cmd)
+    subprocess.call(cmd, shell=True)
+
+    os.remove(b1_output)
+
+    if not is_filters:
+        os.remove(b2_output)
+
+if  __name__ == '__main__':
+    main()
diff --git a/Utilities/Scripts/compare.py b/Utilities/Scripts/compare.py
new file mode 100755
index 000000000..539ace6fb
--- /dev/null
+++ b/Utilities/Scripts/compare.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python
+
+import unittest
+"""
+compare.py - versatile benchmark output compare tool
+"""
+
+import argparse
+from argparse import ArgumentParser
+import sys
+import gbench
+from gbench import util, report
+from gbench.util import *
+
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+               "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing optional flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
+               " is not supported.") % output_type)
+        sys.exit(1)
+
+
+def create_parser():
+    parser = ArgumentParser(
+        description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
+    subparsers = parser.add_subparsers(
+        help='This tool has multiple modes of operation:',
+        dest='mode')
+
+    parser_a = subparsers.add_parser(
+        'benchmarks',
+        help='The most simple use-case, compare all the output of these two benchmarks')
+    baseline = parser_a.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    contender = parser_a.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser_a.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_b = subparsers.add_parser(
+        'filters', help='Compare filter one with the filter two of benchmark')
+    baseline = parser_b.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test',
+        metavar='test',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_b.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_b.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_c = subparsers.add_parser(
+        'benchmarksfiltered',
+        help='Compare filter one of first benchmark with filter two of the second benchmark')
+    baseline = parser_c.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_c.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_c.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    return parser
+
+
+def main():
+    # Parse the command line flags
+    parser = create_parser()
+    args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
+    assert not unknown_args
+    benchmark_options = args.benchmark_options
+
+    if args.mode == 'benchmarks':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = ''
+        filter_contender = ''
+
+        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
+
+        description = 'Comparing %s to %s' % (test_baseline, test_contender)
+    elif args.mode == 'filters':
+        test_baseline = args.test[0].name
+        test_contender = args.test[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if filter_baseline == filter_contender, you are analyzing the
+        # stdev
+
+        description = 'Comparing %s to %s (from %s)' % (
+            filter_baseline, filter_contender, args.test[0].name)
+    elif args.mode == 'benchmarksfiltered':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if test_baseline == test_contender and
+        # filter_baseline == filter_contender, you are analyzing the stdev
+
+        description = 'Comparing %s (from %s) to %s (from %s)' % (
+            filter_baseline, test_baseline, filter_contender, test_contender)
+    else:
+        # should never happen
+        print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
+        exit(1)
+
+    check_inputs(test_baseline, test_contender, benchmark_options)
+
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
+    options_baseline = []
+    options_contender = []
+
+    if filter_baseline and filter_contender:
+        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
+        options_contender = ['--benchmark_filter=%s' % filter_contender]
+
+    # Run the benchmarks and report the results
+    json1 = json1_orig = gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline)
+    json2 = json2_orig = gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender)
+
+    # Now, filter the benchmarks so that the difference report can work
+    if filter_baseline and filter_contender:
+        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement)
+        json2 = gbench.report.filter_benchmark(
+            json2_orig, filter_contender, replacement)
+
+    # Diff and output
+    output_lines = gbench.report.generate_difference_report(
+        json1, json2, args.display_aggregates_only,
+        args.utest, args.utest_alpha)
+    print(description)
+    for ln in output_lines:
+        print(ln)
+
+
+class TestParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = create_parser()
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'gbench',
+            'Inputs')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
+
+    def test_benchmarks_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['d'])
+
+    def test_benchmarks_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_basic(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_filters_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['f'])
+
+    def test_benchmarksfiltered_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarksfiltered_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'f')
+
+    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'g')
+
+
+if __name__ == '__main__':
+    # unittest.main()
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/Utilities/Scripts/gbench/__init__.py b/Utilities/Scripts/gbench/__init__.py
new file mode 100644
index 000000000..fce1a1acf
--- /dev/null
+++ b/Utilities/Scripts/gbench/__init__.py
@@ -0,0 +1,8 @@
+"""Google Benchmark tooling"""
+
+__author__ = 'Eric Fiselier'
+__email__ = 'eric@efcs.ca'
+__versioninfo__ = (0, 5, 0)
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
+
+__all__ = []
diff --git a/Utilities/Scripts/gbench/report.py b/Utilities/Scripts/gbench/report.py
new file mode 100644
index 000000000..5bd3a8d85
--- /dev/null
+++ b/Utilities/Scripts/gbench/report.py
@@ -0,0 +1,541 @@
+import unittest
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+import os
+import re
+import copy
+
+from scipy.stats import mannwhitneyu
+
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(partition, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    timings_time = extract_field(partition, 'real_time')
+    timings_cpu = extract_field(partition, 'cpu_time')
+    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        len(timings_cpu[0]), len(timings_cpu[1]))
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not have_optimal_repetitions:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    last_name = partition[0][0]['name']
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(time_pvalue), time_pvalue,
+                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def generate_difference_report(
+        json1,
+        json2,
+        display_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    assert utest is True or utest is False
+    first_col_width = find_longest_name(json1['benchmarks'])
+
+    def find_test(name):
+        for b in json2['benchmarks']:
+            if b['name'] == name:
+                return b
+        return None
+
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+
+            # *If* we were asked to only display aggregates,
+            # and if it is non-aggregate, then skip it.
+            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
+                assert bn['run_type'] == other_bench['run_type']
+                if bn['run_type'] != 'aggregate':
+                    continue
+
+            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+
+            def get_color(res):
+                if res > 0.05:
+                    return BC_FAIL
+                elif res > -0.07:
+                    return BC_WHITE
+                else:
+                    return BC_CYAN
+
+            tres = calculate_change(bn['real_time'], other_bench['real_time'])
+            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            output_strs += [color_format(use_color,
+                                         fmt_str,
+                                         BC_HEADER,
+                                         bn['name'],
+                                         first_col_width,
+                                         get_color(tres),
+                                         tres,
+                                         get_color(cpures),
+                                         cpures,
+                                         bn['real_time'],
+                                         other_bench['real_time'],
+                                         bn['cpu_time'],
+                                         other_bench['cpu_time'],
+                                         endc=BC_ENDC)]
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            output_strs += print_utest(partition,
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
+
+    return output_strs
+
+
+###############################################################################
+# Unit tests
+
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_basic(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    def load_result(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test2_run.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+        ]
+        json = self.load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, display_aggregates_only=True,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/Utilities/Scripts/gbench/util.py b/Utilities/Scripts/gbench/util.py
new file mode 100644
index 000000000..1f8e8e2c4
--- /dev/null
+++ b/Utilities/Scripts/gbench/util.py
@@ -0,0 +1,164 @@
+"""util.py - General utilities for running, loading, and processing benchmarks
+"""
+import json
+import os
+import tempfile
+import subprocess
+import sys
+
+# Input file type enumeration
+IT_Invalid = 0
+IT_JSON = 1
+IT_Executable = 2
+
+_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
+def is_executable_file(filename):
+    """
+    Return 'True' if 'filename' names a valid file which is likely
+    an executable. A file is considered an executable if it starts with the
+    magic bytes for a EXE, Mach O, or ELF file.
+    """
+    if not os.path.isfile(filename):
+        return False
+    with open(filename, mode='rb') as f:
+        magic_bytes = f.read(_num_magic_bytes)
+    if sys.platform == 'darwin':
+        return magic_bytes in [
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
+        ]
+    elif sys.platform.startswith('win'):
+        return magic_bytes == b'MZ'
+    else:
+        return magic_bytes == b'\x7FELF'
+
+
+def is_json_file(filename):
+    """
+    Returns 'True' if 'filename' names a valid JSON output file.
+    'False' otherwise.
+    """
+    try:
+        with open(filename, 'r') as f:
+            json.load(f)
+        return True
+    except BaseException:
+        pass
+    return False
+
+
+def classify_input_file(filename):
+    """
+    Return a tuple (type, msg) where 'type' specifies the classified type
+    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
+    string represeting the error.
+    """
+    ftype = IT_Invalid
+    err_msg = None
+    if not os.path.exists(filename):
+        err_msg = "'%s' does not exist" % filename
+    elif not os.path.isfile(filename):
+        err_msg = "'%s' does not name a file" % filename
+    elif is_executable_file(filename):
+        ftype = IT_Executable
+    elif is_json_file(filename):
+        ftype = IT_JSON
+    else:
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
+    return ftype, err_msg
+
+
+def check_input_file(filename):
+    """
+    Classify the file named by 'filename' and return the classification.
+    If the file is classified as 'IT_Invalid' print an error message and exit
+    the program.
+    """
+    ftype, msg = classify_input_file(filename)
+    if ftype == IT_Invalid:
+        print("Invalid input file: %s" % msg)
+        sys.exit(1)
+    return ftype
+
+
+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]
+
+
+def load_benchmark_results(fname):
+    """
+    Read benchmark output from a file and return the JSON object.
+    REQUIRES: 'fname' names a file containing JSON benchmark output.
+    """
+    with open(fname, 'r') as f:
+        return json.load(f)
+
+
+def run_benchmark(exe_name, benchmark_flags):
+    """
+    Run a benchmark specified by 'exe_name' with the specified
+    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
+    real time console output.
+    RETURNS: A JSON object representing the benchmark output
+    """
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+            ['--benchmark_out=%s' % output_name]
+
+    cmd = [exe_name] + benchmark_flags
+    print("RUNNING: %s" % ' '.join(cmd))
+    exitCode = subprocess.call(cmd)
+    if exitCode != 0:
+        print('TEST FAILED...')
+        sys.exit(exitCode)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
+    return json_res
+
+
+def run_or_load_benchmark(filename, benchmark_flags):
+    """
+    Get the results for a specified benchmark. If 'filename' specifies
+    an executable benchmark then the results are generated by running the
+    benchmark. Otherwise 'filename' must name a valid JSON output file,
+    which is loaded and the result returned.
+    """
+    ftype = check_input_file(filename)
+    if ftype == IT_JSON:
+        return load_benchmark_results(filename)
+    elif ftype == IT_Executable:
+        return run_benchmark(filename, benchmark_flags)
+    else:
+        assert False  # This branch is unreachable
diff --git a/Utilities/Scripts/strip_asm.py b/Utilities/Scripts/strip_asm.py
new file mode 100755
index 000000000..9030550b4
--- /dev/null
+++ b/Utilities/Scripts/strip_asm.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+strip_asm.py - Cleanup ASM output for the specified file
+"""
+
+from argparse import ArgumentParser
+import sys
+import os
+import re
+
+def find_used_labels(asm):
+    found = set()
+    label_re = re.compile("\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
+    for l in asm.splitlines():
+        m = label_re.match(l)
+        if m:
+            found.add('.L%s' % m.group(1))
+    return found
+
+
+def normalize_labels(asm):
+    decls = set()
+    label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if m:
+            decls.add(m.group(0))
+    if len(decls) == 0:
+        return asm
+    needs_dot = next(iter(decls))[0] != '.'
+    if not needs_dot:
+        return asm
+    for ld in decls:
+        asm = re.sub("(^|\s+)" + ld + "(?=:|\s)", '\\1.' + ld, asm)
+    return asm
+
+
+def transform_labels(asm):
+    asm = normalize_labels(asm)
+    used_decls = find_used_labels(asm)
+    new_asm = ''
+    label_decl = re.compile("^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if not m or m.group(0) in used_decls:
+            new_asm += l
+            new_asm += '\n'
+    return new_asm
+
+
+def is_identifier(tk):
+    if len(tk) == 0:
+        return False
+    first = tk[0]
+    if not first.isalpha() and first != '_':
+        return False
+    for i in range(1, len(tk)):
+        c = tk[i]
+        if not c.isalnum() and c != '_':
+            return False
+    return True
+
+def process_identifiers(l):
+    """
+    process_identifiers - process all identifiers and modify them to have
+    consistent names across all platforms; specifically across ELF and MachO.
+    For example, MachO inserts an additional understore at the beginning of
+    names. This function removes that.
+    """
+    parts = re.split(r'([a-zA-Z0-9_]+)', l)
+    new_line = ''
+    for tk in parts:
+        if is_identifier(tk):
+            if tk.startswith('__Z'):
+                tk = tk[1:]
+            elif tk.startswith('_') and len(tk) > 1 and \
+                    tk[1].isalpha() and tk[1] != 'Z':
+                tk = tk[1:]
+        new_line += tk
+    return new_line
+
+
+def process_asm(asm):
+    """
+    Strip the ASM of unwanted directives and lines
+    """
+    new_contents = ''
+    asm = transform_labels(asm)
+
+    # TODO: Add more things we want to remove
+    discard_regexes = [
+        re.compile("\s+\..*$"), # directive
+        re.compile("\s*#(NO_APP|APP)$"), #inline ASM
+        re.compile("\s*#.*$"), # comment line
+        re.compile("\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"), #global directive
+        re.compile("\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"),
+    ]
+    keep_regexes = [
+
+    ]
+    fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
+    for l in asm.splitlines():
+        # Remove Mach-O attribute
+        l = l.replace('@GOTPCREL', '')
+        add_line = True
+        for reg in discard_regexes:
+            if reg.match(l) is not None:
+                add_line = False
+                break
+        for reg in keep_regexes:
+            if reg.match(l) is not None:
+                add_line = True
+                break
+        if add_line:
+            if fn_label_def.match(l) and len(new_contents) != 0:
+                new_contents += '\n'
+            l = process_identifiers(l)
+            new_contents += l
+            new_contents += '\n'
+    return new_contents
+
+def main():
+    parser = ArgumentParser(
+        description='generate a stripped assembly file')
+    parser.add_argument(
+        'input', metavar='input', type=str, nargs=1,
+        help='An input assembly file')
+    parser.add_argument(
+        'out', metavar='output', type=str, nargs=1,
+        help='The output file')
+    args, unknown_args = parser.parse_known_args()
+    input = args.input[0]
+    output = args.out[0]
+    if not os.path.isfile(input):
+        print(("ERROR: input file '%s' does not exist") % input)
+        sys.exit(1)
+    contents = None
+    with open(input, 'r') as f:
+        contents = f.read()
+    new_contents = process_asm(contents)
+    with open(output, 'w') as f:
+        f.write(new_contents)
+
+
+if __name__ == '__main__':
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/benchmarking/BenchmarkArrayTransfer.cxx b/benchmarking/BenchmarkArrayTransfer.cxx
index 55358c816..71b463f56 100644
--- a/benchmarking/BenchmarkArrayTransfer.cxx
+++ b/benchmarking/BenchmarkArrayTransfer.cxx
@@ -473,12 +473,25 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(BenchExecToContReadWrite,
 
 int main(int argc, char* argv[])
 {
-  // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  // Initialize command line args
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkAtomicArray.cxx b/benchmarking/BenchmarkAtomicArray.cxx
index 48002aa1a..e32205867 100644
--- a/benchmarking/BenchmarkAtomicArray.cxx
+++ b/benchmarking/BenchmarkAtomicArray.cxx
@@ -506,11 +506,24 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(
 int main(int argc, char* argv[])
 {
   // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkCopySpeeds.cxx b/benchmarking/BenchmarkCopySpeeds.cxx
index 8deddad47..1bd5fe244 100644
--- a/benchmarking/BenchmarkCopySpeeds.cxx
+++ b/benchmarking/BenchmarkCopySpeeds.cxx
@@ -95,11 +95,23 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(CopySpeed,
 int main(int argc, char* argv[])
 {
   // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
 // Handle NumThreads command-line arg:
 #ifdef VTKM_ENABLE_TBB
@@ -126,5 +138,5 @@ int main(int argc, char* argv[])
 #endif // TBB
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkDeviceAdapter.cxx b/benchmarking/BenchmarkDeviceAdapter.cxx
index 049e93500..637a91982 100644
--- a/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/benchmarking/BenchmarkDeviceAdapter.cxx
@@ -39,6 +39,40 @@
 namespace
 {
 
+// Parametrize the input size samples for most of the benchmarks
+//
+// Define at compile time:
+//
+//   Being VTKm_BENCHS_RANGE_LOWER_BOUNDARY b0 and,
+//   being VTKm_BENCHS_RANGE_UPPER_BOUNDARY b1
+//
+// This will create the following sample sizes b0, b0*2^3, b0*2^6, ..., b1.
+//
+// Notice that setting up VTKm_BENCHS_RANGE_LOWER_BOUNDARY / VTKm_BENCHS_RANGE_UPPER_BOUNDARY
+// will affect both ShortRange and FullRange.
+//
+#ifndef VTKm_BENCHS_RANGE_LOWER_BOUNDARY
+#define FULL_RANGE_LOWER_BOUNDARY (1 << 12)  //  4 KiB
+#define SHORT_RANGE_LOWER_BOUNDARY (1 << 15) // 32 KiB
+
+#else
+#define FULL_RANGE_LOWER_BOUNDARY (VTKm_BENCHS_RANGE_LOWER_BOUNDARY)
+#define SHORT_RANGE_LOWER_BOUNDARY (VTKm_BENCHS_RANGE_LOWER_BOUNDARY)
+
+#endif
+
+#ifndef VTKm_BENCHS_RANGE_UPPER_BOUNDARY
+#define FULL_RANGE_UPPER_BOUNDARY (1 << 27)             // 128 MiB
+#define SHORT_RANGE_UPPER_BOUNDARY (1 << 27)            // 128 MiB
+#define BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING (1 << 26) // 64 MiB
+
+#else
+#define FULL_RANGE_UPPER_BOUNDARY (VTKm_BENCHS_RANGE_UPPER_BOUNDARY)
+#define SHORT_RANGE_UPPER_BOUNDARY (VTKm_BENCHS_RANGE_UPPER_BOUNDARY)
+#define BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING (VTKm_BENCHS_RANGE_UPPER_BOUNDARY)
+
+#endif
+
 // Default sampling rate is x8 and always includes min/max,
 // so this will generate 7 samples at:
 // 1: 4 KiB
@@ -47,15 +81,17 @@ namespace
 // 4: 2 MiB
 // 5: 16 MiB
 // 6: 128 MiB
-static const std::pair<int64_t, int64_t> FullRange{ 1 << 12, 1 << 27 }; // 4KiB, 128MiB
+static const std::pair<int64_t, int64_t> FullRange{ FULL_RANGE_LOWER_BOUNDARY,
+                                                    FULL_RANGE_UPPER_BOUNDARY };
 
 // Smaller range that can be used to reduce the number of benchmarks. Used
 // with `RangeMultiplier(SmallRangeMultiplier)`, this produces:
 // 1: 32 KiB
 // 2: 2 MiB
 // 3: 128 MiB
-static const std::pair<int64_t, int64_t> SmallRange{ 1 << 15, 1 << 27 }; // 4KiB, 128MiB
-static constexpr int SmallRangeMultiplier = 1 << 21;                     // Ensure a sample at 2MiB
+static const std::pair<int64_t, int64_t> SmallRange{ SHORT_RANGE_LOWER_BOUNDARY,
+                                                     SHORT_RANGE_UPPER_BOUNDARY };
+static constexpr int SmallRangeMultiplier = 1 << 21; // Ensure a sample at 2MiB
 
 using TypeList = vtkm::List<vtkm::UInt8,
                             vtkm::Float32,
@@ -351,7 +387,7 @@ void BenchBitFieldToUnorderedSetGenerator(benchmark::internal::Benchmark* bm)
 {
   // Use a reduced NUM_BYTES_MAX value here -- these benchmarks allocate one
   // 8-byte id per bit, so this caps the index array out at 512 MB:
-  static constexpr int64_t numBytesMax = 1 << 26; // 64 MiB of bits
+  static int64_t numBytesMax = std::min(1 << 29, BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING);
 
   bm->UseManualTime();
   bm->ArgNames({ "Size", "C" });
@@ -393,6 +429,7 @@ void BenchCopy(benchmark::State& state)
   state.SetBytesProcessed(static_cast<int64_t>(numBytes) * iterations);
   state.SetItemsProcessed(static_cast<int64_t>(numValues) * iterations);
 };
+
 VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCopy, ->Ranges({ FullRange })->ArgName("Size"), TypeList);
 
 template <typename ValueType>
@@ -534,7 +571,7 @@ void BenchCountSetBitsGenerator(benchmark::internal::Benchmark* bm)
 
   for (int64_t config = 0; config < 6; ++config)
   {
-    bm->Ranges({ FullRange, { config, config } });
+    bm->Ranges({ { FullRange.first, FullRange.second }, { config, config } });
   }
 }
 VTKM_BENCHMARK_APPLY(BenchCountSetBits, BenchCountSetBitsGenerator);
@@ -1053,8 +1090,10 @@ void BenchmarkStableSortIndicesUniqueGenerator(benchmark::internal::Benchmark* b
   bm->ArgNames({ "Size", "%Uniq" });
   for (int64_t pcntUnique = 0; pcntUnique <= 100; pcntUnique += 25)
   {
-    // Cap the max size here at 21 MiB. This sort is too slow.
-    bm->Ranges({ { SmallRange.first, 1 << 21 }, { pcntUnique, pcntUnique } });
+    // Cap the max size here at 2 MiB. This sort is too slow.
+    const int64_t maxSize = 1 << 21;
+    bm->Ranges(
+      { { SmallRange.first, std::min(maxSize, SmallRange.second) }, { pcntUnique, pcntUnique } });
   }
 }
 
@@ -1167,12 +1206,23 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(BenchUpperBounds,
 
 int main(int argc, char* argv[])
 {
-  // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
 // Handle NumThreads command-line arg:
 #ifdef VTKM_ENABLE_TBB
@@ -1199,5 +1249,5 @@ int main(int argc, char* argv[])
 #endif // TBB
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkFieldAlgorithms.cxx b/benchmarking/BenchmarkFieldAlgorithms.cxx
index 4a607145f..a0817f2ca 100644
--- a/benchmarking/BenchmarkFieldAlgorithms.cxx
+++ b/benchmarking/BenchmarkFieldAlgorithms.cxx
@@ -942,12 +942,24 @@ VTKM_BENCHMARK(Bench2VirtualImplicitFunctions);
 int main(int argc, char* argv[])
 {
   // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkFilters.cxx b/benchmarking/BenchmarkFilters.cxx
index 033bcaeb6..d3e3c6e85 100644
--- a/benchmarking/BenchmarkFilters.cxx
+++ b/benchmarking/BenchmarkFilters.cxx
@@ -1040,12 +1040,23 @@ void InitDataSet(int& argc, char** argv)
 int main(int argc, char* argv[])
 {
   auto opts = vtkm::cont::InitializeOptions::RequireDevice;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
 
-  InitDataSet(argc, argv);
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+    InitDataSet(argc, args.data());
+  }
 
   const std::string dataSetSummary = []() -> std::string {
     std::ostringstream out;
@@ -1054,5 +1065,5 @@ int main(int argc, char* argv[])
   }();
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, dataSetSummary);
+  VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, args.data(), dataSetSummary);
 }
diff --git a/benchmarking/BenchmarkRayTracing.cxx b/benchmarking/BenchmarkRayTracing.cxx
index a8a9ffb6a..1b1585984 100644
--- a/benchmarking/BenchmarkRayTracing.cxx
+++ b/benchmarking/BenchmarkRayTracing.cxx
@@ -116,13 +116,24 @@ VTKM_BENCHMARK(BenchRayTracing);
 
 int main(int argc, char* argv[])
 {
-  // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/BenchmarkTopologyAlgorithms.cxx b/benchmarking/BenchmarkTopologyAlgorithms.cxx
index 708028d4a..b55f5a783 100644
--- a/benchmarking/BenchmarkTopologyAlgorithms.cxx
+++ b/benchmarking/BenchmarkTopologyAlgorithms.cxx
@@ -380,12 +380,24 @@ VTKM_BENCHMARK_TEMPLATES(BenchClassificationDynamic, ValueTypes);
 int main(int argc, char* argv[])
 {
   // Parse VTK-m options:
-  auto opts = vtkm::cont::InitializeOptions::RequireDevice | vtkm::cont::InitializeOptions::AddHelp;
-  Config = vtkm::cont::Initialize(argc, argv, opts);
+  auto opts = vtkm::cont::InitializeOptions::RequireDevice;
 
-  // Setup device:
-  vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+
+  // Parse VTK-m options:
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+
+  // This occurs when it is help
+  if (opts == vtkm::cont::InitializeOptions::None)
+  {
+    std::cout << Config.Usage << std::endl;
+  }
+  else
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
 
   // handle benchmarking related args and run benchmarks:
-  VTKM_EXECUTE_BENCHMARKS(argc, argv);
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
 }
diff --git a/benchmarking/Benchmarker.h b/benchmarking/Benchmarker.h
index f4035767e..0e44da5d2 100644
--- a/benchmarking/Benchmarker.h
+++ b/benchmarking/Benchmarker.h
@@ -388,6 +388,37 @@ static inline vtkm::Id ExecuteBenchmarks(int& argc,
 
   return static_cast<vtkm::Id>(num);
 }
+
+void InitializeArgs(int* argc, std::vector<char*>& args, vtkm::cont::InitializeOptions& opts)
+{
+  bool isHelp = false;
+
+  // Inject --help
+  if (*argc == 1)
+  {
+    const char* help = "--help"; // We want it to be static
+    args.push_back(const_cast<char*>(help));
+    *argc = *argc + 1;
+  }
+
+  args.push_back(nullptr);
+
+  for (size_t i = 0; i < static_cast<size_t>(*argc); ++i)
+  {
+    auto opt_s = std::string(args[i]);
+    if (opt_s == "--help" || opt_s == "-help" || opt_s == "-h")
+    {
+      isHelp = true;
+    }
+  }
+
+  if (!isHelp)
+  {
+    return;
+  }
+
+  opts = vtkm::cont::InitializeOptions::None;
+}
 }
 }
 } // end namespace vtkm::bench::detail
diff --git a/benchmarking/CMakeLists.txt b/benchmarking/CMakeLists.txt
index ba603e031..a504d395e 100644
--- a/benchmarking/CMakeLists.txt
+++ b/benchmarking/CMakeLists.txt
@@ -47,10 +47,17 @@ set(benchmarks
   BenchmarkTopologyAlgorithms
   )
 
+set(VTKm_BENCHS_RANGE_LOWER_BOUNDARY 4096 CACHE STRING "Smallest sample for input size bench for BenchmarkDeviceAdapter")
+set(VTKm_BENCHS_RANGE_UPPER_BOUNDARY 134217728 CACHE STRING "Biggest sample for input size bench for BenchmarkDeviceAdapter")
+mark_as_advanced(VTKm_BENCHS_RANGE_LOWER_BOUNDARY VTKm_BENCHS_RANGE_UPPER_BOUNDARY)
+
 foreach (benchmark ${benchmarks})
   add_benchmark(NAME ${benchmark} FILE ${benchmark}.cxx LIBS vtkm_source vtkm_filter)
 endforeach ()
 
+target_compile_definitions(BenchmarkDeviceAdapter PUBLIC VTKm_BENCHS_RANGE_LOWER_BOUNDARY=${VTKm_BENCHS_RANGE_LOWER_BOUNDARY})
+target_compile_definitions(BenchmarkDeviceAdapter PUBLIC VTKm_BENCHS_RANGE_UPPER_BOUNDARY=${VTKm_BENCHS_RANGE_UPPER_BOUNDARY})
+
 if(TARGET vtkm_rendering)
   add_benchmark(NAME BenchmarkRayTracing FILE BenchmarkRayTracing.cxx LIBS vtkm_rendering)
 endif()
diff --git a/benchmarking/README.md b/benchmarking/README.md
new file mode 100644
index 000000000..a8e1b5948
--- /dev/null
+++ b/benchmarking/README.md
@@ -0,0 +1,120 @@
+# BENCHMARKING VTK-m
+
+## TL;DR
+
+When configuring _VTM-m_ with _CMake_ pass the flag `-DVTKm_ENABLE_BENCHMARKS=1`
+. In the build directory you will see the following binaries:
+
+    $ ls bin/Benchmark*
+    bin/BenchmarkArrayTransfer*  bin/BenchmarkCopySpeeds* bin/BenchmarkFieldAlgorithms*
+    bin/BenchmarkRayTracing* bin/BenchmarkAtomicArray*    bin/BenchmarkDeviceAdapter*
+    bin/BenchmarkFilters* bin/BenchmarkTopologyAlgorithms*
+
+Taking as an example `BenchmarkArrayTransfer`, we can run it as:
+
+    $ bin/BenchmarkArrayTransfer -d Any
+
+---
+
+## Parts of this Documents
+
+0. [TL;DR](#TL;DR)
+1. [Devices](#choosing-devices)
+2. [Filters](#run-a-subset-of-your-benchmarks)
+4. [Compare with baseline](#compare-with-baseline)
+5. [Installing compare.py](#installing-compare-benchmarkspy)
+
+---
+
+## Choosing devices
+
+Taking as an example `BenchmarkArrayTransfer`, we can determine in which
+device we can run it by simply:
+
+    $ bin/BenchmarkArrayTransfer
+    ...
+    Valid devices: "Any" "Serial"
+    ...
+
+Upon the _Valid devices_ you can chose in which device to run the benchmark by:
+
+    $ bin/BenchmarkArrayTransfer -d Serial
+
+
+## Run a subset of your benchmarks
+
+_VTK-m_ benchmarks uses [Google Benchmarks] which allows you to choose a subset
+of benchmaks by using the flag `--benchmark_filter=REGEX`
+
+For instance, if you want to run all the benchmarks that writes something you
+would run:
+
+    $ bin/BenchmarkArrayTransfer -d Serial --benchmark_filter='Write'
+
+Note you can list all of the available benchmarks with the option:
+`--benchmark_list_tests`.
+
+## Compare with baseline
+
+_VTM-m_ ships with a helper script based in [Google Benchmarks] `compare.py`
+named `compare-benchmarks.py` which lets you compare benchmarks using different
+devices, filters, and binaries. After building `VTM-m` it must appear on the 
+`bin` directory within your `build` directory.
+
+When running `compare-benchmarks.py`:
+ - You can specify the baseline benchmark binary path and its arguments in 
+   `--benchmark1=`
+ - The contender benchmark binary path and its arguments in `--benchmark2=`
+ - Extra options to be passed to `compare.py` must come after `--`
+
+### Compare between filters
+
+When comparing filters, we only can use one benchmark binary with a single device
+as shown in the following example:
+
+```sh
+$ ./compare-benchmarks.py --benchmark1='./BenchmarkArrayTransfer -d Any
+--benchmark_filter=1024' --filter1='Read' --filter2=Write -- filters
+
+# It will output something like this:
+
+Benchmark                                                                          Time             CPU      Time Old      Time New       CPU Old       CPU New
+---------------------------------------------------------------------------------------------------------------------------------------------------------------
+BenchContToExec[Read vs. Write]<F32>/Bytes:1024/manual_time                     +0.2694         +0.2655         18521         23511         18766         23749
+BenchExecToCont[Read vs. Write]<F32>/Bytes:1024/manual_time                     +0.0212         +0.0209         25910         26460         26152         26698
+```
+
+### Compare between devices
+
+When comparing two benchmarks using two devices use the _option_ `benchmark`
+after `--` and call `./compare-benchmarks.py` as follows:
+
+```sh
+$ ./compare-benchmarks.py --benchmark1='./BenchmarkArrayTransfer -d Serial
+--benchmark_filter=1024' --benchmark2='./BenchmarkArrayTransfer -d Cuda
+--benchmark_filter=1024' -- benchmarks
+
+
+# It will output something like this:
+
+Benchmark                                                              Time             CPU      Time Old      Time New       CPU Old       CPU New
+---------------------------------------------------------------------------------------------------------------------------------------------------
+BenchContToExecRead<F32>/Bytes:1024/manual_time                     +0.0127         +0.0120         18388         18622         18632         18856
+BenchContToExecWrite<F32>/Bytes:1024/manual_time                    +0.0010         +0.0006         23471         23496         23712         23726
+BenchContToExecReadWrite<F32>/Bytes:1024/manual_time                -0.0034         -0.0041         26363         26274         26611         26502
+BenchRoundTripRead<F32>/Bytes:1024/manual_time                      +0.0055         +0.0056         20635         20748         21172         21291
+BenchRoundTripReadWrite<F32>/Bytes:1024/manual_time                 +0.0084         +0.0082         29288         29535         29662         29905
+BenchExecToContRead<F32>/Bytes:1024/manual_time                     +0.0025         +0.0021         25883         25947         26122         26178
+BenchExecToContWrite<F32>/Bytes:1024/manual_time                    -0.0027         -0.0038         26375         26305         26622         26522
+BenchExecToContReadWrite<F32>/Bytes:1024/manual_time                +0.0041         +0.0039         25639         25745         25871         25972
+```
+
+## Installing compare-benchmarks.py
+
+`compare-benchmarks.py` relies on `compare.py` from Google Benchmarks which also
+relies in `SciPy`, you can find instructions [here][SciPy] regarding its
+installation.
+
+[Google Benchmarks]: https://github.com/google/benchmark
+[Compare.py]:        https://github.com/google/benchmark/blob/master/tools/compare.py
+[SciPy]:             https://www.scipy.org/install.html