Merge branch 'master' of https://gitlab.kitware.com/vtk/vtk-m into temporalparticleadvection

2024-09-16 17:22:55 +00:00 · 2018-05-15 11:46:26 -07:00 · 2018-05-15 11:46:26 -07:00 · eb2ed4f275
commit eb2ed4f275
parent 24dd063810 4e2e96dd6f
76 changed files with 2043 additions and 468 deletions
--- a/CMake/VTKmCompilerFlags.cmake
+++ b/CMake/VTKmCompilerFlags.cmake
@ -44,9 +44,23 @@ endif()


 #-----------------------------------------------------------------------------
-# vtkm_compiler_flags is used by all the vtkm targets
+# vtkm_compiler_flags is used by all the vtkm targets and consumers of VTK-m
+# The flags on vtkm_compiler_flags are needed when using/building vtk-m
 add_library(vtkm_compiler_flags INTERFACE)

+# When building libraries/tests that are part of the VTK-m repository
+# inherit the properties from vtkm_developer_flags and vtkm_vectorization_flags.
+# The flags are intended only for VTK-m itself and are not needed by consumers.
+# We will export vtkm_vectorization_flags in general so consumer can enable
+# vectorization if they so desire
+if (VTKm_ENABLE_DEVELOPER_FLAGS)
+  target_link_libraries(vtkm_compiler_flags
+    INTERFACE $<BUILD_INTERFACE:vtkm_developer_flags>)
+endif()
+target_link_libraries(vtkm_compiler_flags
+  INTERFACE $<BUILD_INTERFACE:vtkm_vectorization_flags>)
+
+
 # setup that we need C++11 support
 if(CMAKE_VERSION VERSION_LESS 3.8)
  target_compile_features(vtkm_compiler_flags INTERFACE cxx_nullptr)
@ -56,14 +70,9 @@ endif()

 # Enable large object support so we can have 2^32 addressable sections
 if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-  if(CMAKE_VERSION VERSION_LESS 3.11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"/bigobj\"")
-  else()
-    target_compile_options(vtkm_compiler_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:/bigobj>>)
-    if(TARGET vtkm::cuda)
-      target_compile_options(vtkm_compiler_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/bigobj">>)
-    endif()
+  target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CXX>:/bigobj>)
+  if(TARGET vtkm::cuda)
+    target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/bigobj">)
  endif()
 endif()

@ -74,29 +83,22 @@ target_include_directories(vtkm_compiler_flags INTERFACE
  $<INSTALL_INTERFACE:${VTKm_INSTALL_INCLUDE_DIR}>
  )

-# Additional warnings just for Clang 3.5+, and AppleClang 7+ we specify
-# for all build types, since these failures to vectorize are not limited
-# to developer builds
-if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
-    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.4)
-  target_compile_options(vtkm_compiler_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>>)
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND
-       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.99)
-  target_compile_options(vtkm_compiler_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>>)
-endif()
-
-# When building libraries/tests that are part of the VTK-m repository
-# inherit the properties from vtkm_developer_flags
-target_link_libraries(vtkm_compiler_flags
-  INTERFACE $<BUILD_INTERFACE:vtkm_developer_flags vtkm_vectorization_flags>)
-
-
 #-----------------------------------------------------------------------------
 # vtkm_developer_flags is used ONLY BY libraries that are built as part of this
 # repository
 add_library(vtkm_developer_flags INTERFACE)
 target_link_libraries(vtkm_developer_flags INTERFACE vtkm_compiler_flags)

+# Additional warnings just for Clang 3.5+, and AppleClang 7+
+# about failures to vectorize.
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.4)
+  target_compile_options(vtkm_developer_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>>)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND
+       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.99)
+  target_compile_options(vtkm_developer_flags INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>>)
+endif()
+
 if(VTKM_COMPILER_IS_MSVC)
  target_compile_definitions(vtkm_developer_flags INTERFACE "_SCL_SECURE_NO_WARNINGS"
                                                            "_CRT_SECURE_NO_WARNINGS")
@ -125,11 +127,22 @@ elseif(VTKM_COMPILER_IS_ICC)
 elseif(VTKM_COMPILER_IS_GNU OR VTKM_COMPILER_IS_CLANG)
  set(cxx_flags -Wall -Wno-long-long -Wcast-align -Wconversion -Wchar-subscripts -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused-parameter -fno-common)
  set(cuda_flags -Xcudafe=--display_error_number -Xcompiler=-Wall,-Wno-unknown-pragmas,-Wno-unused-local-typedefs,-Wno-unused-local-typedefs,-Wno-unused-function,-Wno-long-long,-Wcast-align,-Wconversion,-Wchar-subscripts,-Wpointer-arith,-Wformat,-Wformat-security,-Wshadow,-Wunused-parameter,-fno-common)
-  target_compile_options(vtkm_compiler_flags
+
+  #GCC 5, 6 don't properly handle strict-overflow suppression through pragma's.
+  #Instead of suppressing around the location of the strict-overflow you
+  #have to suppress around the entry point, or in vtk-m case the worklet
+  #invocation site. This is incredibly tedious and has been fixed in gcc 7
+  #
+  if(VTKM_COMPILER_IS_GNU AND
+    (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.99) AND
+    (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.99) )
+    list(APPEND cxx_flags -Wno-strict-overflow)
+  endif()
+  target_compile_options(vtkm_developer_flags
    INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>>
    )
  if(TARGET vtkm::cuda)
-    target_compile_options(vtkm_compiler_flags
+    target_compile_options(vtkm_developer_flags
      INTERFACE $<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>>
      )
  endif()
--- a/CMake/VTKmDeviceAdapters.cmake
+++ b/CMake/VTKmDeviceAdapters.cmake
@ -66,13 +66,9 @@ if(VTKm_ENABLE_CUDA AND NOT TARGET vtkm::cuda)
    add_library(vtkm::cuda UNKNOWN IMPORTED GLOBAL)
  endif()

-if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND CMAKE_VERSION VERSION_LESS 3.11)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-else()
  set_target_properties(vtkm::cuda PROPERTIES
    INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
  )
-  endif()

  # We can't have this location/lib empty, so we provide a location that is
  # valid and will have no effect on compilation
@ -115,6 +111,8 @@ else()
  # for all major virtual architectures, guaranteeing that the code will run
  # anywhere.
  #
+  # The option 'none' is provided so that when being built as part of another
+  # project, its own custom flags can be used.
  #
  # 1 - native
  #   - Uses system introspection to determine compile flags
@ -135,21 +133,19 @@ else()
  #   - Uses: --generate-code=arch=compute_50,code=sm_50
  #   - Uses: --generate-code=arch=compute_60,code=sm_60
  #   - Uses: --generate-code=arch=compute_70,code=sm_70
+  # 8 - none
  #

  #specify the property
  set(VTKm_CUDA_Architecture "native" CACHE STRING "Which GPU Architecture(s) to compile for")
-  set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell pascal volta all)
+  set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell pascal volta all none)

  #detect what the propery is set too
  if(VTKm_CUDA_Architecture STREQUAL "native")

    if(VTKM_CUDA_NATIVE_EXE_PROCESS_RAN_OUTPUT)
      #Use the cached value
-      # replace any semicolons with an empty space as CMAKE_CUDA_FLAGS is
-      # a string not a list and this could be cached from when it was a list
-      string(REPLACE ";" " " run_output "${VTKM_CUDA_NATIVE_EXE_PROCESS_RAN_OUTPUT}")
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${run_output}")
+      set(arch_flags ${VTKM_CUDA_NATIVE_EXE_PROCESS_RAN_OUTPUT})
    else()

      #run execute_process to do auto_detection
@ -174,11 +170,7 @@ else()
        string(FIND "${run_output}" "--generate-code" position)
        string(SUBSTRING "${run_output}" ${position} -1 run_output)

-        # replace any semicolons with an empty space as CMAKE_CUDA_FLAGS is
-        # a string not a list
-        string(REPLACE ";" " " run_output "${run_output}")
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${run_output}")
-
+        set(arch_flags ${run_output})
        set(VTKM_CUDA_NATIVE_EXE_PROCESS_RAN_OUTPUT ${run_output} CACHE INTERNAL
                "device type(s) for cuda[native]")
      else()
@ -190,22 +182,27 @@ else()
  #since when we are native we can fail, and fall back to "kepler" these have
  #to happen after, and separately of the native check
  if(VTKm_CUDA_Architecture STREQUAL "fermi")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_20,code=sm_20")
+    set(arch_flags --generate-code=arch=compute_20,code=sm_20)
  elseif(VTKm_CUDA_Architecture STREQUAL "kepler")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_30,code=sm_30")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_35,code=sm_35")
+    set(arch_flags --generate-code=arch=compute_30,code=sm_30
+                   --generate-code=arch=compute_35,code=sm_35)
  elseif(VTKm_CUDA_Architecture STREQUAL "maxwell")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_50,code=sm_50")
+    set(arch_flags --generate-code=arch=compute_50,code=sm_50)
  elseif(VTKm_CUDA_Architecture STREQUAL "pascal")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_60,code=sm_60")
+    set(arch_flags --generate-code=arch=compute_60,code=sm_60)
  elseif(VTKm_CUDA_Architecture STREQUAL "volta")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_70,code=sm_70")
+    set(arch_flags --generate-code=arch=compute_70,code=sm_70)
  elseif(VTKm_CUDA_Architecture STREQUAL "all")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_30,code=sm_30")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_35,code=sm_35")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_50,code=sm_50")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_60,code=sm_60")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --generate-code=arch=compute_70,code=sm_70")
+    set(arch_flags --generate-code=arch=compute_30,code=sm_30
+                   --generate-code=arch=compute_35,code=sm_35
+                   --generate-code=arch=compute_50,code=sm_50
+                   --generate-code=arch=compute_60,code=sm_60
+                   --generate-code=arch=compute_70,code=sm_70)
  endif()

+  string(REPLACE ";" " " arch_flags "${arch_flags}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${arch_flags}")
+
+  set_target_properties(vtkm::cuda PROPERTIES VTKm_CUDA_Architecture_Flags "${arch_flags}")
+
 endif()
--- a/CMake/VTKmWrappers.cmake
+++ b/CMake/VTKmWrappers.cmake
@ -414,9 +414,8 @@ function(vtkm_unit_tests)
  set_property(TARGET ${test_prog} PROPERTY RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH})

  target_link_libraries(${test_prog} PRIVATE vtkm_cont ${VTKm_UT_LIBRARIES})
-
-  if(VTKm_UT_NO_TESTS)
-    return()
+  if(backend)
+    target_compile_definitions(${test_prog} PRIVATE "VTKM_DEVICE_ADAPTER=VTKM_DEVICE_ADAPTER_${backend}")
  endif()

  #determine the timeout for all the tests based on the backend. CUDA tests
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -22,11 +22,16 @@
 #   - Support for target_sources
 #   - Support for usage requirements
 #
-# If you want CUDA support, you will need to have CMake 3.9 on Linux/OSX or
-# CMake 3.10 on windows.
+# If you want CUDA support, you will need to have CMake 3.9 on Linux/OSX.
+# We require CMake 3.11 on windows as the $<COMPILE_LANGUAGE:> generator
+# expression is not supported on older versions.
 cmake_minimum_required(VERSION 3.3)
 project (VTKm)

+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
+endif()
+
 # Update module path
 set(VTKm_CMAKE_MODULE_PATH ${VTKm_SOURCE_DIR}/CMake)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${VTKm_CMAKE_MODULE_PATH})
@ -112,10 +117,16 @@ vtkm_option(VTKm_USE_DEFAULT_SYMBOL_VISIBILITY "Don't explicitly hide symbols fr
 vtkm_option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
 set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})

+# This flag can be used to prevent VTK-m from exporting its warning flags in its
+# build interface. This is useful when building VTK-m as a thirdparty library
+# and the warnings are too strict for the parent project.
+vtkm_option(VTKm_ENABLE_DEVELOPER_FLAGS "Enable compiler flags that are usefull while developing VTK-m" ON)
+
 mark_as_advanced(
  VTKm_NO_ASSERT
  VTKm_INSTALL_ONLY_LIBRARIES
  VTKm_USE_DEFAULT_SYMBOL_VISIBILITY
+  VTKm_ENABLE_DEVELOPER_FLAGS
  )

 #-----------------------------------------------------------------------------
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@ -21,4 +21,22 @@
 list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
  ".*warning: ignoring loop annotation.*"
  ".*warning: Included by graph for.*not generated, too many nodes. Consider increasing DOT_GRAPH_MAX_NODES."
+
+  # disable doxygen warning about potential recursion.
+  ".*warning: Detected potential recursive class relation between class vtkm::exec::internal::ArrayPortalTransform"
+
+  # disable doxygen warning about not generating graph
+  ".*warning: Included by graph for"
+
+  # disable doxygen warnings from CONTRIBUTING.md, CodingConventions.md.
+  # these files are really intended for Gitlab, hence we don't want to use
+  # doxygen tags in them.
+  "CONTRIBUTING.md.*warning"
+  "CodingConventions.md.*warning"
+
+  # disable PTX warning about recursive functions. These look like they can't be silenced
+  # without disabling all PTX warnings, show hide them on the dashboard.
+  # We explicitly only suppress specific worklets so we can see when new recursive
+  # worklets are added
+  "ptxas warning : Stack size for entry function.*NearestNeighborSearch3DWorklet.*"
 )
--- a/examples/demo/Demo.cxx
+++ b/examples/demo/Demo.cxx
@ -72,7 +72,7 @@ int main(int argc, char* argv[])
    std::cout << "using: " << argv[1] << " as MarchingCubes input file" << std::endl;
    vtkm::io::reader::VTKDataSetReader reader(argv[1]);
    inputData = reader.ReadDataSet();
-    isovalue = atof(argv[2]);
+    isovalue = static_cast<vtkm::Float32>(atof(argv[2]));
    fieldName = "SCALARS:pointvar";
  }

--- a/examples/game_of_life/GameOfLife.cxx
+++ b/examples/game_of_life/GameOfLife.cxx
@ -94,10 +94,10 @@ struct UpdateLifeState : public vtkm::worklet::WorkletPointNeighborhood3x3x3
    // Any live cell with two or three live neighbors lives on to the next generation.
    // Any live cell with more than three live neighbors dies, as if by overcrowding.
    // Any dead cell with exactly three live neighbors becomes a live cell, as if by reproduction.
-    vtkm::UInt8 current = prevstate.Get(0, 0, 0);
-    vtkm::UInt8 count = prevstate.Get(-1, -1, 0) + prevstate.Get(-1, 0, 0) +
-      prevstate.Get(-1, 1, 0) + prevstate.Get(0, -1, 0) + prevstate.Get(0, 1, 0) +
-      prevstate.Get(1, -1, 0) + prevstate.Get(1, 0, 0) + prevstate.Get(1, 1, 0);
+    auto current = prevstate.Get(0, 0, 0);
+    auto count = prevstate.Get(-1, -1, 0) + prevstate.Get(-1, 0, 0) + prevstate.Get(-1, 1, 0) +
+      prevstate.Get(0, -1, 0) + prevstate.Get(0, 1, 0) + prevstate.Get(1, -1, 0) +
+      prevstate.Get(1, 0, 0) + prevstate.Get(1, 1, 0);

    if (current == 1 && (count == 2 || count == 3))
    {
@ -113,8 +113,8 @@ struct UpdateLifeState : public vtkm::worklet::WorkletPointNeighborhood3x3x3
    }

    color[0] = 0;
-    color[1] = state * (100 + (count * 32));
-    color[2] = (state && !current) ? (100 + (count * 32)) : 0;
+    color[1] = static_cast<vtkm::UInt8>(state * (100 + (count * 32)));
+    color[2] = (state && !current) ? static_cast<vtkm::UInt8>(100 + (count * 32)) : 0;
    color[3] = 255; //alpha channel
  }
 };
--- a/examples/histogram/HistogramMPI.hxx
+++ b/examples/histogram/HistogramMPI.hxx
@ -84,12 +84,12 @@ public:
    // now reduce across ranks using MPI.

    // converting to std::vector
-    std::vector<vtkm::Id> send_buf(numBins);
+    std::vector<vtkm::Id> send_buf(static_cast<std::size_t>(numBins));
    std::copy(vtkm::cont::ArrayPortalToIteratorBegin(local.GetPortalConstControl()),
              vtkm::cont::ArrayPortalToIteratorEnd(local.GetPortalConstControl()),
              send_buf.begin());

-    std::vector<vtkm::Id> recv_buf(numBins);
+    std::vector<vtkm::Id> recv_buf(static_cast<std::size_t>(numBins));
    MPI_Reduce(&send_buf[0],
               &recv_buf[0],
               static_cast<int>(numBins),
--- a/examples/multi_backend/CMakeLists.txt
+++ b/examples/multi_backend/CMakeLists.txt
@ -23,15 +23,28 @@ cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
 project(MultiBackend CXX)

 #Find the VTK-m package
-find_package(VTKm REQUIRED QUIET)
+find_package(VTKm REQUIRED)
+find_package(Threads REQUIRED QUIET)
+
+set(headers
+  IOGenerator.h
+  MultiDeviceGradient.h
+  TaskQueue.h
+  )
+
+set(device_srcs
+  MultiDeviceGradient.cxx
+  )

 set(srcs
-    MultiBackend.cxx)
+  IOGenerator.cxx
+  MultiBackend.cxx
+  )

 if(TARGET vtkm::cuda)
-  vtkm_compile_as_cuda(cuda_srcs ${srcs})
-  set(srcs ${cuda_srcs})
+  vtkm_compile_as_cuda(cuda_srcs ${device_srcs})
+  set(device_srcs ${cuda_srcs})
 endif()

-add_executable(MultiBackend ${srcs})
-target_link_libraries(MultiBackend PRIVATE vtkm_cont)
+add_executable(MultiBackend ${device_srcs} ${srcs} ${headers})
+target_link_libraries(MultiBackend PRIVATE vtkm_cont Threads::Threads)
--- a/examples/multi_backend/IOGenerator.cxx
+++ b/examples/multi_backend/IOGenerator.cxx
@ -0,0 +1,111 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include "IOGenerator.h"
+
+#include <vtkm/Math.h>
+
+#include <vtkm/cont/DataSetBuilderUniform.h>
+#include <vtkm/cont/DataSetFieldAdd.h>
+
+#include <vtkm/worklet/DispatcherMapField.h>
+#include <vtkm/worklet/WorkletMapField.h>
+
+#include <vtkm/cont/serial/DeviceAdapterSerial.h>
+
+#include <chrono>
+#include <random>
+
+struct WaveField : public vtkm::worklet::WorkletMapField
+{
+  typedef void ControlSignature(FieldIn<Vec3>, FieldOut<Vec3>);
+  typedef void ExecutionSignature(_1, _2);
+
+  template <typename T>
+  VTKM_EXEC void operator()(const vtkm::Vec<T, 3>& input, vtkm::Vec<T, 3>& output) const
+  {
+    output[0] = input[0];
+    output[1] = 0.25f * vtkm::Sin(input[0]) * vtkm::Cos(input[2]);
+    output[2] = input[2];
+  }
+};
+
+vtkm::cont::DataSet make_test3DImageData(int xdim, int ydim, int zdim)
+{
+  using Builder = vtkm::cont::DataSetBuilderUniform;
+  using FieldAdd = vtkm::cont::DataSetFieldAdd;
+  vtkm::cont::DataSet ds = Builder::Create(vtkm::Id3{ xdim, ydim, zdim });
+
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>> field;
+  vtkm::worklet::DispatcherMapField<WaveField, vtkm::cont::DeviceAdapterTagSerial> dispatcher;
+  dispatcher.Invoke(ds.GetCoordinateSystem(), field);
+
+  FieldAdd::AddPointField(ds, "vec_field", field);
+  return ds;
+}
+
+//=================================================================
+void io_generator(TaskQueue<vtkm::cont::MultiBlock>& queue, std::size_t numberOfTasks)
+{
+  //Step 1. We want to build an initial set of blocks
+  //that vary in size. This way we can generate uneven
+  //work to show off the vtk-m filter work distribution
+  vtkm::cont::DataSet small = make_test3DImageData(128, 128, 128);
+  vtkm::cont::DataSet medium = make_test3DImageData(256, 256, 128);
+  vtkm::cont::DataSet large = make_test3DImageData(512, 512, 128);
+
+  std::vector<vtkm::cont::DataSet> blocks;
+  blocks.push_back(small);
+  blocks.push_back(medium);
+  blocks.push_back(large);
+
+
+  std::mt19937 rng;
+  //uniform_int_distribution is a closed interval [] so both the min and max
+  //can be chosen values
+  std::uniform_int_distribution<vtkm::Id> blockNumGen(6, 32);
+  std::uniform_int_distribution<std::size_t> blockPicker(0, blocks.size() - 1);
+  for (std::size_t i = 0; i < numberOfTasks; ++i)
+  {
+    //Step 2. Construct a random number of blocks
+    const vtkm::Id numberOfBlocks = blockNumGen(rng);
+
+    //Step 3. Randomly pick the blocks in the dataset
+    vtkm::cont::MultiBlock mb(numberOfBlocks);
+    for (vtkm::Id b = 0; b < numberOfBlocks; ++b)
+    {
+      mb.AddBlock(blocks[blockPicker(rng)]);
+    }
+
+    std::cout << "adding multi-block with " << mb.GetNumberOfBlocks() << " blocks" << std::endl;
+
+    //Step 4. Add the multi-block to the queue. We explicitly
+    //use std::move to signal that this thread can't use the
+    //mb object after this call
+    queue.push(std::move(mb));
+
+    //Step 5. Go to sleep for a period of time to replicate
+    //data stream in
+    // std::this_thread::sleep_for(std::chrono::seconds(1));
+  }
+
+  //Step 6. Tell the queue that we are done submitting work
+  queue.shutdown();
+  std::cout << "io_generator finished" << std::endl;
+}
--- a/examples/multi_backend/IOGenerator.h
+++ b/examples/multi_backend/IOGenerator.h
@ -0,0 +1,30 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_examples_multibackend_IOWorker_h
+#define vtk_m_examples_multibackend_IOWorker_h
+
+#include "TaskQueue.h"
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/MultiBlock.h>
+
+vtkm::cont::DataSet make_test3DImageData(int xdim, int ydim, int zdim);
+void io_generator(TaskQueue<vtkm::cont::MultiBlock>& queue, std::size_t numberOfTasks);
+
+#endif
--- a/examples/multi_backend/MultiBackend.cxx
+++ b/examples/multi_backend/MultiBackend.cxx
@ -17,118 +17,96 @@
 //  Laboratory (LANL), the U.S. Government retains certain rights in
 //  this software.
 //============================================================================
+
 #include <iostream>
+#include <thread>

-#include <vtkm/Math.h>
-#include <vtkm/cont/ArrayHandle.h>
-#include <vtkm/cont/RuntimeDeviceInformation.h>
+#include <vtkm/cont/MultiBlock.h>

-#include <vtkm/worklet/DispatcherMapField.h>
-#include <vtkm/worklet/WorkletMapField.h>
+#include "IOGenerator.h"
+#include "MultiDeviceGradient.h"
+#include "TaskQueue.h"

-#include <vtkm/cont/TryExecute.h>
-#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
-#include <vtkm/cont/serial/DeviceAdapterSerial.h>
-#include <vtkm/cont/tbb/DeviceAdapterTBB.h>
-
-using FloatVec3 = vtkm::Vec<vtkm::Float32, 3>;
-using Uint8Vec4 = vtkm::Vec<vtkm::UInt8, 4>;
-
-struct GenerateSurfaceWorklet : public vtkm::worklet::WorkletMapField
-{
-  vtkm::Float32 t;
-  GenerateSurfaceWorklet(vtkm::Float32 st)
-    : t(st)
-  {
-  }
-
-  typedef void ControlSignature(FieldIn<>, FieldOut<>, FieldOut<>);
-  typedef void ExecutionSignature(_1, _2, _3);
-
-  template <typename T>
-  VTKM_EXEC void operator()(const vtkm::Vec<T, 3>& input,
-                            vtkm::Vec<T, 3>& output,
-                            vtkm::Vec<vtkm::UInt8, 4>& color) const
-  {
-    output[0] = input[0];
-    output[1] = 0.25f * vtkm::Sin(input[0] * 10.f + t) * vtkm::Cos(input[2] * 10.f + t);
-    output[2] = input[2];
-
-    color[0] = 0;
-    color[1] = static_cast<vtkm::UInt8>(160 + (96 * vtkm::Sin(input[0] * 10.f + t)));
-    color[2] = static_cast<vtkm::UInt8>(160 + (96 * vtkm::Cos(input[2] * 5.f + t)));
-    color[3] = 255;
-  }
-};
-
-struct RunGenerateSurfaceWorklet
-{
-  template <typename DeviceAdapterTag>
-  bool operator()(DeviceAdapterTag) const
-  {
-    //At this point we know we have runtime support
-    using DeviceTraits = vtkm::cont::DeviceAdapterTraits<DeviceAdapterTag>;
-
-    using DispatcherType =
-      vtkm::worklet::DispatcherMapField<GenerateSurfaceWorklet, DeviceAdapterTag>;
-
-    std::cout << "Running a worklet on device adapter: " << DeviceTraits::GetName() << std::endl;
-
-    GenerateSurfaceWorklet worklet(0.05f);
-    DispatcherType(worklet).Invoke(this->In, this->Out, this->Color);
-
-    return true;
-  }
-
-  vtkm::cont::ArrayHandle<FloatVec3> In;
-  vtkm::cont::ArrayHandle<FloatVec3> Out;
-  vtkm::cont::ArrayHandle<Uint8Vec4> Color;
-};
-
-template <typename T>
-std::vector<vtkm::Vec<T, 3>> make_testData(int size)
-{
-  std::vector<vtkm::Vec<T, 3>> data;
-  data.reserve(static_cast<std::size_t>(size * size));
-  for (int i = 0; i < size; ++i)
-  {
-    for (int j = 0; j < size; ++j)
-    {
-      data.push_back(vtkm::Vec<T, 3>(
-        2.f * static_cast<T>(i / size) - 1.f, 0.f, 2.f * static_cast<T>(j / size) - 1.f));
-    }
-  }
-  return data;
-}
-
-//This is the list of devices to compile in support for. The order of the
-//devices determines the runtime preference.
-struct DevicesToTry : vtkm::ListTagBase<vtkm::cont::DeviceAdapterTagCuda,
-                                        vtkm::cont::DeviceAdapterTagTBB,
-                                        vtkm::cont::DeviceAdapterTagSerial>
-{
-};
+//This demo shows off using vtk-m in multiple threads in two different ways.
+//
+//At a high level we have 2 primary threads, an IO thread and a Worker thread
+//The IO thread will generate all data using the vtk-m serial device, and
+//will post this data to a worker queue as a vtk-m multiblock.
+//The Worker thread will pull down these vtk-m multiblock data and run a
+//vtk-m filter on the multiblock.
+//The vtk-m filter it runs will itself have a worker pool which it will
+//distribute work too. The number of workers is based on what device adapters
+//are enabled but uses the following logic:
+// -  If TBB is enabled construct a single TBB worker
+// -  If CUDA is enabled construct 4 workers for each GPU on the machine
+//
+//Unfortunately due to some thread unsafe logic in VTK-m it is currently not
+//possible to have CUDA and TBB workers at the same time. So the class will
+//choose CUDA over TBB when possible.
+//Once the thread unsafe logic is fixed a machine that has a single CPU
+//and single GPU we should expect that we will have 2 primary 'main loop'
+//threads, and 5 threads for heavy 'task' work.

+void multiblock_processing(TaskQueue<vtkm::cont::MultiBlock>& queue);
 int main(int, char**)
 {
-  std::vector<FloatVec3> data = make_testData<vtkm::Float32>(1024);
+  //Step 1. Construct the two primary 'main loops'. The threads
+  //share a queue object so we need to explicitly pass it
+  //by reference (the std::ref call)
+  TaskQueue<vtkm::cont::MultiBlock> queue;
+  std::thread io(io_generator, std::ref(queue), 12);
+  std::thread worker(multiblock_processing, std::ref(queue));

-  //make array handles for the data
-
-  // TryExecutes takes a functor and a list of devices. It then tries to run
-  // the functor for each device (in the order given in the list) until the
-  // execution succeeds. This allows you to compile in support for multiple
-  // devices which have runtime requirements ( GPU / HW Accelerator ) and
-  // correctly choose the best device at runtime.
-  //
-  // The functor parentheses operator should take exactly one argument, which is
-  // the DeviceAdapterTag to use. The functor should return true if the execution
-  // succeeds.
-  //
-  // This function also optionally takes a vtkm::cont::RuntimeDeviceTracker, which
-  // will monitor for certain failures across calls to TryExecute and skip trying
-  // devices with a history of failure.
-  RunGenerateSurfaceWorklet task;
-  task.In = vtkm::cont::make_ArrayHandle(data);
-  vtkm::cont::TryExecute(task, DevicesToTry());
+  //Step N. Wait for the work to finish
+  io.join();
+  worker.join();
+  return 0;
+}
+
+//=================================================================
+void multiblock_processing(TaskQueue<vtkm::cont::MultiBlock>& queue)
+{
+  //Step 1. Construct the gradient filter outside the work loop
+  //so that we can reuse the thread pool it constructs
+  MultiDeviceGradient gradient;
+  gradient.SetComputePointGradient(true);
+  while (queue.hasTasks())
+  {
+    //Step 2. grab the next multi-block skipping any that are empty
+    //as empty ones can be returned when the queue is about
+    //to say it has no work
+    vtkm::cont::MultiBlock mb = queue.pop();
+    if (mb.GetNumberOfBlocks() == 0)
+    {
+      continue;
+    }
+
+    //Step 3. Get the first field name from the multi-block
+    std::string fieldName = mb.GetBlock(0).GetField(0).GetName();
+
+    //Step 4. Run a multi device gradient
+    gradient.SetActiveField(fieldName);
+    vtkm::cont::MultiBlock result = gradient.Execute(mb);
+    std::cout << "finished processing a multi-block" << std::endl;
+
+    //Step 5. Verify each block has a "Gradients" field
+    for (auto&& block : result)
+    {
+      // std::cout << std::endl << std::endl << std::endl;
+      // std::cout << "block: " << std::endl;
+      // block.PrintSummary(std::cout);
+      try
+      {
+        const auto& field = block.GetField("Gradients", vtkm::cont::Field::ASSOC_POINTS);
+        (void)field;
+      }
+      catch (vtkm::cont::ErrorBadValue)
+      {
+        std::cerr << "gradient filter failed!" << std::endl;
+        break;
+      }
+    }
+  }
+
+  std::cout << "multiblock_processing finished" << std::endl;
 }
--- a/examples/multi_backend/MultiDeviceGradient.cxx
+++ b/examples/multi_backend/MultiDeviceGradient.cxx
@ -0,0 +1,28 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define vtk_m_examples_multibackend_MultiDeviceGradient_cxx
+
+#include "MultiDeviceGradient.h"
+#include "MultiDeviceGradient.hxx"
+
+template vtkm::cont::MultiBlock MultiDeviceGradient::PrepareForExecution<
+  vtkm::filter::PolicyDefault>(const vtkm::cont::MultiBlock&,
+                               const vtkm::filter::PolicyBase<vtkm::filter::PolicyDefault>&);
--- a/examples/multi_backend/MultiDeviceGradient.h
+++ b/examples/multi_backend/MultiDeviceGradient.h
@ -0,0 +1,94 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_examples_multibackend_MultiDeviceGradient_h
+#define vtk_m_examples_multibackend_MultiDeviceGradient_h
+
+
+#include <vtkm/filter/FilterField.h>
+#include <vtkm/filter/FilterTraits.h>
+
+#include "TaskQueue.h"
+
+#include <thread>
+
+using RuntimeTaskQueue = TaskQueue<std::function<void(const vtkm::cont::RuntimeDeviceTracker&)>>;
+
+/// \brief Construct a MultiDeviceGradient for a given multiblock dataset
+///
+/// The Policy used with MultiDeviceGradient must include the TBB and CUDA
+/// backends.
+class MultiDeviceGradient : public vtkm::filter::FilterField<MultiDeviceGradient>
+{
+public:
+  //Construct a MultiDeviceGradient and worker pool
+  VTKM_CONT
+  MultiDeviceGradient();
+
+  //Needed so that we can shut down the worker pool properly
+  VTKM_CONT
+  ~MultiDeviceGradient();
+
+  /// When this flag is on (default is off), the gradient filter will provide a
+  /// point based gradients, which are significantly more costly since for each
+  /// point we need to compute the gradient of each cell that uses it.
+  void SetComputePointGradient(bool enable) { ComputePointGradient = enable; }
+  bool GetComputePointGradient() const { return ComputePointGradient; }
+
+  /// Will submit each block to a work queue that the threads will
+  /// pull work from
+  template <typename DerivedPolicy>
+  VTKM_CONT vtkm::cont::MultiBlock PrepareForExecution(
+    const vtkm::cont::MultiBlock&,
+    const vtkm::filter::PolicyBase<DerivedPolicy>&);
+
+private:
+  bool ComputePointGradient;
+  RuntimeTaskQueue Queue;
+  std::vector<std::thread> Workers;
+};
+
+namespace vtkm
+{
+namespace filter
+{
+template <>
+class FilterTraits<MultiDeviceGradient>
+{
+public:
+  struct TypeListTagGradientInputs : vtkm::ListTagBase<vtkm::Float32,
+                                                       vtkm::Float64,
+                                                       vtkm::Vec<vtkm::Float32, 3>,
+                                                       vtkm::Vec<vtkm::Float64, 3>>
+  {
+  };
+
+  using InputFieldTypeList = TypeListTagGradientInputs;
+};
+}
+} // namespace vtkm::filter
+
+
+#ifndef vtk_m_examples_multibackend_MultiDeviceGradient_cxx
+extern template vtkm::cont::MultiBlock MultiDeviceGradient::PrepareForExecution<
+  vtkm::filter::PolicyDefault>(const vtkm::cont::MultiBlock&,
+                               const vtkm::filter::PolicyBase<vtkm::filter::PolicyDefault>&);
+#endif
+
+#endif
--- a/examples/multi_backend/MultiDeviceGradient.hxx
+++ b/examples/multi_backend/MultiDeviceGradient.hxx
@ -0,0 +1,231 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
+#include <vtkm/cont/tbb/DeviceAdapterTBB.h>
+
+#include <vtkm/filter/Gradient.h>
+
+
+namespace
+{
+vtkm::Id deterine_cuda_gpu_count()
+{
+  vtkm::Id count = 0;
+#if defined(VTKM_ENABLE_CUDA)
+  int numberOfDevices = 0;
+  auto res = cudaGetDeviceCount(&numberOfDevices);
+  if (res == cudaSuccess)
+  {
+    count = static_cast<vtkm::Id>(numberOfDevices);
+  }
+#endif
+  return count;
+}
+
+void process_block_tbb(RuntimeTaskQueue& queue)
+{
+  //Step 1. Set the device adapter to this thread to TBB.
+  //This makes sure that any vtkm::filters used by our
+  //task operate only on TBB
+  //
+  vtkm::cont::RuntimeDeviceTracker tracker;
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagTBB{});
+
+  while (queue.hasTasks())
+  {
+    //Step 2. Get the task to run on TBB
+    auto task = queue.pop();
+
+    //Step 3. Run the task on TBB. We check the validity
+    //of the task since we could be given an empty task
+    //when the queue is empty and we are shutting down
+    if (task != nullptr)
+    {
+      task(tracker);
+    }
+
+    //Step 4. Notify the queue that we finished processing this task
+    queue.completedTask();
+    std::cout << "finished a block on tbb (" << std::this_thread::get_id() << ")" << std::endl;
+  }
+}
+
+void process_block_cuda(RuntimeTaskQueue& queue, int gpuId)
+{
+  //Step 1. Set the device adapter to this thread to cuda.
+  //This makes sure that any vtkm::filters used by our
+  //task operate only on cuda
+  //
+  vtkm::cont::RuntimeDeviceTracker tracker;
+#if defined(VTKM_ENABLE_CUDA)
+  auto error = cudaSetDevice(gpuId);
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagCuda{});
+#endif
+  (void)gpuId;
+
+  while (queue.hasTasks())
+  {
+    //Step 2. Get the task to run on cuda
+    auto task = queue.pop();
+
+    //Step 3. Run the task on TBB. We check the validity
+    //of the task since we could be given an empty task
+    //when the queue is empty and we are shutting down
+    if (task != nullptr)
+    {
+      task(tracker);
+    }
+
+    //Step 4. Notify the queue that we finished processing this task
+    queue.completedTask();
+    std::cout << "finished a block on cuda (" << std::this_thread::get_id() << ")" << std::endl;
+  }
+}
+
+} //namespace
+
+//-----------------------------------------------------------------------------
+VTKM_CONT MultiDeviceGradient::MultiDeviceGradient()
+  : ComputePointGradient(false)
+  , Queue()
+  , Workers()
+{
+  //Step 1. Determine the number of workers we want
+  vtkm::cont::RuntimeDeviceTracker tracker;
+  const bool runOnTbb = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagTBB{});
+  const bool runOnCuda = tracker.CanRunOn(vtkm::cont::DeviceAdapterTagCuda{});
+
+  //Note currently the virtual implementation has some issues
+  //In a multi-threaded enviornment only cuda can be used or
+  //all SMP backends ( Serial, TBB, OpenMP ).
+  //Once this issue is resolved we can enable CUDA + TBB in
+  //this example
+
+  //Step 2. Launch workers that will use cuda (if enabled).
+  //The threads share a queue object so we need to explicitly pass it
+  //by reference (the std::ref call)
+  if (runOnCuda)
+  {
+    std::cout << "adding cuda workers" << std::endl;
+    const vtkm::Id gpu_count = deterine_cuda_gpu_count();
+    for (vtkm::Id i = 0; i < gpu_count; ++i)
+    {
+      //The number of workers per GPU is purely arbitrary currently,
+      //but in general we want multiple of them so we can overlap compute
+      //and transfer
+      this->Workers.emplace_back(process_block_cuda, std::ref(this->Queue), i);
+      this->Workers.emplace_back(process_block_cuda, std::ref(this->Queue), i);
+      this->Workers.emplace_back(process_block_cuda, std::ref(this->Queue), i);
+      this->Workers.emplace_back(process_block_cuda, std::ref(this->Queue), i);
+    }
+  }
+  //Step 3. Launch a worker that will use tbb (if enabled).
+  //The threads share a queue object so we need to explicitly pass it
+  //by reference (the std::ref call)
+  else if (runOnTbb)
+  {
+    std::cout << "adding a tbb worker" << std::endl;
+    this->Workers.emplace_back(process_block_tbb, std::ref(this->Queue));
+  }
+}
+
+//-----------------------------------------------------------------------------
+VTKM_CONT MultiDeviceGradient::~MultiDeviceGradient()
+{
+  this->Queue.shutdown();
+
+  //shutdown all workers
+  for (auto&& thread : this->Workers)
+  {
+    thread.join();
+  }
+}
+
+//-----------------------------------------------------------------------------
+template <typename DerivedPolicy>
+inline VTKM_CONT vtkm::cont::MultiBlock MultiDeviceGradient::PrepareForExecution(
+  const vtkm::cont::MultiBlock& mb,
+  const vtkm::filter::PolicyBase<DerivedPolicy>& policy)
+{
+  //Step 1. Say that we have no more to submit for this multi block
+  //This is needed to happen for each execute as we want to support
+  //the same filter being used for multiple inputs
+  this->Queue.reset();
+
+  //Step 2. Construct the multi-block we are going to fill. The size signature
+  //to MultiBlock just reserves size
+  vtkm::cont::MultiBlock output;
+  output.AddBlocks(std::vector<vtkm::cont::DataSet>(mb.GetNumberOfBlocks()));
+  vtkm::cont::MultiBlock* outPtr = &output;
+
+
+  //Step 3. Construct the filter we want to run on each block
+  vtkm::filter::Gradient gradient;
+  gradient.SetComputePointGradient(this->GetComputePointGradient());
+  gradient.SetActiveField(this->GetActiveFieldName());
+
+  //Step 3b. Post 1 block up as work and block intil it is
+  //complete. This is needed as currently constructing the virtual
+  //Point Coordinates is not thread safe.
+  auto block = mb.cbegin();
+  {
+    vtkm::cont::DataSet input = *block;
+    this->Queue.push( //build a lambda that is the work to do
+      [=](const vtkm::cont::RuntimeDeviceTracker& tracker) {
+        //make a per thread copy of the filter
+        //and give it the device tracker
+        vtkm::filter::Gradient perThreadGrad = gradient;
+        perThreadGrad.SetRuntimeDeviceTracker(tracker);
+
+        vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);
+        outPtr->ReplaceBlock(0, result);
+      });
+    this->Queue.waitForAllTasksToComplete();
+    block++;
+  }
+
+  vtkm::Id index = 1;
+  for (; block != mb.cend(); ++block)
+  {
+    vtkm::cont::DataSet input = *block;
+    //Step 4. For each input block construct a lambda
+    //and add it to the queue for workers to take. This
+    //will allows us to have multiple works execute in a non
+    //blocking manner
+    this->Queue.push( //build a lambda that is the work to do
+      [=](const vtkm::cont::RuntimeDeviceTracker& tracker) {
+        //make a per thread copy of the filter
+        //and give it the device tracker
+        vtkm::filter::Gradient perThreadGrad = gradient;
+        perThreadGrad.SetRuntimeDeviceTracker(tracker);
+
+        vtkm::cont::DataSet result = perThreadGrad.Execute(input, policy);
+        outPtr->ReplaceBlock(index, result);
+      });
+    index++;
+  }
+
+  // Step 5. Wait on all workers to finish
+  this->Queue.waitForAllTasksToComplete();
+
+  return output;
+}
--- a/examples/multi_backend/TaskQueue.h
+++ b/examples/multi_backend/TaskQueue.h
@ -0,0 +1,151 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_examples_multibackend_TaskQueue_h
+#define vtk_m_examples_multibackend_TaskQueue_h
+
+#include <vtkm/cont/MultiBlock.h>
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+
+template <typename T>
+class TaskQueue
+{
+public:
+  TaskQueue() = default;
+
+  void reset()
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->ShutdownOnceTasksCompleted = false;
+      this->TaskCount = 0;
+    }
+    this->CV.notify_all();
+  }
+
+  void shutdown()
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->ShutdownOnceTasksCompleted = true;
+    }
+    this->CV.notify_all();
+  }
+
+  //Say we always have tasks while the producer (IO) hasn't
+  //reported it is finished adding tasks. Once it has finished
+  //submitting tasks, we run until the queue is empty
+  bool hasTasks()
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      if (this->ShutdownOnceTasksCompleted)
+      {
+        return this->Queue.size() > 0;
+      }
+      return true;
+    }
+  }
+
+  //Add a task to the Queue.
+  void push(T&& item)
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->Queue.push(item);
+      this->TaskCount++;
+    } //unlock before we notify so we don't deadlock
+    this->CV.notify_all();
+  }
+
+  //Get a task from the Queue.
+  T pop()
+  {
+    T item;
+    {
+      //wait for a job to come into the queue
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->CV.wait(lock, [this] {
+        //if we are shutting down we need to always wake up
+        if (this->ShutdownOnceTasksCompleted)
+        {
+          return true;
+        }
+        //if we aren't shutting down sleep when we have no work
+        return this->Queue.size() > 0;
+      });
+
+      //When shutting down we don't check the queue size
+      //so make sure we have something to pop
+      if (this->Queue.size() > 0)
+      {
+        //take the job
+        item = this->Queue.front();
+        this->Queue.pop();
+      }
+    } //unlock before we notify so we don't deadlock
+
+    this->CV.notify_all();
+    return item;
+  }
+
+  //Report that you finished processing a task popped from
+  //the Queue
+  void completedTask()
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->TaskCount--;
+    } //unlock before we notify so we don't deadlock
+    this->CV.notify_all();
+  }
+
+  //Wait for all task to be removed from the queue
+  //and to be completed
+  //For this to , threads after processing the
+  //data they got from pop() must call didTask()
+  //
+
+  void waitForAllTasksToComplete()
+  {
+    {
+      std::unique_lock<std::mutex> lock(this->Lock);
+      this->CV.wait(lock, [this] { return this->TaskCount == 0; });
+    }
+    this->CV.notify_all();
+  }
+
+private:
+  std::mutex Lock;
+  std::queue<T> Queue;
+  std::condition_variable CV;
+  int TaskCount = 0;
+  bool ShutdownOnceTasksCompleted = false;
+
+  //don't want copies of this
+  TaskQueue(const TaskQueue& rhs) = delete;
+  TaskQueue& operator=(const TaskQueue& rhs) = delete;
+  TaskQueue(TaskQueue&& rhs) = delete;
+  TaskQueue& operator=(TaskQueue&& rhs) = delete;
+};
+
+#endif
--- a/examples/unified_memory/UnifiedMemory.cu
+++ b/examples/unified_memory/UnifiedMemory.cu
@ -129,7 +129,10 @@ public:
  typedef _2 ExecutionSignature(_1, WorkIndex);

  VTKM_EXEC
-  vtkm::Float32 operator()(vtkm::Int64 x, vtkm::Id& index) const { return (vtkm::Sin(1.0 * x)); }
+  vtkm::Float32 operator()(vtkm::Int64 x, vtkm::Id&) const
+  {
+    return (vtkm::Sin(static_cast<vtkm::Float32>(x)));
+  }
 };
 }
 }
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@ -78,7 +78,7 @@ set(headers
  ErrorFilterExecution.h
  ErrorExecution.h
  ErrorInternal.h
-  ExecutionObjectFactoryBase.h
+  ExecutionObjectBase.h
  Field.h
  FieldRangeCompute.h
  FieldRangeGlobalCompute.h
--- a/vtkm/cont/CellLocatorTwoLevelUniformGrid.h
+++ b/vtkm/cont/CellLocatorTwoLevelUniformGrid.h
@ -610,7 +610,7 @@ public:
    ArrayPortalConst<vtkm::Id> CellIds;
  };

-  struct TwoLevelUniformGridExecutionObjectFactory : public vtkm::cont::ExecutionObjectFactoryBase
+  struct TwoLevelUniformGridExecutionObjectFactory : public vtkm::cont::ExecutionObjectBase
  {
    template <typename DeviceAdapter>
    VTKM_CONT TwoLevelUniformGridExecution<DeviceAdapter> PrepareForExecution(
--- a/vtkm/cont/DataSetBuilderUniform.h
+++ b/vtkm/cont/DataSetBuilderUniform.h
@ -128,7 +128,7 @@ private:
                                           std::string coordNm,
                                           std::string cellNm)
  {
-    vtkm::Id dims[3];
+    vtkm::Id dims[3] = { 1, 1, 1 };
    int ndims = 0;
    for (int i = 0; i < 3; ++i)
    {
--- a/vtkm/cont/ExecutionObjectFactoryBase.h
+++ b/vtkm/cont/ExecutionObjectFactoryBase.h
@ -17,23 +17,23 @@
 //  Laboratory (LANL), the U.S. Government retains certain rights in
 //  this software.
 //============================================================================
-#ifndef vtk_m_cont_ExecutionObjectFactoryBase_h
-#define vtk_m_cont_ExecutionObjectFactoryBase_h
+#ifndef vtk_m_cont_ExecutionObjectBase_h
+#define vtk_m_cont_ExecutionObjectBase_h
 #include <vtkm/Types.h>
 namespace vtkm
 {
 namespace cont
 {
-/// Base \c ExecutionObjectFactoryBase for execution objects to inherit from so that
+/// Base \c ExecutionObjectBase for execution objects to inherit from so that
 /// you can use an arbitrary object as a parameter in an execution environment
 /// function. Any method you want to use on the execution side must have the
 /// VTKM_EXEC modifier.
 /// \tparam Device

-class ExecutionObjectFactoryBase
+class ExecutionObjectBase
 {
 };
 }
 } // namespace vtkm::cont

-#endif //vtk_m_cont_ExecutionObjectFactoryBase_h
+#endif //vtk_m_cont_ExecutionObjectBase_h
--- a/vtkm/cont/PointLocatorUniformGrid.h
+++ b/vtkm/cont/PointLocatorUniformGrid.h
@ -173,7 +173,8 @@ public:
  /// \param coords An ArrayHandle of x, y, z coordinates of input points.
  /// \param device Tag for selecting device adapter
  template <typename DeviceAdapter>
-  void Build(const vtkm::cont::ArrayHandle<vtkm::Vec<T, 3>>& coords, DeviceAdapter)
+  void Build(const vtkm::cont::ArrayHandle<vtkm::Vec<T, 3>>& coords,
+             DeviceAdapter vtkmNotUsed(device))
  {
    using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
    // generate unique id for each input point
--- a/vtkm/cont/arg/TransportTagExecObject.h
+++ b/vtkm/cont/arg/TransportTagExecObject.h
@ -24,7 +24,7 @@

 #include <vtkm/cont/arg/Transport.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>


 namespace vtkm
@ -48,10 +48,10 @@ struct Transport<vtkm::cont::arg::TransportTagExecObject, ContObjectType, Device
 {
  // If you get a compile error here, it means you tried to use an object that is not an execution
  // object as an argument that is expected to be one. All execution objects are expected to
-  // inherit from vtkm::exec::ExecutionObjectFactoryBase.
+  // inherit from vtkm::cont::ExecutionObjectBase.
  VTKM_STATIC_ASSERT_MSG(
-    (std::is_base_of<vtkm::cont::ExecutionObjectFactoryBase, ContObjectType>::value),
-    "All execution objects are expected to inherit from vtkm::exec::ExecutionObjectFactoryBase");
+    (std::is_base_of<vtkm::cont::ExecutionObjectBase, ContObjectType>::value),
+    "All execution objects are expected to inherit from vtkm::cont::ExecutionObjectBase");

  using ExecObjectType = decltype(std::declval<ContObjectType>().PrepareForExecution(Device()));
  template <typename InputDomainType>
--- a/vtkm/cont/arg/TypeCheckTagExecObject.h
+++ b/vtkm/cont/arg/TypeCheckTagExecObject.h
@ -24,7 +24,7 @@

 #include <vtkm/cont/arg/TypeCheck.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 #include <type_traits>

@ -36,7 +36,7 @@ namespace arg
 {

 /// The ExecObject type check passes for any object that inherits from \c
-/// ExecutionObjectFactoryBase. This is supposed to signify that the object can be
+/// ExecutionObjectBase. This is supposed to signify that the object can be
 /// used in the execution environment although there is no way to verify that.
 ///
 struct TypeCheckTagExecObject
@ -46,8 +46,7 @@ struct TypeCheckTagExecObject
 template <typename Type>
 struct TypeCheck<TypeCheckTagExecObject, Type>
 {
-  static constexpr bool value =
-    std::is_base_of<vtkm::cont::ExecutionObjectFactoryBase, Type>::value;
+  static constexpr bool value = std::is_base_of<vtkm::cont::ExecutionObjectBase, Type>::value;
 };
 }
 }
--- a/vtkm/cont/arg/testing/UnitTestTransportExecObject.cxx
+++ b/vtkm/cont/arg/testing/UnitTestTransportExecObject.cxx
@ -24,7 +24,7 @@

 #include <vtkm/cont/serial/DeviceAdapterSerial.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 #include <vtkm/cont/testing/Testing.h>

@ -39,7 +39,7 @@ struct ExecutionObject
  vtkm::Int32 Number;
 };

-struct TestExecutionObject : public vtkm::cont::ExecutionObjectFactoryBase
+struct TestExecutionObject : public vtkm::cont::ExecutionObjectBase
 {
  vtkm::Int32 Number;

--- a/vtkm/cont/arg/testing/UnitTestTypeCheckExecObject.cxx
+++ b/vtkm/cont/arg/testing/UnitTestTypeCheckExecObject.cxx
@ -27,7 +27,7 @@
 namespace
 {

-struct TestExecutionObject : vtkm::cont::ExecutionObjectFactoryBase
+struct TestExecutionObject : vtkm::cont::ExecutionObjectBase
 {
 };
 struct TestNotExecutionObject
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@ -24,7 +24,9 @@ set(headers
  CudaAllocator.h
  DeviceAdapterAlgorithmCuda.h
  DeviceAdapterAlgorithmThrust.h
+  DeviceAdapterRuntimeDetectorCuda.h
  DeviceAdapterTagCuda.h
+  DeviceAdapterTimerImplementationCuda.h
  ExecutionArrayInterfaceBasicCuda.h
  MakeThrustIterator.h
  TaskTuner.h
@ -41,7 +43,9 @@ endif()

 target_sources(vtkm_cont PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterRuntimeDetectorCuda.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterTimerImplementationCuda.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
 )
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -31,29 +31,14 @@
 // Here are the actual implementation of the algorithms.
 #include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>

+// Here are the implementations of device adapter specific classes
+#include <vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h>
+#include <vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h>
+
 #include <vtkm/exec/cuda/internal/TaskStrided.h>

 #include <cuda.h>

-namespace vtkm
-{
-namespace cont
-{
-namespace cuda
-{
-namespace internal
-{
-
-static __global__ void DetermineIfValidCudaDevice()
-{
-  //used only to see if we can launch kernels. It is possible to have a
-  //CUDA capable device, but still fail to have CUDA support.
-}
-}
-}
-}
-}
-
 namespace vtkm
 {
 namespace cont
@ -71,129 +56,6 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
  }
 };

-/// CUDA contains its own high resolution timer.
-///
-template <>
-class DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
-{
-public:
-  VTKM_CONT DeviceAdapterTimerImplementation()
-  {
-    VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
-    VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
-    this->Reset();
-  }
-  VTKM_CONT ~DeviceAdapterTimerImplementation()
-  {
-    // These aren't wrapped in VTKM_CUDA_CALL because we can't throw errors
-    // from destructors. We're relying on cudaGetLastError in the
-    // VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR catching any issues from these calls
-    // later.
-    cudaEventDestroy(this->StartEvent);
-    cudaEventDestroy(this->EndEvent);
-  }
-
-  VTKM_CONT void Reset()
-  {
-    VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
-    VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
-  }
-
-  VTKM_CONT vtkm::Float64 GetElapsedTime()
-  {
-    VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
-    VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
-    float elapsedTimeMilliseconds;
-    VTKM_CUDA_CALL(
-      cudaEventElapsedTime(&elapsedTimeMilliseconds, this->StartEvent, this->EndEvent));
-    return static_cast<vtkm::Float64>(0.001f * elapsedTimeMilliseconds);
-  }
-
-private:
-  // Copying CUDA events is problematic.
-  DeviceAdapterTimerImplementation(
-    const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) = delete;
-  void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) =
-    delete;
-
-  cudaEvent_t StartEvent;
-  cudaEvent_t EndEvent;
-};
-
-/// \brief Class providing a CUDA runtime support detector.
-///
-/// The class provide the actual implementation used by
-/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
-///
-/// We will verify at runtime that the machine has at least one CUDA
-/// capable device, and said device is from the 'fermi' (SM_20) generation
-/// or newer.
-///
-template <>
-class DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
-{
-public:
-  VTKM_CONT DeviceAdapterRuntimeDetector()
-    : NumberOfDevices(0)
-    , HighestArchSupported(0)
-  {
-    static bool deviceQueryInit = false;
-    static int numDevices = 0;
-    static int archVersion = 0;
-
-    if (!deviceQueryInit)
-    {
-      deviceQueryInit = true;
-
-      //first query for the number of devices
-      VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
-
-      for (vtkm::Int32 i = 0; i < numDevices; i++)
-      {
-        cudaDeviceProp prop;
-        VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
-        const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
-        archVersion = vtkm::Max(arch, archVersion);
-      }
-
-      //Make sure we can actually launch a kernel. This could fail for any
-      //of the following reasons:
-      //
-      // 1. cudaErrorInsufficientDriver, caused by out of data drives
-      // 2. cudaErrorDevicesUnavailable, caused by another process locking the
-      //    device or somebody disabling cuda support on the device
-      // 3. cudaErrorNoKernelImageForDevice we built for a compute version
-      //    greater than the device we are running on
-      // Most likely others that I don't even know about
-      vtkm::cont::cuda::internal::DetermineIfValidCudaDevice<<<1, 1, 0, cudaStreamPerThread>>>();
-      if (cudaSuccess != cudaGetLastError())
-      {
-        numDevices = 0;
-        archVersion = 0;
-      }
-    }
-
-    this->NumberOfDevices = numDevices;
-    this->HighestArchSupported = archVersion;
-  }
-
-  /// Returns true if the given device adapter is supported on the current
-  /// machine.
-  ///
-  /// Only returns true if we have at-least one CUDA capable device of SM_20 or
-  /// greater ( fermi ).
-  ///
-  VTKM_CONT bool Exists() const
-  {
-    //
-    return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
-  }
-
-private:
-  vtkm::Int32 NumberOfDevices;
-  vtkm::Int32 HighestArchSupported;
-};
-
 /// CUDA contains its own atomic operations
 ///
 template <typename T>
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
@ -32,11 +32,17 @@ namespace cuda
 namespace internal
 {

-VTKM_CONT_EXPORT int getNumSMs(int dId)
+VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
 {
+  std::size_t index = 0;
+  if (dId > 0)
+  {
+    index = static_cast<size_t>(dId);
+  }
+
  //check
  static bool lookupBuilt = false;
-  static std::vector<int> numSMs;
+  static std::vector<vtkm::UInt32> numSMs;

  if (!lookupBuilt)
  {
@ -53,11 +59,11 @@ VTKM_CONT_EXPORT int getNumSMs(int dId)
    { //get the number of sm's per deviceId
      VTKM_CUDA_CALL(
        cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
-      numSMs.push_back(numberOfSMs);
+      numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
    }
    lookupBuilt = true;
  }
-  return numSMs[dId];
+  return numSMs[index];
 }

 // we use cuda pinned memory to reduce the amount of synchronization
@ -107,8 +113,8 @@ char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupError

 template <>
 void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
-  int& grids,
-  int& blocks,
+  vtkm::UInt32& grids,
+  vtkm::UInt32& blocks,
  vtkm::Id size)
 {
  (void)size;
@ -120,7 +126,7 @@ void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAnd

 template <>
 void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
-  int& grids,
+  vtkm::UInt32& grids,
  dim3& blocks,
  const dim3& size)
 {
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@ -1084,10 +1084,10 @@ public:
  static char* SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);

  VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(int& grid, int& blocks, vtkm::Id size);
+  static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);

  VTKM_CONT_EXPORT
-  static void GetGridsAndBlocks(int& grid, dim3& blocks, const dim3& size);
+  static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);

 public:
  template <typename WType, typename IType>
@ -1102,7 +1102,7 @@ public:
    }
    char* hostErrorPtr = SetupErrorBuffer(functor);

-    int grids, blocks;
+    vtkm::UInt32 grids, blocks;
    GetGridsAndBlocks(grids, blocks, numInstances);

    TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, numInstances);
@ -1140,7 +1140,7 @@ public:
                      static_cast<vtkm::UInt32>(rangeMax[1]),
                      static_cast<vtkm::UInt32>(rangeMax[2]));

-    int grids;
+    vtkm::UInt32 grids;
    dim3 blocks;
    GetGridsAndBlocks(grids, blocks, ranges);

--- a/vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.cu
@ -0,0 +1,100 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include <vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h>
+
+#include <vtkm/Math.h>
+#include <vtkm/cont/cuda/ErrorCuda.h>
+
+#include <cuda.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace cuda
+{
+namespace internal
+{
+
+static __global__ void DetermineIfValidCudaDevice()
+{
+  //used only to see if we can launch kernels. It is possible to have a
+  //CUDA capable device, but still fail to have CUDA support.
+}
+}
+}
+}
+}
+
+namespace vtkm
+{
+namespace cont
+{
+
+DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>::DeviceAdapterRuntimeDetector()
+  : NumberOfDevices(0)
+  , HighestArchSupported(0)
+{
+  static bool deviceQueryInit = false;
+  static int numDevices = 0;
+  static int archVersion = 0;
+
+  if (!deviceQueryInit)
+  {
+    deviceQueryInit = true;
+
+    //first query for the number of devices
+    VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
+
+    for (vtkm::Int32 i = 0; i < numDevices; i++)
+    {
+      cudaDeviceProp prop;
+      VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
+      const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
+      archVersion = vtkm::Max(arch, archVersion);
+    }
+
+    //Make sure we can actually launch a kernel. This could fail for any
+    //of the following reasons:
+    //
+    // 1. cudaErrorInsufficientDriver, caused by out of data drives
+    // 2. cudaErrorDevicesUnavailable, caused by another process locking the
+    //    device or somebody disabling cuda support on the device
+    // 3. cudaErrorNoKernelImageForDevice we built for a compute version
+    //    greater than the device we are running on
+    // Most likely others that I don't even know about
+    vtkm::cont::cuda::internal::DetermineIfValidCudaDevice<<<1, 1, 0, cudaStreamPerThread>>>();
+    if (cudaSuccess != cudaGetLastError())
+    {
+      numDevices = 0;
+      archVersion = 0;
+    }
+  }
+
+  this->NumberOfDevices = numDevices;
+  this->HighestArchSupported = archVersion;
+}
+
+bool DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>::Exists() const
+{
+  return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
+}
+}
+} // namespace vtkm::cont
--- a/vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h
@ -0,0 +1,67 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
+#define vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
+
+#include <vtkm/cont/vtkm_cont_export.h>
+
+#include <vtkm/Types.h>
+
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
+
+
+namespace vtkm
+{
+namespace cont
+{
+
+/// \brief Class providing a CUDA runtime support detector.
+///
+/// The class provide the actual implementation used by
+/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
+///
+/// We will verify at runtime that the machine has at least one CUDA
+/// capable device, and said device is from the 'fermi' (SM_20) generation
+/// or newer.
+///
+template <>
+class VTKM_CONT_EXPORT DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
+{
+public:
+  VTKM_CONT DeviceAdapterRuntimeDetector();
+
+  /// Returns true if the given device adapter is supported on the current
+  /// machine.
+  ///
+  /// Only returns true if we have at-least one CUDA capable device of SM_20 or
+  /// greater ( fermi ).
+  ///
+  VTKM_CONT bool Exists() const;
+
+private:
+  vtkm::Int32 NumberOfDevices;
+  vtkm::Int32 HighestArchSupported;
+};
+}
+} // namespace vtkm::cont
+
+
+#endif
--- a/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.cu
@ -0,0 +1,66 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include <vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h>
+
+#include <vtkm/Types.h>
+#include <vtkm/cont/cuda/ErrorCuda.h>
+
+#include <cuda.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+DeviceAdapterTimerImplementation<
+  vtkm::cont::DeviceAdapterTagCuda>::DeviceAdapterTimerImplementation()
+{
+  VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
+  VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
+  this->Reset();
+}
+
+DeviceAdapterTimerImplementation<
+  vtkm::cont::DeviceAdapterTagCuda>::~DeviceAdapterTimerImplementation()
+{
+  // These aren't wrapped in VTKM_CUDA_CALL because we can't throw errors
+  // from destructors. We're relying on cudaGetLastError in the
+  // VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR catching any issues from these calls
+  // later.
+  cudaEventDestroy(this->StartEvent);
+  cudaEventDestroy(this->EndEvent);
+}
+
+void DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::Reset()
+{
+  VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
+  VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
+}
+
+vtkm::Float64 DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::GetElapsedTime()
+{
+  VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
+  VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
+  float elapsedTimeMilliseconds;
+  VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, this->StartEvent, this->EndEvent));
+  return static_cast<vtkm::Float64>(0.001f * elapsedTimeMilliseconds);
+}
+}
+} // namespace vtkm::cont
--- a/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h
@ -0,0 +1,69 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
+#define vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
+
+#include <vtkm/cont/vtkm_cont_export.h>
+
+#include <vtkm/Types.h>
+
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
+
+#include <cuda.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+///
+/// Specialization of DeviceAdapterTimerImplementation for CUDA
+/// CUDA contains its own high resolution timer that are able
+/// to track how long it takes to execute async kernels.
+/// If we simply measured time on the CPU it would incorrectly
+/// just capture how long it takes to launch a kernel.
+template <>
+class VTKM_CONT_EXPORT DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
+{
+public:
+  VTKM_CONT DeviceAdapterTimerImplementation();
+
+  VTKM_CONT ~DeviceAdapterTimerImplementation();
+
+  VTKM_CONT void Reset();
+
+  VTKM_CONT vtkm::Float64 GetElapsedTime();
+
+private:
+  // Copying CUDA events is problematic.
+  DeviceAdapterTimerImplementation(
+    const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) = delete;
+  void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) =
+    delete;
+
+  cudaEvent_t StartEvent;
+  cudaEvent_t EndEvent;
+};
+}
+} // namespace vtkm::cont
+
+
+#endif
--- a/vtkm/cont/cuda/internal/TaskTuner.h
+++ b/vtkm/cont/cuda/internal/TaskTuner.h
@ -41,7 +41,7 @@ namespace cuda
 namespace internal
 {

-int getNumSMs(int dId);
+vtkm::UInt32 getNumSMs(int dId);

 template <typename TaskType>
 __global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
@ -110,10 +110,10 @@ static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numIn

  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
-    int grids = gridIndexTable[g] * getNumSMs(deviceId);
+    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
    for (vtkm::UInt32 b = 0; b < 12; b++)
    {
-      int blocks = blockIndexTable[b];
+      vtkm::UInt32 blocks = blockIndexTable[b];

      cudaEvent_t start, stop;
      VTKM_CUDA_CALL(cudaEventCreate(&start));
@ -156,7 +156,7 @@ static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rang
  int deviceId;
  for (vtkm::UInt32 g = 0; g < 12; g++)
  {
-    int grids = gridIndexTable[g] * getNumSMs(deviceId);
+    vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
    for (vtkm::UInt32 i = 0; i < 16; i++)
    {
      for (vtkm::UInt32 j = 0; j < 16; j++)
--- a/vtkm/cont/internal/ArrayHandleBasicImpl.cxx
+++ b/vtkm/cont/internal/ArrayHandleBasicImpl.cxx
@ -258,6 +258,10 @@ bool ArrayHandleImpl::PrepareForDevice(DeviceAdapterId devId, vtkm::UInt64 sizeO
  VTKM_ASSERT(!this->ExecutionArrayValid);
  switch (devId)
  {
+    case VTKM_DEVICE_ADAPTER_ERROR:
+      throw vtkm::cont::ErrorBadValue("device should not be VTKM_DEVICE_ADAPTER_ERROR");
+      break;
+
 #ifdef VTKM_ENABLE_TBB
    case VTKM_DEVICE_ADAPTER_TBB:
      this->ExecutionInterface =
@ -274,6 +278,9 @@ bool ArrayHandleImpl::PrepareForDevice(DeviceAdapterId devId, vtkm::UInt64 sizeO
        new ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>(*this->ControlArray);
      break;
 #endif
+
+    case VTKM_DEVICE_ADAPTER_SERIAL:
+      VTKM_FALLTHROUGH;
    default:
      this->ExecutionInterface =
        new ExecutionArrayInterfaceBasic<DeviceAdapterTagSerial>(*this->ControlArray);
--- a/vtkm/cont/serial/testing/CMakeLists.txt
+++ b/vtkm/cont/serial/testing/CMakeLists.txt
@ -32,4 +32,4 @@ set(unit_tests
  UnitTestSerialPointLocatorUniformGrid.cxx
  UnitTestSerialVirtualObjectHandle.cxx
  )
-vtkm_unit_tests(SOURCES ${unit_tests} BACKEND SERIAL)
+vtkm_unit_tests(SOURCES ${unit_tests})
--- a/vtkm/cont/tbb/testing/CMakeLists.txt
+++ b/vtkm/cont/tbb/testing/CMakeLists.txt
@ -33,4 +33,4 @@ set(unit_tests
  UnitTestTBBVirtualObjectHandle.cxx
  )

-vtkm_unit_tests(SOURCES ${unit_tests} BACKEND TBB)
+vtkm_unit_tests(SOURCES ${unit_tests})
--- a/vtkm/exec/AtomicArray.h
+++ b/vtkm/exec/AtomicArray.h
@ -23,7 +23,7 @@
 #include <vtkm/ListTag.h>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/DeviceAdapter.h>
-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 namespace vtkm
 {
@ -51,7 +51,7 @@ struct AtomicArrayTypeListTag : vtkm::ListTagBase<vtkm::Int32, vtkm::Int64>
 ///
 ///
 template <typename T, typename DeviceAdapterTag>
-class AtomicArray : public vtkm::cont::ExecutionObjectFactoryBase
+class AtomicArray : public vtkm::cont::ExecutionObjectBase
 {
 public:
  using ValueType = T;
--- a/vtkm/exec/CellDerivative.h
+++ b/vtkm/exec/CellDerivative.h
@ -556,7 +556,8 @@ VTKM_EXEC vtkm::Vec<typename FieldVecType::ComponentType, 3> CellDerivative(

  using T = typename FieldVecType::ComponentType;

-  return vtkm::Vec<T, 3>((field[1] - field[0]) / wCoords.GetSpacing()[0], T(0), T(0));
+  return vtkm::Vec<T, 3>(
+    static_cast<T>((field[1] - field[0]) / wCoords.GetSpacing()[0]), T(0), T(0));
 }

 //-----------------------------------------------------------------------------
@ -857,7 +858,9 @@ VTKM_EXEC vtkm::Vec<typename FieldVecType::ComponentType, 3> CellDerivative(
  sum = sum + field[2] * VecT(pc[1], pc[0]);
  sum = sum + field[3] * VecT(-pc[1], rc[0]);

-  return vtkm::Vec<T, 3>(sum[0] / wCoords.GetSpacing()[0], sum[1] / wCoords.GetSpacing()[1], T(0));
+  return vtkm::Vec<T, 3>(static_cast<T>(sum[0] / wCoords.GetSpacing()[0]),
+                         static_cast<T>(sum[1] / wCoords.GetSpacing()[1]),
+                         T(0));
 }

 //-----------------------------------------------------------------------------
--- a/vtkm/exec/ExecutionWholeArray.h
+++ b/vtkm/exec/ExecutionWholeArray.h
@ -22,7 +22,7 @@

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/DeviceAdapter.h>
-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 namespace vtkm
 {
@ -39,7 +39,7 @@ namespace exec
 /// structure.
 ///
 template <typename T, typename StorageTag, typename DeviceAdapterTag>
-class ExecutionWholeArray : public vtkm::cont::ExecutionObjectFactoryBase
+class ExecutionWholeArray : public vtkm::cont::ExecutionObjectBase
 {
 public:
  using ValueType = T;
@ -89,7 +89,7 @@ private:
 /// structure
 ///
 template <typename T, typename StorageTag, typename DeviceAdapterTag>
-class ExecutionWholeArrayConst : public vtkm::cont::ExecutionObjectFactoryBase
+class ExecutionWholeArrayConst : public vtkm::cont::ExecutionObjectBase
 {
 public:
  using ValueType = T;
@ -126,4 +126,4 @@ private:
 }
 } // namespace vtkm::exec

-#endif //vtk_m_exec_ExecutionObjectFactoryBase_h
+#endif //vtk_m_exec_ExecutionObjectBase_h
--- a/vtkm/exec/arg/FetchTagWholeCellSetIn.h
+++ b/vtkm/exec/arg/FetchTagWholeCellSetIn.h
@ -23,8 +23,6 @@
 #include <vtkm/exec/arg/AspectTagDefault.h>
 #include <vtkm/exec/arg/Fetch.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
-
 #include <type_traits>

 namespace vtkm
--- a/vtkm/exec/arg/testing/UnitTestFetchExecObject.cxx
+++ b/vtkm/exec/arg/testing/UnitTestFetchExecObject.cxx
@ -22,7 +22,7 @@

 #include <vtkm/exec/arg/testing/ThreadIndicesTesting.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 #include <vtkm/testing/Testing.h>

@ -31,7 +31,7 @@
 namespace
 {

-struct TestExecutionObject : public vtkm::cont::ExecutionObjectFactoryBase
+struct TestExecutionObject : public vtkm::cont::ExecutionObjectBase
 {
  TestExecutionObject()
    : Number(static_cast<vtkm::Int32>(0xDEADDEAD))
--- a/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu
+++ b/vtkm/exec/cuda/internal/testing/UnitTestTaskStrided.cu
@ -32,6 +32,20 @@

 #include <vtkm/cont/testing/Testing.h>

+#if defined(VTKM_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4068) //unknown pragma
+#endif
+
+#if defined(__NVCC__) && defined(__CUDACC_VER_MAJOR__)
+// Disable warning "declared but never referenced"
+// This file produces several false-positive warnings
+// Eg: TestExecObject::TestExecObject, MyOutputToInputMapPortal::Get,
+//     TestWorkletProxy::operator()
+#pragma push
+#pragma diag_suppress 177
+#endif
+
 namespace
 {

@ -359,3 +373,11 @@ int UnitTestTaskStrided(int, char* [])
 {
  return vtkm::cont::testing::Testing::Run(TestTaskStrided<vtkm::cont::DeviceAdapterTagCuda>);
 }
+
+#if defined(__NVCC__) && defined(__CUDACC_VER_MAJOR__)
+#pragma pop
+#endif
+
+#if defined(VTKM_MSVC)
+#pragma warning(pop)
+#endif
--- a/vtkm/exec/internal/ReduceByKeyLookup.h
+++ b/vtkm/exec/internal/ReduceByKeyLookup.h
@ -20,7 +20,7 @@
 #ifndef vtk_m_exec_internal_ReduceByKeyLookup_h
 #define vtk_m_exec_internal_ReduceByKeyLookup_h

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 #include <vtkm/StaticAssert.h>
 #include <vtkm/Types.h>
@ -41,7 +41,7 @@ namespace internal
 /// state.
 ///
 template <typename KeyPortalType, typename IdPortalType, typename IdComponentPortalType>
-struct ReduceByKeyLookup : vtkm::cont::ExecutionObjectFactoryBase
+struct ReduceByKeyLookup : vtkm::cont::ExecutionObjectBase
 {
  using KeyType = typename KeyPortalType::ValueType;

--- a/vtkm/exec/testing/UnitTestCellDerivative.cxx
+++ b/vtkm/exec/testing/UnitTestCellDerivative.cxx
@ -61,9 +61,10 @@ struct LinearField
  template <typename T>
  FieldType GetValue(vtkm::Vec<T, 3> coordinates) const
  {
-    return ((coordinates[0] * this->Gradient[0] + coordinates[1] * this->Gradient[1] +
-             coordinates[2] * this->Gradient[2]) +
-            this->OriginValue);
+    return static_cast<FieldType>((coordinates[0] * this->Gradient[0] +
+                                   coordinates[1] * this->Gradient[1] +
+                                   coordinates[2] * this->Gradient[2]) +
+                                  this->OriginValue);
  }
 };

--- a/vtkm/filter/Streamline.hxx
+++ b/vtkm/filter/Streamline.hxx
@ -95,7 +95,7 @@ inline VTKM_CONT vtkm::cont::DataSet Streamline::DoExecute(

  //RGEvalType eval(input.GetCoordinateSystem(), input.GetCellSet(0), field);
  RGEvalType eval(coords, cells, field);
-  RK4RGType rk4(eval, static_cast<vtkm::FloatDefault>(this->StepSize));
+  RK4RGType rk4(eval, static_cast<T>(this->StepSize));

  vtkm::worklet::Streamline streamline;
  vtkm::worklet::StreamlineResult<T> res;
--- a/vtkm/internal/Configure.h.in
+++ b/vtkm/internal/Configure.h.in
@ -312,4 +312,23 @@

 #endif

+//----------------------------------------------------------------------------
+// Switch case fall-through policy.
+
+// Use "VTKM_FALLTHROUGH;" to annotate deliberate fall-through in switches,
+// use it analogously to "break;".  The trailing semi-colon is required.
+#if !defined(VTKM_FALLTHROUGH) && defined(__has_cpp_attribute)
+# if __cplusplus >= 201703L && __has_cpp_attribute(fallthrough)
+#  define VTKM_FALLTHROUGH [[fallthrough]]
+# elif __cplusplus >= 201103L && __has_cpp_attribute(gnu::fallthrough)
+#  define VTKM_FALLTHROUGH [[gnu::fallthrough]]
+# elif __cplusplus >= 201103L && __has_cpp_attribute(clang::fallthrough)
+#  define VTKM_FALLTHROUGH [[clang::fallthrough]]
+# endif
+#endif
+
+#ifndef VTKM_FALLTHROUGH
+# define VTKM_FALLTHROUGH ((void)0)
+#endif
+
 #endif //vtkm_internal_Configure_h
--- a/vtkm/io/reader/BOVDataSetReader.h
+++ b/vtkm/io/reader/BOVDataSetReader.h
@ -75,7 +75,7 @@ private:
    if (stream.fail())
      throw vtkm::io::ErrorIO("Failed to open file: " + this->FileName);

-    DataFormat dataFormat;
+    DataFormat dataFormat = ByteData;
    std::string bovFile, line, token, options, variableName;
    vtkm::Id numComponents = 1;
    vtkm::Id3 dim;
--- a/vtkm/rendering/Texture2D.h
+++ b/vtkm/rendering/Texture2D.h
@ -25,7 +25,7 @@
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/DeviceAdapter.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 namespace vtkm
 {
@ -211,7 +211,7 @@ public:
    TextureWrapMode WrapMode;
  };

-  class Texture2DSampler : public vtkm::cont::ExecutionObjectFactoryBase
+  class Texture2DSampler : public vtkm::cont::ExecutionObjectBase
  {
  public:
    VTKM_CONT
--- a/vtkm/rendering/raytracing/Camera.cxx
+++ b/vtkm/rendering/raytracing/Camera.cxx
@ -455,7 +455,7 @@ void Camera::SetHeight(const vtkm::Int32& height)
  if (Height != height)
  {
    this->Height = height;
-    this->SetFieldOfView(this->FovX);
+    this->SetFieldOfView(this->FovY);
  }
 }

@ -475,7 +475,7 @@ void Camera::SetWidth(const vtkm::Int32& width)
  if (this->Width != width)
  {
    this->Width = width;
-    this->SetFieldOfView(this->FovX);
+    this->SetFieldOfView(this->FovY);
  }
 }

@ -529,8 +529,32 @@ void Camera::SetFieldOfView(const vtkm::Float32& degrees)
    throw vtkm::cont::ErrorBadValue("Camera feild of view must be less than 180.");
  }

-  vtkm::Float32 newFOVY = (vtkm::Float32(this->Height) / vtkm::Float32(this->Width)) * degrees;
-  vtkm::Float32 newFOVX = degrees;
+  vtkm::Float32 newFOVY = degrees;
+  vtkm::Float32 newFOVX;
+
+  if (this->Width != this->Height)
+  {
+    vtkm::Float32 fovyRad = (newFOVY * static_cast<vtkm::Float32>(vtkm::Pi())) / 180.0f;
+
+    // Use the tan function to find the distance from the center of the image to the top (or
+    // bottom). (Actually, we are finding the ratio of this distance to the near plane distance,
+    // but since we scale everything by the near plane distance, we can use this ratio as a scaled
+    // proxy of the distances we need.)
+    vtkm::Float32 verticalDistance = vtkm::Tan(0.5f * fovyRad);
+
+    // Scale the vertical distance by the aspect ratio to get the horizontal distance.
+    vtkm::Float32 aspectRatio = vtkm::Float32(this->Width) / vtkm::Float32(this->Height);
+    vtkm::Float32 horizontalDistance = aspectRatio * verticalDistance;
+
+    // Now use the arctan function to get the proper field of view in the x direction.
+    vtkm::Float32 fovxRad = 2.0f * vtkm::ATan(horizontalDistance);
+    newFOVX = 180.0f * (fovxRad / static_cast<vtkm::Float32>(vtkm::Pi()));
+  }
+  else
+  {
+    newFOVX = newFOVY;
+  }
+
  if (newFOVX != this->FovX)
  {
    this->IsViewDirty = true;
@ -541,13 +565,13 @@ void Camera::SetFieldOfView(const vtkm::Float32& degrees)
  }
  this->FovX = newFOVX;
  this->FovY = newFOVY;
-  this->CameraView.SetFieldOfView(this->FovX);
+  this->CameraView.SetFieldOfView(this->FovY);
 }

 VTKM_CONT
 vtkm::Float32 Camera::GetFieldOfView() const
 {
-  return this->FovX;
+  return this->FovY;
 }

 VTKM_CONT
--- a/vtkm/rendering/raytracing/MortonCodes.h
+++ b/vtkm/rendering/raytracing/MortonCodes.h
@ -213,8 +213,7 @@ public:
      // the wonders of floating point math. This is bad. If we calculate in the same order
      // for all faces, then at worst, two different faces can enter the same bucket, which
      // we currently check for.
-      vtkm::Vec<vtkm::Id, 4> faceIndices;
-      faceIndices[3] = -1;
+      vtkm::Vec<vtkm::Id, 4> faceIndices(-1);
      //Number of indices this face has
      const vtkm::Int32 indiceCount = tables.ShapesFaceList(tableOffset + i, 0);
      for (vtkm::Int32 j = 1; j <= indiceCount; j++)
--- a/vtkm/worklet/CMakeLists.txt
+++ b/vtkm/worklet/CMakeLists.txt
@ -54,6 +54,7 @@ set(headers
  ParticleAdvection.h
  PointAverage.h
  PointElevation.h
+  PointTransform.h
  Probe.h
  RemoveUnusedPoints.h
  ScalarsToColors.h
--- a/vtkm/worklet/Clip.h
+++ b/vtkm/worklet/Clip.h
@ -149,7 +149,7 @@ private:
  IdPortal IndexOffsets;
 };

-class ExecutionConnectivityExplicit : vtkm::cont::ExecutionObjectFactoryBase
+class ExecutionConnectivityExplicit : vtkm::cont::ExecutionObjectBase
 {
 public:
  VTKM_CONT
--- a/vtkm/worklet/ContourTreeUniform.h
+++ b/vtkm/worklet/ContourTreeUniform.h
@ -75,6 +75,15 @@
 #include <vtkm/worklet/DispatcherMapField.h>
 #include <vtkm/worklet/WorkletMapField.h>

+// For numerous functions inside contourTree GCC is able to determine if i is
+// always greater than j ( or vice-versa ) and optimizes those call sites.
+// But when it does these optimizations is presumes that i and j will not
+// overflow and emits a Wstrict-overflow warning
+#ifdef VTKM_GCC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
 #include <vtkm/worklet/contourtree/ChainGraph.h>
 #include <vtkm/worklet/contourtree/ContourTree.h>
 #include <vtkm/worklet/contourtree/MergeTree.h>
@ -202,4 +211,8 @@ public:
 }
 } // namespace vtkm::worklet

+#ifdef VTKM_GCC
+#pragma GCC diagnostic pop
+#endif
+
 #endif // vtk_m_worklet_ContourTreeUniform_h
--- a/vtkm/worklet/Gradient.h
+++ b/vtkm/worklet/Gradient.h
@ -120,7 +120,7 @@ private:
 } //namespace gradient

 template <typename T>
-struct GradientOutputFields : public vtkm::cont::ExecutionObjectFactoryBase
+struct GradientOutputFields : public vtkm::cont::ExecutionObjectBase
 {

  using ValueType = T;
--- a/vtkm/worklet/PointTransform.h
+++ b/vtkm/worklet/PointTransform.h
@ -0,0 +1,124 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_worklet_PointTransform_h
+#define vtk_m_worklet_PointTransform_h
+
+#include <vtkm/Math.h>
+#include <vtkm/Matrix.h>
+#include <vtkm/Transform3D.h>
+#include <vtkm/worklet/WorkletMapField.h>
+
+namespace vtkm
+{
+namespace worklet
+{
+
+template <typename T>
+class PointTransform : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<Vec3>, FieldOut<Vec3>);
+  typedef _2 ExecutionSignature(_1);
+
+  VTKM_CONT
+  PointTransform() {}
+
+  //Translation
+  template <typename S>
+  VTKM_CONT void SetTranslation(const S& tx, const S& ty, const S& tz)
+  {
+    matrix = vtkm::Transform3DTranslate(static_cast<T>(tx), static_cast<T>(ty), static_cast<T>(tz));
+  }
+
+  template <typename S>
+  VTKM_CONT void SetTranslation(const vtkm::Vec<S, 3>& v)
+  {
+    SetTranslation(v[0], v[1], v[2]);
+  }
+
+  //Rotation
+  template <typename S>
+  VTKM_CONT void SetRotation(const S& angleDegrees, const vtkm::Vec<S, 3>& axis)
+  {
+    matrix = vtkm::Transform3DRotate(angleDegrees, axis);
+  }
+
+  template <typename S>
+  VTKM_CONT void SetRotationX(const S& angleDegrees, const S& rx, const S& ry, const S& rz)
+  {
+    SetRotation(angleDegrees, vtkm::Vec<S, 3>(rx, ry, rz));
+  }
+
+  template <typename S>
+  VTKM_CONT void SetRotationX(const S& angleDegrees)
+  {
+    SetRotation(angleDegrees, 1, 0, 0);
+  }
+
+  template <typename S>
+  VTKM_CONT void SetRotationY(const S& angleDegrees)
+  {
+    SetRotation(angleDegrees, 0, 1, 0);
+  }
+
+  template <typename S>
+  VTKM_CONT void SetRotationZ(const S& angleDegrees)
+  {
+    SetRotation(angleDegrees, 0, 0, 1);
+  }
+
+  //Scaling
+  template <typename S>
+  VTKM_CONT void SetScale(const S& s)
+  {
+    matrix = vtkm::Transform3DScale(s, s, s);
+  }
+
+  template <typename S>
+  VTKM_CONT void SetScale(const S& sx, const S& sy, const S& sz)
+  {
+    matrix = vtkm::Transform3DScale(static_cast<T>(sx), static_cast<T>(sy), static_cast<T>(sz));
+  }
+
+  template <typename S>
+  VTKM_CONT void SetScale(const vtkm::Vec<S, 3>& v)
+  {
+    matrix = vtkm::Transform3DScale(v[0], v[1], v[2]);
+  }
+
+  //General transformation
+  VTKM_CONT
+  void SetTransform(const vtkm::Matrix<T, 4, 4>& mtx) { matrix = mtx; }
+
+  //Functor
+  VTKM_EXEC
+  vtkm::Vec<T, 3> operator()(const vtkm::Vec<T, 3>& vec) const
+  {
+    return vtkm::Transform3DPoint(matrix, vec);
+  }
+
+private:
+  vtkm::Matrix<T, 4, 4> matrix;
+};
+}
+} // namespace vtkm::worklet
+
+#endif // vtk_m_worklet_PointTransform_h
--- a/vtkm/worklet/connectivities/CMakeLists.txt
+++ b/vtkm/worklet/connectivities/CMakeLists.txt
@ -23,6 +23,8 @@ set(headers
  CellSetDualGraph.h
  GraphConnectivity.h
  InnerJoin.h
+  ImageConnectivity.h
+  UnionFind.h
  )

 #-----------------------------------------------------------------------------
--- a/vtkm/worklet/connectivities/CellSetConnectivity.h
+++ b/vtkm/worklet/connectivities/CellSetConnectivity.h
@ -25,6 +25,13 @@
 #include <vtkm/worklet/connectivities/CellSetDualGraph.h>
 #include <vtkm/worklet/connectivities/GraphConnectivity.h>

+namespace vtkm
+{
+namespace worklet
+{
+namespace connectivity
+{
+
 class CellSetConnectivity
 {
 public:
@ -45,4 +52,8 @@ public:
      numIndicesArray, indexOffsetArray, connectivityArray, componentArray);
  }
 };
+}
+}
+} // vtkm::worklet::connectivity
+
 #endif // vtk_m_worklet_connectivity_CellSetConnectivity_h
--- a/vtkm/worklet/connectivities/CellSetDualGraph.h
+++ b/vtkm/worklet/connectivities/CellSetDualGraph.h
@ -28,10 +28,20 @@
 #include <vtkm/worklet/ScatterCounting.h>
 #include <vtkm/worklet/WorkletMapTopology.h>

+namespace vtkm
+{
+namespace worklet
+{
+namespace connectivity
+{
+namespace detail
+{
 struct EdgeCount : public vtkm::worklet::WorkletMapPointToCell
 {
  typedef void ControlSignature(CellSetIn, FieldOutCell<> numEdgesInCell);
+
  typedef _2 ExecutionSignature(CellShape, PointCount);
+
  using InputDomain = _1;

  template <typename CellShapeTag>
@ -46,6 +56,7 @@ struct EdgeExtract : public vtkm::worklet::WorkletMapPointToCell
  typedef void ControlSignature(CellSetIn, FieldOutCell<> cellIndices, FieldOutCell<> edgeIndices);

  typedef void ExecutionSignature(CellShape, InputIndex, PointIndices, VisitIndex, _2, _3);
+
  using InputDomain = _1;

  using ScatterType = vtkm::worklet::ScatterCounting;
@ -75,6 +86,7 @@ struct CellToCellConnectivity : public vtkm::worklet::WorkletMapField
                                WholeArrayOut<> to);

  typedef void ExecutionSignature(_1, InputIndex, _2, _3, _4);
+
  using InputDomain = _1;

  template <typename ConnectivityPortalType, typename CellIdPortalType>
@ -90,6 +102,7 @@ struct CellToCellConnectivity : public vtkm::worklet::WorkletMapField
    to.Set(index * 2 + 1, cells.Get(offset));
  }
 };
+} // vtkm::worklet::connectivity::detail

 template <typename DeviceAdapter>
 class CellSetDualGraph
@ -110,12 +123,14 @@ public:
  {
    // Get number of edges for each cell and use it as scatter count.
    vtkm::cont::ArrayHandle<vtkm::IdComponent> numEdgesPerCell;
-    vtkm::worklet::DispatcherMapTopology<EdgeCount, DeviceAdapter> edgesPerCellDisp;
+    vtkm::worklet::DispatcherMapTopology<detail::EdgeCount, DeviceAdapter> edgesPerCellDisp;
    edgesPerCellDisp.Invoke(cellSet, numEdgesPerCell);

    // Get uncompress Cell to Edge mapping
    vtkm::worklet::ScatterCounting scatter{ numEdgesPerCell, DeviceAdapter() };
-    vtkm::worklet::DispatcherMapTopology<EdgeExtract, DeviceAdapter> edgeExtractDisp{ scatter };
+    vtkm::worklet::DispatcherMapTopology<detail::EdgeExtract, DeviceAdapter> edgeExtractDisp{
+      scatter
+    };
    edgeExtractDisp.Invoke(cellSet, cellIds, cellEdges);
  }

@ -157,7 +172,7 @@ public:
    vtkm::cont::ArrayHandle<vtkm::Id> connTo;
    connFrom.Allocate(sharedEdges.GetNumberOfValues() * 2);
    connTo.Allocate(sharedEdges.GetNumberOfValues() * 2);
-    vtkm::worklet::DispatcherMapField<CellToCellConnectivity, DeviceAdapter> c2cDisp;
+    vtkm::worklet::DispatcherMapField<detail::CellToCellConnectivity, DeviceAdapter> c2cDisp;
    c2cDisp.Invoke(lb, cellIds, connFrom, connTo);

    // Turn dual graph into Compressed Sparse Row format
@ -174,4 +189,8 @@ public:
    Algorithm::ScanExclusive(numIndicesArray, indexOffsetArray);
  }
 };
+}
+}
+}
+
 #endif //vtk_m_worklet_CellSetDualGraph_h
--- a/vtkm/worklet/connectivities/GraphConnectivity.h
+++ b/vtkm/worklet/connectivities/GraphConnectivity.h
@ -25,7 +25,16 @@

 #include <vtkm/worklet/connectivities/CellSetDualGraph.h>
 #include <vtkm/worklet/connectivities/InnerJoin.h>
+#include <vtkm/worklet/connectivities/UnionFind.h>

+namespace vtkm
+{
+namespace worklet
+{
+namespace connectivity
+{
+namespace detail
+{
 class Graft : public vtkm::worklet::WorkletMapField
 {
 public:
@ -36,6 +45,7 @@ public:
                                WholeArrayInOut<IdType> comp);

  typedef void ExecutionSignature(_1, _2, _3, _4, _5);
+
  using InputDomain = _1;

  // TODO: Use Scatter?
@ -56,38 +66,7 @@ public:
    }
  }
 };
-
-class PointerJumping : public vtkm::worklet::WorkletMapField
-{
-public:
-  typedef void ControlSignature(FieldIn<IdType> index, WholeArrayInOut<IdType> comp);
-  typedef void ExecutionSignature(_1, _2);
-  using InputDomain = _1;
-
-  template <typename InOutPortalType>
-  VTKM_EXEC void operator()(vtkm::Id index, InOutPortalType& comp) const
-  {
-    // keep updating component id until we reach the root of the tree.
-    for (auto parent = comp.Get(index); comp.Get(parent) != parent; parent = comp.Get(index))
-    {
-      comp.Set(index, comp.Get(parent));
-    }
-  }
-};
-
-class IsStar : public vtkm::worklet::WorkletMapField
-{
-public:
-  typedef void ControlSignature(FieldIn<IdType> index, WholeArrayIn<IdType> comp, FieldOut<>);
-  typedef _3 ExecutionSignature(_1, _2);
-  using InputDomain = _1;
-
-  template <typename InOutPortalType>
-  VTKM_EXEC bool operator()(vtkm::Id index, InOutPortalType& comp) const
-  {
-    return comp.Get(index) == comp.Get(comp.Get(index));
-  }
-};
+}

 template <typename DeviceAdapter>
 class GraphConnectivity
@ -111,7 +90,7 @@ public:

    do
    {
-      vtkm::worklet::DispatcherMapField<Graft, DeviceAdapter> graftDispatcher;
+      vtkm::worklet::DispatcherMapField<detail::Graft, DeviceAdapter> graftDispatcher;
      graftDispatcher.Invoke(
        cellIds, indexOffsetArray, numIndexArray, connectivityArray, components);

@ -142,4 +121,7 @@ public:
    Algorithm::SortByKey(cellIdsOut, componentsOut);
  }
 };
+}
+}
+}
 #endif //vtk_m_worklet_connectivity_graph_connectivity_h
--- a/vtkm/worklet/connectivities/ImageConnectivity.h
+++ b/vtkm/worklet/connectivities/ImageConnectivity.h
@ -0,0 +1,171 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+
+#ifndef vtk_m_worklet_connectivity_graph_connectivity_h
+#define vtk_m_worklet_connectivity_graph_connectivity_h
+
+#include <vtkm/worklet/DispatcherMapField.h>
+#include <vtkm/worklet/DispatcherPointNeighborhood.h>
+#include <vtkm/worklet/WorkletMapField.h>
+#include <vtkm/worklet/WorkletPointNeighborhood.h>
+
+#include <vtkm/worklet/connectivities/InnerJoin.h>
+#include <vtkm/worklet/connectivities/UnionFind.h>
+
+namespace vtkm
+{
+namespace worklet
+{
+namespace connectivity
+{
+namespace detail
+{
+template <int Dimension>
+class ImageGraft;
+
+template <>
+class ImageGraft<2> : public vtkm::worklet::WorkletPointNeighborhood3x3x3
+{
+public:
+  typedef void ControlSignature(CellSetIn,
+                                FieldInNeighborhood<> comp,
+                                FieldInNeighborhood<> color,
+                                FieldOut<> newComp);
+
+  typedef _4 ExecutionSignature(_2, _3);
+
+  template <typename Comp, typename NeighborColor>
+  VTKM_EXEC vtkm::Id operator()(const Comp& comp, const NeighborColor& color) const
+  {
+    vtkm::Id myComp = comp.Get(0, 0, 0);
+    auto myColor = color.Get(0, 0, 0);
+
+    for (int j = -1; j <= 1; j++)
+    {
+      for (int i = -1; i <= 1; i++)
+      {
+        if (myColor == color.Get(i, j, 0))
+        {
+          myComp = vtkm::Min(myComp, comp.Get(i, j, 0));
+        }
+      }
+    }
+    return myComp;
+  }
+};
+}
+
+class ImageConnectivity
+{
+public:
+  class RunImpl
+  {
+  public:
+    template <typename StorageT, typename OutputPortalType, typename Device>
+    void operator()(const vtkm::cont::ArrayHandle<vtkm::UInt8, StorageT>& pixels,
+                    const vtkm::cont::CellSetStructured<2>& input,
+                    OutputPortalType& componentsOut,
+                    Device) const
+    {
+      using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<Device>;
+
+      // TODO: template pixel type?
+
+      Algorithm::Copy(vtkm::cont::ArrayHandleCounting<vtkm::Id>(0, 1, pixels.GetNumberOfValues()),
+                      componentsOut);
+
+      vtkm::cont::ArrayHandle<vtkm::Id> newComponents;
+
+      vtkm::cont::ArrayHandle<vtkm::Id> pixelIds;
+      Algorithm::Copy(vtkm::cont::ArrayHandleCounting<vtkm::Id>(0, 1, pixels.GetNumberOfValues()),
+                      pixelIds);
+
+      bool allStar = false;
+      vtkm::cont::ArrayHandle<bool> isStar;
+
+      using DispatcherType =
+        vtkm::worklet::DispatcherPointNeighborhood<detail::ImageGraft<2>, Device>;
+
+      do
+      {
+        DispatcherType().Invoke(input, componentsOut, pixels, newComponents);
+
+        // Detection of allStar has to come before pointer jumping. Don't try to rearrange it.
+        vtkm::worklet::DispatcherMapField<IsStar, Device> isStarDisp;
+        isStarDisp.Invoke(pixelIds, newComponents, isStar);
+        allStar = Algorithm::Reduce(isStar, true, vtkm::LogicalAnd());
+
+        vtkm::worklet::DispatcherMapField<PointerJumping, Device> pointJumpingDispatcher;
+        pointJumpingDispatcher.Invoke(pixelIds, newComponents);
+
+        Algorithm::Copy(newComponents, componentsOut);
+
+      } while (!allStar);
+
+      // renumber connected component to the range of [0, number of components).
+      vtkm::cont::ArrayHandle<vtkm::Id> uniqueComponents;
+      Algorithm::Copy(componentsOut, uniqueComponents);
+      Algorithm::Sort(uniqueComponents);
+      Algorithm::Unique(uniqueComponents);
+
+      vtkm::cont::ArrayHandle<vtkm::Id> uniqueColor;
+      Algorithm::Copy(
+        vtkm::cont::ArrayHandleCounting<vtkm::Id>(0, 1, uniqueComponents.GetNumberOfValues()),
+        uniqueColor);
+      vtkm::cont::ArrayHandle<vtkm::Id> cellColors;
+      vtkm::cont::ArrayHandle<vtkm::Id> pixelIdsOut;
+      InnerJoin<Device>().Run(componentsOut,
+                              pixelIds,
+                              uniqueComponents,
+                              uniqueColor,
+                              cellColors,
+                              pixelIdsOut,
+                              componentsOut);
+
+      Algorithm::SortByKey(pixelIdsOut, componentsOut);
+    }
+  };
+
+  template <typename T, typename S, typename OutputPortalType, typename Device>
+  void Run(const vtkm::cont::CellSetStructured<2>& input,
+           const vtkm::cont::DynamicArrayHandleBase<T, S>& pixels,
+           OutputPortalType& componentsOut,
+           Device device) const
+  {
+    using Types = vtkm::ListTagBase<vtkm::UInt8>;
+    vtkm::cont::CastAndCall(pixels.ResetTypeList(Types{}), RunImpl(), input, componentsOut, device);
+  }
+
+  template <typename T, typename S, typename OutputPortalType, typename Device>
+  void Run(const vtkm::cont::CellSetStructured<2>& input,
+           const vtkm::cont::ArrayHandle<T, S>& pixels,
+           OutputPortalType& componentsOut,
+           Device device) const
+  {
+    vtkm::cont::CastAndCall(pixels, RunImpl(), input, componentsOut, device);
+  }
+};
+}
+}
+}
+
+#endif
--- a/vtkm/worklet/connectivities/InnerJoin.h
+++ b/vtkm/worklet/connectivities/InnerJoin.h
@ -29,6 +29,12 @@
 #include <vtkm/worklet/ScatterCounting.h>
 #include <vtkm/worklet/WorkletMapField.h>

+namespace vtkm
+{
+namespace worklet
+{
+namespace connectivity
+{
 template <typename DeviceAdapter>
 class InnerJoin
 {
@ -93,4 +99,8 @@ public:
    mergeDisp.Invoke(key1, value1, lbs, value2, keyOut, value1Out, value2Out);
  }
 };
+}
+}
+} // vtkm::worklet::connectivity
+
 #endif //vtk_m_worklet_connectivity_InnerJoin_h
--- a/vtkm/worklet/connectivities/UnionFind.h
+++ b/vtkm/worklet/connectivities/UnionFind.h
@ -0,0 +1,58 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+
+#ifndef vtk_m_worklet_connectivity_union_find_h
+#define vtk_m_worklet_connectivity_union_find_h
+
+class PointerJumping : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<IdType> index, WholeArrayInOut<IdType> comp);
+  typedef void ExecutionSignature(_1, _2);
+  using InputDomain = _1;
+
+  template <typename InOutPortalType>
+  VTKM_EXEC void operator()(vtkm::Id index, InOutPortalType& comp) const
+  {
+    // keep updating component id until we reach the root of the tree.
+    for (auto parent = comp.Get(index); comp.Get(parent) != parent; parent = comp.Get(index))
+    {
+      comp.Set(index, comp.Get(parent));
+    }
+  }
+};
+
+class IsStar : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<IdType> index, WholeArrayIn<IdType> comp, FieldOut<>);
+  typedef _3 ExecutionSignature(_1, _2);
+  using InputDomain = _1;
+
+  template <typename InOutPortalType>
+  VTKM_EXEC bool operator()(vtkm::Id index, InOutPortalType& comp) const
+  {
+    return comp.Get(index) == comp.Get(comp.Get(index));
+  }
+};
+
+#endif // vtk_m_worklet_connectivity_union_find_h
--- a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h
+++ b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h
@ -192,7 +192,7 @@ public:
    // and set the initial inverse index to a flag
    isCritical = (outDegree != 1) ? 1 : 0;
  }
-}; // Mesh2D_DEM_VertexStarter
+}; // Mesh2D_DEM_VertexOutdegreeStarter

 } // namespace contourtree
 } // namespace worklet
--- a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h
+++ b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h
@ -123,15 +123,6 @@ public:
  {
  }

-// For numerous calls of this function GCC is able to determine if i is
-// always greater than j ( or vice-versa ) and optimizes those call sites.
-// But when it does these optimizations is presumes that i and j will not
-// overflow and emits a Wstrict-overflow warning
-#ifdef VTKM_GCC
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-overflow"
-#endif
-
  // Locate the next vertex in direction indicated
  template <typename InFieldPortalType>
  VTKM_EXEC void operator()(const vtkm::Id& vertex,
@ -223,10 +214,6 @@ public:
    chain = destination;
  } // operator()

-#ifdef VTKM_GCC
-#pragma GCC diagnostic pop
-#endif
-
 }; // Mesh2D_DEM_VertexStarter
 }
 }
--- a/vtkm/worklet/contourtree/VertexValueComparator.h
+++ b/vtkm/worklet/contourtree/VertexValueComparator.h
@ -103,14 +103,6 @@ public:

  inline VTKM_EXEC bool operator()(const vtkm::Id& i, const vtkm::Id& j, bool ascending) const
  {
-// For numerous calls of this function GCC is able to determine if i is
-// always greater than j ( or vice-versa ) and optimizes those call sites.
-// But when it does these optimizations is presumes that i and j will not
-// overflow and emits a Wstrict-overflow warning
-#ifdef VTKM_GCC
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-overflow"
-#endif
    if (values.Get(i) < values.Get(j))
    {
      return ascending ^ true;
@ -127,9 +119,6 @@ public:
    {
      return ascending ^ false;
    }
-#ifdef VTKM_GCC
-#pragma GCC diagnostic pop
-#endif
    // fall through to return false
    return false;
  }
--- a/vtkm/worklet/gradient/GradientOutput.h
+++ b/vtkm/worklet/gradient/GradientOutput.h
@ -26,7 +26,7 @@
 #include <vtkm/cont/arg/TransportTagArrayOut.h>
 #include <vtkm/cont/arg/TransportTagExecObject.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>
 #include <vtkm/exec/arg/FetchTagArrayDirectOut.h>

 #include <vtkm/worklet/gradient/Divergence.h>
@ -39,7 +39,7 @@ namespace exec
 {

 template <typename T, typename DeviceAdapter>
-struct GradientScalarOutput : public vtkm::cont::ExecutionObjectFactoryBase
+struct GradientScalarOutput : public vtkm::cont::ExecutionObjectBase
 {
  using ValueType = vtkm::Vec<T, 3>;
  using BaseTType = typename vtkm::BaseComponent<T>::Type;
@ -77,7 +77,7 @@ struct GradientScalarOutput : public vtkm::cont::ExecutionObjectFactoryBase
 };

 template <typename T, typename DeviceAdapter>
-struct GradientVecOutput : public vtkm::cont::ExecutionObjectFactoryBase
+struct GradientVecOutput : public vtkm::cont::ExecutionObjectBase
 {
  using ValueType = vtkm::Vec<T, 3>;
  using BaseTType = typename vtkm::BaseComponent<T>::Type;
--- a/vtkm/worklet/internal/TriangulateTables.h
+++ b/vtkm/worklet/internal/TriangulateTables.h
@ -23,7 +23,7 @@
 #include <vtkm/CellShape.h>
 #include <vtkm/Types.h>

-#include <vtkm/cont/ExecutionObjectFactoryBase.h>
+#include <vtkm/cont/ExecutionObjectBase.h>

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/StorageBasic.h>
@ -147,7 +147,7 @@ private:
  PortalType Indices;
 };

-class TriangulateTablesExecutionObjectFactory : public vtkm::cont::ExecutionObjectFactoryBase
+class TriangulateTablesExecutionObjectFactory : public vtkm::cont::ExecutionObjectBase
 {
 public:
  template <typename Device>
@ -344,7 +344,7 @@ private:
  PortalType Indices;
 };

-class TetrahedralizeTablesExecutionObjectFactory : public vtkm::cont::ExecutionObjectFactoryBase
+class TetrahedralizeTablesExecutionObjectFactory : public vtkm::cont::ExecutionObjectBase
 {
 public:
  template <typename Device>
--- a/vtkm/worklet/internal/testing/UnitTestDispatcherBase.cxx
+++ b/vtkm/worklet/internal/testing/UnitTestDispatcherBase.cxx
@ -56,7 +56,7 @@ struct ExecutionObject
  vtkm::Id Value;
 };

-struct TestExecObjectType : vtkm::cont::ExecutionObjectFactoryBase
+struct TestExecObjectType : vtkm::cont::ExecutionObjectBase
 {
  template <typename Functor, typename... Args>
  void CastAndCall(Functor f, Args&&... args) const
@ -74,7 +74,7 @@ struct TestExecObjectType : vtkm::cont::ExecutionObjectFactoryBase
 };

 struct TestExecObjectTypeBad
-{ //this will fail as it doesn't inherit from vtkm::cont::ExecutionObjectFactoryBase
+{ //this will fail as it doesn't inherit from vtkm::cont::ExecutionObjectBase
  template <typename Functor, typename... Args>
  void CastAndCall(Functor f, Args&&... args) const
  {
--- a/vtkm/worklet/testing/CMakeLists.txt
+++ b/vtkm/worklet/testing/CMakeLists.txt
@ -23,7 +23,7 @@ set(unit_tests
  UnitTestCellAverage.cxx
  UnitTestCellDeepCopy.cxx
  UnitTestCellGradient.cxx
-  UnitTestCellSetConnectivity.cpp
+  UnitTestCellSetConnectivity.cxx
  UnitTestCellSetDualGraph.cxx
  UnitTestClipping.cxx
  UnitTestContourTreeUniform.cxx
@ -37,6 +37,7 @@ set(unit_tests
  UnitTestFieldHistogram.cxx
  UnitTestFieldStatistics.cxx
  UnitTestInnerJoin.cxx
+  UnitTestImageConnectivity.cxx
  UnitTestKdTreeBuildNNS.cxx
  UnitTestKeys.cxx
  UnitTestMagnitude.cxx
@ -50,6 +51,7 @@ set(unit_tests
  UnitTestParticleAdvection.cxx
  UnitTestPointElevation.cxx
  UnitTestPointGradient.cxx
+  UnitTestPointTransform.cxx
  UnitTestProbe.cxx
  UnitTestRemoveUnusedPoints.cxx
  UnitTestScalarsToColors.cxx
--- a/vtkm/worklet/testing/UnitTestCellSetConnectivity.cxx
+++ b/vtkm/worklet/testing/UnitTestCellSetConnectivity.cxx
@ -140,7 +140,8 @@ public:

    auto cellSet = outputData.GetCellSet().Cast<vtkm::cont::CellSetSingleType<>>();
    vtkm::cont::ArrayHandle<vtkm::Id> componentArray;
-    CellSetConnectivity().Run(cellSet, componentArray, DeviceAdapter());
+    vtkm::worklet::connectivity::CellSetConnectivity().Run(
+      cellSet, componentArray, DeviceAdapter());

    using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
    Algorithm::Sort(componentArray);
@ -155,7 +156,8 @@ public:

    auto cellSet = dataSet.GetCellSet().Cast<vtkm::cont::CellSetExplicit<>>();
    vtkm::cont::ArrayHandle<vtkm::Id> componentArray;
-    CellSetConnectivity().Run(cellSet, componentArray, DeviceAdapter());
+    vtkm::worklet::connectivity::CellSetConnectivity().Run(
+      cellSet, componentArray, DeviceAdapter());

    using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
    Algorithm::Sort(componentArray);
@ -170,7 +172,8 @@ public:

    auto cellSet = dataSet.GetCellSet();
    vtkm::cont::ArrayHandle<vtkm::Id> componentArray;
-    CellSetConnectivity().Run(cellSet, componentArray, DeviceAdapter());
+    vtkm::worklet::connectivity::CellSetConnectivity().Run(
+      cellSet, componentArray, DeviceAdapter());

    using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
    Algorithm::Sort(componentArray);
@ -179,7 +182,8 @@ public:
                     "Wrong number of connected components");
  }

-  void operator()() const {
+  void operator()() const
+  {
    this->TestTangleIsosurface();
    this->TestExplicitDataSet();
    this->TestUniformDataSet();
--- a/vtkm/worklet/testing/UnitTestCellSetDualGraph.cxx
+++ b/vtkm/worklet/testing/UnitTestCellSetDualGraph.cxx
@ -28,7 +28,7 @@
 template <typename DeviceAdapter>
 class TestCellSetDualGraph
 {
-public:
+private:
  template <typename T, typename Storage>
  bool TestArrayHandle(const vtkm::cont::ArrayHandle<T, Storage>& ah,
                       const T* expected,
@ -50,6 +50,7 @@ public:
    return true;
  }

+public:
  void TestTriangleMesh() const
  {
    std::vector<vtkm::Id> connectivity = { 0, 2, 4, 1, 3, 5, 2, 6, 4, 5, 3, 7, 2, 9, 6, 4, 6, 8 };
@ -61,7 +62,7 @@ public:
    vtkm::cont::ArrayHandle<vtkm::Id> indexOffsetArray;
    vtkm::cont::ArrayHandle<vtkm::Id> connectivityArray;

-    CellSetDualGraph<DeviceAdapter>().Run(
+    vtkm::worklet::connectivity::CellSetDualGraph<DeviceAdapter>().Run(
      cellSet, numIndicesArray, indexOffsetArray, connectivityArray);

    vtkm::Id expectedNumIndices[] = { 1, 1, 3, 1, 1, 1 };
--- a/vtkm/worklet/testing/UnitTestImageConnectivity.cxx
+++ b/vtkm/worklet/testing/UnitTestImageConnectivity.cxx
@ -0,0 +1,74 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+#include <vtkm/cont/testing/MakeTestDataSet.h>
+#include <vtkm/cont/testing/Testing.h>
+#include <vtkm/filter/MarchingCubes.h>
+
+#include <vtkm/worklet/connectivities/ImageConnectivity.h>
+
+
+template <typename DeviceAdapter>
+class TestImageConnectivity
+{
+public:
+  using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
+
+  void operator()() const
+  {
+    // example image from Connected Component Labeling in CUDA by OndˇrejˇŚtava,
+    // Bedˇrich Beneˇ
+    std::vector<vtkm::UInt8> pixels(8 * 4, 0);
+    pixels[3] = pixels[4] = pixels[10] = pixels[11] = 1;
+    pixels[1] = pixels[9] = pixels[16] = pixels[17] = pixels[24] = pixels[25] = 1;
+    pixels[7] = pixels[15] = pixels[21] = pixels[23] = pixels[28] = pixels[29] = pixels[30] =
+      pixels[31] = 1;
+
+    vtkm::cont::DataSetBuilderUniform builder;
+    vtkm::cont::DataSet data = builder.Create(vtkm::Id3(8, 4, 1));
+
+    auto colorField = vtkm::cont::make_Field("color", vtkm::cont::Field::ASSOC_POINTS, pixels);
+    data.AddField(colorField);
+
+    vtkm::cont::ArrayHandle<vtkm::Id> component;
+    vtkm::worklet::connectivity::ImageConnectivity().Run(
+      data.GetCellSet(0).Cast<vtkm::cont::CellSetStructured<2>>(),
+      colorField.GetData(),
+      component,
+      DeviceAdapter());
+
+    std::vector<vtkm::Id> componentExpected = { 0, 1, 2, 1, 1, 3, 3, 4, 0, 1, 1, 1, 3, 3, 3, 4,
+                                                1, 1, 3, 3, 3, 4, 3, 4, 1, 1, 3, 3, 4, 4, 4, 4 };
+
+    std::size_t i = 0;
+    for (vtkm::Id index = 0; index < component.GetNumberOfValues(); index++, i++)
+    {
+      VTKM_TEST_ASSERT(component.GetPortalConstControl().Get(index) == componentExpected[i],
+                       "Components has unexpected value.");
+    }
+  }
+};
+
+int UnitTestImageConnectivity(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(
+    TestImageConnectivity<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>());
+}
--- a/vtkm/worklet/testing/UnitTestInnerJoin.cxx
+++ b/vtkm/worklet/testing/UnitTestInnerJoin.cxx
@ -23,6 +23,7 @@
 #include <vtkm/cont/testing/Testing.h>
 #include <vtkm/worklet/connectivities/InnerJoin.h>

+
 template <typename DeviceAdapter>
 class TestInnerJoin
 {
@ -67,7 +68,8 @@ public:
    vtkm::cont::ArrayHandle<vtkm::Id> outA;
    vtkm::cont::ArrayHandle<vtkm::Id> outB;

-    InnerJoin<DeviceAdapter>().Run(A_arr, idxA, B_arr, idxB, joinedIndex, outA, outB);
+    vtkm::worklet::connectivity::InnerJoin<DeviceAdapter>().Run(
+      A_arr, idxA, B_arr, idxB, joinedIndex, outA, outB);

    vtkm::Id expectedIndex[] = { 5, 5, 8, 8, 9 };
    VTKM_TEST_ASSERT(TestArrayHandle(joinedIndex, expectedIndex, 5), "Wrong joined keys");
--- a/vtkm/worklet/testing/UnitTestPointTransform.cxx
+++ b/vtkm/worklet/testing/UnitTestPointTransform.cxx
@ -0,0 +1,201 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/CellSetExplicit.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/testing/Testing.h>
+#include <vtkm/worklet/DispatcherMapField.h>
+#include <vtkm/worklet/PointTransform.h>
+
+#include <random>
+#include <vector>
+
+namespace
+{
+std::mt19937 randGenerator;
+
+vtkm::cont::DataSet MakePointTransformTestDataSet()
+{
+  vtkm::cont::DataSet dataSet;
+
+  std::vector<vtkm::Vec<vtkm::FloatDefault, 3>> coordinates;
+  const vtkm::Id dim = 5;
+  for (vtkm::Id j = 0; j < dim; ++j)
+  {
+    vtkm::FloatDefault z =
+      static_cast<vtkm::FloatDefault>(j) / static_cast<vtkm::FloatDefault>(dim - 1);
+    for (vtkm::Id i = 0; i < dim; ++i)
+    {
+      vtkm::FloatDefault x =
+        static_cast<vtkm::FloatDefault>(i) / static_cast<vtkm::FloatDefault>(dim - 1);
+      vtkm::FloatDefault y = (x * x + z * z) / 2.0f;
+      coordinates.push_back(vtkm::make_Vec(x, y, z));
+    }
+  }
+
+  vtkm::Id numCells = (dim - 1) * (dim - 1);
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, vtkm::CopyFlag::On));
+
+  vtkm::cont::CellSetExplicit<> cellSet("cells");
+  cellSet.PrepareToAddCells(numCells, numCells * 4);
+  for (vtkm::Id j = 0; j < dim - 1; ++j)
+  {
+    for (vtkm::Id i = 0; i < dim - 1; ++i)
+    {
+      cellSet.AddCell(vtkm::CELL_SHAPE_QUAD,
+                      4,
+                      vtkm::make_Vec<vtkm::Id>(
+                        j * dim + i, j * dim + i + 1, (j + 1) * dim + i + 1, (j + 1) * dim + i));
+    }
+  }
+  cellSet.CompleteAddingCells(vtkm::Id(coordinates.size()));
+
+  dataSet.AddCellSet(cellSet);
+  return dataSet;
+}
+
+void ValidatePointTransform(const vtkm::cont::CoordinateSystem& coords,
+                            const vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::FloatDefault, 3>>& result,
+                            const vtkm::Matrix<vtkm::FloatDefault, 4, 4>& matrix)
+{
+  auto points = coords.GetData();
+  VTKM_TEST_ASSERT(points.GetNumberOfValues() == result.GetNumberOfValues(),
+                   "Incorrect number of points in point transform");
+
+  auto pointsPortal = points.GetPortalConstControl();
+  auto resultsPortal = result.GetPortalConstControl();
+
+  for (vtkm::Id i = 0; i < points.GetNumberOfValues(); i++)
+    VTKM_TEST_ASSERT(
+      test_equal(resultsPortal.Get(i), vtkm::Transform3DPoint(matrix, pointsPortal.Get(i))),
+      "Wrong result for PointTransform worklet");
+}
+
+
+void TestPointTransformTranslation(const vtkm::cont::DataSet& ds,
+                                   const vtkm::Vec<vtkm::FloatDefault, 3>& trans)
+{
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::FloatDefault, 3>> result;
+  vtkm::worklet::PointTransform<vtkm::FloatDefault> worklet;
+
+  worklet.SetTranslation(trans);
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::PointTransform<vtkm::FloatDefault>> dispatcher(
+    worklet);
+  dispatcher.Invoke(ds.GetCoordinateSystem(), result);
+
+  ValidatePointTransform(ds.GetCoordinateSystem(), result, Transform3DTranslate(trans));
+}
+
+void TestPointTransformScale(const vtkm::cont::DataSet& ds,
+                             const vtkm::Vec<vtkm::FloatDefault, 3>& scale)
+{
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::FloatDefault, 3>> result;
+  vtkm::worklet::PointTransform<vtkm::FloatDefault> worklet;
+
+  worklet.SetScale(scale);
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::PointTransform<vtkm::FloatDefault>> dispatcher(
+    worklet);
+  dispatcher.Invoke(ds.GetCoordinateSystem(), result);
+
+  ValidatePointTransform(ds.GetCoordinateSystem(), result, Transform3DScale(scale));
+}
+
+void TestPointTransformRotation(const vtkm::cont::DataSet& ds,
+                                const vtkm::FloatDefault& angle,
+                                const vtkm::Vec<vtkm::FloatDefault, 3>& axis)
+{
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::FloatDefault, 3>> result;
+  vtkm::worklet::PointTransform<vtkm::FloatDefault> worklet;
+
+  worklet.SetRotation(angle, axis);
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::PointTransform<vtkm::FloatDefault>> dispatcher(
+    worklet);
+  dispatcher.Invoke(ds.GetCoordinateSystem(), result);
+
+  ValidatePointTransform(ds.GetCoordinateSystem(), result, Transform3DRotate(angle, axis));
+}
+}
+
+void TestPointTransform()
+{
+  std::cout << "Testing PointTransform Worklet" << std::endl;
+
+  vtkm::cont::DataSet ds = MakePointTransformTestDataSet();
+  int N = 41;
+
+  //Test translation
+  TestPointTransformTranslation(ds, vtkm::Vec<vtkm::FloatDefault, 3>(0, 0, 0));
+  TestPointTransformTranslation(ds, vtkm::Vec<vtkm::FloatDefault, 3>(1, 1, 1));
+  TestPointTransformTranslation(ds, vtkm::Vec<vtkm::FloatDefault, 3>(-1, -1, -1));
+
+  std::uniform_real_distribution<vtkm::FloatDefault> transDist(-100, 100);
+  for (int i = 0; i < N; i++)
+    TestPointTransformTranslation(ds,
+                                  vtkm::Vec<vtkm::FloatDefault, 3>(transDist(randGenerator),
+                                                                   transDist(randGenerator),
+                                                                   transDist(randGenerator)));
+
+  //Test scaling
+  TestPointTransformScale(ds, vtkm::Vec<vtkm::FloatDefault, 3>(1, 1, 1));
+  TestPointTransformScale(ds, vtkm::Vec<vtkm::FloatDefault, 3>(.23f, .23f, .23f));
+  TestPointTransformScale(ds, vtkm::Vec<vtkm::FloatDefault, 3>(1, 2, 3));
+  TestPointTransformScale(ds, vtkm::Vec<vtkm::FloatDefault, 3>(3.23f, 9.23f, 4.23f));
+
+  std::uniform_real_distribution<vtkm::FloatDefault> scaleDist(0.0001f, 100);
+  for (int i = 0; i < N; i++)
+  {
+    TestPointTransformScale(ds, vtkm::Vec<vtkm::FloatDefault, 3>(scaleDist(randGenerator)));
+    TestPointTransformScale(ds,
+                            vtkm::Vec<vtkm::FloatDefault, 3>(scaleDist(randGenerator),
+                                                             scaleDist(randGenerator),
+                                                             scaleDist(randGenerator)));
+  }
+
+  //Test rotation
+  std::vector<vtkm::FloatDefault> angles;
+  std::uniform_real_distribution<vtkm::FloatDefault> angleDist(0, 360);
+  for (int i = 0; i < N; i++)
+    angles.push_back(angleDist(randGenerator));
+
+  std::vector<vtkm::Vec<vtkm::FloatDefault, 3>> axes;
+  axes.push_back(vtkm::Vec<vtkm::FloatDefault, 3>(1, 0, 0));
+  axes.push_back(vtkm::Vec<vtkm::FloatDefault, 3>(0, 1, 0));
+  axes.push_back(vtkm::Vec<vtkm::FloatDefault, 3>(0, 0, 1));
+  axes.push_back(vtkm::Vec<vtkm::FloatDefault, 3>(1, 1, 1));
+  axes.push_back(-axes[0]);
+  axes.push_back(-axes[1]);
+  axes.push_back(-axes[2]);
+  axes.push_back(-axes[3]);
+
+  std::uniform_real_distribution<vtkm::FloatDefault> axisDist(-1, 1);
+  for (int i = 0; i < N; i++)
+    axes.push_back(vtkm::Vec<vtkm::FloatDefault, 3>(
+      axisDist(randGenerator), axisDist(randGenerator), axisDist(randGenerator)));
+
+  for (std::size_t i = 0; i < angles.size(); i++)
+    for (std::size_t j = 0; j < axes.size(); j++)
+      TestPointTransformRotation(ds, angles[i], axes[j]);
+}
+
+int UnitTestPointTransform(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestPointTransform);
+}