Merge branch 'master' into particle_density

2020-10-07 12:24:07 -06:00 · 2020-10-07 12:24:07 -06:00 · c1681d2f2a
commit c1681d2f2a
parent 5a47c39ac1 1b2623664b
851 changed files with 30216 additions and 17343 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,17 +1,19 @@
 ---
-# This configuration requires clang-format 3.8 or higher.
+# This configuration requires clang-format 9 or higher.
 BasedOnStyle: Mozilla
 AlignAfterOpenBracket: Align
+AlignEscapedNewlines: true
 AlignOperands: false
-AlwaysBreakAfterReturnType: None
+AllowAllParametersOfDeclarationOnNextLine: false
 AlwaysBreakAfterDefinitionReturnType: None
-BreakBeforeBraces: Allman
+AlwaysBreakAfterReturnType: None
 BinPackArguments: false
 BinPackParameters: false
+BreakBeforeBraces: Allman
 ColumnLimit: 100
+# FixNamespaceComments: true
 MaxEmptyLinesToKeep: 4
-Standard: Cpp11
-# This requires clang-format 4.0 (at least).
-#FixNamespaceComments: true
 ReflowComments: false
+SpaceAfterTemplateKeyword: true
+Standard: Cpp11
 ...
--- a/.gitattributes
+++ b/.gitattributes
@ -1,5 +1,5 @@
 # Attributes used for formatting.
-[attr]our-c-style   whitespace=tab-in-indent  format.clang-format
+[attr]our-c-style   whitespace=tab-in-indent  format.clang-format=9

 *.cxx   our-c-style
 *.h     our-c-style
--- a/.gitlab-ci-ecp.yml
+++ b/.gitlab-ci-ecp.yml
@ -1,125 +0,0 @@
-
-
-.slurm_p9_cuda: &slurm_p9_cuda
-    tags:
-        - nmc
-        - slurm
-        - nmc-xxfe1-sched-001
-        - xx-fe1
-    variables:
-        NMC_FE1_SLURM_PARAMETERS: " -N1 -p ecp-p9-4v100 --extra-node-info=*:*:* -t 1:30:00 "
-        CC: "gcc"
-        CXX: "g++"
-        CUDAHOSTCXX: "g++"
-    before_script:
-        # We need gcc-4.8.5, which is the system default compiler but not a compiler
-        # listed under the module system.
-        #
-        # That means to get this to work properly we explicitly do not request
-        # any compiler.
-        - module load cuda cmake/3.14.5
-
-
-.slurm_p9_opemp: &slurm_p9_opemp
-    tags:
-        - nmc
-        - slurm
-        - nmc-xxfe1-sched-001
-        - xx-fe1
-    variables:
-        NMC_FE1_SLURM_PARAMETERS: " -N1 -p ecp-p9-4v100 --extra-node-info=*:*:* -t 1:30:00 "
-    before_script:
-        - module load gcc/8.3.0 openmpi/3.1.4 cmake/3.14.5
-
-.cmake_build_artifacts: &cmake_build_artifacts
-    artifacts:
-        expire_in: 24 hours
-        when: always
-        paths:
-            # The artifacts of the build.
-            - vtkm-build/bin/
-            - vtkm-build/include/
-
-            # CTest files.
-            # XXX(globbing): Can be simplified with support from
-            # https://gitlab.com/gitlab-org/gitlab-runner/issues/4840
-            - vtkm-build/CTestCustom*.cmake
-            - vtkm-build/CTestTestfile.cmake
-            - vtkm-build/*/CTestTestfile.cmake
-            - vtkm-build/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/*/*/CTestTestfile.cmake
-            - vtkm-build/Testing/
-
-            # CDash files.
-            - vtkm-build/DartConfiguration.tcl
-
-.cmake_build_p9_cuda: &cmake_build_p9_cuda
-    stage: build
-    script:
-        - srun env | grep SLURM_JOB_NAME
-        - mkdir vtkm-build
-        - pushd vtkm-build
-        - cmake -DCMAKE_BUILD_TYPE=Release -DVTKm_ENABLE_CUDA=ON -S ../
-        - cmake --build . -j20
-        - popd
-
-.cmake_build_p9_openmp: &cmake_build_p9_openmp
-    stage: build
-    script:
-        - srun env | grep SLURM_JOB_NAME
-        - mkdir vtkm-build
-        - pushd vtkm-build
-        - cmake -DCMAKE_BUILD_TYPE=Release -DVTKm_ENABLE_OPENMP=ON -S ../
-        - cmake --build . -j20
-        - popd
-
-
-
-.cmake_test_p9: &cmake_test_p9
-    stage: test
-    script:
-        - echo "running the test using artifacts of the build"
-        - pushd vtkm-build
-        # We need to exclude the following tests
-        #   - CopyrightStatement
-        #   - TestInstallSetup
-        #   - SourceInInstall
-        # Which we can do by using an exclude regex
-        - ctest -E "Install|CopyrightStatement"
-        - popd
-
-stages:
-    - build
-    - test
-
-build:p9_openmp:
-    extends:
-        - .slurm_p9_opemp
-        - .cmake_build_artifacts
-        - .cmake_build_p9_openmp
-
-test:p9_openmp:
-    extends:
-        - .slurm_p9_opemp
-        - .cmake_test_p9
-    dependencies:
-        - build:p9_openmp
-    needs:
-        - build:p9_openmp
-
-build:p9_cuda:
-    extends:
-        - .slurm_p9_cuda
-        - .cmake_build_artifacts
-        - .cmake_build_p9_cuda
-
-test:p9_cuda:
-    extends:
-        - .slurm_p9_cuda
-        - .cmake_test_p9
-    dependencies:
-        - build:p9_cuda
-    needs:
-        - build:p9_cuda
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -49,55 +49,64 @@
    GIT_CLONE_PATH: $CI_BUILDS_DIR/gitlab-kitware-sciviz-ci

 .centos7: &centos7
-  image: "kitware/vtkm:ci-centos7_cuda10.2-20200601"
+  image: "kitware/vtkm:ci-centos7_cuda10.2-20200820"
  extends:
    - .docker_image

 .centos8: &centos8
-  image: "kitware/vtkm:ci-centos8-20200601"
+  image: "kitware/vtkm:ci-centos8-20200820"
  extends:
    - .docker_image

 .rhel8: &rhel8
-  image: "kitware/vtkm:ci-rhel8_cuda10.2-20200601"
+  image: "kitware/vtkm:ci-rhel8_cuda10.2-20200820"
  extends:
    - .docker_image

 .ubuntu1604: &ubuntu1604
-  image: "kitware/vtkm:ci-ubuntu1604-20200601"
+  image: "kitware/vtkm:ci-ubuntu1604-20200820"
  extends:
    - .docker_image

 .ubuntu1604_cuda: &ubuntu1604_cuda
-  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20200601"
+  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20200820"
  extends:
    - .docker_image

 .ubuntu1804: &ubuntu1804
-  image: "kitware/vtkm:ci-ubuntu1804-20200601"
+  image: "kitware/vtkm:ci-ubuntu1804-20200820"
  extends:
    - .docker_image

 .ubuntu1804_cuda: &ubuntu1804_cuda
-  image: "kitware/vtkm:ci-ubuntu1804_cuda10.1-20200601"
+  image: "kitware/vtkm:ci-ubuntu1804_cuda10.1-20200820"
+  extends:
+    - .docker_image
+
+.ubuntu1804_cuda_kokkos: &ubuntu1804_cuda_kokkos
+  image: "kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-20200820"
  extends:
    - .docker_image

 .ubuntu2004_doxygen: &ubuntu2004_doxygen
-  image: "kitware/vtkm:ci-doxygen-20200601"
+  image: "kitware/vtkm:ci-doxygen-20200820"
  extends:
    - .docker_image

+.ubuntu2004_kokkos: &ubuntu2004_kokkos
+  image: "kitware/vtkm:ci-ubuntu2004_kokkos-20200820"
+  extends:
+    - .docker_image

 .only-default: &only-default
  only:
-    - master
+    - master@vtk/vtk-m
+    - tags@vtk/vtk-m
    - merge_requests
-    - tags

 .only-master: &only-master
  only:
-    - master
+    - master@vtk/vtk-m


 # General Longer Term Tasks:
@ -178,4 +187,5 @@ include:
  - local: '/.gitlab/ci/rhel8.yml'
  - local: '/.gitlab/ci/ubuntu1604.yml'
  - local: '/.gitlab/ci/ubuntu1804.yml'
+  - local: '/.gitlab/ci/ubuntu2004.yml'
  - local: '/.gitlab/ci/windows10.yml'
--- a/.gitlab/ci/centos7.yml
+++ b/.gitlab/ci/centos7.yml
@ -7,6 +7,7 @@ build:centos7_gcc48:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .centos7
@ -20,15 +21,17 @@ build:centos7_gcc48:
 test:centos7_gcc48:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .centos7
    - .cmake_test_linux
    - .only-default
+  variables:
+      CTEST_EXCLUSIONS: "UnitTestContourTreeUniformAugmentedFilterCUDA|UnitTestContourTreeUniformAugmentedCUDA"
  dependencies:
    - build:centos7_gcc48
  needs:
@ -37,17 +40,17 @@ test:centos7_gcc48:
 test:rhel8_test_centos7:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .rhel8
    - .cmake_test_linux
    - .only-default
  variables:
-      CTEST_EXCLUSIONS: "built_against_test_install"
+      CTEST_EXCLUSIONS: "built_against_test_install|UnitTestContourTreeUniformAugmentedFilterCUDA|UnitTestContourTreeUniformAugmentedCUDA"
  dependencies:
    - build:centos7_gcc48
  needs:
--- a/.gitlab/ci/config/initial_config.cmake
+++ b/.gitlab/ci/config/initial_config.cmake
@ -10,10 +10,16 @@
 ##
 ##=============================================================================

+# Default to Release builds.
+if ("$ENV{CMAKE_BUILD_TYPE}" STREQUAL "")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
+else ()
+  set(CMAKE_BUILD_TYPE "$ENV{CMAKE_BUILD_TYPE}" CACHE STRING "")
+endif ()
+
 string(REPLACE "+" ";" options "$ENV{VTKM_SETTINGS}")

 foreach(option IN LISTS options)
-
  if(static STREQUAL option)
    set(BUILD_SHARED_LIBS "OFF" CACHE STRING "")

@ -43,6 +49,9 @@ foreach(option IN LISTS options)
  elseif(no_rendering STREQUAL option)
    set(VTKm_ENABLE_RENDERING "OFF" CACHE STRING "")

+  elseif(no_virtual STREQUAL option)
+    set(VTKm_NO_DEPRECATED_VIRTUAL "ON" CACHE STRING "")
+
  elseif(examples STREQUAL option)
    set(VTKm_ENABLE_EXAMPLES "ON" CACHE STRING "")

@ -64,6 +73,9 @@ foreach(option IN LISTS options)
  elseif(cuda STREQUAL option)
    set(VTKm_ENABLE_CUDA "ON" CACHE STRING "")

+  elseif(kokkos STREQUAL option)
+    set(VTKm_ENABLE_KOKKOS "ON" CACHE STRING "")
+
  elseif(maxwell STREQUAL option)
    set(VTKm_CUDA_Architecture "maxwell" CACHE STRING "")

@ -88,7 +100,10 @@ find_program(SCCACHE_COMMAND NAMES sccache)
 if(SCCACHE_COMMAND)
  set(CMAKE_C_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
  set(CMAKE_CXX_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
-  if(VTKm_ENABLE_CUDA)
+
+  # Use VTKm_CUDA_Architecture to determine if we need CUDA sccache setup
+  # since this will also capture when kokkos is being used with CUDA backing
+  if(DEFINED VTKm_CUDA_Architecture)
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
  endif()
 endif()
--- a/.gitlab/ci/docker/centos7/cuda10.2/Dockerfile
+++ b/.gitlab/ci/docker/centos7/cuda10.2/Dockerfile
@ -1,7 +1,7 @@
 FROM nvidia/cuda:10.2-devel-centos7
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

-RUN yum install cmake make gcc gcc-c++ -y
+RUN yum install make gcc gcc-c++ curl cuda-compat-10-2 -y
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | bash
 RUN yum install git git-lfs -y

--- a/.gitlab/ci/docker/rhel8/cuda10.2/Dockerfile
+++ b/.gitlab/ci/docker/rhel8/cuda10.2/Dockerfile
@ -1,7 +1,7 @@
 FROM nvidia/cuda:10.2-devel-ubi8
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

-RUN yum install make gcc gcc-c++ curl -y
+RUN yum install make gcc gcc-c++ curl cuda-compat-10-2 -y
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | bash
 RUN yum install git git-lfs -y

--- a/.gitlab/ci/docker/ubuntu1804/cuda10.1/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/cuda10.1/Dockerfile
@ -5,6 +5,7 @@ LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"
 RUN apt-get update && apt-get install -y --no-install-recommends \
      curl \
      g++ \
+      clang-8 \
      git \
      git-lfs \
      libmpich-dev \
--- a/.gitlab/ci/docker/ubuntu1804/kokkos-cuda/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/kokkos-cuda/Dockerfile
@ -0,0 +1,47 @@
+FROM nvidia/cuda:11.0-devel-ubuntu18.04
+LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"
+
+# Base dependencies for building VTK-m projects
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      curl \
+      g++ \
+      git \
+      git-lfs \
+      ninja-build \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Need to run git-lfs install manually on ubuntu based images when using the
+# system packaged version
+RUN git-lfs install
+
+# kokkos backend requires cmake 3.18
+RUN mkdir /opt/cmake/ && \
+    curl -L https://github.com/Kitware/CMake/releases/download/v3.18.1/cmake-3.18.1-Linux-x86_64.sh > cmake-3.18.1-Linux-x86_64.sh && \
+    sh cmake-3.18.1-Linux-x86_64.sh --prefix=/opt/cmake/ --exclude-subdir --skip-license && \
+    rm cmake-3.18.1-Linux-x86_64.sh && \
+    ln -s /opt/cmake/bin/ctest /opt/cmake/bin/ctest-latest
+
+ENV PATH "/opt/cmake/bin:${PATH}"
+
+# Build and install Kokkos
+RUN mkdir -p /opt/kokkos/build && \
+    cd /opt/kokkos/build && \
+    curl -L https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz > kokkos-3.1.01.tar.gz && \
+    tar -xf kokkos-3.1.01.tar.gz && \
+    mkdir bld && cd bld && \
+    CXX=/opt/kokkos/build/kokkos-3.1.01/bin/nvcc_wrapper \
+    cmake -B . -S ../kokkos-3.1.01 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=/opt/kokkos \
+          -DCMAKE_CXX_FLAGS=-fPIC \
+          -DCMAKE_CXX_STANDARD=14 \
+          -DKokkos_ENABLE_CUDA=ON \
+          -DKokkos_ENABLE_CUDA_CONSTEXPR=ON \
+          -DKokkos_ENABLE_CUDA_LAMBDA=ON \
+          -DKokkos_ENABLE_CUDA_LDG_INTRINSIC=ON \
+          -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
+          -DKokkos_ENABLE_CUDA_UVM=ON \
+          -DKokkos_ARCH_TURING75=ON && \
+    cmake --build . -j 8 && \
+    cmake --install .
--- a/.gitlab/ci/docker/ubuntu2004/kokkos/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu2004/kokkos/Dockerfile
@ -0,0 +1,41 @@
+FROM ubuntu:20.04
+LABEL maintainer "Sujin Philip<sujin.philip@kitware.com>"
+
+# Base dependencies for building VTK-m projects
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+      cmake \
+      curl \
+      g++ \
+      git \
+      git-lfs \
+      libmpich-dev \
+      libomp-dev \
+      mpich \
+      ninja-build \
+      rsync \
+      ssh \
+      software-properties-common
+
+# Need to run git-lfs install manually on ubuntu based images when using the
+# system packaged version
+RUN git-lfs install
+
+# Provide CMake 3.17 so we can re-run tests easily
+# This will be used when we run just the tests
+RUN mkdir /opt/cmake/ && \
+    curl -L https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-x86_64.sh > cmake-3.17.3-Linux-x86_64.sh && \
+    sh cmake-3.17.3-Linux-x86_64.sh --prefix=/opt/cmake/ --exclude-subdir --skip-license && \
+    rm cmake-3.17.3-Linux-x86_64.sh && \
+    ln -s /opt/cmake/bin/ctest /opt/cmake/bin/ctest-latest
+
+ENV PATH "${PATH}:/opt/cmake/bin"
+
+# Build and install Kokkos
+RUN mkdir -p /opt/kokkos/build && \
+    cd /opt/kokkos/build && \
+    curl -L https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz > kokkos-3.1.01.tar.gz && \
+    tar -xf kokkos-3.1.01.tar.gz && \
+    mkdir bld && cd bld && \
+    cmake -GNinja -DCMAKE_INSTALL_PREFIX=/opt/kokkos -DCMAKE_CXX_FLAGS=-fPIC -DKokkos_ENABLE_SERIAL=ON ../kokkos-3.1.01 &&\
+    ninja all && \
+    ninja install
--- a/.gitlab/ci/docker/update_all.sh
+++ b/.gitlab/ci/docker/update_all.sh
@ -34,10 +34,18 @@ cd ubuntu1804/cuda10.1
 sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda10.1-$date .
 cd ../..

+cd ubuntu1804/kokkos-cuda
+sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-$date .
+cd ../..
+
 cd ubuntu2004/doxygen/
 sudo docker build -t kitware/vtkm:ci-doxygen-$date .
 cd ../..

+cd ubuntu2004/kokkos
+sudo docker build -t kitware/vtkm:ci-ubuntu2004_kokkos-$date .
+cd ../..
+
 # sudo docker login --username=<docker_hub_name>
 sudo docker push kitware/vtkm
 sudo docker system prune
--- a/.gitlab/ci/doxygen.yml
+++ b/.gitlab/ci/doxygen.yml
@ -25,10 +25,9 @@ doxygen:
    - "cmake -V -P .gitlab/ci/config/gitlab_ci_setup.cmake"
    - "ctest -VV -S .gitlab/ci/ctest_configure.cmake"
  script:
-    - eval `ssh-agent -s`
-    - ssh-add <(echo "$DOC_API_KEY_BASE64" | base64 --decode)
    - doxygen build/docs/doxyfile
-    - rsync -tv --recursive --delete -e "ssh -o StrictHostKeyChecking=no" build/docs/doxygen/html/ vtkm.documentation
+    - chmod 400 $DOC_KEY_FILE
+    - rsync -tv --recursive --delete -e "ssh -i $DOC_KEY_FILE -o StrictHostKeyChecking=no" build/docs/doxygen/html/ kitware@public.kitware.com:vtkm_documentation/
  variables:
    CMAKE_BUILD_TYPE: Release
    VTKM_SETTINGS: "tbb+openmp+mpi+shared+docs"
--- a/.gitlab/ci/ubuntu1604.yml
+++ b/.gitlab/ci/ubuntu1604.yml
@ -7,6 +7,7 @@ build:ubuntu1604_gcc5:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1604_cuda
@ -16,41 +17,24 @@ build:ubuntu1604_gcc5:
    CC: "gcc-5"
    CXX: "g++-5"
    CMAKE_BUILD_TYPE: RelWithDebInfo
-    VTKM_SETTINGS: "cuda+pascal"
+    VTKM_SETTINGS: "cuda+pascal+no_virtual"

-# Temporarily disabled as we don't have a pascal hw gitlab-runner
-# test:ubuntu1604_gcc5:
-#   tags:
-#     - test
-#     - cuda-rt
-#     - pascal
-#     - vtkm
-#     - docker
-#     - linux
-#   extends:
-#     - .ubuntu1604_cuda
-#     - .cmake_test_linux
-#     - .only-default
-#   dependencies:
-#     - build:ubuntu1604_gcc5
-#   needs:
-#     - build:ubuntu1604_gcc5
-# test:ubuntu1804_test_ubuntu1604_gcc5:
-#   tags:
-#     - test
-#     - cuda-rt
-#     - pascal
-#     - vtkm
-#     - docker
-#     - linux
-#   extends:
-#     - .ubuntu1804_cuda
-#     - .cmake_test_linux
-#     - .only-default
-#   dependencies:
-#     - build:ubuntu1604_gcc5
-#   needs:
-#     - build:ubuntu1604_gcc5
+test:ubuntu1604_gcc5:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - pascal
+  extends:
+    - .ubuntu1604_cuda
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu1604_gcc5
+  needs:
+    - build:ubuntu1604_gcc5

 # Build on ubuntu1704 with OpenMP + CUDA
 # Runs only on nightlies
@ -60,6 +44,7 @@ build:ubuntu1604_gcc5_2:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1604_cuda
@ -71,6 +56,25 @@ build:ubuntu1604_gcc5_2:
    CMAKE_BUILD_TYPE: Release
    VTKM_SETTINGS: "openmp+cuda+pascal+examples"

+test:ubuntu1804_test_ubuntu1604_gcc5_2:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - pascal
+  extends:
+    - .ubuntu1804_cuda
+    - .cmake_test_linux
+    - .only-master
+  variables:
+      CTEST_EXCLUSIONS: "built_against_test_install"
+  dependencies:
+    - build:ubuntu1604_gcc5_2
+  needs:
+    - build:ubuntu1604_gcc5_2
+
 # Build on ubuntu1604 with mpi + tbb and test on ubuntu1604
 # Uses gcc 4.8
 # Uses OpenMPI
--- a/.gitlab/ci/ubuntu1804.yml
+++ b/.gitlab/ci/ubuntu1804.yml
@ -46,6 +46,7 @@ build:ubuntu1804_gcc7:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1804_cuda
@ -54,16 +55,16 @@ build:ubuntu1804_gcc7:
  variables:
    CC: "gcc-7"
    CXX: "g++-7"
-    VTKM_SETTINGS: "cuda+turing+mpi+64bit_floats"
+    VTKM_SETTINGS: "cuda+turing+mpi+64bit_floats+no_virtual"

 test:ubuntu1804_gcc7:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .ubuntu1804_cuda
    - .cmake_test_linux
@ -74,42 +75,45 @@ test:ubuntu1804_gcc7:
    - build:ubuntu1804_gcc7


-# Build on ubuntu1804 with OpenMP and test on ubuntu1804
-# Uses gcc 7.4
+# Build on ubuntu1804 with CUDA+TBB and test on ubuntu1804
+# Uses clang as CUDA host compiler
 # Runs only on nightlies
-build:ubuntu1804_gcc7_2:
+build:ubuntu1804_clang_cuda:
  tags:
    - build
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - large-memory
  extends:
-    - .ubuntu1804
+    - .ubuntu1804_cuda
    - .cmake_build_linux
-    - .only-master
+    - .only-default
+    # - .only-master
  variables:
-    CC: "gcc-7"
-    CXX: "g++-7"
-    VTKM_SETTINGS: "openmp+shared+examples"
+    CC: "clang-8"
+    CXX: "clang++-8"
+    CUDAHOSTCXX: "clang++-8"
+    VTKM_SETTINGS: "cuda+pascal+tbb+static+examples"

-test:ubuntu1804_gcc7_2:
+test:ubuntu1804_clang_cuda:
  tags:
    - test
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - pascal
  extends:
-    - .ubuntu1804
+    - .ubuntu1804_cuda
    - .cmake_test_linux
-    - .only-master
-  variables:
-    #Restrict OpenMP number of threads since multiple test stages
-    #execute on the same hardware concurrently
-    OMP_NUM_THREADS: 4
+    - .only-default
+    # - .only-master
  dependencies:
-    - build:ubuntu1804_gcc7_2
+    - build:ubuntu1804_clang_cuda
  needs:
-    - build:ubuntu1804_gcc7_2
+    - build:ubuntu1804_clang_cuda

 # Build on ubuntu1804 with OpenMP and test on ubuntu1804
 # Uses gcc 6.5
@ -179,3 +183,41 @@ test:ubuntu1804_clang8:
    - build:ubuntu1804_clang8
  needs:
    - build:ubuntu1804_clang8
+
+# Build on ubuntu1804 with kokkos and test on ubuntu1804
+# Uses CUDA 11
+build:ubuntu1804_kokkos:
+  tags:
+    - build
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - large-memory
+  extends:
+    - .ubuntu1804_cuda_kokkos
+    - .cmake_build_linux
+    - .only-default
+  variables:
+    CMAKE_GENERATOR: "Ninja"
+    CMAKE_BUILD_TYPE: Release
+    VTKM_SETTINGS: "kokkos+turing+static+64bit_floats"
+
+test:ubuntu1804_kokkos:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - turing
+  extends:
+    - .ubuntu1804_cuda_kokkos
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu1804_kokkos
+  needs:
+    - build:ubuntu1804_kokkos
+  variables:
+    CUDA_LAUNCH_BLOCKING: "1"
--- a/.gitlab/ci/ubuntu2004.yml
+++ b/.gitlab/ci/ubuntu2004.yml
@ -0,0 +1,28 @@
+build:ubuntu2004_kokkos:
+  tags:
+    - build
+    - vtkm
+    - docker
+    - linux
+  extends:
+    - .ubuntu2004_kokkos
+    - .cmake_build_linux
+    - .only-default
+  variables:
+    CMAKE_BUILD_TYPE: RelWithDebInfo
+    VTKM_SETTINGS: "kokkos+shared+64bit_floats"
+
+test:ubuntu2004_kokkos:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+  extends:
+    - .ubuntu2004_kokkos
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu2004_kokkos
+  needs:
+    - build:ubuntu2004_kokkos
--- a/CMake/VTKmCMakeBackports.cmake
+++ b/CMake/VTKmCMakeBackports.cmake
@ -0,0 +1,23 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##============================================================================
+
+file(GLOB cmake_version_backports
+  LIST_DIRECTORIES true
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}/patches"
+  "${CMAKE_CURRENT_LIST_DIR}/patches/*")
+
+foreach (cmake_version_backport IN LISTS cmake_version_backports)
+  if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/patches/${cmake_version_backport}")
+    continue ()
+  endif ()
+  if (CMAKE_VERSION VERSION_LESS "${cmake_version_backport}")
+    list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_LIST_DIR}/patches/${cmake_version_backport}")
+  endif ()
+endforeach ()
--- a/CMake/VTKmCompilerFlags.cmake
+++ b/CMake/VTKmCompilerFlags.cmake
@ -22,6 +22,8 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
  set(VTKM_COMPILER_IS_CLANG 1)
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  set(VTKM_COMPILER_IS_GNU 1)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XLClang")
+  set(VTKM_COMPILER_IS_XL 1)
 endif()

 #-----------------------------------------------------------------------------
@ -51,7 +53,7 @@ if(VTKM_COMPILER_IS_MSVC)
  if(TARGET vtkm::cuda)
    target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Gy">)
  endif()
-elseif(NOT VTKM_COMPILER_IS_PGI) #can't find an equivalant PGI flag
+elseif(NOT (VTKM_COMPILER_IS_PGI OR VTKM_COMPILER_IS_XL)) #can't find an equivalant PGI/XL flag
  target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-ffunction-sections>)
  if(TARGET vtkm::cuda)
    target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-ffunction-sections>)
@ -122,8 +124,8 @@ elseif(VTKM_COMPILER_IS_ICC)
  target_compile_options(vtkm_developer_flags INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-wd1478 -wd13379>)

 elseif(VTKM_COMPILER_IS_GNU OR VTKM_COMPILER_IS_CLANG)
-  set(cxx_flags -Wall -Wcast-align -Wchar-subscripts -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused -fno-common)
-  set(cuda_flags -Xcompiler=-Wall,-Wno-unknown-pragmas,-Wno-unused-local-typedefs,-Wno-unused-local-typedefs,-Wno-unused-function,-Wcast-align,-Wchar-subscripts,-Wpointer-arith,-Wformat,-Wformat-security,-Wshadow,-Wunused,-fno-common)
+  set(cxx_flags -Wall -Wcast-align -Wchar-subscripts -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused -fno-common -Wno-unused-function)
+  set(cuda_flags -Xcompiler=-Wall,-Wcast-align,-Wchar-subscripts,-Wpointer-arith,-Wformat,-Wformat-security,-Wshadow,-fno-common,-Wunused,-Wno-unknown-pragmas,-Wno-unused-local-typedefs,-Wno-unused-function)

  #Only add float-conversion warnings for gcc as the integer warnigns in GCC
  #include the implicit casting of all types smaller than int to ints.
@ -161,17 +163,21 @@ elseif(VTKM_COMPILER_IS_GNU OR VTKM_COMPILER_IS_CLANG)
  endif()
 endif()

-#common warnings for all platforms when building cuda
-if(TARGET vtkm::cuda)
+function(setup_cuda_flags)
  if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
    #nvcc 9 introduced specific controls to disable the stack size warning
    #otherwise we let the warning occur. We have to set this in CMAKE_CUDA_FLAGS
    #as it is passed to the device link step, unlike compile_options
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xnvlink=--suppress-stack-size-warning")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xnvlink=--suppress-stack-size-warning" PARENT_SCOPE)
  endif()

  set(display_error_nums -Xcudafe=--display_error_number)
  target_compile_options(vtkm_developer_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${display_error_nums}>)
+endfunction()
+
+#common warnings for all platforms when building cuda
+if ((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda))
+  setup_cuda_flags()
 endif()

 if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
--- a/CMake/VTKmConfig.cmake.in
+++ b/CMake/VTKmConfig.cmake.in
@ -39,6 +39,7 @@
 #  VTKm_ENABLE_CUDA           Will be enabled if VTK-m was built with CUDA support
 #  VTKm_ENABLE_TBB            Will be enabled if VTK-m was built with TBB support
 #  VTKm_ENABLE_OPENMP         Will be enabled if VTK-m was built with OpenMP support
+#  VTKm_ENABLE_KOKKOS         Will be enabled if VTK-m was built with Kokkos support
 #  VTKm_ENABLE_LOGGING        Will be enabled if VTK-m was built with logging support
 #  VTKm_ENABLE_MPI            Will be enabled if VTK-m was built with MPI support
 #  VTKm_ENABLE_RENDERING      Will be enabled if VTK-m was built with rendering support
@ -67,8 +68,9 @@ set(VTKm_VERSION "@VTKm_VERSION@")

 set(VTKm_BUILD_SHARED_LIBS "@VTKm_BUILD_SHARED_LIBS@")
 set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@")
-set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
+set(VTKm_ENABLE_KOKKOS "@VTKm_ENABLE_KOKKOS@")
 set(VTKm_ENABLE_OPENMP "@VTKm_ENABLE_OPENMP@")
+set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
 set(VTKm_ENABLE_LOGGING "@VTKm_ENABLE_LOGGING@")
 set(VTKm_ENABLE_RENDERING "@VTKm_ENABLE_RENDERING@")
 set(VTKm_ENABLE_GL_CONTEXT "@VTKm_ENABLE_GL_CONTEXT@")
@ -101,6 +103,12 @@ endif()
 if(VTKm_ENABLE_CUDA AND VTKM_FROM_INSTALL_DIR)
  set_target_properties(vtkm::cuda PROPERTIES cuda_architecture_flags "@VTKm_CUDA_Architecture_Flags@")
  set_target_properties(vtkm::cuda PROPERTIES requires_static_builds TRUE)
+
+  # If VTK-m is built with 3.18+ and the consumer is < 3.18 we need to drop
+  # these properties as they break the VTK-m cuda flag logic
+  if(CMAKE_VERSION VERSION_LESS 3.18)
+    set_target_properties(vtkm::cuda PROPERTIES INTERFACE_LINK_OPTIONS "")
+  endif()
 endif()

 # VTKm requires some CMake Find modules not included with CMake, so
--- a/CMake/VTKmDeviceAdapters.cmake
+++ b/CMake/VTKmDeviceAdapters.cmake
@ -127,10 +127,13 @@ if(VTKm_ENABLE_CUDA)
      requires_static_builds TRUE
    )

+    target_compile_options(vtkm_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)

-    set_target_properties(vtkm_cuda PROPERTIES
-      INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
+      CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0)
+      # CUDA 11+ deprecated C++11 support
+      target_compile_features(vtkm_cuda INTERFACE cxx_std_14)
+    endif()

    # add the -gencode flags so that all cuda code
    # way compiled properly
@ -241,13 +244,103 @@ if(VTKm_ENABLE_CUDA)
    endif()

    string(REPLACE ";" " " arch_flags "${arch_flags}")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${arch_flags}")
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      #We propagate cuda flags via target* options so that they
+      #export cleanly
+      set(CMAKE_CUDA_ARCHITECTURES OFF)
+      target_compile_options(vtkm_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${arch_flags}>)
+      target_link_options(vtkm_cuda INTERFACE $<DEVICE_LINK:${arch_flags}>)
+    else()
+      # Before 3.18 we had to use CMAKE_CUDA_FLAGS as we had no way
+      # to propagate flags to the device link step
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${arch_flags}")
+    endif()

    # This needs to be lower-case for the property to be properly exported
    # CMake 3.15 we can add `cuda_architecture_flags` to the EXPORT_PROPERTIES
    # target property to have this automatically exported for us
-    set_target_properties(vtkm_cuda PROPERTIES cuda_architecture_flags "${arch_flags}")
    set(VTKm_CUDA_Architecture_Flags "${arch_flags}")
+    set_target_properties(vtkm_cuda PROPERTIES cuda_architecture_flags "${arch_flags}")
+    unset(arch_flags)
+  endif()
+endif()
+
+#-----------------------------------------------------------------------------
+# Kokkos with its Cuda backend enabled, expects everything to be compiled using its
+# `nvcc-wrapper` as the CXX compiler. As the name suggests, nvcc-wrapper is a wrapper around
+# Cuda's nvcc compiler. Kokkos targets have all of the flags meant for the nvcc compiler set as the
+# CXX compiler flags. This function changes all such flags to be CUDA flags so that we can use
+# CMake and vtk-m's existing infrastructure to compile for Cuda and Host separately. Without this
+# all of the files will be compiled using nvcc which can be very time consuming. It can also have
+# issues with calling host functions from device functions when compiling code for other backends.
+function(kokkos_fix_compile_options)
+  set(targets Kokkos::kokkos)
+  set(seen_targets)
+  set(cuda_arch)
+
+  while(targets)
+    list(GET targets 0 target_name)
+    list(REMOVE_AT targets 0)
+
+    get_target_property(link_libraries ${target_name} INTERFACE_LINK_LIBRARIES)
+    foreach(lib_target IN LISTS link_libraries)
+      if (TARGET ${lib_target})
+        if (lib_target IN_LIST seen_targets)
+          continue()
+        endif()
+
+        list(APPEND seen_targets ${lib_target})
+        list(APPEND targets ${lib_target})
+        get_target_property(compile_options ${lib_target} INTERFACE_COMPILE_OPTIONS)
+        if (compile_options)
+          string(REGEX MATCH "[$]<[$]<COMPILE_LANGUAGE:CXX>:-Xcompiler;.*>" cxx_compile_options "${compile_options}")
+          string(REGEX MATCH "-arch=sm_[0-9][0-9]" cuda_arch "${compile_options}")
+          string(REPLACE "-Xcompiler;" "" cxx_compile_options "${cxx_compile_options}")
+          list(TRANSFORM compile_options REPLACE "--relocatable-device-code=true" "") #We use CMake for this flag
+          list(TRANSFORM compile_options REPLACE "COMPILE_LANGUAGE:CXX" "COMPILE_LANGUAGE:CUDA")
+          list(APPEND compile_options "${cxx_compile_options}")
+          set_property(TARGET ${lib_target} PROPERTY INTERFACE_COMPILE_OPTIONS ${compile_options})
+        endif()
+
+        set_property(TARGET ${lib_target} PROPERTY INTERFACE_LINK_OPTIONS "")
+      endif()
+    endforeach()
+  endwhile()
+
+  set_property(TARGET vtkm::kokkos PROPERTY INTERFACE_LINK_OPTIONS "$<DEVICE_LINK:${cuda_arch}>")
+  if (OPENMP IN_LIST Kokkos_DEVICES)
+    set_property(TARGET vtkm::kokkos PROPERTY INTERFACE_LINK_OPTIONS "$<HOST_LINK:-fopenmp>")
+  endif()
+endfunction()
+
+if(VTKm_ENABLE_KOKKOS AND NOT TARGET vtkm::kokkos)
+  cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+
+  find_package(Kokkos REQUIRED)
+  if (CUDA IN_LIST Kokkos_DEVICES)
+    cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+    enable_language(CUDA)
+
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND
+       CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0" AND
+       CMAKE_BUILD_TYPE STREQUAL "Release")
+      message(WARNING "There is a known issue with Cuda 10 and -O3 optimization. Switching to -O2. Please refer to issue #555.")
+      string(REPLACE "-O3" "-O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+      string(REPLACE "-O3" "-O2" CMAKE_CUDA_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+    endif()
+
+    string(REGEX MATCH "[0-9][0-9]$" cuda_arch ${Kokkos_ARCH})
+    set(CMAKE_CUDA_ARCHITECTURES ${cuda_arch})
+    message(STATUS "Detected Cuda arch from Kokkos: ${cuda_arch}")
+
+    add_library(vtkm::kokkos_cuda INTERFACE IMPORTED GLOBAL)
+  endif()
+
+  add_library(vtkm::kokkos INTERFACE IMPORTED GLOBAL)
+  set_target_properties(vtkm::kokkos PROPERTIES INTERFACE_LINK_LIBRARIES "Kokkos::kokkos")
+
+  if (TARGET vtkm::kokkos_cuda)
+    kokkos_fix_compile_options()
  endif()
 endif()

--- a/CMake/VTKmMPI.cmake
+++ b/CMake/VTKmMPI.cmake
@ -1,24 +0,0 @@
-##============================================================================
-##  Copyright (c) Kitware, Inc.
-##  All rights reserved.
-##  See LICENSE.txt for details.
-##
-##  This software is distributed WITHOUT ANY WARRANTY; without even
-##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-##  PURPOSE.  See the above copyright notice for more information.
-##============================================================================
-
-if(VTKm_ENABLE_MPI AND NOT TARGET MPI::MPI_CXX)
-  if(CMAKE_VERSION VERSION_LESS 3.15)
-    #While CMake 3.10 introduced the new MPI module.
-    #Fixes related to MPI+CUDA that VTK-m needs are
-    #only found in CMake 3.15+.
-    find_package(MPI REQUIRED MODULE)
-  else()
-    #clunky but we need to make sure we use the upstream module if it exists
-    set(orig_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH})
-    set(CMAKE_MODULE_PATH "")
-    find_package(MPI REQUIRED MODULE)
-    set(CMAKE_MODULE_PATH ${orig_CMAKE_MODULE_PATH})
-  endif()
-endif()
--- a/CMake/VTKmWrappers.cmake
+++ b/CMake/VTKmWrappers.cmake
@ -10,9 +10,13 @@

 include(CMakeParseArguments)

+include(VTKmCMakeBackports)
 include(VTKmDeviceAdapters)
 include(VTKmCPUVectorization)
-include(VTKmMPI)
+
+if(VTKm_ENABLE_MPI AND NOT TARGET MPI::MPI_CXX)
+  find_package(MPI REQUIRED MODULE)
+endif()

 #-----------------------------------------------------------------------------
 # INTERNAL FUNCTIONS
@ -62,7 +66,7 @@ function(vtkm_generate_export_header lib_name)

  # Now generate a header that holds the macros needed to easily export
  # template classes. This
-  string(TOUPPER ${kit_name} BASE_NAME_UPPER)
+  string(TOUPPER ${lib_name} BASE_NAME_UPPER)
  set(EXPORT_MACRO_NAME "${BASE_NAME_UPPER}")

  set(EXPORT_IS_BUILT_STATIC 0)
@ -77,17 +81,17 @@ function(vtkm_generate_export_header lib_name)
  if(NOT EXPORT_IMPORT_CONDITION)
    #set EXPORT_IMPORT_CONDITION to what the DEFINE_SYMBOL would be when
    #building shared
-    set(EXPORT_IMPORT_CONDITION ${kit_name}_EXPORTS)
+    set(EXPORT_IMPORT_CONDITION ${lib_name}_EXPORTS)
  endif()


  configure_file(
      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
-      ${VTKm_BINARY_DIR}/include/${dir_prefix}/${kit_name}_export.h
+      ${VTKm_BINARY_DIR}/include/${dir_prefix}/${lib_name}_export.h
    @ONLY)

  if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
-    install(FILES ${VTKm_BINARY_DIR}/include/${dir_prefix}/${kit_name}_export.h
+    install(FILES ${VTKm_BINARY_DIR}/include/${dir_prefix}/${lib_name}_export.h
      DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/${dir_prefix}
      )
  endif()
@ -146,9 +150,14 @@ endfunction()
 # Pass to consumers extra compile flags they need to add to CMAKE_CUDA_FLAGS
 # to have CUDA compatibility.
 #
-# This is required as currently the -sm/-gencode flags when specified inside
-# COMPILE_OPTIONS / target_compile_options are not propagated to the device
-# linker. Instead they must be specified in CMAKE_CUDA_FLAGS
+# If VTK-m was built with CMake 3.18+ and you are using CMake 3.18+ and have
+# a cmake_minimum_required of 3.18 or have set policy CMP0105 to new, this will
+# return an empty string as the `vtkm::cuda` target will correctly propagate
+# all the necessary flags.
+#
+# This is required for CMake < 3.18 as they don't support the `$<DEVICE_LINK>`
+# generator expression for `target_link_options`. Instead they need to be
+# specified in CMAKE_CUDA_FLAGS
 #
 #
 # add_library(lib_that_uses_vtkm ...)
@ -156,7 +165,18 @@ endfunction()
 # target_link_libraries(lib_that_uses_vtkm PRIVATE vtkm_filter)
 #
 function(vtkm_get_cuda_flags settings_var)
+
  if(TARGET vtkm::cuda)
+    if(POLICY CMP0105)
+      cmake_policy(GET CMP0105 does_device_link)
+      get_property(arch_flags
+        TARGET vtkm::cuda
+        PROPERTY INTERFACE_LINK_OPTIONS)
+      if(arch_flags AND CMP0105 STREQUAL "NEW")
+        return()
+      endif()
+    endif()
+
    get_property(arch_flags
      TARGET    vtkm::cuda
      PROPERTY  cuda_architecture_flags)
@ -232,8 +252,14 @@ endfunction()
 #
 #
 #  MODIFY_CUDA_FLAGS: If enabled will add the required -arch=<ver> flags
-#  that VTK-m was compiled with. If you have multiple libraries that use
-#  VTK-m calling `vtkm_add_target_information` multiple times with
+#  that VTK-m was compiled with.
+#
+#  If VTK-m was built with CMake 3.18+ and you are using CMake 3.18+ and have
+#  a cmake_minimum_required of 3.18 or have set policy CMP0105 to new, this will
+#  return an empty string as the `vtkm::cuda` target will correctly propagate
+#  all the necessary flags.
+#
+#  Note: calling `vtkm_add_target_information` multiple times with
 #  `MODIFY_CUDA_FLAGS` will cause duplicate compiler flags. To resolve this issue
 #  you can; pass all targets and sources to a single `vtkm_add_target_information`
 #  call, have the first one use `MODIFY_CUDA_FLAGS`, or use the provided
@ -275,10 +301,11 @@ function(vtkm_add_target_information uses_vtkm_target)
    ${ARGN}
    )

-
  if(VTKm_TI_MODIFY_CUDA_FLAGS)
-    vtkm_get_cuda_flags(CMAKE_CUDA_FLAGS)
-    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} PARENT_SCOPE)
+    vtkm_get_cuda_flags(cuda_flags)
+    if(cuda_flags)
+      set(CMAKE_CUDA_FLAGS ${cuda_flags} PARENT_SCOPE)
+    endif()
  endif()

  set(targets ${uses_vtkm_target})
@ -291,6 +318,8 @@ function(vtkm_add_target_information uses_vtkm_target)
  # set the required target properties
  set_target_properties(${targets} PROPERTIES POSITION_INDEPENDENT_CODE ON)
  set_target_properties(${targets} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  # CUDA_ARCHITECTURES added in CMake 3.18
+  set_target_properties(${targets} PROPERTIES CUDA_ARCHITECTURES OFF)

  if(VTKm_TI_DROP_UNUSED_SYMBOLS)
    foreach(target IN LISTS targets)
@ -305,11 +334,16 @@ function(vtkm_add_target_information uses_vtkm_target)
  #
  # This is required as CUDA currently doesn't support device side calls across
  # dynamic library boundaries.
-  if(TARGET vtkm::cuda)
+  if((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda))
    set_source_files_properties(${VTKm_TI_DEVICE_SOURCES} PROPERTIES LANGUAGE "CUDA")
    foreach(target IN LISTS targets)
      get_target_property(lib_type ${target} TYPE)
-      get_target_property(requires_static vtkm::cuda requires_static_builds)
+      if (TARGET vtkm::cuda)
+        get_target_property(requires_static vtkm::cuda requires_static_builds)
+      endif()
+      if (TARGET vtkm::kokkos)
+        get_target_property(requires_static vtkm::kokkos requires_static_builds)
+      endif()

      if(requires_static AND ${lib_type} STREQUAL "SHARED_LIBRARY" AND VTKm_TI_EXTENDS_VTKM)
        #We provide different error messages based on if we are building VTK-m
--- a/CMake/patches/3.15/FindMPI.cmake
+++ b/CMake/patches/3.15/FindMPI.cmake
--- a/CMake/patches/README.md
+++ b/CMake/patches/README.md
@ -0,0 +1,7 @@
+# CMake backports
+
+This directory contains backports from newer CMake versions to help support
+actually using older CMake versions for building VTK-m. The directory name is the
+minimum version of CMake for which the contained files are no longer necessary.
+For example, the files under the `3.15` directory are not needed for 3.15 or
+3.16, but are for 3.14.
--- a/CMake/testing/VTKmCheckPyexpander.cmake
+++ b/CMake/testing/VTKmCheckPyexpander.cmake
@ -37,11 +37,19 @@ if(NOT GENERATED_FILE)
  return()
 endif()

-execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
-  RESULT_VARIABLE pyexpander_result
-  OUTPUT_VARIABLE pyexpander_output
+if(MSVC)
+  execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
+    RESULT_VARIABLE pyexpander_result
+    OUTPUT_VARIABLE pyexpander_output
  )
+else()
+  execute_process(
+    COMMAND ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
+    RESULT_VARIABLE pyexpander_result
+    OUTPUT_VARIABLE pyexpander_output
+  )
+endif()

 if(pyexpander_result)
  # If pyexpander returned non-zero, it failed.
--- a/CMake/testing/VTKmCheckSourceInInstall.cmake
+++ b/CMake/testing/VTKmCheckSourceInInstall.cmake
@ -110,8 +110,15 @@ function(do_verify root_dir prefix)
    )

  set(file_exceptions
-    cont/ColorTablePrivate.hxx
    thirdparty/diy/vtkmdiy/cmake/mpi_types.h
+
+    # Ignore deprecated virtual classes (which are not installed if VTKm_NO_DEPRECATED_VIRTUAL
+    # is on). These exceptions can be removed when these files are completely removed.
+    cont/ArrayHandleVirtual.h
+    cont/ArrayHandleVirtual.hxx
+    cont/ArrayHandleVirtualCoordinates.h
+    cont/StorageVirtual.h
+    cont/StorageVirtual.hxx
    )

  #by default every header in a testing directory doesn't need to be installed
--- a/CMake/testing/VTKmTestInstall.cmake
+++ b/CMake/testing/VTKmTestInstall.cmake
@ -110,6 +110,10 @@ function(vtkm_test_against_install dir)
    )
  endif()

+  if(TARGET vtkm::kokkos)
+    list(APPEND args "-DKokkos_DIR=${Kokkos_DIR}")
+  endif()
+
  #determine if the test is expected to compile or fail to build. We use
  #this information to built the test name to make it clear to the user
  #what a 'passing' test means
--- a/CMake/testing/VTKmTestWrappers.cmake
+++ b/CMake/testing/VTKmTestWrappers.cmake
@ -27,7 +27,6 @@ function(vtkm_create_test_executable
  # for MPI tests, suffix test name and add MPI_Init/MPI_Finalize calls.
  if (is_mpi_test)
    set(extraArgs EXTRA_INCLUDE "vtkm/thirdparty/diy/environment.h")
-    set(CMAKE_TESTDRIVER_BEFORE_TESTMAIN "vtkmdiy::mpi::environment env(ac, av);")

    if (use_mpi)
      vtkm_diy_use_mpi(ON)
@ -50,7 +49,7 @@ function(vtkm_create_test_executable

  #if all backends are enabled, we can use cuda compiler to handle all possible backends.
  set(device_sources)
-  if(TARGET vtkm::cuda AND enable_all_backends)
+  if(((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda)) AND enable_all_backends)
    set(device_sources ${sources})
  endif()
  vtkm_add_target_information(${prog} DEVICE_SOURCES ${device_sources})
@ -153,6 +152,13 @@ function(vtkm_unit_tests)
      #serially
      list(APPEND per_device_serial TRUE)
    endif()
+    if (VTKm_ENABLE_KOKKOS)
+      list(APPEND per_device_command_line_arguments --device=kokkos)
+      list(APPEND per_device_suffix "KOKKOS")
+      #may require more time because of kernel generation.
+      list(APPEND per_device_timeout 1500)
+      list(APPEND per_device_serial FALSE)
+    endif()
  endif()

  set(test_prog)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,16 +8,10 @@
 ##  PURPOSE.  See the above copyright notice for more information.
 ##============================================================================

-# If you want CUDA support, you will need to have CMake 3.9 on Linux/OSX.
-# We require CMake 3.11 with the MSVC generator as the $<COMPILE_LANGUAGE:>
-# generator expression is not supported on older versions.
+# If you want CUDA support, you will need to have CMake 3.13 on Linux/OSX.
 cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
 project (VTKm)

-if(${CMAKE_GENERATOR} MATCHES "Visual Studio")
-  cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
-endif()
-
 # Update module path
 set(VTKm_CMAKE_MODULE_PATH ${VTKm_SOURCE_DIR}/CMake)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${VTKm_CMAKE_MODULE_PATH})
@ -79,8 +73,9 @@ endmacro ()

 # Configurable Options
 vtkm_option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF)
-vtkm_option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
+vtkm_option(VTKm_ENABLE_KOKKOS "Enable Kokkos support" OFF)
 vtkm_option(VTKm_ENABLE_OPENMP "Enable OpenMP support" OFF)
+vtkm_option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
 vtkm_option(VTKm_ENABLE_RENDERING "Enable rendering library" ON)
 vtkm_option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)
 vtkm_option(VTKm_ENABLE_MPI "Enable MPI support" OFF)
@ -108,6 +103,17 @@ vtkm_option(VTKm_ENABLE_LOGGING "Enable VTKm Logging" ON)
 # performance.
 vtkm_option(VTKm_NO_ASSERT "Disable assertions in debugging builds." OFF)

+# The CUDA compiler (as of CUDA 11) takes a surprising long time to compile
+# kernels with assert in them. By default we turn off asserts when compiling
+# for CUDA devices.
+vtkm_option(VTKm_NO_ASSERT_CUDA "Disable assertions for CUDA devices." ON)
+
+# The HIP compiler (as of ROCm 3.7) takes a surprising long time to compile
+# kernels with assert in them they generate `printf` calls which are very
+# slow ( cause massive register spillage). By default we turn off asserts when
+# compiling for HIP devices.
+vtkm_option(VTKm_NO_ASSERT_HIP "Disable assertions for HIP devices." ON)
+
 # When VTK-m is embedded into larger projects that wish to make end user
 # applications they want to only install libraries and don't want CMake/headers
 # installed.
@ -132,13 +138,22 @@ vtkm_option(VTKm_ENABLE_DEVELOPER_FLAGS "Enable compiler flags that are useful w
 # Some application might need not to install those, hence this option.
 vtkm_option(VTKm_NO_INSTALL_README_LICENSE "disable the installation of README and LICENSE files" OFF)

+# We are in the process of deprecating the use of virtual methods because they
+# are not well supported on many accelerators. Turn this option on to remove
+# the code entirely. Note that the deprecation of virtual methods is work in
+# progress, so not all use of virtual methods may be done. In VTK-m 2.0
+# virtual methods should be removed entirely and this option will be removed.
+vtkm_option(VTKm_NO_DEPRECATED_VIRTUAL "Do not compile support of deprecated virtual methods" OFF)
+
 mark_as_advanced(
  VTKm_ENABLE_LOGGING
  VTKm_NO_ASSERT
+  VTKm_NO_ASSERT_CUDA
  VTKm_INSTALL_ONLY_LIBRARIES
  VTKm_HIDE_PRIVATE_SYMBOLS
  VTKm_ENABLE_DEVELOPER_FLAGS
  VTKm_NO_INSTALL_README_LICENSE
+  VTKm_NO_DEPRECATED_VIRTUAL
  )

 #-----------------------------------------------------------------------------
@ -266,8 +281,9 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
  # Install helper configure files.
  install(
    FILES
+      ${VTKm_SOURCE_DIR}/CMake/VTKmCMakeBackports.cmake
      ${VTKm_SOURCE_DIR}/CMake/FindTBB.cmake
-      ${VTKm_SOURCE_DIR}/CMake/FindMPI.cmake
+      ${VTKm_SOURCE_DIR}/CMake/patches/3.15/FindMPI.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
    )

@ -279,7 +295,6 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
      ${VTKm_SOURCE_DIR}/CMake/VTKmDeviceAdapters.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmDIYUtils.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
-      ${VTKm_SOURCE_DIR}/CMake/VTKmMPI.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmRenderingContexts.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmWrappers.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
@ -313,7 +328,7 @@ endif ()
 #-----------------------------------------------------------------------------
 #add the benchmarking folder
 if(VTKm_ENABLE_BENCHMARKS)
-    add_subdirectory(benchmarking)
+  add_subdirectory(benchmarking)
 endif()

 #-----------------------------------------------------------------------------
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@ -55,7 +55,7 @@ list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
  "nvlink warning : .*ArrayPortalVirtual.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorBoundingIntervalHierarchyExec.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorRectilinearGrid.* has address taken but no possible call to it"
-  "nvlink warning : .*CellLocatorUniformBins.* has address taken but no possible call to it"
+  "nvlink warning : .*CellLocatorTwoLevel.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorUniformGrid.* has address taken but no possible call to it"

 )
--- a/Utilities/CI/reproduce_ci_env.py
+++ b/Utilities/CI/reproduce_ci_env.py
@ -1,4 +1,4 @@
-#!/bin/env python3
+#!/usr/bin/env python3

 #=============================================================================
 #
--- a/benchmarking/BenchmarkArrayTransfer.cxx
+++ b/benchmarking/BenchmarkArrayTransfer.cxx
@ -77,6 +77,14 @@ struct ReadWriteValues : vtkm::worklet::WorkletMapField
  }
 };

+// Takes a vector of data and creates a fresh ArrayHandle with memory just allocated
+// in the control environment.
+template <typename T>
+vtkm::cont::ArrayHandle<T> CreateFreshArrayHandle(const std::vector<T>& vec)
+{
+  return vtkm::cont::make_ArrayHandleMove(std::vector<T>(vec));
+}
+
 //------------- Benchmark functors -------------------------------------------

 // Copies NumValues from control environment to execution environment and
@ -97,14 +105,18 @@ void BenchContToExecRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);
+
    timer.Start();
    invoker(ReadValues{}, array);
    timer.Stop();
@ -181,19 +193,26 @@ void BenchContToExecReadWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);
+
    timer.Start();
    invoker(ReadWriteValues{}, array);
    timer.Stop();

    state.SetIterationTime(timer.GetElapsedTime());
+
+    // Remove data from execution environment so it has to be transferred again.
+    array.ReleaseResourcesExecution();
  }

  const int64_t iterations = static_cast<int64_t>(state.iterations());
@ -223,21 +242,23 @@ void BenchRoundTripRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
-    // Ensure data is in control before we start:
-    array.ReleaseResourcesExecution();
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);

    timer.Start();
    invoker(ReadValues{}, array);

    // Copy back to host and read:
+    // (Note, this probably does not copy. The array exists in both control and execution for read.)
    auto portal = array.ReadPortal();
    for (vtkm::Id i = 0; i < numValues; ++i)
    {
@ -277,21 +298,23 @@ void BenchRoundTripReadWrite(benchmark::State& state)
  }

  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
-    // Ensure data is in control before we start:
-    array.ReleaseResourcesExecution();
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);

    timer.Start();

    // Do work on device:
    invoker(ReadWriteValues{}, array);

+    // Copy back to host and read/write:
    auto portal = array.WritePortal();
    for (vtkm::Id i = 0; i < numValues; ++i)
    {
@ -330,14 +353,14 @@ void BenchExecToContRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    // Time the copy:
    timer.Start();

@ -383,14 +406,14 @@ void BenchExecToContWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    timer.Start();

    // Allocate/write data on device
@ -435,14 +458,14 @@ void BenchExecToContReadWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    timer.Start();

    // Allocate/write data on device
--- a/benchmarking/BenchmarkFieldAlgorithms.cxx
+++ b/benchmarking/BenchmarkFieldAlgorithms.cxx
@ -13,13 +13,16 @@

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleMultiplexer.h>
-#include <vtkm/cont/ArrayHandleVirtual.h>
 #include <vtkm/cont/CellSetStructured.h>
 #include <vtkm/cont/ImplicitFunctionHandle.h>
 #include <vtkm/cont/Initialize.h>
 #include <vtkm/cont/Invoker.h>
 #include <vtkm/cont/Timer.h>

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
+#include <vtkm/cont/ArrayHandleVirtual.h>
+#endif
+
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/WorkletMapTopology.h>

@ -433,15 +436,19 @@ void BenchBlackScholesStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchBlackScholesStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchBlackScholesDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchBlackScholesImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.StockPrice),
           vtkm::cont::make_ArrayHandleVirtual(impl.OptionStrike),
           vtkm::cont::make_ArrayHandleVirtual(impl.OptionYears));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchBlackScholesDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchBlackScholesMultiplexer0(::benchmark::State& state)
@ -537,15 +544,19 @@ void BenchMathStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchMathStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchMathDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchMathImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.InputHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.TempHandle1),
           vtkm::cont::make_ArrayHandleVirtual(impl.TempHandle2));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchMathDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchMathMultiplexer0(::benchmark::State& state)
@ -636,13 +647,17 @@ void BenchFusedMathStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchFusedMathStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchFusedMathDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchFusedMathImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.InputHandle));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchFusedMathDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchFusedMathMultiplexer0(::benchmark::State& state)
@ -756,15 +771,19 @@ void BenchEdgeInterpStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchEdgeInterpStatic, InterpValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchEdgeInterpDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchEdgeInterpImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.EdgePairHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.WeightHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.FieldHandle));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchEdgeInterpDynamic, InterpValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 struct ImplicitFunctionBenchData
 {
--- a/benchmarking/BenchmarkFilters.cxx
+++ b/benchmarking/BenchmarkFilters.cxx
@ -24,7 +24,6 @@
 #include <vtkm/cont/ErrorInternal.h>
 #include <vtkm/cont/Logging.h>
 #include <vtkm/cont/RuntimeDeviceTracker.h>
-#include <vtkm/cont/StorageBasic.h>
 #include <vtkm/cont/Timer.h>

 #include <vtkm/cont/internal/OptionParser.h>
@ -166,8 +165,8 @@ void BenchGradient(::benchmark::State& state, int options)
  }
 }

-#define VTKM_PRIVATE_GRADIENT_BENCHMARK(Name, Opts)                                                \
-  void BenchGradient##Name(::benchmark::State& state) { BenchGradient(state, Opts); }              \
+#define VTKM_PRIVATE_GRADIENT_BENCHMARK(Name, Opts)                                   \
+  void BenchGradient##Name(::benchmark::State& state) { BenchGradient(state, Opts); } \
  VTKM_BENCHMARK(BenchGradient##Name)

 VTKM_PRIVATE_GRADIENT_BENCHMARK(Scalar, Gradient | ScalarInput);
@ -861,8 +860,12 @@ void InitDataSet(int& argc, char** argv)

  if (options[HELP])
  {
-    // FIXME: Print google benchmark usage too
-    option::printUsage(std::cerr, usage.data());
+    option::printUsage(std::cout, usage.data());
+    // Print google benchmark usage too
+    const char* helpstr = "--help";
+    char* tmpargv[] = { argv[0], const_cast<char*>(helpstr), nullptr };
+    int tmpargc = 2;
+    VTKM_EXECUTE_BENCHMARKS(tmpargc, tmpargv);
    exit(0);
  }

@ -1015,16 +1018,12 @@ int main(int argc, char* argv[])
  // Parse VTK-m options:
  Config = vtkm::cont::Initialize(argc, args.data(), opts);

-  // This occurs when it is help
-  if (opts == vtkm::cont::InitializeOptions::None)
-  {
-    std::cout << Config.Usage << std::endl;
-  }
-  else
+  // This opts changes when it is help
+  if (opts != vtkm::cont::InitializeOptions::None)
  {
    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
-    InitDataSet(argc, args.data());
  }
+  InitDataSet(argc, args.data());

  const std::string dataSetSummary = []() -> std::string {
    std::ostringstream out;
--- a/benchmarking/BenchmarkODEIntegrators.cxx
+++ b/benchmarking/BenchmarkODEIntegrators.cxx
@ -0,0 +1,97 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include "Benchmarker.h"
+
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/DataSetBuilderUniform.h>
+#include <vtkm/cont/ErrorInternal.h>
+#include <vtkm/cont/Logging.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/Timer.h>
+#include <vtkm/cont/internal/OptionParser.h>
+#include <vtkm/filter/ParticleAdvection.h>
+#include <vtkm/worklet/particleadvection/EulerIntegrator.h>
+#include <vtkm/worklet/particleadvection/RK4Integrator.h>
+#ifdef VTKM_ENABLE_TBB
+#include <tbb/task_scheduler_init.h>
+#endif
+#ifdef VTKM_ENABLE_OPENMP
+#include <omp.h>
+#endif
+
+
+namespace
+{
+// Hold configuration state (e.g. active device):
+vtkm::cont::InitializeResult Config;
+
+// Wrapper around RK4:
+void BenchParticleAdvection(::benchmark::State& state)
+{
+  const vtkm::cont::DeviceAdapterId device = Config.Device;
+  const vtkm::Id3 dims(5, 5, 5);
+  const vtkm::Vec3f vecX(1, 0, 0);
+
+  vtkm::Id numPoints = dims[0] * dims[1] * dims[2];
+
+  std::vector<vtkm::Vec3f> vectorField(static_cast<std::size_t>(numPoints));
+  for (std::size_t i = 0; i < static_cast<std::size_t>(numPoints); i++)
+    vectorField[i] = vecX;
+
+  vtkm::cont::DataSetBuilderUniform dataSetBuilder;
+
+  vtkm::cont::DataSet ds = dataSetBuilder.Create(dims);
+  ds.AddPointField("vector", vectorField);
+
+  vtkm::cont::ArrayHandle<vtkm::Particle> seedArray =
+    vtkm::cont::make_ArrayHandle({ vtkm::Particle(vtkm::Vec3f(.2f, 1.0f, .2f), 0),
+                                   vtkm::Particle(vtkm::Vec3f(.2f, 2.0f, .2f), 1),
+                                   vtkm::Particle(vtkm::Vec3f(.2f, 3.0f, .2f), 2) });
+
+  vtkm::filter::ParticleAdvection particleAdvection;
+
+  particleAdvection.SetStepSize(vtkm::FloatDefault(1) / state.range(0));
+  particleAdvection.SetNumberOfSteps(static_cast<vtkm::Id>(state.range(0)));
+  particleAdvection.SetSeeds(seedArray);
+  particleAdvection.SetActiveField("vector");
+  vtkm::cont::Timer timer{ device };
+  for (auto _ : state)
+  {
+    (void)_;
+    timer.Start();
+    auto output = particleAdvection.Execute(ds);
+    ::benchmark::DoNotOptimize(output);
+    timer.Stop();
+
+    state.SetIterationTime(timer.GetElapsedTime());
+  }
+  state.SetComplexityN(state.range(0));
+}
+VTKM_BENCHMARK_OPTS(BenchParticleAdvection,
+                      ->RangeMultiplier(2)
+                      ->Range(32, 4096)
+                      ->ArgName("Steps")
+                      ->Complexity());
+
+} // end anon namespace
+
+int main(int argc, char* argv[])
+{
+  auto opts = vtkm::cont::InitializeOptions::DefaultAnyDevice;
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+  if (opts != vtkm::cont::InitializeOptions::None)
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
+}
--- a/benchmarking/BenchmarkRayTracing.cxx
+++ b/benchmarking/BenchmarkRayTracing.cxx
@ -26,8 +26,6 @@

 #include <vtkm/exec/FunctorBase.h>

-#include <vtkm/cont/ColorTable.hxx>
-
 #include <sstream>
 #include <string>
 #include <vector>
--- a/benchmarking/Benchmarker.h
+++ b/benchmarking/Benchmarker.h
@ -170,7 +170,7 @@
 /// and modified using the passed arguments; see the Google Benchmark documentation
 /// for more details. The `preamble` string may be used to supply additional
 /// information that will be appended to the output's preamble.
-#define VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble)                                     \
+#define VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble) \
  vtkm::bench::detail::ExecuteBenchmarks(argc, argv, preamble)

 /// \def VTKM_BENCHMARK(BenchFunc)
@ -181,7 +181,7 @@
 /// ```
 /// void BenchFunc(::benchmark::State& state)
 /// ```
-#define VTKM_BENCHMARK(BenchFunc)                                                                  \
+#define VTKM_BENCHMARK(BenchFunc) \
  BENCHMARK(BenchFunc)->UseManualTime()->Unit(benchmark::kMillisecond)

 /// \def VTKM_BENCHMARK_OPTS(BenchFunc, Args)
@ -196,7 +196,7 @@
 /// Note the similarity to the raw Google Benchmark usage of
 /// `BENCHMARK(MyBenchmark)->ArgName("MyParam")->Range(32, 1024*1024);`. See
 /// the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_OPTS(BenchFunc, options)                                                    \
+#define VTKM_BENCHMARK_OPTS(BenchFunc, options) \
  BENCHMARK(BenchFunc)->UseManualTime()->Unit(benchmark::kMillisecond) options

 /// \def VTKM_BENCHMARK_APPLY(BenchFunc, ConfigFunc)
@ -211,7 +211,7 @@
 /// ```
 ///
 /// See the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_APPLY(BenchFunc, applyFunctor)                                              \
+#define VTKM_BENCHMARK_APPLY(BenchFunc, applyFunctor) \
  BENCHMARK(BenchFunc)->Apply(applyFunctor)->UseManualTime()->Unit(benchmark::kMillisecond)

 /// \def VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)
@ -224,7 +224,7 @@
 /// template <typename T>
 /// void BenchFunc(::benchmark::State& state)
 /// ```
-#define VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)                                              \
+#define VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList) \
  VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, vtkm::bench::detail::NullApply, TypeList)

 /// \def VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, Args, TypeList)
@ -237,10 +237,10 @@
 ///                                ->ArgName("MyParam")->Range(32, 1024*1024),
 ///                              vtkm::List<vtkm::Float32, vtkm::Vec3f_32>);
 /// ```
-#define VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, options, TypeList)                                \
-  VTKM_BENCHMARK_TEMPLATES_APPLY(                                                                  \
-    BenchFunc,                                                                                     \
-    [](::benchmark::internal::Benchmark* bm) { bm options->Unit(benchmark::kMillisecond); },       \
+#define VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, options, TypeList)                          \
+  VTKM_BENCHMARK_TEMPLATES_APPLY(                                                            \
+    BenchFunc,                                                                               \
+    [](::benchmark::internal::Benchmark* bm) { bm options->Unit(benchmark::kMillisecond); }, \
    TypeList)

 /// \def VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ConfigFunc, TypeList)
@ -255,22 +255,22 @@
 /// ```
 ///
 /// See the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ApplyFunctor, TypeList)                                                                                                             \
-  namespace                                                                                                                                                                           \
+#define VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ApplyFunctor, TypeList)                            \
+  namespace                                                                                          \
  { /* A template function cannot be used as a template parameter, so wrap the function with       \
     * a template struct to get it into the GenerateTemplateBenchmarks class. */ \
-  template <typename... Ts>                                                                                                                                                           \
-  struct VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                                                                                                       \
-  {                                                                                                                                                                                   \
-    static ::benchmark::internal::Function* GetFunction() { return BenchFunc<Ts...>; }                                                                                                \
-  };                                                                                                                                                                                  \
-  } /* end anon namespace */                                                                                                                                                          \
-  int BENCHMARK_PRIVATE_NAME(BenchFunc) = vtkm::bench::detail::GenerateTemplateBenchmarks<                                                                                            \
-    brigand::bind<VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)>,                                                                                                                            \
+  template <typename... Ts>                                                                          \
+  struct VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                      \
+  {                                                                                                  \
+    static ::benchmark::internal::Function* GetFunction() { return BenchFunc<Ts...>; }               \
+  };                                                                                                 \
+  } /* end anon namespace */                                                                         \
+  int BENCHMARK_PRIVATE_NAME(BenchFunc) = vtkm::bench::detail::GenerateTemplateBenchmarks<           \
+    brigand::bind<VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)>,                                           \
    TypeList>::Register(#BenchFunc, ApplyFunctor)

 // Internal use only:
-#define VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                     \
+#define VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc) \
  BENCHMARK_PRIVATE_CONCAT(_wrapper_, BenchFunc, __LINE__)

 namespace vtkm
@ -280,9 +280,7 @@ namespace bench
 namespace detail
 {

-static inline void NullApply(::benchmark::internal::Benchmark*)
-{
-}
+static inline void NullApply(::benchmark::internal::Benchmark*) {}

 /// Do not use directly. The VTKM_BENCHMARK_TEMPLATES macros should be used
 /// instead.
--- a/benchmarking/CMakeLists.txt
+++ b/benchmarking/CMakeLists.txt
@ -44,6 +44,7 @@ set(benchmarks
  BenchmarkDeviceAdapter
  BenchmarkFieldAlgorithms
  BenchmarkFilters
+  BenchmarkODEIntegrators
  BenchmarkTopologyAlgorithms
  )

--- a/data/data/curvilinear/simple_structured_ascii.vtk
+++ b/data/data/curvilinear/simple_structured_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f85560cc05688d09c21b22e91c14cec22deecb3c51dc364d82cc9fd460c6ab6
+size 328
--- a/data/data/curvilinear/simple_structured_bin.vtk
+++ b/data/data/curvilinear/simple_structured_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a47045b1ae5539ef0125273ee9c50a9a6e809f78411f6a850ac34e6fa43189bb
+size 535
--- a/data/data/rectilinear/fishtank.vtk
+++ b/data/data/rectilinear/fishtank.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef3dfd79f0c8d18780d0749014d71c0226134041283d33de0bcd994e343dd421
+size 2001070
--- a/data/data/rectilinear/fishtank_double_ascii.vtk
+++ b/data/data/rectilinear/fishtank_double_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb3d36ea5ecef5e7ef1057d0dddebbc590424915083091ead3dac2928000524
+size 2904465
--- a/data/data/rectilinear/fishtank_double_big_endian.vtk
+++ b/data/data/rectilinear/fishtank_double_big_endian.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bffad7dae3dd6ef018ad7a9e109464ced0f3b9bc15cf1fb5d555f6d0d00b621f
+size 3001624
--- a/data/data/rectilinear/fusion.vtk
+++ b/data/data/rectilinear/fusion.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cbdf56fd5445ddc5b6bc05507b8825fb8d74fe1ccce894bde03e5ff2ecf5fb6
+size 525141
--- a/data/data/rectilinear/simple_rectilinear1_ascii.vtk
+++ b/data/data/rectilinear/simple_rectilinear1_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:752021630d25aff8dfd00064badd452896be70bc8b2f94b008900b4fc70d4dd5
+size 1811
--- a/data/data/rectilinear/simple_rectilinear2_ascii.vtk
+++ b/data/data/rectilinear/simple_rectilinear2_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d1dbb4c28f1c829769ad3e03fc58f667935d8a461d3515036d5d98f5e3841cb
+size 395
--- a/data/data/uniform/simple_structured_points_ascii.vtk
+++ b/data/data/uniform/simple_structured_points_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1860e747d7f460afc63e32de184e445ffb966a42fb07f9d44ba39020584864f
+size 496
--- a/data/data/uniform/simple_structured_points_bin.vtk
+++ b/data/data/uniform/simple_structured_points_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9bea2064cd3402f3f5b7862e6b775e37f33210ba099f59358857d4bdae1020
+size 255
--- a/data/data/uniform/simple_structured_points_visit_ascii.vtk
+++ b/data/data/uniform/simple_structured_points_visit_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e154ba13346e6998b864316868da3f155e99efe4f330c8e080b0d7ece22b505a
+size 488
--- a/data/data/unstructured/empty_poly.vtk
+++ b/data/data/unstructured/empty_poly.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0ddc7c712a6d544db85660cd9d325884892b18d6f0ed451361aaeae2a96413
+size 204
--- a/data/data/unstructured/empty_unstructured.vtk
+++ b/data/data/unstructured/empty_unstructured.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b5601eb23b1724d5309e69a51839615bce625f6e7641b52dc3d06e10b0c5ee
+size 745
--- a/data/data/unstructured/simple_poly_ascii.vtk
+++ b/data/data/unstructured/simple_poly_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff3108d009d2eef410593811857e38388001f7df624ddeaed3edceafbc838aea
+size 849
--- a/data/data/unstructured/simple_poly_bin.vtk
+++ b/data/data/unstructured/simple_poly_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aca6667b06deb4ec6236d5caa3d9518345bc1eb9021bc721289b81acc980af9
+size 789
--- a/data/data/unstructured/simple_unstructured_ascii.vtk
+++ b/data/data/unstructured/simple_unstructured_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:861fc904b7d4db43288fce85c8c1398726b54ac82d7bcbcebd8f12808cb5599b
+size 1002
--- a/data/data/unstructured/simple_unstructured_bin.vtk
+++ b/data/data/unstructured/simple_unstructured_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e43c695763535251ab22af815651caa53d103b5fd168c72dfb9188e72e4ff4
+size 1244
--- a/data/data/unstructured/simple_unstructured_visit_ascii.vtk
+++ b/data/data/unstructured/simple_unstructured_visit_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3731448fe4d87b204e185829237a6a6b0140aed2fb27eea0533883a4cf4ed79d
+size 1065
--- a/docs/CI-README.md
+++ b/docs/CI-README.md
@ -60,14 +60,14 @@ Current gitlab runner tags for VTK-m are:
        Used to state that we require a linux based gitlab-runner
    - large-memory
        Used to state that this step will require a machine that has lots of memory.
-        This is currently used for cuda `build` requests
+        This is currently used for CUDA `build` requests
    - cuda-rt
-        Used to state that the runner is required to have the cuda runtime enviornment.
-        This isn't required to `build` VTK-m, only `test`
+        Used to state that the runner is required to have the CUDA runtime environment.
+        This is required to `build` and `test` VTK-m when using CUDA 
    - maxwell
    - pascal
    - turing
-        Only used on a `test` stage to signifiy which GPU hardware is required to
+        Only used on a `test` stage to signify which GPU hardware is required to
        run the VTK-m tests

 # How to use docker builders locally
@ -259,22 +259,3 @@ sudo docker login --username=<docker_hub_name>
 cd .gitlab/ci/docker
 sudo ./update_all.sh 20201230
 ```
-
-# ECP OSTI CI
-
-`.gitlab-ci-ecp.yml` allows for VTK-m to run CI on provided by ECP at NMC.
-
-To have this work properly you will need to make sure that the gitlab repository
-has been updated to this non-standard yaml file location
-( "Settings" -> "CI/CD" -> "General pipelines" -> "Custom CI configuration path").
-
-The ECP CI is setup to verify VTK-m mainly on Power9 hardware as that currently is
-missing from VTK-m standard CI infrastructure.
-
-Currently we verify Power9 support with `cuda` and `openmp` builders. The `cuda` builder
-is setup to use the default cuda SDK on the machine and the required `c++` compiler which
-currently is `gcc-4.8.5`. The `openmp` builder is setup to use the newest `c++` compiler provided
-on the machine so that we maximimze compiler coverage.
-
-## Issues
-Currently these builders don't report back to the VTK-m CDash instance.
--- a/docs/changelog/VTKDataSetWriter-remove-justpoints.md
+++ b/docs/changelog/VTKDataSetWriter-remove-justpoints.md
@ -0,0 +1,7 @@
+# Remove VTKDataSetWriter::WriteDataSet just_points parameter
+
+In the method `VTKDataSetWriter::WriteDataSet`, `just_points` parameter has been
+removed due to lack of usage. 
+
+The purpose of `just_points` was to allow exporting only the points of a
+DataSet without its cell data.
--- a/docs/changelog/add-kokkos-backend.md
+++ b/docs/changelog/add-kokkos-backend.md
@ -0,0 +1,5 @@
+# Add Kokkos backend
+
+Adds a new device backend `Kokkos` which uses the kokkos library for parallelism.
+User must provide the kokkos build and Vtk-m will use the default configured execution
+space.
--- a/docs/changelog/buffer-memory-ownership.md
+++ b/docs/changelog/buffer-memory-ownership.md
@ -0,0 +1,29 @@
+# `vtkm::cont::internal::Buffer` now can have ownership transferred
+
+Memory once transferred to `Buffer` always had to be managed by VTK-m. This is problematic
+for applications that needed VTK-m to allocate memory, but have the memory ownership
+be longer than VTK-m.
+
+`Buffer::TakeHostBufferOwnership` allows for easy transfer ownership of memory out of VTK-m.
+When taking ownership of an VTK-m buffer you are provided the following information:
+
+- Memory: A `void*` pointer to the array
+- Container: A `void*` pointer used to free the memory. This is necessary to support cases such as allocations transferred into VTK-m from a `std::vector`.
+- Delete: The function to call to actually delete the transferred memory
+- Reallocate: The function to call to re-allocate the transferred memory. This will throw an exception if users try
+to reallocate a buffer that was 'view' only
+- Size: The size in number of elements of the array 
+
+ 
+To properly steal memory from VTK-m you do the following:
+```cpp
+  vtkm::cont::ArrayHandle<T> arrayHandle;
+
+  ...
+
+  auto stolen = arrayHandle.GetBuffers()->TakeHostBufferOwnership();
+    
+  ...
+
+  stolen.Delete(stolen.Container);
+```
--- a/docs/changelog/buffer.md
+++ b/docs/changelog/buffer.md
@ -0,0 +1,202 @@
+# Redesign of ArrayHandle to access data using typeless buffers
+
+The original implementation of `ArrayHandle` is meant to be very generic.
+To define an `ArrayHandle`, you actually create a `Storage` class that
+maintains the data and provides portals to access it (on the host). Because
+the `Storage` can provide any type of data structure it wants, you also
+need to define an `ArrayTransfer` that describes how to move the
+`ArrayHandle` to and from a device. It also has to be repeated for every
+translation unit that uses them.
+
+This is a very powerful mechanism. However, one of the major problems with
+this approach is that every `ArrayHandle` type needs to have a separate
+compile path for every value type crossed with every device. Because of
+this limitation, the `ArrayHandle` for the basic storage has a special
+implementation that manages the actual data allocation and movement as
+`void *` arrays. In this way all the data management can be compiled once
+and put into the `vtkm_cont` library. This has dramatically improved the
+VTK-m compile time.
+
+This new design replicates the basic `ArrayHandle`'s success to all other
+storage types. The basic idea is to make the implementation of
+`ArrayHandle` storage slightly less generic. Instead of requiring it to
+manage the data it stores, it instead just builds `ArrayPortal`s from
+`void` pointers that it is given. The management of `void` pointers can be
+done in non-templated classes that are compiled into a library.
+
+This initial implementation does not convert all `ArrayHandle`s to avoid
+making non-backward compatible changes before the next minor revision of
+VTK-m. In particular, it would be particularly difficult to convert
+`ArrayHandleVirtual`. It could be done, but it would be a lot of work for a
+class that will likely be removed.
+
+## Buffer
+
+Key to these changes is the introduction of a
+`vtkm::cont::internal::Buffer` object. As the name implies, the `Buffer`
+object manages a single block of bytes. `Buffer` is agnostic to the type of
+data being stored. It only knows the length of the buffer in bytes. It is
+responsible for allocating space on the host and any devices as necessary
+and for transferring data among them. (Since `Buffer` knows nothing about
+the type of data, a precondition of VTK-m would be that the host and all
+devices have to have the same endian.)
+
+The idea of the `Buffer` object is similar in nature to the existing
+`vtkm::cont::internal::ExecutionArrayInterfaceBasicBase` except that it
+will manage a buffer of data among the control and all devices rather than
+in one device through a templated subclass.
+
+As explained below, `ArrayHandle` holds some fixed number of `Buffer`
+objects. (The number can be zero for implicit `ArrayHandle`s.) Because all
+the interaction with the devices happen through `Buffer`, it will no longer
+be necessary to compile any reference to `ArrayHandle` for devices (e.g.
+you won’t have to use nvcc just because the code links `ArrayHandle.h`).
+
+## Storage
+
+The `vtkm::cont::internal::Storage` class changes dramatically. Although an
+instance will be kept, the intention is for `Storage` itself to be a
+stateless object. It will manage its data through `Buffer` objects provided
+from the `ArrayHandle`.
+
+That said, it is possible for `Storage` to have some state. For example,
+the `Storage` for `ArrayHandleImplicit` must hold on to the instance of the
+portal used to manage the state.
+
+
+## ArrayTransport
+
+The `vtkm::cont::internal::ArrayTransfer` class will be removed completely.
+All data transfers will be handled internally with the `Buffer` object
+
+## Portals
+
+A big change for this design is that the type of a portal for an
+`ArrayHandle` will be the same for all devices and the host. Thus, we no
+longer need specialized versions of portals for each device. We only have
+one portal type. And since they are constructed from `void *` pointers, one
+method can create them all.
+
+
+## Advantages
+
+The `ArrayHandle` interface should not change significantly for external
+uses, but this redesign offers several advantages.
+
+### Faster Compiles
+
+Because the memory management is contained in a non-templated `Buffer`
+class, it can be compiled once in a library and used by all template
+instances of `ArrayHandle`. It should have similar compile advantages to
+our current specialization of the basic `ArrayHandle`, but applied to all
+types of `ArrayHandle`s.
+
+### Fewer Templates
+
+Hand-in-hand with faster compiles, the new design should require fewer
+templates and template instances. We have immediately gotten rid of
+`ArrayTransport`. `Storage` is also much shorter. Because all
+`ArrayPortal`s are the same for every device and the host, we need many
+fewer versions of those classes. In the device adapter, we can probably
+collapse the three `ArrayManagerExecution` classes into a single, much
+simpler class that does simple memory allocation and copy.
+
+### Fewer files need to be compiled for CUDA
+
+Including `ArrayHandle.h` no longer adds code that compiles for a device.
+Thus, we should no longer need to compile for a specific device adapter
+just because we access an `ArrayHandle`. This should make it much easier to
+achieve our goal of a "firewall". That is, code that just calls VTK-m
+filters does not need to support all its compilers and flags.
+
+### Simpler ArrayHandle specialization
+
+The newer code should simplify the implementation of special `ArrayHandle`s
+a bit. You need only implement an `ArrayPortal` that operates on one or
+more `void *` arrays and a simple `Storage` class.
+
+### Out of band memory sharing
+
+With the current version of `ArrayHandle`, if you want to take data from
+one `ArrayHandle` you pretty much have to create a special template to wrap
+another `ArrayHandle` around that. With this new design, it is possible to
+take data from one `ArrayHandle` and give it to another `ArrayHandle` of a
+completely different type. You can’t do this willy-nilly since different
+`ArrayHandle` types will interpret buffers differently. But there can be
+some special important use cases.
+
+One such case could be an `ArrayHandle` that provides strided access to a
+buffer. (Let’s call it `ArrayHandleStride`.) The idea is that it interprets
+the buffer as an array for a particular type (like a basic `ArrayHandle`)
+but also defines a stride, skip, and repeat so that given an index it looks
+up the value `((index / skip) % repeat) * stride`. The point is that it can
+take an AoS array of tuples and represent an array of one of the
+components.
+
+The point would be that if you had a `VariantArrayHandle` or `Field`, you
+could pull out an array of one of the components as an `ArrayHandleStride`.
+An `ArrayHandleStride<vtkm::Float32>` could be used to represent that data
+that comes from any basic `ArrayHandle` with `vtkm::Float32` or a
+`vtkm::Vec` of that type. It could also represent data from an
+`ArrayHandleCartesianProduct` and `ArrayHandleSoA`. We could even represent
+an `ArrayHandleUniformPointCoordinates` by just making a small array. This
+allows us to statically access a whole bunch of potential array storage
+classes with a single type.
+
+### Potentially faster device transfers
+
+There is currently a fast-path for basic `ArrayHandle`s that does a block
+cuda memcpy between host and device. But for other `ArrayHandle`s that do
+not defer their `ArrayTransfer` to a sub-array, the transfer first has to
+copy the data into a known buffer.
+
+Because this new design stores all data in `Buffer` objects, any of these
+can be easily and efficiently copied between devices.
+
+## Disadvantages
+
+This new design gives up some features of the original `ArrayHandle` design.
+
+### Can only interface data that can be represented in a fixed number of buffers
+
+Because the original `ArrayHandle` design required the `Storage` to
+completely manage the data, it could represent it in any way possible. In
+this redesign, the data need to be stored in some fixed number of memory
+buffers.
+
+This is a pretty open requirement. I suspect most data formats will be
+storable in this. The user’s guide has an example of data stored in a
+`std::deque` that will not be representable. But that is probably not a
+particularly practical example.
+
+### VTK-m would only be able to support hosts and devices with the same endian
+
+Because data are transferred as `void *` blocks of memory, there is no way
+to correct words if the endian on the two devices does not agree. As far as
+I know, there should be no issues with the proposed ECP machines.
+
+If endian becomes an issue, it might be possible to specify a word length
+in the `Buffer`. That would assume that all numbers stored in the `Buffer`
+have the same word length.
+
+### ArrayPortals must be completely recompiled in each translation unit
+
+We can declare that an `ArrayHandle` does not need to include the device
+adapter header files in part because it no longer needs specialized
+`ArrayPortal`s for each device. However, that means that a translation unit
+compiled with the host compiler (say gcc) will produce different code for
+the `ArrayPortal`s than those with the device compiler (say nvcc). This
+could lead to numerous linking problems.
+
+To get around these issues, we will probably have to enforce no exporting
+of any of the `ArrayPotal` symbols and force them all to be recompiled for
+each translation unit. This will serve to increase the compile times a bit.
+We will probably also still encounter linking errors as there would be no
+way to enforce this requirement.
+
+### Cannot have specialized portals for the control environment
+
+Because the new design unifies `ArrayPortal` types across control and
+execution environments, it is no longer possible to have a special version
+for the control environment to manage resources. This will require removing
+some recent behavior of control portals such as with MR !1988.
--- a/docs/changelog/cuda-no-assert.md
+++ b/docs/changelog/cuda-no-assert.md
@ -0,0 +1,10 @@
+# Disable asserts for CUDA architecture builds
+
+`assert` is supported on recent CUDA cards, but compiling it appears to be
+very slow. By default, the `VTKM_ASSERT` macro has been disabled whenever
+compiling for a CUDA device (i.e. when `__CUDA_ARCH__` is defined).
+
+Asserts for CUDA devices can be turned back on by turning the
+`VTKm_NO_ASSERT_CUDA` CMake variable off. Turning this CMake variable off
+will enable assertions in CUDA kernels unless there is another reason
+turning off all asserts (such as a release build).
--- a/docs/changelog/deprecate-arrayhandlevirtualcoordinates.md
+++ b/docs/changelog/deprecate-arrayhandlevirtualcoordinates.md
@ -0,0 +1,39 @@
+# Deprecate ArrayHandleVirtualCoordinates
+
+As we port VTK-m to more types of accelerator architectures, supporting
+virtual methods is becoming more problematic. Thus, we are working to back
+out of using virtual methods in the execution environment.
+
+One of the most widespread users of virtual methods in the execution
+environment is `ArrayHandleVirtual`. As a first step of deprecating this
+class, we first deprecate the `ArrayHandleVirtualCoordinates` subclass.
+
+Not surprisingly, `ArrayHandleVirtualCoordinates` is used directly by
+`CoordinateSystem`. The biggest change necessary was that the `GetData`
+method returned an `ArrayHandleVirtualCoordinates`, which obviously would
+not work if that class is deprecated.
+
+An oddness about this return type is that it is quite different from the
+superclass's method of the same name. Rather, `Field` returns a
+`VariantArrayHandle`. Since this had to be corrected anyway, it was decided
+to change `CoordinateSystem`'s `GetData` to also return a
+`VariantArrayHandle`, although its typelist is set to just `vtkm::Vec3f`.
+
+To try to still support old code that expects the deprecated behavior of
+returning an `ArrayHandleVirtualCoordinates`, `CoordinateSystem::GetData`
+actually returns a "hidden" subclass of `VariantArrayHandle` that
+automatically converts itself to an `ArrayHandleVirtualCoordinates`. (A
+deprecation warning is given if this is done.)
+
+This approach to support deprecated code is not perfect. The returned value
+for `CoordinateSystem::GetData` can only be used as an `ArrayHandle` if a
+method is directly called on it or if it is cast specifically to
+`ArrayHandleVirtualCoordinates` or its superclass. For example, if passing
+it to a method argument typed as `vtkm::cont::ArrayHandle<T, S>` where `T`
+and `S` are template parameters, then the conversion will fail.
+
+To continue to support ease of use, `CoordinateSystem` now has a method
+named `GetDataAsMultiplexer` that returns the data as an
+`ArrayHandleMultiplexer`. This can be employed to quickly use the
+`CoordinateSystem` as an array without the overhead of a `CastAndCall`.
+
--- a/docs/changelog/deprecate-virtual-methods.md
+++ b/docs/changelog/deprecate-virtual-methods.md
@ -0,0 +1,17 @@
+# Virtual methods in execution environment deprecated
+
+The use of classes with any virtual methods in the execution environment is
+deprecated. Although we had code to correctly build virtual methods on some
+devices such as CUDA, this feature was not universally supported on all
+programming models we wish to support. Plus, the implementation of virtual
+methods is not hugely convenient on CUDA because the virtual methods could
+not be embedded in a library. To get around virtual methods declared in
+different libraries, all builds had to be static, and a special linking
+step to pull in possible virtual method implementations was required.
+
+For these reasons, VTK-m is no longer relying on virtual methods. (Other
+approaches like multiplexers are used instead.) The code will be officially
+removed in version 2.0. It is still supported in a deprecated sense (you
+should get a warning). However, if you want to build without virtual
+methods, you can set the `VTKm_NO_DEPRECATED_VIRTUAL` CMake flag, and they
+will not be compiled.
--- a/docs/changelog/deprecation.md
+++ b/docs/changelog/deprecation.md
@ -40,11 +40,11 @@ using OldAlias VTKM_DEPRECATED(1.6, "Use NewClass instead.") = NewClass;
 ```

 Functions and methods are marked as deprecated by adding `VTKM_DEPRECATED`
-as a modifier before the return value.
+as a modifier before the return value and any markup (VTKM_CONT, VTKM_EXEC, or VTKM_EXEC_CONT).

 ``` cpp
-VTKM_EXEC_CONT
 VTKM_DEPRECATED(1.6, "You must now specify a tolerance.") void ImportantMethod(double x)
+VTKM_EXEC_CONT
 {
  this->ImportantMethod(x, 1e-6);
 }
@ -83,8 +83,8 @@ support this a pair of macros, `VTKM_DEPRECATED_SUPPRESS_BEGIN` and
 deprecated items should be wrapped in these macros.

 ``` cpp
-VTKM_EXEC_CONT
 VTKM_DEPRECATED(1.6, "You must now specify both a value and tolerance.")
+VTKM_EXEC_CONT
 void ImportantMethod()
 {
  // It can be the case that to implement a deprecated method you need to
--- a/docs/changelog/free-atomic-functions.md
+++ b/docs/changelog/free-atomic-functions.md
@ -0,0 +1,14 @@
+# Add atomic free functions
+
+Previously, all atomic functions were stored in classes named
+`AtomicInterfaceControl` and `AtomicInterfaceExecution`, which required
+you to know at compile time which device was using the methods. That in
+turn means that anything using an atomic needed to be templated on the
+device it is running on.
+
+That can be a big hassle (and is problematic for some code structure).
+Instead, these methods are moved to free functions in the `vtkm`
+namespace. These functions operate like those in `Math.h`. Using
+compiler directives, an appropriate version of the function is compiled
+for the current device the compiler is using.
+
--- a/docs/changelog/hip-no-assert.md
+++ b/docs/changelog/hip-no-assert.md
@ -0,0 +1,12 @@
+# Disable asserts for HIP architecture builds
+
+`assert` is supported on recent HIP cards, but compiling it is very slow,
+as it triggers the usage of `printf` which. Currently (ROCm 3.7) `printf`
+has a severe performance penalty and should be avoided when possible.
+By default, the `VTKM_ASSERT` macro has been disabled whenever compiling
+for a HIP device via kokkos.
+
+Asserts for HIP devices can be turned back on by turning the
+`VTKm_NO_ASSERT_HIP` CMake variable off. Turning this CMake variable off
+will enable assertions in HIP kernels unless there is another reason
+turning off all asserts (such as a release build).
--- a/docs/changelog/move-std-vector.md
+++ b/docs/changelog/move-std-vector.md
@ -0,0 +1,120 @@
+# Improvements to moving data into ArrayHandle
+
+We have made several improvements to adding data into an `ArrayHandle`.
+
+## Moving data from an `std::vector`
+
+For numerous reasons, it is convenient to define data in a `std::vector`
+and then wrap that into an `ArrayHandle`. There are two obvious ways to do
+this. First, you could deep copy the data into an `ArrayHandle`, which has
+obvious drawbacks. Second, you could take the pointer for the data in the
+`std::vector` and use that as user-allocated memory in the `ArrayHandle`
+without deep copying it. The problem with this shallow copy is that it is
+unsafe. If the `std::vector` goes out of scope (or gets resized), then the
+data the `ArrayHandle` is pointing to becomes unallocated, which will lead
+to unpredictable behavior.
+
+However, there is a third option. It is often the case that an
+`std::vector` is filled and then becomes unused once it is converted to an
+`ArrayHandle`. In this case, what we really want is to pass the data off to
+the `ArrayHandle` so that the `ArrayHandle` is now managing the data and
+not the `std::vector`.
+
+C++11 has a mechanism to do this: move semantics. You can now pass
+variables to functions as an "rvalue" (right-hand value). When something is
+passed as an rvalue, it can pull state out of that variable and move it
+somewhere else. `std::vector` implements this movement so that an rvalue
+can be moved to another `std::vector` without actually copying the data.
+`make_ArrayHandle` now also takes advantage of this feature to move rvalue
+`std::vector`s.
+
+There is a special form of `make_ArrayHandle` named `make_ArrayHandleMove`
+that takes an rvalue. There is also a special overload of
+`make_ArrayHandle` itself that handles an rvalue `vector`. (However, using
+the explicit move version is better if you want to make sure the data is
+actually moved.)
+
+So if you create the `std::vector` in the call to `make_ArrayHandle`, then
+the data only gets created once.
+
+``` cpp
+auto array = vtkm::cont::make_ArrayHandleMove(std::vector<vtkm::Id>{ 2, 6, 1, 7, 4, 3, 9 });
+```
+
+Note that there is now a better way to express an initializer list to
+`ArrayHandle` documented below. But this form of `ArrayHandleMove` can be
+particularly useful for initializing an array to all of a particular value.
+For example, an easy way to initialize an array of 1000 elements all to 1
+is
+
+``` cpp
+auto array = vtkm::cont::make_ArrayHandleMove(std::vector<vtkm::Id>(1000, 1));
+```
+
+You can also move the data from an already created `std::vector` by using
+the `std::move` function to convert it to an rvalue. When you do this, the
+`std::vector` becomes invalid after the call and any use will be undefined.
+
+``` cpp
+std::vector<vtkm::Id> vector;
+// fill vector
+
+auto array = vtkm::cont::make_ArrayHandleMove(std::move(vector));
+```
+
+## Make `ArrayHandle` from initalizer list
+
+A common use case for using `std::vector` (particularly in our unit tests)
+is to quickly add an initalizer list into an `ArrayHandle`. Repeating the
+example from above:
+
+``` cpp
+auto array = vtkm::cont::make_ArrayHandleMove(std::vector<vtkm::Id>{ 2, 6, 1, 7, 4, 3, 9 });
+```
+
+However, creating the `std::vector` should be unnecessary. Why not be able
+to create the `ArrayHandle` directly from an initializer list? Now you can
+by simply passing an initializer list to `make_ArrayHandle`.
+
+``` cpp
+auto array = vtkm::cont::make_ArrayHandle({ 2, 6, 1, 7, 4, 3, 9 });
+```
+
+There is an issue here. The type here can be a little ambiguous (for
+humans). In this case, `array` will be of type
+`vtkm::cont::ArrayHandleBasic<int>`, since that is what an integer literal
+defaults to. This could be a problem if, for example, you want to use
+`array` as an array of `vtkm::Id`, which could be of type `vtkm::Int64`.
+This is easily remedied by specifying the desired value type as a template
+argument to `make_ArrayHandle`.
+
+``` cpp
+auto array = vtkm::cont::make_ArrayHandle<vtkm::Id>({ 2, 6, 1, 7, 4, 3, 9 });
+```
+
+## Deprecated `make_ArrayHandle` with default shallow copy
+
+For historical reasons, passing an `std::vector` or a pointer to
+`make_ArrayHandle` does a shallow copy (i.e. `CopyFlag` defaults to `Off`).
+Although more efficient, this mode is inherintly unsafe, and making it the
+default is asking for trouble.
+
+To combat this, calling `make_ArrayHandle` without a copy flag is
+deprecated. In this way, if you wish to do the faster but more unsafe
+creation of an `ArrayHandle` you should explicitly express that.
+
+This requried quite a few changes through the VTK-m source (particularly in
+the tests).
+
+## Similar changes to `Field`
+
+`vtkm::cont::Field` has a `make_Field` helper function that is similar to
+`make_ArrayHandle`. It also features the ability to create fields from
+`std::vector`s and C arrays. It also likewise had the same unsafe behavior
+by default of not copying from the source of the arrays.
+
+That behavior has similarly been depreciated. You now have to specify a
+copy flag.
+
+The ability to construct a `Field` from an initializer list of values has
+also been added.
--- a/docs/changelog/unknown-array-handle.md
+++ b/docs/changelog/unknown-array-handle.md
@ -0,0 +1,109 @@
+# UnknownArrayHandle and UncertainArrayHandle for runtime-determined types
+
+Two new classes have been added to VTK-m: `UnknownArrayHandle` and
+`UncertainArrayHandle`. These classes serve the same purpose as the set of
+`VariantArrayHandle` classes and will replace them.
+
+Motivated mostly by the desire to move away from `ArrayHandleVirtual`, we
+have multiple reasons to completely refactor the `VariantArrayHandle`
+class. These include changing the implementation, some behavior, and even
+the name.
+
+## Motivation
+
+We have several reasons that have accumulated to revisit the implementation
+of `VariantArrayHandle`.
+
+### Move away from `ArrayHandleVirtual`
+
+The current implementation of `VariantArrayHandle` internally stores the
+array wrapped in an `ArrayHandleVirtual`. That makes sense since you might
+as well consolidate the hierarchy of virtual objects into one.
+
+Except `ArrayHandleVirtual` is being deprecated, so it no longer makes
+sense to use that internally.
+
+So we will transition the class back to managing the data as typeless on
+its own. We will consider using function pointers rather than actual
+virtual functions because compilers can be slow in creating lots of virtual
+subclasses.
+
+### Reintroduce storage tag lists
+
+The original implementation of `VariantArrayHandle` (which at the time was
+called `DynamicArrayHandle`) actually had two type lists: one for the array
+value type and one for the storage type. The storage type list was removed
+soon after `ArrayHandleVirtual` was introduced because whatever the type of
+array it could be access as `ArrayHandleVirtual`.
+
+However, with `ArrayHandleVirtual` being deprecated, this feature is no
+longer possible. We are in need again for the list of storage types to try.
+Thus, we need to reintroduce this template argument to
+`VariantArrayHandle`.
+
+### More clear name
+
+The name of this class has always been unsatisfactory. The first name,
+`DynamicArrayHandle`, makes it sound like the data is always changing. The
+second name, `VariantArrayHandle`, makes it sound like an array that holds
+a value type that can vary (like an `std::variant`).
+
+We can use a more clear name that expresses better that it is holding an
+`ArrayHandle` of an _unknown_ type.
+
+### Take advantage of default types for less templating
+
+Once upon a time everything in VTK-m was templated header library. Things
+have changed quite a bit since then. The most recent development is the
+ability to select the "default types" with CMake configuration that allows
+you to select a global set of types you care about during compilation. This
+is so units like filters can be compiled into a library with all types we
+care about, and we don't have to constantly recompile units.
+
+This means that we are becoming less concerned about maintaining type lists
+everywhere. Often we can drop the type list and pass data across libraries.
+
+With that in mind, it makes less sense for `VariantArrayHandle` to actually
+be a `using` alias for `VariantArrayHandleBase<VTKM_DEFAULT_TYPE_LIST>`.
+
+In response, we can revert the is-a relationship between the two. Have a
+completely typeless version as the base class and have a second version
+templated version to express when the type of the array has been partially
+narrowed down to given type lists.
+
+## New Name and Structure
+
+The ultimate purpose of this class is to store an `ArrayHandle` where the
+value and storage types are unknown. Thus, an appropriate name for the
+class is `UnknownArrayHandle`.
+
+`UnknownArrayHandle` is _not_ templated. It simply stores an `ArrayHandle`
+in a typeless (`void *`) buffer. It does, however, contain many templated
+methods that allow you to query whether the contained array matches given
+types, to cast to given types, and to cast and call to a given functor
+(from either given type lists or default lists).
+
+Rather than have a virtual class structure to manage the typeless array,
+the new management will use function pointers. This has shown to sometimes
+improve compile times and generate less code.
+
+Sometimes it is the case that the set of potential types can be narrowed. In
+this case, the array ceases to be unknown and becomes _uncertain_. Thus,
+the companion class to `UnknownArrayHandle` is `UncertainArrayHandle`.
+
+`UncertainArrayHandle` has two template parameters: a list of potential
+value types and a list of potential storage types. The behavior of
+`UncertainArrayHandle` matches that of `UnknownArrayHandle` (and might
+inherit from it). However, for `CastAndCall` operations, it will use the
+type lists defined in its template parameters.
+
+## Serializing UnknownArrayHandle
+
+Because `UnknownArrayHandle` is not templated, it contains some
+opportunities to compile things into the `vtkm_cont` library. Templated
+methods like `CastAndCall` cannot be, but the specializations of DIY's
+serialize can be.
+
+And since it only has to be compiled once into a library, we can spend some
+extra time compiling for more types. We don't have to restrict ourselves to
+`VTKM_DEFAULT_TYPE_LIST`. We can compile for vtkm::TypeListTagAll.
--- a/docs/changelog/write-uniform-rectilinear.md
+++ b/docs/changelog/write-uniform-rectilinear.md
@ -0,0 +1,13 @@
+# Write uniform and rectilinear grids to legacy VTK files
+
+As a programming convenience, all `vtkm::cont::DataSet` written by
+`vtkm::io::VTKDataSetWriter` were written as a structured grid. Although
+technically correct, it changed the structure of the data. This meant that
+if you wanted to capture data to run elsewhere, it would run as a different
+data type. This was particularly frustrating if the data of that structure
+was causing problems and you wanted to debug it.
+
+Now, `VTKDataSetWriter` checks the type of the `CoordinateSystem` to
+determine whether the data should be written out as `STRUCTURED_POINTS`
+(i.e. a uniform grid), `RECTILINEAR_GRID`, or `STRUCTURED_GRID`
+(curvilinear).
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -19,14 +19,16 @@ if(VTKm_ENABLE_EXAMPLES)
  add_subdirectory(contour_tree_distributed)
  add_subdirectory(cosmotools)
  add_subdirectory(demo)
-  #add_subdirectory(game_of_life)
+  add_subdirectory(game_of_life)
  add_subdirectory(hello_worklet)
  add_subdirectory(histogram)
+  add_subdirectory(ising)
  add_subdirectory(lagrangian)
  add_subdirectory(mesh_quality)
  add_subdirectory(multi_backend)
  add_subdirectory(oscillator)
  add_subdirectory(particle_advection)
+  add_subdirectory(streamline_mpi)
  add_subdirectory(polyline_archimedean_helix)
  add_subdirectory(redistribute_points)
  add_subdirectory(temporal_advection)
--- a/examples/contour_tree_augmented/ContourTreeApp.cxx
+++ b/examples/contour_tree_augmented/ContourTreeApp.cxx
@ -370,28 +370,14 @@ int main(int argc, char* argv[])
    VTKM_LOG_IF_S(vtkm::cont::LogLevel::Info,
                  numLevels > 0,
                  std::endl
-                    << "    ------------ Settings Isolevel Selection -----------"
-                    << std::endl
-                    << "    levels="
-                    << numLevels
-                    << std::endl
-                    << "    eps="
-                    << eps
-                    << std::endl
-                    << "    comp"
-                    << numComp
-                    << std::endl
-                    << "    type="
-                    << contourType
-                    << std::endl
-                    << "    method="
-                    << contourSelectMethod
-                    << std::endl
-                    << "    mc="
-                    << useMarchingCubes
-                    << std::endl
-                    << "    use"
-                    << (usePersistenceSorter ? "PersistenceSorter" : "VolumeSorter"));
+                    << "    ------------ Settings Isolevel Selection -----------" << std::endl
+                    << "    levels=" << numLevels << std::endl
+                    << "    eps=" << eps << std::endl
+                    << "    comp" << numComp << std::endl
+                    << "    type=" << contourType << std::endl
+                    << "    method=" << contourSelectMethod << std::endl
+                    << "    mc=" << useMarchingCubes << std::endl
+                    << "    use" << (usePersistenceSorter ? "PersistenceSorter" : "VolumeSorter"));
  }
  currTime = totalTime.GetElapsedTime();
  vtkm::Float64 startUpTime = currTime - prevTime;
@ -401,8 +387,8 @@ int main(int argc, char* argv[])
 #ifdef WITH_MPI
 #ifdef DEBUG_PRINT
  // From https://www.unix.com/302983597-post2.html
-  char* cstr_filename = new char[15];
-  snprintf(cstr_filename, sizeof(filename), "cout_%d.log", rank);
+  char cstr_filename[32];
+  snprintf(cstr_filename, sizeof(cstr_filename), "cout_%d.log", rank);
  int out = open(cstr_filename, O_RDWR | O_CREAT | O_APPEND, 0600);
  if (-1 == out)
  {
@ -431,8 +417,6 @@ int main(int argc, char* argv[])
    perror("cannot redirect stderr");
    return 255;
  }
-
-  delete[] cstr_filename;
 #endif
 #endif

@ -458,23 +442,27 @@ int main(int argc, char* argv[])
    // Copy the data into the values array so we can construct a multiblock dataset
    // TODO All we should need to do to implement BOV support is to copy the values
    // in the values vector and copy the dimensions in the dims vector
-    vtkm::Id nRows, nCols, nSlices;
-    vtkm::worklet::contourtree_augmented::GetRowsColsSlices temp;
-    temp(inDataSet.GetCellSet(), nRows, nCols, nSlices);
-    dims[0] = nRows;
-    dims[1] = nCols;
-    dims[2] = nSlices;
-    auto tempField = inDataSet.GetField("values").GetData();
-    values.resize(static_cast<std::size_t>(tempField.GetNumberOfValues()));
-    auto tempFieldHandle = tempField.AsVirtual<ValueType>().ReadPortal();
-    for (vtkm::Id i = 0; i < tempField.GetNumberOfValues(); i++)
-    {
-      values[static_cast<std::size_t>(i)] = static_cast<ValueType>(tempFieldHandle.Get(i));
-    }
+    vtkm::Id3 meshSize;
+    vtkm::worklet::contourtree_augmented::GetPointDimensions temp;
+    temp(inDataSet.GetCellSet(), meshSize);
+    dims[0] = meshSize[0];
+    dims[1] = meshSize[1];
+    dims[2] = meshSize[2];
+    // TODO/FIXME: The following is commented out since it creates a a warning that
+    // AsVirtual() will no longer be supported. Since this implementation is
+    // incomplete anyway, it currently makes more sense to comment it out than
+    // to fix the warning.
+    // auto tempField = inDataSet.GetField("values").GetData();
+    // values.resize(static_cast<std::size_t>(tempField.GetNumberOfValues()));
+    // auto tempFieldHandle = tempField.AsVirtual<ValueType>().ReadPortal();
+    // for (vtkm::Id i = 0; i < tempField.GetNumberOfValues(); i++)
+    // {
+    //   values[static_cast<std::size_t>(i)] = static_cast<ValueType>(tempFieldHandle.Get(i));
+    // }
    VTKM_LOG_S(vtkm::cont::LogLevel::Error,
               "BOV reader not yet support in MPI mode by this example");
    MPI_Finalize();
-    return EXIT_SUCCESS;
+    return EXIT_FAILURE;
 #endif
  }
  else // Read ASCII data input
@ -529,6 +517,9 @@ int main(int argc, char* argv[])
    dataReadTime = currTime - prevTime;
    prevTime = currTime;

+    // swap dims order
+    std::swap(dims[0], dims[1]);
+
 #ifndef WITH_MPI // We only need the inDataSet if are not using MPI otherwise we'll constructe a multi-block dataset
    // build the input dataset
    vtkm::cont::DataSetBuilderUniform dsb;
@ -536,16 +527,16 @@ int main(int argc, char* argv[])
    if (nDims == 2)
    {
      vtkm::Id2 vdims;
-      vdims[0] = static_cast<vtkm::Id>(dims[1]);
-      vdims[1] = static_cast<vtkm::Id>(dims[0]);
+      vdims[0] = static_cast<vtkm::Id>(dims[0]);
+      vdims[1] = static_cast<vtkm::Id>(dims[1]);
      inDataSet = dsb.Create(vdims);
    }
    // 3D data
    else
    {
      vtkm::Id3 vdims;
-      vdims[0] = static_cast<vtkm::Id>(dims[1]);
-      vdims[1] = static_cast<vtkm::Id>(dims[0]);
+      vdims[0] = static_cast<vtkm::Id>(dims[0]);
+      vdims[1] = static_cast<vtkm::Id>(dims[1]);
      vdims[2] = static_cast<vtkm::Id>(dims[2]);
      inDataSet = dsb.Create(vdims);
    }
@ -558,19 +549,17 @@ int main(int argc, char* argv[])
  {
    VTKM_LOG_S(vtkm::cont::LogLevel::Info,
               std::endl
-                 << "    ---------------- Input Mesh Properties --------------"
-                 << std::endl
-                 << "    Number of dimensions: "
-                 << nDims);
+                 << "    ---------------- Input Mesh Properties --------------" << std::endl
+                 << "    Number of dimensions: " << nDims);
  }

  // Check if marching cubes is enabled for non 3D data
  bool invalidMCOption = (useMarchingCubes && nDims != 3);
-  VTKM_LOG_IF_S(
-    vtkm::cont::LogLevel::Error,
-    invalidMCOption && (rank == 0),
-    "The input mesh is " << nDims << "D. "
-                         << "Contour tree using marching cubes is only supported for 3D data.");
+  VTKM_LOG_IF_S(vtkm::cont::LogLevel::Error,
+                invalidMCOption && (rank == 0),
+                "The input mesh is "
+                  << nDims << "D. "
+                  << "Contour tree using marching cubes is only supported for 3D data.");

  // If we found any errors in the setttings than finalize MPI and exit the execution
  if (invalidMCOption)
@ -583,7 +572,7 @@ int main(int argc, char* argv[])

 #ifndef WITH_MPI                              // construct regular, single-block VTK-M input dataset
  vtkm::cont::DataSet useDataSet = inDataSet; // Single block dataset
-#else                                         // Create a multi-block dataset for multi-block DIY-paralle processing
+#else  // Create a multi-block dataset for multi-block DIY-paralle processing
  vtkm::cont::PartitionedDataSet useDataSet; // Partitioned variant of the input dataset
  vtkm::Id3 blocksPerDim =
    nDims == 3 ? vtkm::Id3(1, 1, numBlocks) : vtkm::Id3(1, numBlocks, 1); // Decompose the data into
@ -610,8 +599,8 @@ int main(int argc, char* argv[])
    {
      VTKM_LOG_IF_S(vtkm::cont::LogLevel::Error,
                    rank == 0,
-                    "Number of ranks to large for data. Use " << lastDimSize / 2
-                                                              << "or fewer ranks");
+                    "Number of ranks too large for data. Use " << lastDimSize / 2
+                                                               << "or fewer ranks");
      MPI_Finalize();
      return EXIT_FAILURE;
    }
@ -645,8 +634,8 @@ int main(int argc, char* argv[])
      if (nDims == 2)
      {
        vtkm::Id2 vdims;
-        vdims[0] = static_cast<vtkm::Id>(currBlockSize);
-        vdims[1] = static_cast<vtkm::Id>(dims[0]);
+        vdims[0] = static_cast<vtkm::Id>(dims[0]);
+        vdims[1] = static_cast<vtkm::Id>(currBlockSize);
        vtkm::Vec<ValueType, 2> origin(0, blockIndex * blockSize);
        vtkm::Vec<ValueType, 2> spacing(1, 1);
        ds = dsb.Create(vdims, origin, spacing);
@ -661,8 +650,8 @@ int main(int argc, char* argv[])
      else
      {
        vtkm::Id3 vdims;
-        vdims[0] = static_cast<vtkm::Id>(dims[0]);
-        vdims[1] = static_cast<vtkm::Id>(dims[1]);
+        vdims[0] = static_cast<vtkm::Id>(dims[1]);
+        vdims[1] = static_cast<vtkm::Id>(dims[0]);
        vdims[2] = static_cast<vtkm::Id>(currBlockSize);
        vtkm::Vec<ValueType, 3> origin(0, 0, (blockIndex * blockSize));
        vtkm::Vec<ValueType, 3> spacing(1, 1, 1);
@ -683,7 +672,7 @@ int main(int argc, char* argv[])
      useDataSet.AppendPartition(ds);
    }
  }
-#endif                                        // WITH_MPI construct input dataset
+#endif // WITH_MPI construct input dataset

  currTime = totalTime.GetElapsedTime();
  buildDatasetTime = currTime - prevTime;
@ -706,6 +695,21 @@ int main(int argc, char* argv[])
  vtkm::Float64 computeContourTreeTime = currTime - prevTime;
  prevTime = currTime;

+#ifdef WITH_MPI
+#ifdef DEBUG_PRINT
+  std::cout << std::flush;
+  close(out);
+  std::cerr << std::flush;
+  close(err);
+
+  dup2(save_out, fileno(stdout));
+  dup2(save_err, fileno(stderr));
+
+  close(save_out);
+  close(save_err);
+#endif
+#endif
+
  ////////////////////////////////////////////
  // Compute the branch decomposition
  ////////////////////////////////////////////
@ -719,12 +723,12 @@ int main(int argc, char* argv[])
    ctaug_ns::IdArrayType superarcDependentWeight;
    ctaug_ns::IdArrayType supernodeTransferWeight;
    ctaug_ns::IdArrayType hyperarcDependentWeight;
-    ctaug_ns::ProcessContourTree::ComputeVolumeWeights(filter.GetContourTree(),
-                                                       filter.GetNumIterations(),
-                                                       superarcIntrinsicWeight,  // (output)
-                                                       superarcDependentWeight,  // (output)
-                                                       supernodeTransferWeight,  // (output)
-                                                       hyperarcDependentWeight); // (output)
+    ctaug_ns::ProcessContourTree::ComputeVolumeWeightsSerial(filter.GetContourTree(),
+                                                             filter.GetNumIterations(),
+                                                             superarcIntrinsicWeight,  // (output)
+                                                             superarcDependentWeight,  // (output)
+                                                             supernodeTransferWeight,  // (output)
+                                                             hyperarcDependentWeight); // (output)
    // Record the timings for the branch decomposition
    std::stringstream timingsStream; // Use a string stream to log in one message
    timingsStream << std::endl;
@ -740,14 +744,14 @@ int main(int argc, char* argv[])
    ctaug_ns::IdArrayType branchMaximum;
    ctaug_ns::IdArrayType branchSaddle;
    ctaug_ns::IdArrayType branchParent;
-    ctaug_ns::ProcessContourTree::ComputeVolumeBranchDecomposition(filter.GetContourTree(),
-                                                                   superarcDependentWeight,
-                                                                   superarcIntrinsicWeight,
-                                                                   whichBranch,   // (output)
-                                                                   branchMinimum, // (output)
-                                                                   branchMaximum, // (output)
-                                                                   branchSaddle,  // (output)
-                                                                   branchParent); // (output)
+    ctaug_ns::ProcessContourTree::ComputeVolumeBranchDecompositionSerial(filter.GetContourTree(),
+                                                                         superarcDependentWeight,
+                                                                         superarcIntrinsicWeight,
+                                                                         whichBranch,   // (output)
+                                                                         branchMinimum, // (output)
+                                                                         branchMaximum, // (output)
+                                                                         branchSaddle,  // (output)
+                                                                         branchParent); // (output)
    // Record and log the branch decompostion timings
    timingsStream << "    " << std::setw(38) << std::left << "Compute Volume Branch Decomposition"
                  << ": " << branchDecompTimer.GetElapsedTime() << " seconds" << std::endl;
@ -866,116 +870,47 @@ int main(int argc, char* argv[])
  currTime = totalTime.GetElapsedTime();
  VTKM_LOG_S(vtkm::cont::LogLevel::Info,
             std::endl
-               << "    -------------------------- Totals "
-               << rank
-               << " -----------------------------"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Start-up"
-               << ": "
-               << startUpTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Data Read"
-               << ": "
-               << dataReadTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Build VTKM Dataset"
-               << ": "
-               << buildDatasetTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Compute Contour Tree"
-               << ": "
-               << computeContourTreeTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Compute Branch Decomposition"
-               << ": "
-               << computeBranchDecompTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Total Time"
-               << ": "
-               << currTime
-               << " seconds");
+               << "    -------------------------- Totals " << rank
+               << " -----------------------------" << std::endl
+               << std::setw(42) << std::left << "    Start-up"
+               << ": " << startUpTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Data Read"
+               << ": " << dataReadTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Build VTKM Dataset"
+               << ": " << buildDatasetTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Compute Contour Tree"
+               << ": " << computeContourTreeTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Compute Branch Decomposition"
+               << ": " << computeBranchDecompTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Total Time"
+               << ": " << currTime << " seconds");

  const ctaug_ns::ContourTree& ct = filter.GetContourTree();
  VTKM_LOG_S(vtkm::cont::LogLevel::Info,
             std::endl
-               << "    ---------------- Contour Tree Array Sizes ---------------------"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Nodes"
-               << ": "
-               << ct.Nodes.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Arcs"
-               << ": "
-               << ct.Arcs.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Superparents"
-               << ": "
-               << ct.Superparents.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Superarcs"
-               << ": "
-               << ct.Superarcs.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Supernodes"
-               << ": "
-               << ct.Supernodes.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Hyperparents"
-               << ": "
-               << ct.Hyperparents.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #WhenTransferred"
-               << ": "
-               << ct.WhenTransferred.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Hypernodes"
-               << ": "
-               << ct.Hypernodes.GetNumberOfValues()
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    #Hyperarcs"
-               << ": "
-               << ct.Hyperarcs.GetNumberOfValues()
-               << std::endl);
+               << "    ---------------- Contour Tree Array Sizes ---------------------" << std::endl
+               << std::setw(42) << std::left << "    #Nodes"
+               << ": " << ct.Nodes.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Arcs"
+               << ": " << ct.Arcs.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Superparents"
+               << ": " << ct.Superparents.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Superarcs"
+               << ": " << ct.Superarcs.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Supernodes"
+               << ": " << ct.Supernodes.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Hyperparents"
+               << ": " << ct.Hyperparents.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #WhenTransferred"
+               << ": " << ct.WhenTransferred.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Hypernodes"
+               << ": " << ct.Hypernodes.GetNumberOfValues() << std::endl
+               << std::setw(42) << std::left << "    #Hyperarcs"
+               << ": " << ct.Hyperarcs.GetNumberOfValues() << std::endl);
  // Print hyperstructure statistics
  VTKM_LOG_S(vtkm::cont::LogLevel::Info,
             std::endl
-               << ct.PrintHyperStructureStatistics(false)
-               << std::endl);
+               << ct.PrintHyperStructureStatistics(false) << std::endl);

  // Flush ouput streams just to make sure everything has been logged (in particular when using MPI)
  std::cout << std::flush;
--- a/examples/contour_tree_distributed/CMakeLists.txt
+++ b/examples/contour_tree_distributed/CMakeLists.txt
@ -60,7 +60,7 @@ find_package(VTKm REQUIRED QUIET)
 ####################################
 if (VTKm_ENABLE_MPI)
  add_executable(ContourTree_Distributed ContourTreeApp.cxx)
-  target_link_libraries(ContourTree_Distributed vtkm_filter)
+  target_link_libraries(ContourTree_Distributed vtkm_filter MPI::MPI_CXX)
  vtkm_add_target_information(ContourTree_Distributed
                              MODIFY_CUDA_FLAGS
                              DEVICE_SOURCES ContourTreeApp.cxx)
--- a/examples/contour_tree_distributed/ContourTreeApp.cxx
+++ b/examples/contour_tree_distributed/ContourTreeApp.cxx
@ -160,7 +160,7 @@ int main(int argc, char* argv[])
  auto comm = MPI_COMM_WORLD;

  // Tell VTK-m which communicator it should use.
-  vtkm::cont::EnvironmentTracker::SetCommunicator(vtkmdiy::mpi::communicator(comm));
+  vtkm::cont::EnvironmentTracker::SetCommunicator(vtkmdiy::mpi::communicator());

  // get the rank and size
  int rank, size;
@ -252,25 +252,14 @@ int main(int argc, char* argv[])
  {
    VTKM_LOG_S(vtkm::cont::LogLevel::Info,
               std::endl
-                 << "    ------------ Settings -----------"
-                 << std::endl
-                 << "    filename="
-                 << filename
-                 << std::endl
-                 << "    device="
-                 << device.GetName()
-                 << std::endl
-                 << "    mc="
-                 << useMarchingCubes
-                 << std::endl
+                 << "    ------------ Settings -----------" << std::endl
+                 << "    filename=" << filename << std::endl
+                 << "    device=" << device.GetName() << std::endl
+                 << "    mc=" << useMarchingCubes << std::endl
 #ifdef ENABLE_SET_NUM_THREADS
-                 << "    numThreads="
-                 << numThreads
-                 << std::endl
+                 << "    numThreads=" << numThreads << std::endl
 #endif
-                 << "    nblocks="
-                 << numBlocks
-                 << std::endl);
+                 << "    nblocks=" << numBlocks << std::endl);
  }
  currTime = totalTime.GetElapsedTime();
  vtkm::Float64 startUpTime = currTime - prevTime;
@ -341,14 +330,9 @@ int main(int argc, char* argv[])
  {
    VTKM_LOG_S(vtkm::cont::LogLevel::Info,
               std::endl
-                 << "    ---------------- Input Mesh Properties --------------"
-                 << std::endl
-                 << "    Number of dimensions: "
-                 << nDims
-                 << std::endl
-                 << "    Number of mesh vertices: "
-                 << numVertices
-                 << std::endl);
+                 << "    ---------------- Input Mesh Properties --------------" << std::endl
+                 << "    Number of dimensions: " << nDims << std::endl
+                 << "    Number of mesh vertices: " << numVertices << std::endl);
  }

  // Check for fatal input errors
@ -359,13 +343,14 @@ int main(int argc, char* argv[])
  // Log any errors if found on rank 0
  VTKM_LOG_IF_S(vtkm::cont::LogLevel::Error,
                invalidNumDimensions && (rank == 0),
-                "The input mesh is " << nDims << "D. "
-                                                 "The input data must be either 2D or 3D.");
-  VTKM_LOG_IF_S(
-    vtkm::cont::LogLevel::Error,
-    invalidMCOption && (rank == 0),
-    "The input mesh is " << nDims << "D. "
-                         << "Contour tree using marching cubes is only supported for 3D data.");
+                "The input mesh is " << nDims
+                                     << "D. "
+                                        "The input data must be either 2D or 3D.");
+  VTKM_LOG_IF_S(vtkm::cont::LogLevel::Error,
+                invalidMCOption && (rank == 0),
+                "The input mesh is "
+                  << nDims << "D. "
+                  << "Contour tree using marching cubes is only supported for 3D data.");
  // If we found any errors in the setttings than finalize MPI and exit the execution
  if (invalidNumDimensions || invalidMCOption)
  {
@ -519,44 +504,18 @@ int main(int argc, char* argv[])
  currTime = totalTime.GetElapsedTime();
  VTKM_LOG_S(vtkm::cont::LogLevel::Info,
             std::endl
-               << "    -------------------------- Totals "
-               << rank
-               << " -----------------------------"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Start-up"
-               << ": "
-               << startUpTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Data Read"
-               << ": "
-               << dataReadTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Build VTKM Dataset"
-               << ": "
-               << buildDatasetTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Compute Contour Tree"
-               << ": "
-               << computeContourTreeTime
-               << " seconds"
-               << std::endl
-               << std::setw(42)
-               << std::left
-               << "    Total Time"
-               << ": "
-               << currTime
-               << " seconds");
+               << "    -------------------------- Totals " << rank
+               << " -----------------------------" << std::endl
+               << std::setw(42) << std::left << "    Start-up"
+               << ": " << startUpTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Data Read"
+               << ": " << dataReadTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Build VTKM Dataset"
+               << ": " << buildDatasetTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Compute Contour Tree"
+               << ": " << computeContourTreeTime << " seconds" << std::endl
+               << std::setw(42) << std::left << "    Total Time"
+               << ": " << currTime << " seconds");

  // Flush ouput streams just to make sure everything has been logged (in particular when using MPI)
  std::cout << std::flush;
--- a/examples/cosmotools/CosmoCenterFinder.cxx
+++ b/examples/cosmotools/CosmoCenterFinder.cxx
@ -52,11 +52,11 @@ void TestCosmoCenterFinder(const char* fileName)
  }

  vtkm::cont::ArrayHandle<vtkm::Float32> xLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(xLocation, nParticles);
+    vtkm::cont::make_ArrayHandle<vtkm::Float32>(xLocation, nParticles, vtkm::CopyFlag::Off);
  vtkm::cont::ArrayHandle<vtkm::Float32> yLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(yLocation, nParticles);
+    vtkm::cont::make_ArrayHandle<vtkm::Float32>(yLocation, nParticles, vtkm::CopyFlag::Off);
  vtkm::cont::ArrayHandle<vtkm::Float32> zLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(zLocation, nParticles);
+    vtkm::cont::make_ArrayHandle<vtkm::Float32>(zLocation, nParticles, vtkm::CopyFlag::Off);

  // Output MBP particleId pairs array
  vtkm::Pair<vtkm::Id, vtkm::Float32> nxnResult;
--- a/examples/cosmotools/CosmoHaloFinder.cxx
+++ b/examples/cosmotools/CosmoHaloFinder.cxx
@ -53,11 +53,11 @@ void TestCosmoHaloFinder(const char* fileName)
  }

  vtkm::cont::ArrayHandle<vtkm::Float32> xLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(xLocation, nParticles);
+    vtkm::cont::make_ArrayHandleMove<vtkm::Float32>(xLocation, nParticles);
  vtkm::cont::ArrayHandle<vtkm::Float32> yLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(yLocation, nParticles);
+    vtkm::cont::make_ArrayHandleMove<vtkm::Float32>(yLocation, nParticles);
  vtkm::cont::ArrayHandle<vtkm::Float32> zLocArray =
-    vtkm::cont::make_ArrayHandle<vtkm::Float32>(zLocation, nParticles);
+    vtkm::cont::make_ArrayHandleMove<vtkm::Float32>(zLocation, nParticles);

  // Output halo id, mbp id and min potential per particle
  vtkm::cont::ArrayHandle<vtkm::Id> resultHaloId;
@ -88,10 +88,6 @@ void TestCosmoHaloFinder(const char* fileName)
  xLocArray.ReleaseResources();
  yLocArray.ReleaseResources();
  zLocArray.ReleaseResources();
-
-  delete[] xLocation;
-  delete[] yLocation;
-  delete[] zLocation;
 }

 /////////////////////////////////////////////////////////////////////
--- a/examples/demo/Demo.cxx
+++ b/examples/demo/Demo.cxx
@ -25,10 +25,10 @@
 // write that image to a file. It then computes an isosurface on the input data set and renders
 // this output data set in a separate image file

-using vtkm::rendering::MapperVolume;
-using vtkm::rendering::MapperRayTracer;
-using vtkm::rendering::MapperWireframer;
 using vtkm::rendering::CanvasRayTracer;
+using vtkm::rendering::MapperRayTracer;
+using vtkm::rendering::MapperVolume;
+using vtkm::rendering::MapperWireframer;

 int main(int argc, char* argv[])
 {
--- a/examples/game_of_life/GameOfLife.cxx
+++ b/examples/game_of_life/GameOfLife.cxx
@ -17,7 +17,6 @@
 #include <iostream>
 #include <random>

-#include <vtkm/Math.h>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleCounting.h>
 #include <vtkm/cont/DataSetBuilderUniform.h>
@ -29,10 +28,8 @@
 #include <vtkm/filter/FilterDataSet.h>
 #include <vtkm/worklet/WorkletPointNeighborhood.h>

+#include <vtkm/cont/Invoker.h>
 #include <vtkm/cont/TryExecute.h>
-#include <vtkm/cont/cuda/DeviceAdapterCuda.h>
-#include <vtkm/cont/serial/DeviceAdapterSerial.h>
-#include <vtkm/cont/tbb/DeviceAdapterTBB.h>

 //Suppress warnings about glut being deprecated on OSX
 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
--- a/examples/ising/CMakeLists.txt
+++ b/examples/ising/CMakeLists.txt
@ -0,0 +1,21 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##============================================================================
+cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
+project(IsingModel CXX)
+
+#Find the VTK-m package
+find_package(VTKm REQUIRED QUIET)
+
+add_executable(Ising Ising.cxx)
+target_link_libraries(Ising PRIVATE vtkm_worklet vtkm_io vtkm_rendering)
+
+vtkm_add_target_information(Ising
+                            DROP_UNUSED_SYMBOLS MODIFY_CUDA_FLAGS
+                            DEVICE_SOURCES Ising.cxx)
--- a/examples/ising/Ising.cxx
+++ b/examples/ising/Ising.cxx
@ -0,0 +1,122 @@
+//
+// Created by ollie on 7/8/20.
+//
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+/// Simulation of ferromagnetism using the Ising Model
+/// Reference: Computational Physics 2nd Edition, Nicholas Giordano & Hisao Nakanishi
+
+#include <iomanip>
+#include <vtkm/cont/ArrayHandleRandomUniformReal.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/DataSetBuilderUniform.h>
+#include <vtkm/cont/Initialize.h>
+#include <vtkm/rendering/CanvasRayTracer.h>
+#include <vtkm/rendering/MapperRayTracer.h>
+#include <vtkm/rendering/Scene.h>
+#include <vtkm/rendering/View2D.h>
+#include <vtkm/worklet/WorkletCellNeighborhood.h>
+
+struct UpDown
+{
+  VTKM_EXEC_CONT vtkm::Float32 operator()(vtkm::Float32 p) const { return p > 0.5 ? 1.0f : -1.0f; }
+};
+
+vtkm::cont::DataSet SpinField(vtkm::Id2 dims)
+{
+  auto result =
+    vtkm::cont::DataSetBuilderUniform::Create(dims, vtkm::Vec2f{ 0, 0 }, vtkm::Vec2f{ 1, 1 });
+
+  vtkm::cont::ArrayHandle<vtkm::Float32> spins;
+  vtkm::cont::ArrayCopy(
+    vtkm::cont::make_ArrayHandleTransform(
+      vtkm::cont::ArrayHandleRandomUniformReal<vtkm::Float32>(result.GetNumberOfCells()), UpDown{}),
+    spins);
+  result.AddCellField("spins", spins);
+
+  return result;
+}
+
+struct UpdateSpins : public vtkm::worklet::WorkletCellNeighborhood
+{
+  using ControlSignature = void(CellSetIn,
+                                FieldInNeighborhood prevspin,
+                                FieldIn prob,
+                                FieldOut spin);
+  using ExecutionSignature = void(_2, _3, _4);
+
+  template <typename NeighIn>
+  VTKM_EXEC_CONT void operator()(const NeighIn& prevspin,
+                                 vtkm::Float32 p,
+                                 vtkm::Float32& spin) const
+  {
+    // TODO: what is the real value and unit of the change constant J and Boltzmann constant kB?
+    const vtkm::Float32 J = 1.f;
+    const vtkm::Float32 kB = 1.f;
+    // TODO: temperature in Kelvin
+    const vtkm::Float32 T = 5.f;
+    const auto mySpin = prevspin.Get(0, 0, 0);
+
+    // 1. Calculate the energy of flipping, E_flip
+    vtkm::Float32 E_flip = J * mySpin *
+      (prevspin.Get(-1, -1, 0) + prevspin.Get(-1, 0, 0) + prevspin.Get(-1, 1, 0) +
+       prevspin.Get(0, -1, 0) + prevspin.Get(0, 1, 0) + prevspin.Get(1, -1, 0) +
+       prevspin.Get(1, 0, 0) + prevspin.Get(1, 1, 0));
+
+    if (E_flip <= 0)
+    {
+      // 2. If E_flip <= 0, just flip the spin
+      spin = -1.f * mySpin;
+    }
+    else
+    {
+      // 3. otherwise, flip the spin if the Boltzmann factor exp(-E_flip/kB*T) is larger than the
+      // uniform real random number p.
+      if (p <= vtkm::Exp(-E_flip / (kB * T)))
+        spin = -1.f * mySpin;
+      else
+        spin = mySpin;
+    }
+  }
+};
+
+int main(int argc, char** argv)
+{
+  auto opts =
+    vtkm::cont::InitializeOptions::DefaultAnyDevice | vtkm::cont::InitializeOptions::Strict;
+  vtkm::cont::Initialize(argc, argv, opts);
+
+  auto dataSet = SpinField({ 5, 5 });
+  vtkm::cont::ArrayHandle<vtkm::Float32> spins;
+  dataSet.GetCellField("spins").GetData().CopyTo(spins);
+
+  vtkm::rendering::Scene scene;
+  vtkm::rendering::Actor actor(dataSet.GetCellSet(),
+                               dataSet.GetCoordinateSystem(),
+                               dataSet.GetCellField("spins"),
+                               vtkm::cont::ColorTable("Cool To Warm"));
+  scene.AddActor(actor);
+  vtkm::rendering::CanvasRayTracer canvas(1024, 1024);
+  vtkm::rendering::MapperRayTracer mapper;
+  mapper.SetShadingOn(false);
+  vtkm::rendering::View2D view(scene, mapper, canvas);
+  view.Paint();
+  view.SaveAs("spin0.png");
+
+  vtkm::cont::Invoker invoker;
+  for (vtkm::UInt32 i = 1; i < 10; ++i)
+  {
+    vtkm::cont::ArrayHandleRandomUniformReal<vtkm::Float32> prob(dataSet.GetNumberOfCells(), { i });
+    invoker(UpdateSpins{}, dataSet.GetCellSet(), spins, prob, spins);
+    view.Paint();
+    view.SaveAs("spin" + std::to_string(i) + ".png");
+  }
+}
--- a/examples/multi_backend/IOGenerator.cxx
+++ b/examples/multi_backend/IOGenerator.cxx
@ -42,7 +42,7 @@ vtkm::cont::DataSet make_test3DImageData(vtkm::Id3 dims)

  vtkm::cont::ArrayHandle<vtkm::Vec3f> field;
  vtkm::cont::Invoker invoke;
-  invoke(WaveField{}, ds.GetCoordinateSystem(), field);
+  invoke(WaveField{}, ds.GetCoordinateSystem().GetDataAsMultiplexer(), field);

  ds.AddPointField("vec_field", field);
  return ds;
--- a/examples/oscillator/Oscillator.cxx
+++ b/examples/oscillator/Oscillator.cxx
@ -102,7 +102,12 @@ void read_oscillators(std::string filePath, vtkm::source::Oscillator& source)
 // ArcticViewer helper
 // ----------------------------------------------------------------------------

-void writeData(std::string& basePath, int timestep, int iSize, int jSize, int kSize, double* values)
+void writeData(std::string& basePath,
+               int timestep,
+               int iSize,
+               int jSize,
+               int kSize,
+               const double* values)
 {
  int size = iSize * jSize * kSize;
  std::ostringstream timeValues;
@ -158,7 +163,7 @@ void writeData(std::string& basePath, int timestep, int iSize, int jSize, int kS
  else
  {
    int stackSize = size * 8;
-    dataFilePathPointer.write((char*)values, stackSize);
+    dataFilePathPointer.write(reinterpret_cast<const char*>(values), stackSize);
    dataFilePathPointer.flush();
    dataFilePathPointer.close();
  }
@ -313,9 +318,9 @@ int main(int argc, char** argv)
    vtkm::cont::DataSet rdata = source.Execute();
    if (generateOutput)
    {
-      vtkm::cont::ArrayHandle<vtkm::Float64> tmp;
+      vtkm::cont::ArrayHandleBasic<vtkm::Float64> tmp;
      rdata.GetField("scalars", vtkm::cont::Field::Association::POINTS).GetData().CopyTo(tmp);
-      double* values = tmp.GetStorage().GetArray();
+      const double* values = tmp.GetReadPointer();
      writeData(outputDirectory, count++, sizeX, sizeY, sizeZ, values);
    }

--- a/examples/particle_advection/ParticleAdvection.cxx
+++ b/examples/particle_advection/ParticleAdvection.cxx
@ -71,7 +71,7 @@ int main(int argc, char** argv)
    p.ID = i;
    seeds.push_back(p);
  }
-  auto seedArray = vtkm::cont::make_ArrayHandle(seeds);
+  auto seedArray = vtkm::cont::make_ArrayHandle(seeds, vtkm::CopyFlag::Off);

  //compute streamlines
  vtkm::filter::Streamline streamline;
--- a/examples/redistribute_points/RedistributePoints.h
+++ b/examples/redistribute_points/RedistributePoints.h
@ -223,11 +223,12 @@ inline VTKM_CONT vtkm::cont::PartitionedDataSet RedistributePoints::PrepareForEx
  vtkmdiy::RegularDecomposer<vtkmdiy::ContinuousBounds> decomposer(
    /*dim*/ 3, internal::convert(gbounds), assigner.nblocks());

-  vtkmdiy::Master master(comm,
-                         /*threads*/ 1,
-                         /*limit*/ -1,
-                         []() -> void* { return new vtkm::cont::DataSet(); },
-                         [](void* ptr) { delete static_cast<vtkm::cont::DataSet*>(ptr); });
+  vtkmdiy::Master master(
+    comm,
+    /*threads*/ 1,
+    /*limit*/ -1,
+    []() -> void* { return new vtkm::cont::DataSet(); },
+    [](void* ptr) { delete static_cast<vtkm::cont::DataSet*>(ptr); });
  decomposer.decompose(comm.rank(), assigner, master);

  assert(static_cast<vtkm::Id>(master.size()) == input.GetNumberOfPartitions());
--- a/examples/streamline_mpi/CMakeLists.txt
+++ b/examples/streamline_mpi/CMakeLists.txt
@ -0,0 +1,27 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##============================================================================
+cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
+project(StreamlineMPI CXX)
+
+#Find the VTK-m package
+find_package(VTKm REQUIRED QUIET)
+
+if (VTKm_ENABLE_MPI)
+  add_executable(StreamlineMPI StreamlineMPI.cxx)
+  target_compile_definitions(StreamlineMPI PRIVATE "MPI_ENABLED")
+  target_link_libraries(StreamlineMPI PRIVATE vtkm_filter vtkm_io MPI::MPI_CXX)
+  vtkm_add_target_information(StreamlineMPI
+                              DROP_UNUSED_SYMBOLS MODIFY_CUDA_FLAGS
+                              DEVICE_SOURCES StreamlineMPI.cxx)
+endif()
+
+#if(TARGET vtkm::tbb)
+#  target_compile_definitions(streamline_mpi PRIVATE BUILDING_TBB_VERSION)
+#endif()
--- a/examples/streamline_mpi/StreamlineMPI.cxx
+++ b/examples/streamline_mpi/StreamlineMPI.cxx
@ -0,0 +1,120 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include <vtkm/cont/AssignerPartitionedDataSet.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/EnvironmentTracker.h>
+#include <vtkm/cont/Field.h>
+#include <vtkm/cont/Initialize.h>
+#include <vtkm/cont/PartitionedDataSet.h>
+#include <vtkm/filter/Streamline.h>
+#include <vtkm/io/VTKDataSetReader.h>
+#include <vtkm/io/VTKDataSetWriter.h>
+#include <vtkm/io/reader/VTKDataSetReader.h>
+
+#include <mpi.h>
+#include <vtkm/thirdparty/diy/diy.h>
+#include <vtkm/thirdparty/diy/mpi-cast.h>
+
+
+#include <vtkm/filter/ParticleAdvection.h>
+#include <vtkm/filter/particleadvection/BoundsMap.h>
+#include <vtkm/filter/particleadvection/ParticleMessenger.h>
+
+
+void LoadData(std::string& fname, std::vector<vtkm::cont::DataSet>& dataSets, int rank, int nRanks)
+{
+  std::string buff;
+  std::ifstream is;
+  is.open(fname);
+  std::cout << "Opening: " << fname << std::endl;
+  if (!is)
+  {
+    std::cout << "File not found! : " << fname << std::endl;
+    throw "unknown file: " + fname;
+  }
+
+  auto p0 = fname.rfind(".visit");
+  if (p0 == std::string::npos)
+    throw "Only .visit files are supported.";
+  auto tmp = fname.substr(0, p0);
+  auto p1 = tmp.rfind("/");
+  auto dir = tmp.substr(0, p1);
+
+  std::getline(is, buff);
+  auto numBlocks = std::stoi(buff.substr(buff.find("!NBLOCKS ") + 9, buff.size()));
+  if (rank == 0)
+    std::cout << "numBlocks= " << numBlocks << std::endl;
+
+  int nPer = numBlocks / nRanks;
+  int b0 = rank * nPer, b1 = (rank + 1) * nPer;
+  if (rank == (nRanks - 1))
+    b1 = numBlocks;
+
+  for (int i = 0; i < numBlocks; i++)
+  {
+    std::getline(is, buff);
+    if (i >= b0 && i < b1)
+    {
+      vtkm::cont::DataSet ds;
+      std::string vtkFile = dir + "/" + buff;
+      vtkm::io::reader::VTKDataSetReader reader(vtkFile);
+      ds = reader.ReadDataSet();
+      auto f = ds.GetField("grad").GetData();
+      vtkm::cont::ArrayHandle<vtkm::Vec<double, 3>> fieldArray;
+      fieldArray = f.Cast<vtkm::cont::ArrayHandle<vtkm::Vec<double, 3>>>();
+      int n = fieldArray.GetNumberOfValues();
+      auto portal = fieldArray.WritePortal();
+      for (int ii = 0; ii < n; ii++)
+        portal.Set(ii, vtkm::Vec<double, 3>(1, 0, 0));
+
+      dataSets.push_back(ds);
+    }
+  }
+}
+
+// Example computing streamlines.
+// An example vector field is available in the vtk-m data directory: magField.vtk
+// Example usage:
+//   this will advect 200 particles 50 steps using a step size of 0.01
+//
+// Particle_Advection <path-to-data-dir>/magField.vtk vec 200 50 0.01 output.vtk
+//
+
+int main(int argc, char** argv)
+{
+  MPI_Init(&argc, &argv);
+  auto comm = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  int rank = comm.rank();
+  int size = comm.size();
+
+  std::string dataFile = argv[1];
+  std::vector<vtkm::cont::DataSet> dataSets;
+  LoadData(dataFile, dataSets, rank, size);
+
+  vtkm::filter::ParticleAdvection pa;
+
+  vtkm::cont::ArrayHandle<vtkm::Particle> seedArray;
+  std::vector<vtkm::Particle> seeds;
+  seeds.push_back(vtkm::Particle(vtkm::Vec3f(.1f, .1f, .9f), 0));
+  seeds.push_back(vtkm::Particle(vtkm::Vec3f(.1f, .6f, .6f), 1));
+  seeds.push_back(vtkm::Particle(vtkm::Vec3f(.1f, .9f, .1f), 2));
+  seedArray = vtkm::cont::make_ArrayHandle(seeds);
+  pa.SetStepSize(0.001f);
+  pa.SetNumberOfSteps(10000);
+  pa.SetSeeds(seedArray);
+  pa.SetActiveField("grad");
+
+  vtkm::cont::PartitionedDataSet pds(dataSets);
+  auto output = pa.Execute(pds);
+  output.PrintSummary(std::cout);
+
+  return 0;
+}
--- a/vtkm/Assert.h
+++ b/vtkm/Assert.h
@ -15,6 +15,17 @@

 #include <cassert>

+// Pick up conditions where we want to turn on/off assert.
+#ifndef VTKM_NO_ASSERT
+#if defined(NDEBUG)
+#define VTKM_NO_ASSERT
+#elif defined(VTKM_CUDA_DEVICE_PASS) && defined(VTKM_NO_ASSERT_CUDA)
+#define VTKM_NO_ASSERT
+#elif defined(VTKM_HIP) && defined(VTKM_NO_ASSERT_HIP)
+#define VTKM_NO_ASSERT
+#endif
+#endif // VTKM_NO_ASSERT
+
 /// \def VTKM_ASSERT(condition)
 ///
 /// Asserts that \a condition resolves to true.  If \a condition is false,
@ -28,11 +39,7 @@
 ///
 /// The VTKM_NO_ASSERT cmake and preprocessor option allows debugging builds
 /// to remove assertions for performance reasons.
-#if defined(VTKM_CUDA_VERSION_MAJOR) && (VTKM_CUDA_VERSION_MAJOR == 7)
-//CUDA 7.5 doesn't support assert in device code
-#define VTKM_ASSERT(condition) (void)(condition)
-#elif !defined(NDEBUG) && !defined(VTKM_NO_ASSERT)
-//Only assert if we are in debug mode and don't have VTKM_NO_ASSERT defined
+#ifndef VTKM_NO_ASSERT
 #define VTKM_ASSERT(condition) assert(condition)
 #define VTKM_ASSERTS_CHECKED
 #else
--- a/vtkm/Atomic.h
+++ b/vtkm/Atomic.h
@ -0,0 +1,821 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+#ifndef vtk_m_Atomic_h
+#define vtk_m_Atomic_h
+
+#include <vtkm/List.h>
+
+#include <vtkm/internal/Windows.h>
+
+#include <atomic>
+
+namespace vtkm
+{
+
+/// \brief Specifies memory order semantics for atomic operations.
+///
+/// The memory order parameter controls how all other memory operations are
+/// ordered around a specific atomic instruction.
+///
+/// Memory access is complicated. Compilers can reorder instructions to optimize
+/// scheduling, processors can speculatively read memory, and caches make
+/// assumptions about coherency that we may not normally be aware of. Because of
+/// this complexity, the order in which multiple updates to shared memory become
+/// visible to other threads is not guaranteed, nor is it guaranteed that each
+/// thread will see memory updates occur in the same order as any other thread.
+/// This can lead to surprising behavior and cause problems when using atomics
+/// to communicate between threads.
+///
+/// These problems are solved by using a standard set of memory orderings which
+/// describe common access patterns used for shared memory programming. Their
+/// goal is to provide guarantees that changes made in one thread will be visible
+/// to another thread at a specific and predictable point in execution, regardless
+/// of any hardware or compiler optimizations.
+///
+/// If unsure, use `SequentiallyConsistent` memory orderings. It will "do the right
+/// thing", but at the cost of increased and possibly unnecessary memory ordering
+/// restrictions. The other orderings are optimizations that are only applicable
+/// in very specific situations.
+///
+/// See https://en.cppreference.com/w/cpp/atomic/memory_order for a detailed
+/// description of the different orderings and their usage.
+///
+/// The memory order semantics follow those of other common atomic operations such as
+/// the `std::memory_order` identifiers used for `std::atomic`.
+///
+/// Note that when a memory order is specified, the enforced memory order is guaranteed
+/// to be as good or better than that requested.
+///
+enum class MemoryOrder
+{
+  /// An atomic operations with `Relaxed` memory order enforces no synchronization or ordering
+  /// constraints on local reads and writes. That is, a read or write to a local, non-atomic
+  /// variable may be moved to before or after an atomic operation with `Relaxed` memory order.
+  ///
+  Relaxed,
+
+  /// A load operation with `Acquire` memory order will enforce that any local read or write
+  /// operations listed in the program after the atomic will happen after the atomic.
+  ///
+  Acquire,
+
+  /// A store operation with `Release` memory order will enforce that any local read or write
+  /// operations listed in the program before the atomic will happen before the atomic.
+  ///
+  Release,
+
+  /// A read-modify-write operation with `AcquireAndRelease` memory order will enforce that any
+  /// local read or write operations listed in the program before the atomic will happen before the
+  /// atomic and likewise any read or write operations listed in the program after the atomic will
+  /// happen after the atomic.
+  ///
+  AcquireAndRelease,
+
+  /// An atomic with `SequentiallyConsistent` memory order will enforce any appropriate semantics
+  /// as `Acquire`, `Release`, and `AcquireAndRelease`. Additionally, `SequentiallyConsistent` will
+  /// enforce a consistent ordering of atomic operations across all threads. That is, all threads
+  /// observe the modifications in the same order.
+  ///
+  SequentiallyConsistent
+};
+
+namespace internal
+{
+
+VTKM_EXEC_CONT inline std::memory_order StdAtomicMemOrder(vtkm::MemoryOrder order)
+{
+  switch (order)
+  {
+    case vtkm::MemoryOrder::Relaxed:
+      return std::memory_order_relaxed;
+    case vtkm::MemoryOrder::Acquire:
+      return std::memory_order_acquire;
+    case vtkm::MemoryOrder::Release:
+      return std::memory_order_release;
+    case vtkm::MemoryOrder::AcquireAndRelease:
+      return std::memory_order_acq_rel;
+    case vtkm::MemoryOrder::SequentiallyConsistent:
+      return std::memory_order_seq_cst;
+  }
+
+  // Should never reach here, but avoid compiler warnings
+  return std::memory_order_seq_cst;
+}
+
+} // namespace internal
+
+} // namespace vtkm
+
+
+#if defined(VTKM_CUDA_DEVICE_PASS)
+
+namespace vtkm
+{
+namespace detail
+{
+
+// Fence to ensure that previous non-atomic stores are visible to other threads.
+VTKM_EXEC_CONT inline void AtomicStoreFence(vtkm::MemoryOrder order)
+{
+  if ((order == vtkm::MemoryOrder::Release) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
+      (order == vtkm::MemoryOrder::SequentiallyConsistent))
+  {
+    __threadfence();
+  }
+}
+
+// Fence to ensure that previous non-atomic stores are visible to other threads.
+VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
+{
+  if ((order == vtkm::MemoryOrder::Acquire) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
+      (order == vtkm::MemoryOrder::SequentiallyConsistent))
+  {
+    __threadfence();
+  }
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+{
+  const volatile T* vaddr = addr; /* volatile to bypass cache*/
+  if (order == vtkm::MemoryOrder::SequentiallyConsistent)
+  {
+    __threadfence();
+  }
+  const T value = *vaddr;
+  /* fence to ensure that dependent reads are correctly ordered */
+  AtomicLoadFence(order);
+  return value;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
+{
+  volatile T* vaddr = addr; /* volatile to bypass cache */
+  /* fence to ensure that previous non-atomic stores are visible to other threads */
+  AtomicStoreFence(order);
+  *vaddr = value;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  auto result = atomicAdd(addr, arg);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  auto result = atomicAnd(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  auto result = atomicOr(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  auto result = atomicXor(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
+{
+  return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
+                                                 T desired,
+                                                 T expected,
+                                                 vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  auto result = atomicCAS(addr, expected, desired);
+  AtomicLoadFence(order);
+  return result;
+}
+}
+} // namespace vtkm::detail
+
+#elif defined(VTKM_ENABLE_KOKKOS)
+
+VTKM_THIRDPARTY_PRE_INCLUDE
+// Superhack! Kokkos_Macros.hpp defines macros to include modifiers like __device__.
+// However, we don't want to actually use those if compiling this with a standard
+// C++ compiler (because this particular code does not run on a device). Thus,
+// we want to disable that behavior when not using the device compiler. To do that,
+// we are going to have to load the KokkosCore_config.h file (which you are not
+// supposed to do), then undefine the device enables if necessary, then load
+// Kokkos_Macros.hpp to finish the state.
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+#include <KokkosCore_config.h>
+#undef KOKKOS_MACROS_HPP
+#define KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+
+#if defined(KOKKOS_ENABLE_CUDA) && !defined(VTKM_CUDA)
+#undef KOKKOS_ENABLE_CUDA
+#endif
+#endif //KOKKOS_MACROS_HPP not loaded
+
+#include <Kokkos_Core.hpp>
+VTKM_THIRDPARTY_POST_INCLUDE
+
+namespace vtkm
+{
+namespace detail
+{
+
+// Fence to ensure that previous non-atomic stores are visible to other threads.
+VTKM_EXEC_CONT inline void AtomicStoreFence(vtkm::MemoryOrder order)
+{
+  if ((order == vtkm::MemoryOrder::Release) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
+      (order == vtkm::MemoryOrder::SequentiallyConsistent))
+  {
+    Kokkos::memory_fence();
+  }
+}
+
+// Fence to ensure that previous non-atomic stores are visible to other threads.
+VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
+{
+  if ((order == vtkm::MemoryOrder::Acquire) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
+      (order == vtkm::MemoryOrder::SequentiallyConsistent))
+  {
+    Kokkos::memory_fence();
+  }
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+{
+  switch (order)
+  {
+    case vtkm::MemoryOrder::Relaxed:
+      return Kokkos::Impl::atomic_load(addr, Kokkos::Impl::memory_order_relaxed);
+    case vtkm::MemoryOrder::Acquire:
+    case vtkm::MemoryOrder::Release:           // Release doesn't make sense. Use Acquire.
+    case vtkm::MemoryOrder::AcquireAndRelease: // Release doesn't make sense. Use Acquire.
+      return Kokkos::Impl::atomic_load(addr, Kokkos::Impl::memory_order_acquire);
+    case vtkm::MemoryOrder::SequentiallyConsistent:
+      return Kokkos::Impl::atomic_load(addr, Kokkos::Impl::memory_order_seq_cst);
+  }
+
+  // Should never reach here, but avoid compiler warnings
+  return Kokkos::Impl::atomic_load(addr, Kokkos::Impl::memory_order_seq_cst);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
+{
+  switch (order)
+  {
+    case vtkm::MemoryOrder::Relaxed:
+      Kokkos::Impl::atomic_store(addr, value, Kokkos::Impl::memory_order_relaxed);
+      break;
+    case vtkm::MemoryOrder::Acquire: // Acquire doesn't make sense. Use Release.
+    case vtkm::MemoryOrder::Release:
+    case vtkm::MemoryOrder::AcquireAndRelease: // Acquire doesn't make sense. Use Release.
+      Kokkos::Impl::atomic_store(addr, value, Kokkos::Impl::memory_order_release);
+      break;
+    case vtkm::MemoryOrder::SequentiallyConsistent:
+      Kokkos::Impl::atomic_store(addr, value, Kokkos::Impl::memory_order_seq_cst);
+      break;
+  }
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  T result = Kokkos::atomic_fetch_add(addr, arg);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  T result = Kokkos::atomic_fetch_and(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  T result = Kokkos::atomic_fetch_or(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  T result = Kokkos::atomic_fetch_xor(addr, mask);
+  AtomicLoadFence(order);
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
+{
+  return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
+                                                 T desired,
+                                                 T expected,
+                                                 vtkm::MemoryOrder order)
+{
+  AtomicStoreFence(order);
+  T result = Kokkos::atomic_compare_exchange(addr, expected, desired);
+  AtomicLoadFence(order);
+  return result;
+}
+}
+} // namespace vtkm::detail
+
+#elif defined(VTKM_MSVC)
+
+// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
+
+#include <cstdint>
+#include <cstring>
+#include <intrin.h> // For MSVC atomics
+
+namespace vtkm
+{
+namespace detail
+{
+
+template <typename To, typename From>
+VTKM_EXEC_CONT inline To BitCast(const From& src)
+{
+  // The memcpy should be removed by the compiler when possible, but this
+  // works around a host of issues with bitcasting using reinterpret_cast.
+  VTKM_STATIC_ASSERT(sizeof(From) == sizeof(To));
+  To dst;
+  std::memcpy(&dst, &src, sizeof(From));
+  return dst;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T BitCast(T&& src)
+{
+  return std::forward<T>(src);
+}
+
+// Note about Load and Store implementations:
+//
+// "Simple reads and writes to properly-aligned 32-bit variables are atomic
+//  operations"
+//
+// "Simple reads and writes to properly aligned 64-bit variables are atomic on
+// 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be
+// atomic on 32-bit Windows."
+//
+// "Reads and writes to variables of other sizes [than 32 or 64 bits] are not
+// guaranteed to be atomic on any platform."
+//
+// https://docs.microsoft.com/en-us/windows/desktop/sync/interlocked-variable-access
+
+VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(const vtkm::UInt8* addr, vtkm::MemoryOrder order)
+{
+  // This assumes that the memory interface is smart enough to load a 32-bit
+  // word atomically and a properly aligned 8-bit word from it.
+  // We could build address masks and do shifts to perform this manually if
+  // this assumption is incorrect.
+  auto result = *static_cast<volatile const vtkm::UInt8*>(addr);
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(const vtkm::UInt16* addr, vtkm::MemoryOrder order)
+{
+  // This assumes that the memory interface is smart enough to load a 32-bit
+  // word atomically and a properly aligned 16-bit word from it.
+  // We could build address masks and do shifts to perform this manually if
+  // this assumption is incorrect.
+  auto result = *static_cast<volatile const vtkm::UInt16*>(addr);
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(const vtkm::UInt32* addr, vtkm::MemoryOrder order)
+{
+  auto result = *static_cast<volatile const vtkm::UInt32*>(addr);
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(const vtkm::UInt64* addr, vtkm::MemoryOrder order)
+{
+  auto result = *static_cast<volatile const vtkm::UInt64*>(addr);
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  return result;
+}
+
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt8* addr,
+                                           vtkm::UInt8 val,
+                                           vtkm::MemoryOrder order)
+{
+  // There doesn't seem to be an atomic store instruction in the windows
+  // API, so just exchange and discard the result.
+  _InterlockedExchange8(reinterpret_cast<volatile CHAR*>(addr), BitCast<CHAR>(val));
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt16* addr,
+                                           vtkm::UInt16 val,
+                                           vtkm::MemoryOrder order)
+{
+  // There doesn't seem to be an atomic store instruction in the windows
+  // API, so just exchange and discard the result.
+  _InterlockedExchange16(reinterpret_cast<volatile SHORT*>(addr), BitCast<SHORT>(val));
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt32* addr,
+                                           vtkm::UInt32 val,
+                                           vtkm::MemoryOrder order)
+{
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  *addr = val;
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt64* addr,
+                                           vtkm::UInt64 val,
+                                           vtkm::MemoryOrder order)
+{
+  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
+  *addr = val;
+}
+
+#define VTKM_ATOMIC_OP(vtkmName, winName, vtkmType, winType, suffix)                             \
+  VTKM_EXEC_CONT inline vtkmType vtkmName(vtkmType* addr, vtkmType arg, vtkm::MemoryOrder order) \
+  {                                                                                              \
+    return BitCast<vtkmType>(                                                                    \
+      winName##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(arg)));        \
+  }
+
+#define VTKM_ATOMIC_OPS_FOR_TYPE(vtkmType, winType, suffix)                             \
+  VTKM_ATOMIC_OP(AtomicAddImpl, _InterlockedExchangeAdd, vtkmType, winType, suffix)     \
+  VTKM_ATOMIC_OP(AtomicAndImpl, _InterlockedAnd, vtkmType, winType, suffix)             \
+  VTKM_ATOMIC_OP(AtomicOrImpl, _InterlockedOr, vtkmType, winType, suffix)               \
+  VTKM_ATOMIC_OP(AtomicXorImpl, _InterlockedXor, vtkmType, winType, suffix)             \
+  VTKM_EXEC_CONT inline vtkmType AtomicNotImpl(vtkmType* addr, vtkm::MemoryOrder order) \
+  {                                                                                     \
+    return AtomicXorImpl(addr, static_cast<vtkmType>(~vtkmType{ 0u }), order);          \
+  }                                                                                     \
+  VTKM_EXEC_CONT inline vtkmType AtomicCompareAndSwapImpl(                              \
+    vtkmType* addr, vtkmType desired, vtkmType expected, vtkm::MemoryOrder order)       \
+  {                                                                                     \
+    return BitCast<vtkmType>(                                                           \
+      _InterlockedCompareExchange##suffix(reinterpret_cast<volatile winType*>(addr),    \
+                                          BitCast<winType>(desired),                    \
+                                          BitCast<winType>(expected)));                 \
+  }
+
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt8, CHAR, 8)
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt16, SHORT, 16)
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt32, LONG, )
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt64, LONG64, 64)
+
+#undef VTKM_ATOMIC_OPS_FOR_TYPE
+}
+} // namespace vtkm::detail
+
+#else // gcc/clang for CPU
+
+// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
+
+#include <cstdint>
+#include <cstring>
+
+namespace vtkm
+{
+namespace detail
+{
+
+VTKM_EXEC_CONT inline int GccAtomicMemOrder(vtkm::MemoryOrder order)
+{
+  switch (order)
+  {
+    case vtkm::MemoryOrder::Relaxed:
+      return __ATOMIC_RELAXED;
+    case vtkm::MemoryOrder::Acquire:
+      return __ATOMIC_ACQUIRE;
+    case vtkm::MemoryOrder::Release:
+      return __ATOMIC_RELEASE;
+    case vtkm::MemoryOrder::AcquireAndRelease:
+      return __ATOMIC_ACQ_REL;
+    case vtkm::MemoryOrder::SequentiallyConsistent:
+      return __ATOMIC_SEQ_CST;
+  }
+
+  // Should never reach here, but avoid compiler warnings
+  return __ATOMIC_SEQ_CST;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+{
+  return __atomic_load_n(addr, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
+{
+  return __atomic_store_n(addr, value, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
+{
+  return __atomic_fetch_add(addr, arg, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  return __atomic_fetch_and(addr, mask, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  return __atomic_fetch_or(addr, mask, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
+{
+  return __atomic_fetch_xor(addr, mask, GccAtomicMemOrder(order));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
+{
+  return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
+                                                 T desired,
+                                                 T expected,
+                                                 vtkm::MemoryOrder order)
+{
+  __atomic_compare_exchange_n(
+    addr, &expected, desired, false, GccAtomicMemOrder(order), GccAtomicMemOrder(order));
+  return expected;
+}
+}
+} // namespace vtkm::detail
+
+#endif // gcc/clang
+
+namespace vtkm
+{
+
+namespace detail
+{
+
+template <typename T>
+using OppositeSign = typename std::conditional<std::is_signed<T>::value,
+                                               typename std::make_unsigned<T>::type,
+                                               typename std::make_signed<T>::type>::type;
+
+} // namespace detail
+
+/// \brief The preferred type to use for atomic operations.
+///
+using AtomicTypePreferred = vtkm::UInt32;
+
+/// \brief A list of types that can be used with atomic operations.
+///
+/// TODO: Adjust based on devices being compiled.
+///
+/// BUG: vtkm::UInt64 is provided in this list even though it is not supported on CUDA
+/// before compute capability 3.5.
+///
+using AtomicTypesSupported = vtkm::List<vtkm::UInt32, vtkm::UInt64>;
+
+/// \brief Atomic function to load a value from a shared memory location.
+///
+/// Given a pointer, returns the value in that pointer. If other threads are writing to
+/// that same location, the returned value will be consistent to what was present before
+/// or after that write.
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoad(const T* pointer,
+                                   vtkm::MemoryOrder order = vtkm::MemoryOrder::Acquire)
+{
+  return detail::AtomicLoadImpl(pointer, order);
+}
+
+///@{
+/// \brief Atomic function to save a value to a shared memory location.
+///
+/// Given a pointer and a value, stores that value at the pointer's location. If two
+/// threads are simultaneously using `AtomicStore` at the same location, the resulting
+/// value will be one of the values or the other (as opposed to a mix of bits).
+///
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStore(T* pointer,
+                                       T value,
+                                       vtkm::MemoryOrder order = vtkm::MemoryOrder::Release)
+{
+  detail::AtomicStoreImpl(pointer, value, order);
+}
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStore(T* pointer,
+                                       detail::OppositeSign<T> value,
+                                       vtkm::MemoryOrder order = vtkm::MemoryOrder::Release)
+{
+  detail::AtomicStoreImpl(pointer, static_cast<T>(value), order);
+}
+///@}
+
+///@{
+/// \brief Atomic function to add a value to a shared memory location.
+///
+/// Given a pointer and an operand, adds the operand to the value at the given memory
+/// location. The result of the addition is put into that memory location and the
+/// _old_ value that was originally in the memory is returned. For example, if you
+/// call `AtomicAdd` on a memory location that holds a 5 with an operand of 3, the
+/// value of 8 is stored in the memory location and the value of 5 is returned.
+///
+/// If multiple threads call `AtomicAdd` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAdd(
+  T* pointer,
+  T operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicAddImpl(pointer, operand, order);
+}
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAdd(
+  T* pointer,
+  detail::OppositeSign<T> operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicAddImpl(pointer, static_cast<T>(operand), order);
+}
+///@}
+
+///@{
+/// \brief Atomic function to AND bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise AND of the operand and thevalue at the given
+/// memory location. The result of the AND is put into that memory location and the _old_ value
+/// that was originally in the memory is returned. For example, if you call `AtomicAnd` on a memory
+/// location that holds a 0x6 with an operand of 0x3, the value of 0x2 is stored in the memory
+/// location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicAnd` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAnd(
+  T* pointer,
+  T operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicAndImpl(pointer, operand, order);
+}
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAnd(
+  T* pointer,
+  detail::OppositeSign<T> operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicAndImpl(pointer, static_cast<T>(operand), order);
+}
+///@}
+
+///@{
+/// \brief Atomic function to OR bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise OR of the operand and the value at the given
+/// memory location. The result of the OR is put into that memory location and the _old_ value
+/// that was originally in the memory is returned. For example, if you call `AtomicOr` on a memory
+/// location that holds a 0x6 with an operand of 0x3, the value of 0x7 is stored in the memory
+/// location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicOr` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T>
+VTKM_EXEC_CONT inline T
+AtomicOr(T* pointer, T operand, vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicOrImpl(pointer, operand, order);
+}
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOr(
+  T* pointer,
+  detail::OppositeSign<T> operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicOrImpl(pointer, static_cast<T>(operand), order);
+}
+///@}
+
+///@{
+/// \brief Atomic function to XOR bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise exclusive-OR of the operand and the value at
+/// the given memory location. The result of the XOR is put into that memory location and the _old_
+/// value that was originally in the memory is returned. For example, if you call `AtomicXor` on a
+/// memory location that holds a 0x6 with an operand of 0x3, the value of 0x5 is stored in the
+/// memory location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicXor` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other.
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXor(
+  T* pointer,
+  T operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicXorImpl(pointer, operand, order);
+}
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXor(
+  T* pointer,
+  detail::OppositeSign<T> operand,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicXorImpl(pointer, static_cast<T>(operand), order);
+}
+///@}
+
+/// \brief Atomic function to NOT bits to a shared memory location.
+///
+/// Given a pointer, performs a bitwise NOT of the value at the given
+/// memory location. The result of the NOT is put into that memory location and the _old_ value
+/// that was originally in the memory is returned.
+///
+/// If multiple threads call `AtomicNot` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other.
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNot(
+  T* pointer,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicNotImpl(pointer, order);
+}
+
+/// \brief Atomic function that replaces a value given a condition.
+///
+/// Given a pointer, a new desired value, and an expected value, replaces the value at the
+/// pointer if it is the same as the expected value with the new desired value. If the original
+/// value in the pointer does not equal the expected value, then the memory at the pointer
+/// remains unchanged. In either case, the function returns the _old_ original value that
+/// was at the pointer.
+///
+/// If multiple threads call `AtomicCompareAndSwap` simultaneously, the result will be consistent
+/// as if one was called before the other (although it is indeterminate which will be applied
+/// first).
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwap(
+  T* pointer,
+  T desired,
+  T expected,
+  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
+{
+  return detail::AtomicCompareAndSwapImpl(pointer, desired, expected, order);
+}
+
+} // namespace vtkm
+
+#endif //vtk_m_Atomic_h
--- a/vtkm/Bitset.h
+++ b/vtkm/Bitset.h
@ -55,6 +55,11 @@ struct Bitset
    return ((this->Mask & (static_cast<MaskType>(1) << bitIndex)) != 0);
  }

+  VTKM_EXEC_CONT bool operator==(const vtkm::Bitset<MaskType>& otherBitset) const
+  {
+    return this->Mask == otherBitset.Mask;
+  }
+
 private:
  MaskType Mask = 0;
 };
--- a/vtkm/CMakeLists.txt
+++ b/vtkm/CMakeLists.txt
@ -19,6 +19,7 @@ vtkm_install_headers(
 set(headers
  Algorithms.h
  Assert.h
+  Atomic.h
  BinaryPredicates.h
  BinaryOperators.h
  Bitset.h
--- a/vtkm/CellShape.h
+++ b/vtkm/CellShape.h
@ -79,8 +79,8 @@ struct CellShapeTagVtkmToVtkc;
 /// concept check to make sure that a template argument is a proper cell shape
 /// tag.
 ///
-#define VTKM_IS_CELL_SHAPE_TAG(tag)                                                                \
-  VTKM_STATIC_ASSERT_MSG(::vtkm::internal::CellShapeTagCheck<tag>::value,                          \
+#define VTKM_IS_CELL_SHAPE_TAG(tag)                                       \
+  VTKM_STATIC_ASSERT_MSG(::vtkm::internal::CellShapeTagCheck<tag>::value, \
                         "Provided type is not a valid VTK-m cell shape tag.")

 /// A traits-like class to get an CellShapeId known at compile time to a tag.
@ -98,32 +98,32 @@ struct CellShapeIdToTag
 // Define a tag for each cell shape as well as the support structs to go
 // between tags and ids. The following macro is only valid here.

-#define VTKM_DEFINE_CELL_TAG(name, idname)                                                         \
-  struct CellShapeTag##name                                                                        \
-  {                                                                                                \
-    static constexpr vtkm::UInt8 Id = vtkm::idname;                                                \
-  };                                                                                               \
-  namespace internal                                                                               \
-  {                                                                                                \
-  template <>                                                                                      \
-  struct CellShapeTagCheck<vtkm::CellShapeTag##name> : std::true_type                              \
-  {                                                                                                \
-  };                                                                                               \
-  template <>                                                                                      \
-  struct CellShapeTagVtkmToVtkc<vtkm::CellShapeTag##name>                                          \
-  {                                                                                                \
-    using Type = lcl::name;                                                                        \
-  };                                                                                               \
-  }                                                                                                \
-  static inline VTKM_EXEC_CONT const char* GetCellShapeName(vtkm::CellShapeTag##name)              \
-  {                                                                                                \
-    return #name;                                                                                  \
-  }                                                                                                \
-  template <>                                                                                      \
-  struct CellShapeIdToTag<vtkm::idname>                                                            \
-  {                                                                                                \
-    using valid = std::true_type;                                                                  \
-    using Tag = vtkm::CellShapeTag##name;                                                          \
+#define VTKM_DEFINE_CELL_TAG(name, idname)                                            \
+  struct CellShapeTag##name                                                           \
+  {                                                                                   \
+    static constexpr vtkm::UInt8 Id = vtkm::idname;                                   \
+  };                                                                                  \
+  namespace internal                                                                  \
+  {                                                                                   \
+  template <>                                                                         \
+  struct CellShapeTagCheck<vtkm::CellShapeTag##name> : std::true_type                 \
+  {                                                                                   \
+  };                                                                                  \
+  template <>                                                                         \
+  struct CellShapeTagVtkmToVtkc<vtkm::CellShapeTag##name>                             \
+  {                                                                                   \
+    using Type = lcl::name;                                                           \
+  };                                                                                  \
+  }                                                                                   \
+  static inline VTKM_EXEC_CONT const char* GetCellShapeName(vtkm::CellShapeTag##name) \
+  {                                                                                   \
+    return #name;                                                                     \
+  }                                                                                   \
+  template <>                                                                         \
+  struct CellShapeIdToTag<vtkm::idname>                                               \
+  {                                                                                   \
+    using valid = std::true_type;                                                     \
+    using Tag = vtkm::CellShapeTag##name;                                             \
  }

 VTKM_DEFINE_CELL_TAG(Empty, CELL_SHAPE_EMPTY);
@ -189,12 +189,12 @@ inline lcl::Cell make_LclCellShapeTag(const vtkm::CellShapeTagGeneric& tag,

 } // namespace internal

-#define vtkmGenericCellShapeMacroCase(cellShapeId, call)                                           \
-  case vtkm::cellShapeId:                                                                          \
-  {                                                                                                \
-    using CellShapeTag = vtkm::CellShapeIdToTag<vtkm::cellShapeId>::Tag;                           \
-    call;                                                                                          \
-  }                                                                                                \
+#define vtkmGenericCellShapeMacroCase(cellShapeId, call)                 \
+  case vtkm::cellShapeId:                                                \
+  {                                                                      \
+    using CellShapeTag = vtkm::CellShapeIdToTag<vtkm::cellShapeId>::Tag; \
+    call;                                                                \
+  }                                                                      \
  break

 /// \brief A macro used in a \c switch statement to determine cell shape.
@ -227,17 +227,17 @@ inline lcl::Cell make_LclCellShapeTag(const vtkm::CellShapeTagGeneric& tag,
 /// Note that \c vtkmGenericCellShapeMacro does not have a default case. You
 /// should consider adding one that gives a
 ///
-#define vtkmGenericCellShapeMacro(call)                                                            \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_EMPTY, call);                                           \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_VERTEX, call);                                          \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_LINE, call);                                            \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_POLY_LINE, call);                                       \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_TRIANGLE, call);                                        \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_POLYGON, call);                                         \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_QUAD, call);                                            \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_TETRA, call);                                           \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_HEXAHEDRON, call);                                      \
-  vtkmGenericCellShapeMacroCase(CELL_SHAPE_WEDGE, call);                                           \
+#define vtkmGenericCellShapeMacro(call)                       \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_EMPTY, call);      \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_VERTEX, call);     \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_LINE, call);       \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_POLY_LINE, call);  \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_TRIANGLE, call);   \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_POLYGON, call);    \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_QUAD, call);       \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_TETRA, call);      \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_HEXAHEDRON, call); \
+  vtkmGenericCellShapeMacroCase(CELL_SHAPE_WEDGE, call);      \
  vtkmGenericCellShapeMacroCase(CELL_SHAPE_PYRAMID, call)

 } // namespace vtkm
--- a/vtkm/CellTraits.h
+++ b/vtkm/CellTraits.h
@ -81,23 +81,23 @@ struct CellTraits

 // Define traits for every cell type.

-#define VTKM_DEFINE_CELL_TRAITS(name, dimensions, numPoints)                                       \
-  template <>                                                                                      \
-  struct CellTraits<vtkm::CellShapeTag##name>                                                      \
-  {                                                                                                \
-    static constexpr vtkm::IdComponent TOPOLOGICAL_DIMENSIONS = dimensions;                        \
-    using TopologicalDimensionsTag = vtkm::CellTopologicalDimensionsTag<TOPOLOGICAL_DIMENSIONS>;   \
-    using IsSizeFixed = vtkm::CellTraitsTagSizeFixed;                                              \
-    static constexpr vtkm::IdComponent NUM_POINTS = numPoints;                                     \
+#define VTKM_DEFINE_CELL_TRAITS(name, dimensions, numPoints)                                     \
+  template <>                                                                                    \
+  struct CellTraits<vtkm::CellShapeTag##name>                                                    \
+  {                                                                                              \
+    static constexpr vtkm::IdComponent TOPOLOGICAL_DIMENSIONS = dimensions;                      \
+    using TopologicalDimensionsTag = vtkm::CellTopologicalDimensionsTag<TOPOLOGICAL_DIMENSIONS>; \
+    using IsSizeFixed = vtkm::CellTraitsTagSizeFixed;                                            \
+    static constexpr vtkm::IdComponent NUM_POINTS = numPoints;                                   \
  }

-#define VTKM_DEFINE_CELL_TRAITS_VARIABLE(name, dimensions)                                         \
-  template <>                                                                                      \
-  struct CellTraits<vtkm::CellShapeTag##name>                                                      \
-  {                                                                                                \
-    static constexpr vtkm::IdComponent TOPOLOGICAL_DIMENSIONS = dimensions;                        \
-    using TopologicalDimensionsTag = vtkm::CellTopologicalDimensionsTag<TOPOLOGICAL_DIMENSIONS>;   \
-    using IsSizeFixed = vtkm::CellTraitsTagSizeVariable;                                           \
+#define VTKM_DEFINE_CELL_TRAITS_VARIABLE(name, dimensions)                                       \
+  template <>                                                                                    \
+  struct CellTraits<vtkm::CellShapeTag##name>                                                    \
+  {                                                                                              \
+    static constexpr vtkm::IdComponent TOPOLOGICAL_DIMENSIONS = dimensions;                      \
+    using TopologicalDimensionsTag = vtkm::CellTopologicalDimensionsTag<TOPOLOGICAL_DIMENSIONS>; \
+    using IsSizeFixed = vtkm::CellTraitsTagSizeVariable;                                         \
  }

 VTKM_DEFINE_CELL_TRAITS(Empty, 0, 0);
--- a/vtkm/Deprecated.h
+++ b/vtkm/Deprecated.h
@ -13,9 +13,9 @@
 #include <vtkm/StaticAssert.h>
 #include <vtkm/Types.h>

-#define VTK_M_DEPRECATED_MAKE_MESSAGE(...)                                                         \
+#define VTK_M_DEPRECATED_MAKE_MESSAGE(...) \
  VTKM_EXPAND(VTK_M_DEPRECATED_MAKE_MESSAGE_IMPL(__VA_ARGS__, "", vtkm::internal::NullType{}))
-#define VTK_M_DEPRECATED_MAKE_MESSAGE_IMPL(version, message, ...)                                  \
+#define VTK_M_DEPRECATED_MAKE_MESSAGE_IMPL(version, message, ...) \
  message " Deprecated in version " #version "."

 /// \def VTKM_DEPRECATED(version, message)
@ -104,7 +104,7 @@
 #if defined(VTKM_GCC) || defined(VTKM_CLANG)

 #define VTKM_DEPRECATED_SUPPRESS_SUPPORTED
-#define VTKM_DEPRECATED_SUPPRESS_BEGIN                                                             \
+#define VTKM_DEPRECATED_SUPPRESS_BEGIN \
  _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
 #define VTKM_DEPRECATED_SUPPRESS_END _Pragma("GCC diagnostic pop")

--- a/vtkm/ErrorCode.h
+++ b/vtkm/ErrorCode.h
@ -108,14 +108,14 @@ VTKM_EXEC_CONT inline vtkm::ErrorCode LclErrorToVtkmError(lcl::ErrorCode code) n

 } // namespace vtkm

-#define VTKM_RETURN_ON_ERROR(call)                                                                 \
-  do                                                                                               \
-  {                                                                                                \
-    auto status = (call);                                                                          \
-    if (status != ::vtkm::ErrorCode::Success)                                                      \
-    {                                                                                              \
-      return status;                                                                               \
-    }                                                                                              \
+#define VTKM_RETURN_ON_ERROR(call)            \
+  do                                          \
+  {                                           \
+    auto status = (call);                     \
+    if (status != ::vtkm::ErrorCode::Success) \
+    {                                         \
+      return status;                          \
+    }                                         \
  } while (false)

 #endif //vtk_m_exec_ErrorCode_h
--- a/vtkm/Geometry.hxx
+++ b/vtkm/Geometry.hxx
@ -16,7 +16,7 @@ namespace vtkm

 template <typename CoordType, int Dim, bool IsTwoSided>
 template <int Dim_, typename std::enable_if<Dim_ == 2, int>::type>
-Ray<CoordType, Dim, IsTwoSided>::Ray()
+VTKM_EXEC_CONT Ray<CoordType, Dim, IsTwoSided>::Ray()
  : Origin{ 0.f }
  , Direction{ 1.f, 0.f }
 {
@ -24,50 +24,42 @@ Ray<CoordType, Dim, IsTwoSided>::Ray()

 template <typename CoordType, int Dim, bool IsTwoSided>
 template <int Dim_, typename std::enable_if<Dim_ == 3, int>::type>
-Ray<CoordType, Dim, IsTwoSided>::Ray()
+VTKM_EXEC_CONT Ray<CoordType, Dim, IsTwoSided>::Ray()
  : Origin{ 0.f }
  , Direction{ 1.f, 0.f, 0.f }
 {
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-Ray<CoordType, Dim, IsTwoSided>::Ray(const LineSegment<CoordType, Dim>& segment)
+VTKM_EXEC_CONT Ray<CoordType, Dim, IsTwoSided>::Ray(const LineSegment<CoordType, Dim>& segment)
  : Origin(segment.Endpoints[0])
  , Direction(vtkm::Normal(segment.Direction()))
 {
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-Ray<CoordType, Dim, IsTwoSided>::Ray(const Vector& point, const Vector& direction)
+VTKM_EXEC_CONT Ray<CoordType, Dim, IsTwoSided>::Ray(const Vector& point, const Vector& direction)
  : Origin(point)
  , Direction(vtkm::Normal(direction))
 {
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-typename Ray<CoordType, Dim, IsTwoSided>::Vector Ray<CoordType, Dim, IsTwoSided>::Evaluate(
-  CoordType param) const
+VTKM_EXEC_CONT typename Ray<CoordType, Dim, IsTwoSided>::Vector
+Ray<CoordType, Dim, IsTwoSided>::Evaluate(CoordType param) const
 {
  auto pointOnLine = this->Origin + this->Direction * param;
  return pointOnLine;
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-bool Ray<CoordType, Dim, IsTwoSided>::IsValid() const
+VTKM_EXEC_CONT bool Ray<CoordType, Dim, IsTwoSided>::IsValid() const
 {
-// At least on Ubuntu 17.10, cuda 9.1 will fail with an internal
-// compiler error when calling vtkm::IsInf() here. But the fix
-// below works. The fix should be removed as soon as our dashboards
-// allow it.
-#if __CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ == 1
-  return !isinf(this->Direction[0]);
-#else
  return !vtkm::IsInf(this->Direction[0]);
-#endif
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point) const
+VTKM_EXEC_CONT CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point) const
 {
  Vector closest;
  CoordType param;
@ -75,9 +67,9 @@ CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point) const
 }

 template <typename CoordType, int Dim, bool IsTwoSided>
-CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point,
-                                                      CoordType& param,
-                                                      Vector& projectedPoint) const
+VTKM_EXEC_CONT CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point,
+                                                                     CoordType& param,
+                                                                     Vector& projectedPoint) const
 {
  const auto& dir = this->Direction;
  auto mag2 = vtkm::MagnitudeSquared(dir);
@ -105,9 +97,10 @@ CoordType Ray<CoordType, Dim, IsTwoSided>::DistanceTo(const Vector& point,

 template <typename CoordType, int Dim, bool IsTwoSided>
 template <bool OtherTwoSided, int Dim_, typename std::enable_if<Dim_ == 2, int>::type>
-bool Ray<CoordType, Dim, IsTwoSided>::Intersect(const Ray<CoordType, Dim, OtherTwoSided>& other,
-                                                Vector& point,
-                                                CoordType tol)
+VTKM_EXEC_CONT bool Ray<CoordType, Dim, IsTwoSided>::Intersect(
+  const Ray<CoordType, Dim, OtherTwoSided>& other,
+  Vector& point,
+  CoordType tol)
 {
  auto d1 = this->Direction;
  auto d2 = other.Direction;
@ -139,33 +132,33 @@ bool Ray<CoordType, Dim, IsTwoSided>::Intersect(const Ray<CoordType, Dim, OtherT

 template <typename CoordType, int Dim>
 template <int Dim_, typename std::enable_if<Dim_ == 2, int>::type>
-LineSegment<CoordType, Dim>::LineSegment()
+VTKM_EXEC_CONT LineSegment<CoordType, Dim>::LineSegment()
  : Endpoints{ { 0.f }, { 1.f, 0.f } }
 {
 }

 template <typename CoordType, int Dim>
 template <int Dim_, typename std::enable_if<Dim_ == 3, int>::type>
-LineSegment<CoordType, Dim>::LineSegment()
+VTKM_EXEC_CONT LineSegment<CoordType, Dim>::LineSegment()
  : Endpoints{ { 0.f }, { 1.f, 0.f, 0.f } }
 {
 }

 template <typename CoordType, int Dim>
-LineSegment<CoordType, Dim>::LineSegment(const Vector& p0, const Vector& p1)
+VTKM_EXEC_CONT LineSegment<CoordType, Dim>::LineSegment(const Vector& p0, const Vector& p1)
  : Endpoints{ p0, p1 }
 {
 }

 template <typename CoordType, int Dim>
-bool LineSegment<CoordType, Dim>::IsSingular(CoordType tol2) const
+VTKM_EXEC_CONT bool LineSegment<CoordType, Dim>::IsSingular(CoordType tol2) const
 {
  return vtkm::MagnitudeSquared(this->Direction()) < tol2;
 }

 template <typename CoordType, int Dim>
 template <int Dim_, typename std::enable_if<Dim_ == 2, int>::type>
-Ray<CoordType, Dim, true> LineSegment<CoordType, Dim>::PerpendicularBisector() const
+VTKM_EXEC_CONT Ray<CoordType, Dim, true> LineSegment<CoordType, Dim>::PerpendicularBisector() const
 {
  const Vector dir = this->Direction();
  const Vector perp(-dir[1], dir[0]);
@ -175,13 +168,13 @@ Ray<CoordType, Dim, true> LineSegment<CoordType, Dim>::PerpendicularBisector() c

 template <typename CoordType, int Dim>
 template <int Dim_, typename std::enable_if<Dim_ == 3, int>::type>
-Plane<CoordType> LineSegment<CoordType, Dim>::PerpendicularBisector() const
+VTKM_EXEC_CONT Plane<CoordType> LineSegment<CoordType, Dim>::PerpendicularBisector() const
 {
  return Plane<CoordType>(this->Center(), this->Direction());
 }

 template <typename CoordType, int Dim>
-typename LineSegment<CoordType, Dim>::Vector LineSegment<CoordType, Dim>::Evaluate(
+VTKM_EXEC_CONT typename LineSegment<CoordType, Dim>::Vector LineSegment<CoordType, Dim>::Evaluate(
  CoordType param) const
 {
  auto pointOnLine = this->Endpoints[0] * (1.0f - param) + this->Endpoints[1] * param;
@ -189,7 +182,7 @@ typename LineSegment<CoordType, Dim>::Vector LineSegment<CoordType, Dim>::Evalua
 }

 template <typename CoordType, int Dim>
-CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point) const
+VTKM_EXEC_CONT CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point) const
 {
  Vector closest;
  CoordType param;
@ -197,9 +190,9 @@ CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point) const
 }

 template <typename CoordType, int Dim>
-CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point,
-                                                  CoordType& param,
-                                                  Vector& projectedPoint) const
+VTKM_EXEC_CONT CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point,
+                                                                 CoordType& param,
+                                                                 Vector& projectedPoint) const
 {
  auto dir = this->Endpoints[1] - this->Endpoints[0];
  auto mag2 = vtkm::MagnitudeSquared(dir);
@ -224,9 +217,10 @@ CoordType LineSegment<CoordType, Dim>::DistanceTo(const Vector& point,

 template <typename CoordType, int Dim>
 template <int Dim_, typename std::enable_if<Dim_ == 2, int>::type>
-bool LineSegment<CoordType, Dim>::IntersectInfinite(const LineSegment<CoordType, Dim>& other,
-                                                    Vector& point,
-                                                    CoordType tol)
+VTKM_EXEC_CONT bool LineSegment<CoordType, Dim>::IntersectInfinite(
+  const LineSegment<CoordType, Dim>& other,
+  Vector& point,
+  CoordType tol)
 {
  auto d1 = this->Direction();
  auto d2 = other.Direction();
@ -249,14 +243,14 @@ bool LineSegment<CoordType, Dim>::IntersectInfinite(const LineSegment<CoordType,
 // Plane

 template <typename CoordType>
-Plane<CoordType>::Plane()
+VTKM_EXEC_CONT VTKM_EXEC_CONT Plane<CoordType>::Plane()
  : Origin{ 0.f, 0.f, 0.f }
  , Normal{ 0.f, 0.f, 1.f }
 {
 }

 template <typename CoordType>
-Plane<CoordType>::Plane(const Vector& origin, const Vector& normal, CoordType tol2)
+VTKM_EXEC_CONT Plane<CoordType>::Plane(const Vector& origin, const Vector& normal, CoordType tol2)
  : Origin(origin)
  , Normal(vtkm::Normal(normal))
 {
@ -268,14 +262,15 @@ Plane<CoordType>::Plane(const Vector& origin, const Vector& normal, CoordType to
 }

 template <typename CoordType>
-CoordType Plane<CoordType>::DistanceTo(const Vector& point) const
+VTKM_EXEC_CONT CoordType Plane<CoordType>::DistanceTo(const Vector& point) const
 {
  auto dist = vtkm::Dot(point - this->Origin, this->Normal);
  return dist;
 }

 template <typename CoordType>
-typename Plane<CoordType>::Vector Plane<CoordType>::ClosestPoint(const Vector& point) const
+VTKM_EXEC_CONT typename Plane<CoordType>::Vector Plane<CoordType>::ClosestPoint(
+  const Vector& point) const
 {
  auto vop = vtkm::Project(point - this->Origin, this->Normal);
  auto closest = point - vop;
@ -284,11 +279,11 @@ typename Plane<CoordType>::Vector Plane<CoordType>::ClosestPoint(const Vector& p

 template <typename CoordType>
 template <bool IsTwoSided>
-bool Plane<CoordType>::Intersect(const Ray<CoordType, 3, IsTwoSided>& ray,
-                                 CoordType& parameter,
-                                 Vector& point,
-                                 bool& lineInPlane,
-                                 CoordType tol) const
+VTKM_EXEC_CONT bool Plane<CoordType>::Intersect(const Ray<CoordType, 3, IsTwoSided>& ray,
+                                                CoordType& parameter,
+                                                Vector& point,
+                                                bool& lineInPlane,
+                                                CoordType tol) const
 {
  CoordType d0 = this->DistanceTo(ray.Origin);
  CoordType dirDot = vtkm::Dot(this->Normal, ray.Direction);
@ -330,19 +325,19 @@ bool Plane<CoordType>::Intersect(const Ray<CoordType, 3, IsTwoSided>& ray,
 }

 template <typename CoordType>
-bool Plane<CoordType>::Intersect(const LineSegment<CoordType>& segment,
-                                 CoordType& parameter,
-                                 bool& lineInPlane) const
+VTKM_EXEC_CONT bool Plane<CoordType>::Intersect(const LineSegment<CoordType>& segment,
+                                                CoordType& parameter,
+                                                bool& lineInPlane) const
 {
  Vector point;
  return this->Intersect(segment, parameter, point, lineInPlane);
 }

 template <typename CoordType>
-bool Plane<CoordType>::Intersect(const LineSegment<CoordType>& segment,
-                                 CoordType& parameter,
-                                 Vector& point,
-                                 bool& lineInPlane) const
+VTKM_EXEC_CONT bool Plane<CoordType>::Intersect(const LineSegment<CoordType>& segment,
+                                                CoordType& parameter,
+                                                Vector& point,
+                                                bool& lineInPlane) const
 {
  CoordType d0 = this->DistanceTo(segment.Endpoints[0]);
  CoordType d1 = this->DistanceTo(segment.Endpoints[1]);
@ -394,10 +389,10 @@ bool Plane<CoordType>::Intersect(const LineSegment<CoordType>& segment,
 }

 template <typename CoordType>
-bool Plane<CoordType>::Intersect(const Plane<CoordType>& other,
-                                 Ray<CoordType, 3, true>& ray,
-                                 bool& coincident,
-                                 CoordType tol2) const
+VTKM_EXEC_CONT bool Plane<CoordType>::Intersect(const Plane<CoordType>& other,
+                                                Ray<CoordType, 3, true>& ray,
+                                                bool& coincident,
+                                                CoordType tol2) const
 {
  auto dir = vtkm::Cross(this->Normal, other.Normal);
  auto mag2 = vtkm::MagnitudeSquared(dir);
@ -434,27 +429,27 @@ bool Plane<CoordType>::Intersect(const Plane<CoordType>& other,
 // Sphere

 template <typename CoordType, int Dim>
-Sphere<CoordType, Dim>::Sphere()
+VTKM_EXEC_CONT Sphere<CoordType, Dim>::Sphere()
  : Center{ 0.f }
  , Radius(static_cast<CoordType>(1.f))
 {
 }

 template <typename CoordType, int Dim>
-Sphere<CoordType, Dim>::Sphere(const Vector& center, CoordType radius)
+VTKM_EXEC_CONT Sphere<CoordType, Dim>::Sphere(const Vector& center, CoordType radius)
  : Center(center)
  , Radius(radius <= 0.f ? static_cast<CoordType>(-1.0f) : radius)
 {
 }

 template <typename CoordType, int Dim>
-bool Sphere<CoordType, Dim>::Contains(const Vector& point, CoordType tol2) const
+VTKM_EXEC_CONT bool Sphere<CoordType, Dim>::Contains(const Vector& point, CoordType tol2) const
 {
  return this->Classify(point, tol2) < 0;
 }

 template <typename CoordType, int Dim>
-int Sphere<CoordType, Dim>::Classify(const Vector& point, CoordType tol2) const
+VTKM_EXEC_CONT int Sphere<CoordType, Dim>::Classify(const Vector& point, CoordType tol2) const
 {
  if (!this->IsValid())
  {
@ -469,16 +464,17 @@ int Sphere<CoordType, Dim>::Classify(const Vector& point, CoordType tol2) const
 // Construction techniques

 template <typename CoordType, bool IsTwoSided>
-vtkm::Plane<CoordType> make_PlaneFromPointAndLine(const vtkm::Vec<CoordType, 3>& point,
-                                                  const vtkm::Ray<CoordType, 3, IsTwoSided>& ray,
-                                                  CoordType tol2)
+VTKM_EXEC_CONT vtkm::Plane<CoordType> make_PlaneFromPointAndLine(
+  const vtkm::Vec<CoordType, 3>& point,
+  const vtkm::Ray<CoordType, 3, IsTwoSided>& ray,
+  CoordType tol2)
 {
  auto tmpDir = point - ray.Origin;
  return vtkm::Plane<CoordType>(point, vtkm::Cross(ray.Direction, tmpDir), tol2);
 }

 template <typename CoordType>
-vtkm::Plane<CoordType> make_PlaneFromPointAndLineSegment(
+VTKM_EXEC_CONT vtkm::Plane<CoordType> make_PlaneFromPointAndLineSegment(
  const vtkm::Vec<CoordType, 3>& point,
  const vtkm::LineSegment3<CoordType>& segment,
  CoordType tol2)
@ -488,10 +484,11 @@ vtkm::Plane<CoordType> make_PlaneFromPointAndLineSegment(
 }

 template <typename CoordType>
-vtkm::Circle<CoordType> make_CircleFrom3Points(const typename vtkm::Vec<CoordType, 2>& p0,
-                                               const typename vtkm::Vec<CoordType, 2>& p1,
-                                               const typename vtkm::Vec<CoordType, 2>& p2,
-                                               CoordType tol)
+VTKM_EXEC_CONT vtkm::Circle<CoordType> make_CircleFrom3Points(
+  const typename vtkm::Vec<CoordType, 2>& p0,
+  const typename vtkm::Vec<CoordType, 2>& p1,
+  const typename vtkm::Vec<CoordType, 2>& p2,
+  CoordType tol)
 {
  constexpr int Dim = 2;
  using Vector = typename vtkm::Circle<CoordType>::Vector;
@ -518,11 +515,11 @@ vtkm::Circle<CoordType> make_CircleFrom3Points(const typename vtkm::Vec<CoordTyp
 }

 template <typename CoordType>
-vtkm::Sphere<CoordType, 3> make_SphereFrom4Points(const vtkm::Vec<CoordType, 3>& a0,
-                                                  const vtkm::Vec<CoordType, 3>& a1,
-                                                  const vtkm::Vec<CoordType, 3>& a2,
-                                                  const vtkm::Vec<CoordType, 3>& a3,
-                                                  CoordType tol)
+VTKM_EXEC_CONT vtkm::Sphere<CoordType, 3> make_SphereFrom4Points(const vtkm::Vec<CoordType, 3>& a0,
+                                                                 const vtkm::Vec<CoordType, 3>& a1,
+                                                                 const vtkm::Vec<CoordType, 3>& a2,
+                                                                 const vtkm::Vec<CoordType, 3>& a3,
+                                                                 CoordType tol)
 {
  // Choose p3 such that the min(p3 - p[012]) is larger than any other choice of p3.
  // From: http://steve.hollasch.net/cgindex/geometry/sphere4pts.html,
--- a/vtkm/ImplicitFunction.h
+++ b/vtkm/ImplicitFunction.h
@ -665,22 +665,18 @@ private:

 } // namespace vtkm

-#ifdef VTKM_CUDA
-
 // Cuda seems to have a bug where it expects the template class VirtualObjectTransfer
 // to be instantiated in a consistent order among all the translation units of an
 // executable. Failing to do so results in random crashes and incorrect results.
 // We workaroud this issue by explicitly instantiating VirtualObjectTransfer for
 // all the implicit functions here.
-
-#include <vtkm/cont/cuda/internal/VirtualObjectTransferCuda.h>
-
+#ifdef VTKM_CUDA
+#include <vtkm/cont/internal/VirtualObjectTransferInstantiate.h>
 VTKM_EXPLICITLY_INSTANTIATE_TRANSFER(vtkm::Box);
 VTKM_EXPLICITLY_INSTANTIATE_TRANSFER(vtkm::Cylinder);
 VTKM_EXPLICITLY_INSTANTIATE_TRANSFER(vtkm::Frustum);
 VTKM_EXPLICITLY_INSTANTIATE_TRANSFER(vtkm::Plane);
 VTKM_EXPLICITLY_INSTANTIATE_TRANSFER(vtkm::Sphere);
-
 #endif

 #endif //vtk_m_ImplicitFunction_h
--- a/vtkm/List.h
+++ b/vtkm/List.h
@ -57,8 +57,8 @@ using IsList = typename vtkm::internal::IsListImpl<T>::type;
 /// actually a device adapter tag. (You can get weird errors elsewhere in the
 /// code when a mistake is made.)
 ///
-#define VTKM_IS_LIST(type)                                                                         \
-  VTKM_STATIC_ASSERT_MSG((::vtkm::internal::IsList<type>::value),                                  \
+#define VTKM_IS_LIST(type)                                        \
+  VTKM_STATIC_ASSERT_MSG((::vtkm::internal::IsList<type>::value), \
                         "Provided type is not a valid VTK-m list type.")

 namespace detail
@ -226,8 +226,7 @@ template <vtkm::IdComponent NumSearched,
          typename... Ts>
 struct FindFirstOfType<NumSearched, Target, T0, T1, T2, T3, T4, T5, Ts...>
  : FindFirstOfSplit4<(std::is_same<Target, T0>::value || std::is_same<Target, T1>::value ||
-                       std::is_same<Target, T2>::value ||
-                       std::is_same<Target, T3>::value),
+                       std::is_same<Target, T2>::value || std::is_same<Target, T3>::value),
                      NumSearched,
                      Target,
                      T0,
@ -257,8 +256,7 @@ template <vtkm::IdComponent NumSearched,
          typename... Ts>
 struct FindFirstOfSplit8<true, NumSearched, Target, T0, T1, T2, T3, T4, T5, T6, T7, Ts...>
  : FindFirstOfSplit4<(std::is_same<Target, T0>::value || std::is_same<Target, T1>::value ||
-                       std::is_same<Target, T2>::value ||
-                       std::is_same<Target, T3>::value),
+                       std::is_same<Target, T2>::value || std::is_same<Target, T3>::value),
                      NumSearched,
                      Target,
                      T0,
@ -305,12 +303,9 @@ template <vtkm::IdComponent NumSearched,
          typename... Ts>
 struct FindFirstOfType<NumSearched, Target, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, Ts...>
  : FindFirstOfSplit8<(std::is_same<Target, T0>::value || std::is_same<Target, T1>::value ||
-                       std::is_same<Target, T2>::value ||
-                       std::is_same<Target, T3>::value ||
-                       std::is_same<Target, T4>::value ||
-                       std::is_same<Target, T5>::value ||
-                       std::is_same<Target, T6>::value ||
-                       std::is_same<Target, T7>::value),
+                       std::is_same<Target, T2>::value || std::is_same<Target, T3>::value ||
+                       std::is_same<Target, T4>::value || std::is_same<Target, T5>::value ||
+                       std::is_same<Target, T6>::value || std::is_same<Target, T7>::value),
                      NumSearched,
                      Target,
                      T0,
--- a/vtkm/ListTag.h
+++ b/vtkm/ListTag.h
@ -103,8 +103,8 @@ struct VTKM_DEPRECATED(1.6, "VTKM_IS_LIST_TAG replaced with VTKM_IS_LIST.") List
 /// actually a device adapter tag. (You can get weird errors elsewhere in the
 /// code when a mistake is made.)
 ///
-#define VTKM_IS_LIST_TAG(tag)                                                                      \
-  VTKM_STATIC_ASSERT_MSG((::vtkm::detail::ListTagAssert<tag>::value),                              \
+#define VTKM_IS_LIST_TAG(tag)                                         \
+  VTKM_STATIC_ASSERT_MSG((::vtkm::detail::ListTagAssert<tag>::value), \
                         "Provided type is not a valid VTK-m list tag.")

 namespace internal
--- a/vtkm/Math.h
+++ b/vtkm/Math.h
@ -17,8 +17,10 @@
 #include <vtkm/Types.h>
 #include <vtkm/VecTraits.h>

+#include <limits> // must be found with or without CUDA.
 #ifndef VTKM_CUDA
 #include <cmath>
+#include <cstring>
 #include <limits.h>
 #include <math.h>
 #include <stdlib.h>
@ -2584,6 +2586,105 @@ inline VTKM_EXEC_CONT vtkm::Float64 Ldexp(vtkm::Float64 x, vtkm::Int32 exponent)
 #endif
 }

+// See: https://randomascii.wordpress.com/2012/01/23/stupid-float-tricks-2/ for why this works.
+inline VTKM_EXEC_CONT vtkm::UInt64 FloatDistance(vtkm::Float64 x, vtkm::Float64 y)
+{
+  static_assert(sizeof(vtkm::Float64) == sizeof(vtkm::UInt64), "vtkm::Float64 is incorrect size.");
+  static_assert(std::numeric_limits<vtkm::Float64>::has_denorm == std::denorm_present, "FloatDistance presumes the floating-point type has subnormal numbers.");
+
+  if (!vtkm::IsFinite(x) || !vtkm::IsFinite(y)) {
+    return 0xFFFFFFFFFFFFFFFFL;
+  }
+
+  // Signed zero is the sworn enemy of this process.
+  if (y == 0) {
+    y = vtkm::Abs(y);
+  }
+  if (x == 0) {
+    x = vtkm::Abs(x);
+  }
+
+  if ( (x < 0 && y >= 0) || (x >= 0 && y < 0) )
+  {
+    vtkm::UInt64 dx, dy;
+    if (x < 0) {
+      dy = FloatDistance(0.0, y);
+      dx = FloatDistance(0.0, -x);
+    }
+    else {
+      dy = FloatDistance(0.0, -y);
+      dx = FloatDistance(0.0, x);
+    }
+
+    return dx + dy;
+  }
+
+  if (x < 0 && y < 0) {
+    return FloatDistance(-x, -y);
+  }
+
+  // Note that:
+  // int64_t xi = *reinterpret_cast<int64_t*>(&x);
+  // int64_t yi = *reinterpret_cast<int64_t*>(&y);
+  // also works, but generates warnings.
+  // Good option to have if we get compile errors off memcpy or don't want to #include <cstring> though.
+  // At least on gcc, both versions generate the same assembly.
+  vtkm::UInt64 xi;
+  vtkm::UInt64 yi;
+  memcpy(&xi, &x, sizeof(vtkm::UInt64));
+  memcpy(&yi, &y, sizeof(vtkm::UInt64));
+  if (yi > xi) {
+    return yi - xi;
+  }
+  return xi - yi;
+}
+
+inline VTKM_EXEC_CONT vtkm::UInt64 FloatDistance(vtkm::Float32 x, vtkm::Float32 y)
+{
+  static_assert(sizeof(vtkm::Float32) == sizeof(vtkm::Int32), "vtkm::Float32 is incorrect size.");
+  static_assert(std::numeric_limits<vtkm::Float32>::has_denorm == std::denorm_present, "FloatDistance presumes the floating-point type has subnormal numbers.");
+
+  if (!vtkm::IsFinite(x) || !vtkm::IsFinite(y)) {
+    return 0xFFFFFFFFFFFFFFFFL;
+  }
+
+  if (y == 0) {
+    y = vtkm::Abs(y);
+  }
+  if (x == 0) {
+    x = vtkm::Abs(x);
+  }
+
+  if ( (x < 0 && y >= 0) || (x >= 0 && y < 0) )
+  {
+    vtkm::UInt64 dx, dy;
+    if (x < 0) {
+      dy = FloatDistance(0.0f, y);
+      dx = FloatDistance(0.0f, -x);
+    }
+    else {
+      dy = FloatDistance(0.0f, -y);
+      dx = FloatDistance(0.0f, x);
+    }
+    return dx + dy;
+  }
+
+  if (x < 0 && y < 0) {
+    return FloatDistance(-x, -y);
+  }
+
+  vtkm::UInt32 xi_32;
+  vtkm::UInt32 yi_32;
+  memcpy(&xi_32, &x, sizeof(vtkm::UInt32));
+  memcpy(&yi_32, &y, sizeof(vtkm::UInt32));
+  vtkm::UInt64 xi = xi_32;
+  vtkm::UInt64 yi = yi_32;
+  if (yi > xi) {
+    return yi - xi;
+  }
+  return xi - yi;
+}
+
 /// Bitwise operations
 ///

--- a/vtkm/Math.h.in
+++ b/vtkm/Math.h.in
@ -29,8 +29,10 @@ $# Ignore the following comment. It is meant for the generated file.
 #include <vtkm/Types.h>
 #include <vtkm/VecTraits.h>

+#include <limits> // must be found with or without CUDA.
 #ifndef VTKM_CUDA
 #include <cmath>
+#include <cstring>
 #include <limits.h>
 #include <math.h>
 #include <stdlib.h>
@ -1186,6 +1188,105 @@ inline VTKM_EXEC_CONT vtkm::Float64 Ldexp(vtkm::Float64 x, vtkm::Int32 exponent)
 #endif
 }

+// See: https://randomascii.wordpress.com/2012/01/23/stupid-float-tricks-2/ for why this works.
+inline VTKM_EXEC_CONT vtkm::UInt64 FloatDistance(vtkm::Float64 x, vtkm::Float64 y)
+{
+  static_assert(sizeof(vtkm::Float64) == sizeof(vtkm::UInt64), "vtkm::Float64 is incorrect size.");
+  static_assert(std::numeric_limits<vtkm::Float64>::has_denorm == std::denorm_present, "FloatDistance presumes the floating-point type has subnormal numbers.");
+
+  if (!vtkm::IsFinite(x) || !vtkm::IsFinite(y)) {
+    return 0xFFFFFFFFFFFFFFFFL;
+  }
+
+  // Signed zero is the sworn enemy of this process.
+  if (y == 0) {
+    y = vtkm::Abs(y);
+  }
+  if (x == 0) {
+    x = vtkm::Abs(x);
+  }
+
+  if ( (x < 0 && y >= 0) || (x >= 0 && y < 0) )
+  {
+    vtkm::UInt64 dx, dy;
+    if (x < 0) {
+      dy = FloatDistance(0.0, y);
+      dx = FloatDistance(0.0, -x);
+    }
+    else {
+      dy = FloatDistance(0.0, -y);
+      dx = FloatDistance(0.0, x);
+    }
+
+    return dx + dy;
+  }
+
+  if (x < 0 && y < 0) {
+    return FloatDistance(-x, -y);
+  }
+
+  // Note that:
+  // int64_t xi = *reinterpret_cast<int64_t*>(&x);
+  // int64_t yi = *reinterpret_cast<int64_t*>(&y);
+  // also works, but generates warnings.
+  // Good option to have if we get compile errors off memcpy or don't want to #include <cstring> though.
+  // At least on gcc, both versions generate the same assembly.
+  vtkm::UInt64 xi;
+  vtkm::UInt64 yi;
+  memcpy(&xi, &x, sizeof(vtkm::UInt64));
+  memcpy(&yi, &y, sizeof(vtkm::UInt64));
+  if (yi > xi) {
+    return yi - xi;
+  }
+  return xi - yi;
+}
+
+inline VTKM_EXEC_CONT vtkm::UInt64 FloatDistance(vtkm::Float32 x, vtkm::Float32 y)
+{
+  static_assert(sizeof(vtkm::Float32) == sizeof(vtkm::Int32), "vtkm::Float32 is incorrect size.");
+  static_assert(std::numeric_limits<vtkm::Float32>::has_denorm == std::denorm_present, "FloatDistance presumes the floating-point type has subnormal numbers.");
+
+  if (!vtkm::IsFinite(x) || !vtkm::IsFinite(y)) {
+    return 0xFFFFFFFFFFFFFFFFL;
+  }
+
+  if (y == 0) {
+    y = vtkm::Abs(y);
+  }
+  if (x == 0) {
+    x = vtkm::Abs(x);
+  }
+
+  if ( (x < 0 && y >= 0) || (x >= 0 && y < 0) )
+  {
+    vtkm::UInt64 dx, dy;
+    if (x < 0) {
+      dy = FloatDistance(0.0f, y);
+      dx = FloatDistance(0.0f, -x);
+    }
+    else {
+      dy = FloatDistance(0.0f, -y);
+      dx = FloatDistance(0.0f, x);
+    }
+    return dx + dy;
+  }
+
+  if (x < 0 && y < 0) {
+    return FloatDistance(-x, -y);
+  }
+
+  vtkm::UInt32 xi_32;
+  vtkm::UInt32 yi_32;
+  memcpy(&xi_32, &x, sizeof(vtkm::UInt32));
+  memcpy(&yi_32, &y, sizeof(vtkm::UInt32));
+  vtkm::UInt64 xi = xi_32;
+  vtkm::UInt64 yi = yi_32;
+  if (yi > xi) {
+    return yi - xi;
+  }
+  return xi - yi;
+}
+
 /// Bitwise operations
 ///

--- a/Show More
+++ b/Show More