Merge branch 'master' into alpine_sampling_2d

2024-09-08 21:33:55 +00:00 · 2021-03-24 14:08:10 -06:00 · 2021-03-24 14:08:10 -06:00 · ed8d2fb356
commit ed8d2fb356
parent 93b8ee97dd 62f6799f37
1238 changed files with 74155 additions and 29609 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,17 +1,19 @@
 ---
-# This configuration requires clang-format 3.8 or higher.
+# This configuration requires clang-format 9 or higher.
 BasedOnStyle: Mozilla
 AlignAfterOpenBracket: Align
+AlignEscapedNewlines: true
 AlignOperands: false
-AlwaysBreakAfterReturnType: None
+AllowAllParametersOfDeclarationOnNextLine: false
 AlwaysBreakAfterDefinitionReturnType: None
-BreakBeforeBraces: Allman
+AlwaysBreakAfterReturnType: None
 BinPackArguments: false
 BinPackParameters: false
+BreakBeforeBraces: Allman
 ColumnLimit: 100
+# FixNamespaceComments: true
 MaxEmptyLinesToKeep: 4
-Standard: Cpp11
-# This requires clang-format 4.0 (at least).
-#FixNamespaceComments: true
 ReflowComments: false
+SpaceAfterTemplateKeyword: true
+Standard: Cpp11
 ...
--- a/.gitattributes
+++ b/.gitattributes
@ -1,5 +1,5 @@
 # Attributes used for formatting.
-[attr]our-c-style   whitespace=tab-in-indent  format.clang-format
+[attr]our-c-style   whitespace=tab-in-indent  format.clang-format=9

 *.cxx   our-c-style
 *.h     our-c-style
--- a/.gitlab-ci-ecp.yml
+++ b/.gitlab-ci-ecp.yml
@ -1,125 +0,0 @@
-
-
-.slurm_p9_cuda: &slurm_p9_cuda
-    tags:
-        - nmc
-        - slurm
-        - nmc-xxfe1-sched-001
-        - xx-fe1
-    variables:
-        NMC_FE1_SLURM_PARAMETERS: " -N1 -p ecp-p9-4v100 --extra-node-info=*:*:* -t 1:30:00 "
-        CC: "gcc"
-        CXX: "g++"
-        CUDAHOSTCXX: "g++"
-    before_script:
-        # We need gcc-4.8.5, which is the system default compiler but not a compiler
-        # listed under the module system.
-        #
-        # That means to get this to work properly we explicitly do not request
-        # any compiler.
-        - module load cuda cmake/3.14.5
-
-
-.slurm_p9_opemp: &slurm_p9_opemp
-    tags:
-        - nmc
-        - slurm
-        - nmc-xxfe1-sched-001
-        - xx-fe1
-    variables:
-        NMC_FE1_SLURM_PARAMETERS: " -N1 -p ecp-p9-4v100 --extra-node-info=*:*:* -t 1:30:00 "
-    before_script:
-        - module load gcc/8.3.0 openmpi/3.1.4 cmake/3.14.5
-
-.cmake_build_artifacts: &cmake_build_artifacts
-    artifacts:
-        expire_in: 24 hours
-        when: always
-        paths:
-            # The artifacts of the build.
-            - vtkm-build/bin/
-            - vtkm-build/include/
-
-            # CTest files.
-            # XXX(globbing): Can be simplified with support from
-            # https://gitlab.com/gitlab-org/gitlab-runner/issues/4840
-            - vtkm-build/CTestCustom*.cmake
-            - vtkm-build/CTestTestfile.cmake
-            - vtkm-build/*/CTestTestfile.cmake
-            - vtkm-build/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/*/CTestTestfile.cmake
-            - vtkm-build/*/*/*/*/*/CTestTestfile.cmake
-            - vtkm-build/Testing/
-
-            # CDash files.
-            - vtkm-build/DartConfiguration.tcl
-
-.cmake_build_p9_cuda: &cmake_build_p9_cuda
-    stage: build
-    script:
-        - srun env | grep SLURM_JOB_NAME
-        - mkdir vtkm-build
-        - pushd vtkm-build
-        - cmake -DCMAKE_BUILD_TYPE=Release -DVTKm_ENABLE_CUDA=ON -S ../
-        - cmake --build . -j20
-        - popd
-
-.cmake_build_p9_openmp: &cmake_build_p9_openmp
-    stage: build
-    script:
-        - srun env | grep SLURM_JOB_NAME
-        - mkdir vtkm-build
-        - pushd vtkm-build
-        - cmake -DCMAKE_BUILD_TYPE=Release -DVTKm_ENABLE_OPENMP=ON -S ../
-        - cmake --build . -j20
-        - popd
-
-
-
-.cmake_test_p9: &cmake_test_p9
-    stage: test
-    script:
-        - echo "running the test using artifacts of the build"
-        - pushd vtkm-build
-        # We need to exclude the following tests
-        #   - CopyrightStatement
-        #   - TestInstallSetup
-        #   - SourceInInstall
-        # Which we can do by using an exclude regex
-        - ctest -E "Install|CopyrightStatement"
-        - popd
-
-stages:
-    - build
-    - test
-
-build:p9_openmp:
-    extends:
-        - .slurm_p9_opemp
-        - .cmake_build_artifacts
-        - .cmake_build_p9_openmp
-
-test:p9_openmp:
-    extends:
-        - .slurm_p9_opemp
-        - .cmake_test_p9
-    dependencies:
-        - build:p9_openmp
-    needs:
-        - build:p9_openmp
-
-build:p9_cuda:
-    extends:
-        - .slurm_p9_cuda
-        - .cmake_build_artifacts
-        - .cmake_build_p9_cuda
-
-test:p9_cuda:
-    extends:
-        - .slurm_p9_cuda
-        - .cmake_test_p9
-    dependencies:
-        - build:p9_cuda
-    needs:
-        - build:p9_cuda
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -3,7 +3,7 @@
 #
 #   * .gitlab/ci/docker/centos7/cuda10.2/
 #     - cuda
-#     - gcc 4.8.5
+#     - gcc 7.3.1
 #   * .gitlab/ci/docker/centos8/base/
 #     - gcc 8.3.1
 #     - clang 8.0.1
@ -13,7 +13,7 @@
 #     - cuda
 #     - gcc 8.2.1
 #   * .gitlab/ci/docker/ubuntu1604/base/
-#     - gcc 4.8
+#     - gcc 5.4.0
 #     - clang 3.8
 #     - clang 5.0
 #     - tbb
@ -32,9 +32,11 @@
 #     - tbb
 #     - openmp
 #     - mpich2
-#   * .gitlab/ci/docker/ubuntu1804/cuda10.1/
+#     - hdf5
+#   * .gitlab/ci/docker/ubuntu1804/cuda11.1/
 #     - cuda
-#     - gcc 7.4
+#     - gcc 7
+#     - gcc 8
 #     - tbb
 #     - openmp
 #     - mpich2
@ -49,55 +51,64 @@
    GIT_CLONE_PATH: $CI_BUILDS_DIR/gitlab-kitware-sciviz-ci

 .centos7: &centos7
-  image: "kitware/vtkm:ci-centos7_cuda10.2-20200601"
+  image: "kitware/vtkm:ci-centos7_cuda10.2-20210128"
  extends:
    - .docker_image

 .centos8: &centos8
-  image: "kitware/vtkm:ci-centos8-20200601"
+  image: "kitware/vtkm:ci-centos8-20201016"
  extends:
    - .docker_image

 .rhel8: &rhel8
-  image: "kitware/vtkm:ci-rhel8_cuda10.2-20200601"
+  image: "kitware/vtkm:ci-rhel8_cuda10.2-20201016"
  extends:
    - .docker_image

 .ubuntu1604: &ubuntu1604
-  image: "kitware/vtkm:ci-ubuntu1604-20200601"
+  image: "kitware/vtkm:ci-ubuntu1604-20201016"
  extends:
    - .docker_image

 .ubuntu1604_cuda: &ubuntu1604_cuda
-  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20200601"
+  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20201016"
  extends:
    - .docker_image

 .ubuntu1804: &ubuntu1804
-  image: "kitware/vtkm:ci-ubuntu1804-20200601"
+  image: "kitware/vtkm:ci-ubuntu1804-20210107"
  extends:
    - .docker_image

 .ubuntu1804_cuda: &ubuntu1804_cuda
-  image: "kitware/vtkm:ci-ubuntu1804_cuda10.1-20200601"
+  image: "kitware/vtkm:ci-ubuntu1804_cuda11.1-20201016"
+  extends:
+    - .docker_image
+
+.ubuntu1804_cuda_kokkos: &ubuntu1804_cuda_kokkos
+  image: "kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-20201016"
  extends:
    - .docker_image

 .ubuntu2004_doxygen: &ubuntu2004_doxygen
-  image: "kitware/vtkm:ci-doxygen-20200601"
+  image: "kitware/vtkm:ci-doxygen-20201016"
  extends:
    - .docker_image

+.ubuntu2004_kokkos: &ubuntu2004_kokkos
+  image: "kitware/vtkm:ci-ubuntu2004_kokkos-20201016"
+  extends:
+    - .docker_image

 .only-default: &only-default
  only:
-    - master
+    - master@vtk/vtk-m
+    - tags@vtk/vtk-m
    - merge_requests
-    - tags

 .only-master: &only-master
  only:
-    - master
+    - master@vtk/vtk-m


 # General Longer Term Tasks:
@ -120,6 +131,7 @@ stages:
    - export PATH=$PWD/.gitlab:$PATH
    - SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
    - sccache --show-stats
+    - .gitlab/ci/config/google_benchmarks.sh
    - "cmake --version"
    - "cmake -V -P .gitlab/ci/config/gitlab_ci_setup.cmake"
    - "ctest -VV -S .gitlab/ci/ctest_configure.cmake"
@ -163,6 +175,14 @@ stages:
    #for running failed tests multiple times so failures
    #due to system load are not reported
    - "ctest-latest -VV -S .gitlab/ci/ctest_test.cmake"
+  artifacts:
+    expire_in: 24 hours
+    when: always
+    paths:
+      # The generated regression testing images
+      - build/*.png
+      - build/*.pnm
+      - build/*.pmm

 .cmake_memcheck_linux: &cmake_memcheck_linux
  stage: test
@ -170,6 +190,14 @@ stages:
  interruptible: true
  script:
    - "ctest-latest -VV -S .gitlab/ci/ctest_memcheck.cmake"
+  artifacts:
+    expire_in: 24 hours
+    when: always
+    paths:
+      # The generated regression testing images
+      - build/*.png
+      - build/*.pnm
+      - build/*.pmm

 include:
  - local: '/.gitlab/ci/centos7.yml'
@ -178,4 +206,5 @@ include:
  - local: '/.gitlab/ci/rhel8.yml'
  - local: '/.gitlab/ci/ubuntu1604.yml'
  - local: '/.gitlab/ci/ubuntu1804.yml'
+  - local: '/.gitlab/ci/ubuntu2004.yml'
  - local: '/.gitlab/ci/windows10.yml'
--- a/.gitlab/ci/centos7.yml
+++ b/.gitlab/ci/centos7.yml
@ -1,12 +1,13 @@

 # Build on centos7 with CUDA and test on rhel8 and centos7
-# gcc 4.8
-build:centos7_gcc48:
+# gcc 7.3.1
+build:centos7_gcc73:
  tags:
    - build
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .centos7
@ -15,33 +16,33 @@ build:centos7_gcc48:
  variables:
    CMAKE_BUILD_TYPE: RelWithDebInfo
    CMAKE_GENERATOR: "Unix Makefiles"
-    VTKM_SETTINGS: "cuda+turing+32bit_ids"
+    VTKM_SETTINGS: "cuda+turing+32bit_ids+no_rendering"

-test:centos7_gcc48:
+test:centos7_gcc73:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .centos7
    - .cmake_test_linux
    - .only-default
  dependencies:
-    - build:centos7_gcc48
+    - build:centos7_gcc73
  needs:
-    - build:centos7_gcc48
+    - build:centos7_gcc73

 test:rhel8_test_centos7:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .rhel8
    - .cmake_test_linux
@ -49,6 +50,6 @@ test:rhel8_test_centos7:
  variables:
      CTEST_EXCLUSIONS: "built_against_test_install"
  dependencies:
-    - build:centos7_gcc48
+    - build:centos7_gcc73
  needs:
-    - build:centos7_gcc48
+    - build:centos7_gcc73
--- a/.gitlab/ci/centos8.yml
+++ b/.gitlab/ci/centos8.yml
@ -30,6 +30,7 @@ test:centos8_sanitizer:
  variables:
    OMP_NUM_THREADS: 4
    CTEST_MEMORYCHECK_TYPE: LeakSanitizer
+    CTEST_EXCLUSIONS: "RegressionTest.*"
  dependencies:
    - build:centos8_sanitizer
  needs:
--- a/.gitlab/ci/config/google_benchmarks.sh
+++ b/.gitlab/ci/config/google_benchmarks.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -xe
+
+readonly version="v1.5.2"
+readonly tarball="$version.tar.gz"
+readonly url="https://github.com/google/benchmark/archive/$tarball"
+readonly sha256sum="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
+readonly install_dir="$HOME/gbench"
+
+if ! [[ "$VTKM_SETTINGS" =~ "benchmarks" ]]; then
+  exit 0
+fi
+
+cd "$HOME"
+
+echo "$sha256sum  $tarball" > gbenchs.sha256sum
+curl --insecure -OL "$url"
+sha256sum --check gbenchs.sha256sum
+tar xf "$tarball"
+
+mkdir build
+mkdir "$install_dir"
+
+cmake -GNinja -S benchmark* -B build -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+cmake --build build
+cmake --install build --prefix "$install_dir"
--- a/.gitlab/ci/config/initial_config.cmake
+++ b/.gitlab/ci/config/initial_config.cmake
@ -10,10 +10,16 @@
 ##
 ##=============================================================================

+# Default to Release builds.
+if ("$ENV{CMAKE_BUILD_TYPE}" STREQUAL "")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
+else ()
+  set(CMAKE_BUILD_TYPE "$ENV{CMAKE_BUILD_TYPE}" CACHE STRING "")
+endif ()
+
 string(REPLACE "+" ";" options "$ENV{VTKM_SETTINGS}")

 foreach(option IN LISTS options)
-
  if(static STREQUAL option)
    set(BUILD_SHARED_LIBS "OFF" CACHE STRING "")

@ -23,6 +29,10 @@ foreach(option IN LISTS options)
  elseif(vtk_types STREQUAL option)
    set(VTKm_USE_DEFAULT_TYPES_FOR_VTK "ON" CACHE STRING "")

+  elseif(ascent_types STREQUAL option)
+    # Note: ascent_types also requires 32bit_ids and 64bit_floats
+    set(VTKm_USE_DEFAULT_TYPES_FOR_ASCENT "ON" CACHE STRING "")
+
  elseif(32bit_ids STREQUAL option)
    set(VTKm_USE_64BIT_IDS "OFF" CACHE STRING "")

@ -37,6 +47,15 @@ foreach(option IN LISTS options)
    set(VTKm_ENABLE_SANITIZER "ON" CACHE STRING "")
    list(APPEND sanitizers "leak")

+  elseif(rendering STREQUAL option)
+    set(VTKm_ENABLE_RENDERING "ON" CACHE STRING "")
+
+  elseif(no_rendering STREQUAL option)
+    set(VTKm_ENABLE_RENDERING "OFF" CACHE STRING "")
+
+  elseif(no_virtual STREQUAL option)
+    set(VTKm_NO_DEPRECATED_VIRTUAL "ON" CACHE STRING "")
+
  elseif(examples STREQUAL option)
    set(VTKm_ENABLE_EXAMPLES "ON" CACHE STRING "")

@ -45,6 +64,7 @@ foreach(option IN LISTS options)

  elseif(benchmarks STREQUAL option)
    set(VTKm_ENABLE_BENCHMARKS "ON" CACHE STRING "")
+    set(ENV{CMAKE_PREFIX_PATH} "$ENV{HOME}/gbench")

  elseif(mpi STREQUAL option)
    set(VTKm_ENABLE_MPI "ON" CACHE STRING "")
@ -58,6 +78,12 @@ foreach(option IN LISTS options)
  elseif(cuda STREQUAL option)
    set(VTKm_ENABLE_CUDA "ON" CACHE STRING "")

+  elseif(kokkos STREQUAL option)
+    set(VTKm_ENABLE_KOKKOS "ON" CACHE STRING "")
+
+  elseif(hdf5 STREQUAL option)
+    set(VTKm_ENABLE_HDF5_IO "ON" CACHE STRING "")
+
  elseif(maxwell STREQUAL option)
    set(VTKm_CUDA_Architecture "maxwell" CACHE STRING "")

@ -82,7 +108,10 @@ find_program(SCCACHE_COMMAND NAMES sccache)
 if(SCCACHE_COMMAND)
  set(CMAKE_C_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
  set(CMAKE_CXX_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
-  if(VTKm_ENABLE_CUDA)
+
+  # Use VTKm_CUDA_Architecture to determine if we need CUDA sccache setup
+  # since this will also capture when kokkos is being used with CUDA backing
+  if(DEFINED VTKm_CUDA_Architecture)
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${SCCACHE_COMMAND}" CACHE STRING "")
  endif()
 endif()
--- a/.gitlab/ci/config/sccache.sh
+++ b/.gitlab/ci/config/sccache.sh
@ -10,7 +10,7 @@ readonly tarball="$filename.tar.gz"
 cd .gitlab

 echo "$sha256sum  $tarball" > sccache.sha256sum
-curl -OL "https://github.com/robertmaynard/sccache/releases/download/$version/$tarball"
+curl --insecure -OL "https://github.com/robertmaynard/sccache/releases/download/$version/$tarball"
 sha256sum --check sccache.sha256sum
 tar xf "$tarball"
 #mv "$filename/sccache" .
--- a/.gitlab/ci/config/setup_vs_powershell.ps1
+++ b/.gitlab/ci/config/setup_vs_powershell.ps1
@ -1,14 +0,0 @@
-
-$tempFile = "$env:temp\vcvars.txt"
-
-if ($env:CI_JOB_NAME -eq "build:windows_vs2019") {
-  cmd.exe /c "call `"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat`" && set > $tempFile"
-} else {
-  cmd.exe /c "call `"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat`" && set > $tempFile"
-}
-
-Get-Content "$tempFile" | Foreach-Object {
-  if ($_ -match "^(.*?)=(.*)$") {
-    Set-Content "env:\$($matches[1])" $matches[2]
-  }
-}
--- a/.gitlab/ci/config/vcvarsall.ps1
+++ b/.gitlab/ci/config/vcvarsall.ps1
@ -0,0 +1,9 @@
+$erroractionpreference = "stop"
+
+cmd /c "`"$env:VCVARSALL`" $VCVARSPLATFORM -vcvars_ver=$VCVARSVERSION & set" |
+foreach {
+    if ($_ -match "=") {
+        $v = $_.split("=")
+        [Environment]::SetEnvironmentVariable($v[0], $v[1])
+    }
+}
--- a/.gitlab/ci/ctest_memcheck.cmake
+++ b/.gitlab/ci/ctest_memcheck.cmake
@ -33,7 +33,8 @@ if(NOT CTEST_MEMORYCHECK_SUPPRESSIONS_FILE)
 endif()

 set(test_exclusions
-  # placeholder for tests to exclude
+  # placeholder for tests to exclude provided by the env
+  $ENV{CTEST_EXCLUSIONS}
 )

 string(REPLACE ";" "|" test_exclusions "${test_exclusions}")
--- a/.gitlab/ci/docker/centos7/cuda10.2/Dockerfile
+++ b/.gitlab/ci/docker/centos7/cuda10.2/Dockerfile
@ -1,9 +1,9 @@
 FROM nvidia/cuda:10.2-devel-centos7
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

-RUN yum install cmake make gcc gcc-c++ -y
+RUN yum install make gcc gcc-c++ curl cuda-compat-10-2 centos-release-scl -y
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | bash
-RUN yum install git git-lfs -y
+RUN yum install git git-lfs devtoolset-7-gcc-c++ -y

 # Provide a consistent CMake path across all images
 # Install CMake 3.13 as it is the minium for cuda builds
@ -20,4 +20,4 @@ RUN mkdir /opt/cmake-latest/ && \
    rm cmake-3.17.3-Linux-x86_64.sh && \
    ln -s /opt/cmake-latest/bin/ctest /opt/cmake-latest/bin/ctest-latest

-ENV PATH "/opt/cmake/bin:/opt/cmake-latest/bin:${PATH}"
+ENV PATH "/opt/rh/devtoolset-7/root/bin:/opt/cmake/bin:/opt/cmake-latest/bin:${PATH}"
--- a/.gitlab/ci/docker/rhel8/cuda10.2/Dockerfile
+++ b/.gitlab/ci/docker/rhel8/cuda10.2/Dockerfile
@ -1,7 +1,7 @@
 FROM nvidia/cuda:10.2-devel-ubi8
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

-RUN yum install make gcc gcc-c++ curl -y
+RUN yum install make gcc gcc-c++ curl cuda-compat-10-2 -y
 RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | bash
 RUN yum install git git-lfs -y

--- a/.gitlab/ci/docker/ubuntu1804/base/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/base/Dockerfile
@ -12,12 +12,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
      libmpich-dev \
      libomp-dev \
      libtbb-dev \
+      libhdf5-dev \
      mpich \
      ninja-build \
      software-properties-common

 # extra dependencies for charm machine
-RUN add-apt-repository ppa:jonathonf/gcc-9.2
+RUN add-apt-repository ppa:jonathonf/gcc
 RUN apt-get update && apt-get install -y --no-install-recommends \
      clang-8 \
      g++-9 \
--- a/.gitlab/ci/docker/ubuntu1804/cuda11.1/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/cuda11.1/Dockerfile
@ -1,10 +1,11 @@
-FROM nvidia/cuda:10.1-devel-ubuntu18.04
+FROM nvidia/cuda:11.1-devel-ubuntu18.04
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

 # Base dependencies for building VTK-m projects
 RUN apt-get update && apt-get install -y --no-install-recommends \
      curl \
-      g++ \
+      g++-8 \
+      clang-8 \
      git \
      git-lfs \
      libmpich-dev \
--- a/.gitlab/ci/docker/ubuntu1804/kokkos-cuda/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/kokkos-cuda/Dockerfile
@ -0,0 +1,47 @@
+FROM nvidia/cuda:11.0-devel-ubuntu18.04
+LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"
+
+# Base dependencies for building VTK-m projects
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      curl \
+      g++ \
+      git \
+      git-lfs \
+      ninja-build \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Need to run git-lfs install manually on ubuntu based images when using the
+# system packaged version
+RUN git-lfs install
+
+# kokkos backend requires cmake 3.18
+RUN mkdir /opt/cmake/ && \
+    curl -L https://github.com/Kitware/CMake/releases/download/v3.18.1/cmake-3.18.1-Linux-x86_64.sh > cmake-3.18.1-Linux-x86_64.sh && \
+    sh cmake-3.18.1-Linux-x86_64.sh --prefix=/opt/cmake/ --exclude-subdir --skip-license && \
+    rm cmake-3.18.1-Linux-x86_64.sh && \
+    ln -s /opt/cmake/bin/ctest /opt/cmake/bin/ctest-latest
+
+ENV PATH "/opt/cmake/bin:${PATH}"
+
+# Build and install Kokkos
+RUN mkdir -p /opt/kokkos/build && \
+    cd /opt/kokkos/build && \
+    curl -L https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz > kokkos-3.1.01.tar.gz && \
+    tar -xf kokkos-3.1.01.tar.gz && \
+    mkdir bld && cd bld && \
+    CXX=/opt/kokkos/build/kokkos-3.1.01/bin/nvcc_wrapper \
+    cmake -B . -S ../kokkos-3.1.01 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=/opt/kokkos \
+          -DCMAKE_CXX_FLAGS=-fPIC \
+          -DCMAKE_CXX_STANDARD=14 \
+          -DKokkos_ENABLE_CUDA=ON \
+          -DKokkos_ENABLE_CUDA_CONSTEXPR=ON \
+          -DKokkos_ENABLE_CUDA_LAMBDA=ON \
+          -DKokkos_ENABLE_CUDA_LDG_INTRINSIC=ON \
+          -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
+          -DKokkos_ENABLE_CUDA_UVM=ON \
+          -DKokkos_ARCH_TURING75=ON && \
+    cmake --build . -j 8 && \
+    cmake --install .
--- a/.gitlab/ci/docker/ubuntu2004/kokkos/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu2004/kokkos/Dockerfile
@ -0,0 +1,41 @@
+FROM ubuntu:20.04
+LABEL maintainer "Sujin Philip<sujin.philip@kitware.com>"
+
+# Base dependencies for building VTK-m projects
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+      cmake \
+      curl \
+      g++ \
+      git \
+      git-lfs \
+      libmpich-dev \
+      libomp-dev \
+      mpich \
+      ninja-build \
+      rsync \
+      ssh \
+      software-properties-common
+
+# Need to run git-lfs install manually on ubuntu based images when using the
+# system packaged version
+RUN git-lfs install
+
+# Provide CMake 3.17 so we can re-run tests easily
+# This will be used when we run just the tests
+RUN mkdir /opt/cmake/ && \
+    curl -L https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-x86_64.sh > cmake-3.17.3-Linux-x86_64.sh && \
+    sh cmake-3.17.3-Linux-x86_64.sh --prefix=/opt/cmake/ --exclude-subdir --skip-license && \
+    rm cmake-3.17.3-Linux-x86_64.sh && \
+    ln -s /opt/cmake/bin/ctest /opt/cmake/bin/ctest-latest
+
+ENV PATH "${PATH}:/opt/cmake/bin"
+
+# Build and install Kokkos
+RUN mkdir -p /opt/kokkos/build && \
+    cd /opt/kokkos/build && \
+    curl -L https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz > kokkos-3.1.01.tar.gz && \
+    tar -xf kokkos-3.1.01.tar.gz && \
+    mkdir bld && cd bld && \
+    cmake -GNinja -DCMAKE_INSTALL_PREFIX=/opt/kokkos -DCMAKE_CXX_FLAGS=-fPIC -DKokkos_ENABLE_SERIAL=ON ../kokkos-3.1.01 &&\
+    ninja all && \
+    ninja install
--- a/.gitlab/ci/docker/update_all.sh
+++ b/.gitlab/ci/docker/update_all.sh
@ -30,14 +30,22 @@ cd ubuntu1804/base
 sudo docker build -t kitware/vtkm:ci-ubuntu1804-$date .
 cd ../..

-cd ubuntu1804/cuda10.1
-sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda10.1-$date .
+cd ubuntu1804/cuda11.1
+sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda11.1-$date .
+cd ../..
+
+cd ubuntu1804/kokkos-cuda
+sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-$date .
 cd ../..

 cd ubuntu2004/doxygen/
 sudo docker build -t kitware/vtkm:ci-doxygen-$date .
 cd ../..

+cd ubuntu2004/kokkos
+sudo docker build -t kitware/vtkm:ci-ubuntu2004_kokkos-$date .
+cd ../..
+
 # sudo docker login --username=<docker_hub_name>
 sudo docker push kitware/vtkm
 sudo docker system prune
--- a/.gitlab/ci/doxygen.yml
+++ b/.gitlab/ci/doxygen.yml
@ -25,10 +25,9 @@ doxygen:
    - "cmake -V -P .gitlab/ci/config/gitlab_ci_setup.cmake"
    - "ctest -VV -S .gitlab/ci/ctest_configure.cmake"
  script:
-    - eval `ssh-agent -s`
-    - ssh-add <(echo "$DOC_API_KEY_BASE64" | base64 --decode)
    - doxygen build/docs/doxyfile
-    - rsync -tv --recursive --delete -e "ssh -o StrictHostKeyChecking=no" build/docs/doxygen/html/ vtkm.documentation
+    - chmod 400 $DOC_KEY_FILE
+    - rsync -tv --recursive --delete -e "ssh -i $DOC_KEY_FILE -o StrictHostKeyChecking=no" build/docs/doxygen/html/ kitware@public.kitware.com:vtkm_documentation/
  variables:
    CMAKE_BUILD_TYPE: Release
    VTKM_SETTINGS: "tbb+openmp+mpi+shared+docs"
--- a/.gitlab/ci/ubuntu1604.yml
+++ b/.gitlab/ci/ubuntu1604.yml
@ -7,6 +7,7 @@ build:ubuntu1604_gcc5:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1604_cuda
@ -16,41 +17,24 @@ build:ubuntu1604_gcc5:
    CC: "gcc-5"
    CXX: "g++-5"
    CMAKE_BUILD_TYPE: RelWithDebInfo
-    VTKM_SETTINGS: "cuda+pascal"
+    VTKM_SETTINGS: "cuda+pascal+no_virtual+ascent_types+32bit_ids+64bit_floats"

-# Temporarily disabled as we don't have a pascal hw gitlab-runner
-# test:ubuntu1604_gcc5:
-#   tags:
-#     - test
-#     - cuda-rt
-#     - pascal
-#     - vtkm
-#     - docker
-#     - linux
-#   extends:
-#     - .ubuntu1604_cuda
-#     - .cmake_test_linux
-#     - .only-default
-#   dependencies:
-#     - build:ubuntu1604_gcc5
-#   needs:
-#     - build:ubuntu1604_gcc5
-# test:ubuntu1804_test_ubuntu1604_gcc5:
-#   tags:
-#     - test
-#     - cuda-rt
-#     - pascal
-#     - vtkm
-#     - docker
-#     - linux
-#   extends:
-#     - .ubuntu1804_cuda
-#     - .cmake_test_linux
-#     - .only-default
-#   dependencies:
-#     - build:ubuntu1604_gcc5
-#   needs:
-#     - build:ubuntu1604_gcc5
+test:ubuntu1604_gcc5:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - pascal
+  extends:
+    - .ubuntu1604_cuda
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu1604_gcc5
+  needs:
+    - build:ubuntu1604_gcc5

 # Build on ubuntu1704 with OpenMP + CUDA
 # Runs only on nightlies
@ -60,6 +44,7 @@ build:ubuntu1604_gcc5_2:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1604_cuda
@ -71,49 +56,24 @@ build:ubuntu1604_gcc5_2:
    CMAKE_BUILD_TYPE: Release
    VTKM_SETTINGS: "openmp+cuda+pascal+examples"

-# Build on ubuntu1604 with mpi + tbb and test on ubuntu1604
-# Uses gcc 4.8
-# Uses OpenMPI
-build:ubuntu1604_gcc48:
-  tags:
-    - build
-    - vtkm
-    - docker
-    - linux
-  extends:
-    - .ubuntu1604
-    - .cmake_build_linux
-    - .only-default
-  variables:
-    CC: "gcc-4.8"
-    CXX: "g++-4.8"
-    CMAKE_BUILD_TYPE: Release
-    #custom openmpi install location
-    CMAKE_PREFIX_PATH: "/opt/openmpi/"
-    VTKM_SETTINGS: "tbb+mpi+shared"
-
-test:ubuntu1604_gcc48:
+test:ubuntu1804_test_ubuntu1604_gcc5_2:
  tags:
    - test
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - pascal
  extends:
-    - .ubuntu1604
+    - .ubuntu1804_cuda
    - .cmake_test_linux
-    - .only-default
+    - .only-master
  variables:
-    #env flags to allow openmpi to run as root user
-    OMPI_ALLOW_RUN_AS_ROOT: 1
-    OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
-    #mpi location so that `built_against_test_install` tests
-    #pass
-    CMAKE_PREFIX_PATH: "/opt/openmpi/"
+      CTEST_EXCLUSIONS: "built_against_test_install"
  dependencies:
-    - build:ubuntu1604_gcc48
+    - build:ubuntu1604_gcc5_2
  needs:
-    - build:ubuntu1604_gcc48
-
+    - build:ubuntu1604_gcc5_2

 # Build on ubuntu1604 with tbb and test on ubuntu1604
 # Uses clang 5
--- a/.gitlab/ci/ubuntu1804.yml
+++ b/.gitlab/ci/ubuntu1804.yml
@ -16,7 +16,7 @@ build:ubuntu1804_gcc9:
    CC: "gcc-9"
    CXX: "g++-9"
    CMAKE_BUILD_TYPE: Debug
-    VTKM_SETTINGS: "tbb+openmp+mpi+shared"
+    VTKM_SETTINGS: "benchmarks+tbb+openmp+mpi+shared+hdf5"

 test:ubuntu1804_gcc9:
  tags:
@ -38,7 +38,7 @@ test:ubuntu1804_gcc9:
    - build:ubuntu1804_gcc9

 # Build on ubuntu1804 with CUDA + MPI and test on ubuntu1804
-# Uses gcc 7.4
+# Uses gcc 7
 # Uses MPICH2
 build:ubuntu1804_gcc7:
  tags:
@ -46,6 +46,7 @@ build:ubuntu1804_gcc7:
    - vtkm
    - docker
    - linux
+    - cuda-rt
    - large-memory
  extends:
    - .ubuntu1804_cuda
@ -54,16 +55,17 @@ build:ubuntu1804_gcc7:
  variables:
    CC: "gcc-7"
    CXX: "g++-7"
-    VTKM_SETTINGS: "cuda+turing+mpi+64bit_floats"
+    CUDAHOSTCXX: "g++-7"
+    VTKM_SETTINGS: "benchmarks+cuda+turing+mpi+64bit_floats+no_virtual"

 test:ubuntu1804_gcc7:
  tags:
    - test
-    - cuda-rt
-    - turing
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - turing
  extends:
    - .ubuntu1804_cuda
    - .cmake_test_linux
@ -74,42 +76,45 @@ test:ubuntu1804_gcc7:
    - build:ubuntu1804_gcc7


-# Build on ubuntu1804 with OpenMP and test on ubuntu1804
-# Uses gcc 7.4
+# Build on ubuntu1804 with CUDA+TBB and test on ubuntu1804
+# Uses clang as CUDA host compiler
 # Runs only on nightlies
-build:ubuntu1804_gcc7_2:
+build:ubuntu1804_clang_cuda:
  tags:
    - build
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - large-memory
  extends:
-    - .ubuntu1804
+    - .ubuntu1804_cuda
    - .cmake_build_linux
-    - .only-master
+    - .only-default
+    # - .only-master
  variables:
-    CC: "gcc-7"
-    CXX: "g++-7"
-    VTKM_SETTINGS: "openmp+shared+examples"
+    CC: "clang-8"
+    CXX: "clang++-8"
+    CUDAHOSTCXX: "clang++-8"
+    VTKM_SETTINGS: "cuda+pascal+tbb+static+examples"

-test:ubuntu1804_gcc7_2:
+test:ubuntu1804_clang_cuda:
  tags:
    - test
    - vtkm
    - docker
    - linux
+    - cuda-rt
+    - pascal
  extends:
-    - .ubuntu1804
+    - .ubuntu1804_cuda
    - .cmake_test_linux
-    - .only-master
-  variables:
-    #Restrict OpenMP number of threads since multiple test stages
-    #execute on the same hardware concurrently
-    OMP_NUM_THREADS: 4
+    - .only-default
+    # - .only-master
  dependencies:
-    - build:ubuntu1804_gcc7_2
+    - build:ubuntu1804_clang_cuda
  needs:
-    - build:ubuntu1804_gcc7_2
+    - build:ubuntu1804_clang_cuda

 # Build on ubuntu1804 with OpenMP and test on ubuntu1804
 # Uses gcc 6.5
@ -179,3 +184,39 @@ test:ubuntu1804_clang8:
    - build:ubuntu1804_clang8
  needs:
    - build:ubuntu1804_clang8
+
+# Build on ubuntu1804 with kokkos and test on ubuntu1804
+# Uses CUDA 11
+build:ubuntu1804_kokkos:
+  tags:
+    - build
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - large-memory
+  extends:
+    - .ubuntu1804_cuda_kokkos
+    - .cmake_build_linux
+    - .only-default
+  variables:
+    CMAKE_GENERATOR: "Ninja"
+    CMAKE_BUILD_TYPE: Release
+    VTKM_SETTINGS: "benchmarks+kokkos+turing+static+64bit_floats"
+
+test:ubuntu1804_kokkos:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+    - cuda-rt
+    - turing
+  extends:
+    - .ubuntu1804_cuda_kokkos
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu1804_kokkos
+  needs:
+    - build:ubuntu1804_kokkos
--- a/.gitlab/ci/ubuntu2004.yml
+++ b/.gitlab/ci/ubuntu2004.yml
@ -0,0 +1,28 @@
+build:ubuntu2004_kokkos:
+  tags:
+    - build
+    - vtkm
+    - docker
+    - linux
+  extends:
+    - .ubuntu2004_kokkos
+    - .cmake_build_linux
+    - .only-default
+  variables:
+    CMAKE_BUILD_TYPE: RelWithDebInfo
+    VTKM_SETTINGS: "kokkos+shared+64bit_floats"
+
+test:ubuntu2004_kokkos:
+  tags:
+    - test
+    - vtkm
+    - docker
+    - linux
+  extends:
+    - .ubuntu2004_kokkos
+    - .cmake_test_linux
+    - .only-default
+  dependencies:
+    - build:ubuntu2004_kokkos
+  needs:
+    - build:ubuntu2004_kokkos
--- a/.gitlab/ci/windows10.yml
+++ b/.gitlab/ci/windows10.yml
@ -1,10 +1,27 @@
+.windows_build:
+  variables:
+    # Note that shell runners only support runners with a single
+    # concurrency level. We can't use `$CI_CONCURRENCY_ID` because this may
+    # change between the build and test stages which CMake doesn't support.
+    # Even if we could, it could change if other runners on the machine
+    # could run at the same time, so we drop it.
+    GIT_CLONE_PATH: "$CI_BUILDS_DIR\\vtkm ci"
+
+.windows_vs2019:
+  variables:
+    VCVARSALL: "${VS160COMNTOOLS}\\..\\..\\VC\\Auxiliary\\Build\\vcvarsall.bat"
+    VCVARSPLATFORM: "x64"
+    VCVARSVERSION: "14.25"

 .cmake_build_windows: &cmake_build_windows
+  extends:
+    - .windows_build
+    - .windows_vs2019
  stage: build
  timeout:  2 hours
  interruptible: true
  before_script:
-    - .gitlab/ci/config/setup_vs_powershell.ps1
+    - Invoke-Expression -Command .gitlab/ci/config/vcvarsall.ps1
    - "cmake --version"
    - "cmake -V -P .gitlab/ci/config/gitlab_ci_setup.cmake"
    - "ctest -VV -S .gitlab/ci/ctest_configure.cmake"
@ -39,11 +56,14 @@


 .cmake_test_windows: &cmake_test_windows
+  extends:
+    - .windows_build
+    - .windows_vs2019
  stage: test
  timeout:  50 minutes
  interruptible: true
  before_script:
-    - .gitlab/ci/config/setup_vs_powershell.ps1
+    - Invoke-Expression -Command .gitlab/ci/config/vcvarsall.ps1
  script:
    #Need to use our custom ctest-latest symlink
    #This will allow us to use 3.17+ which has support
@ -55,11 +75,13 @@
 # Will have CUDA 10.2 once build issues are resolved
 build:windows_vs2019:
  tags:
+    - vtkm # Since this is a bare runner, pin to a project.
+    - nonconcurrent
    - build
-    - vtkm
    - windows
-    - vs2019
    - shell
+    - vs2019
+    - msvc-19.25
    - large-memory
  extends:
    - .cmake_build_windows
@ -73,10 +95,13 @@ build:windows_vs2019:

 test:windows_vs2019:
  tags:
+    - vtkm # Since this is a bare runner, pin to a project.
+    - nonconcurrent
    - test
-    - vtkm
    - windows
    - shell
+    - vs2019
+    - msvc-19.25
    - cuda-rt
    - turing
  extends:
--- a/CMake/VTKmCMakeBackports.cmake
+++ b/CMake/VTKmCMakeBackports.cmake
@ -0,0 +1,23 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##============================================================================
+
+file(GLOB cmake_version_backports
+  LIST_DIRECTORIES true
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}/patches"
+  "${CMAKE_CURRENT_LIST_DIR}/patches/*")
+
+foreach (cmake_version_backport IN LISTS cmake_version_backports)
+  if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/patches/${cmake_version_backport}")
+    continue ()
+  endif ()
+  if (CMAKE_VERSION VERSION_LESS "${cmake_version_backport}")
+    list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_LIST_DIR}/patches/${cmake_version_backport}")
+  endif ()
+endforeach ()
--- a/CMake/VTKmCPUVectorization.cmake
+++ b/CMake/VTKmCPUVectorization.cmake
@ -77,7 +77,7 @@ endif()
 set(vec_levels none native)

 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  #for now we presume gcc >= 4.8
+  #for now we presume gcc >= 5.4
  list(APPEND vec_levels avx avx2)

  #common flags for the avx and avx2 instructions for the gcc compiler
--- a/CMake/VTKmCompilerFlags.cmake
+++ b/CMake/VTKmCompilerFlags.cmake
@ -22,6 +22,8 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
  set(VTKM_COMPILER_IS_CLANG 1)
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  set(VTKM_COMPILER_IS_GNU 1)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XLClang")
+  set(VTKM_COMPILER_IS_XL 1)
 endif()

 #-----------------------------------------------------------------------------
@ -51,7 +53,7 @@ if(VTKM_COMPILER_IS_MSVC)
  if(TARGET vtkm::cuda)
    target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Gy">)
  endif()
-elseif(NOT VTKM_COMPILER_IS_PGI) #can't find an equivalant PGI flag
+elseif(NOT (VTKM_COMPILER_IS_PGI OR VTKM_COMPILER_IS_XL)) #can't find an equivalant PGI/XL flag
  target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-ffunction-sections>)
  if(TARGET vtkm::cuda)
    target_compile_options(vtkm_compiler_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-ffunction-sections>)
@ -122,8 +124,15 @@ elseif(VTKM_COMPILER_IS_ICC)
  target_compile_options(vtkm_developer_flags INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-wd1478 -wd13379>)

 elseif(VTKM_COMPILER_IS_GNU OR VTKM_COMPILER_IS_CLANG)
-  set(cxx_flags -Wall -Wcast-align -Wchar-subscripts -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused -fno-common)
-  set(cuda_flags -Xcompiler=-Wall,-Wno-unknown-pragmas,-Wno-unused-local-typedefs,-Wno-unused-local-typedefs,-Wno-unused-function,-Wcast-align,-Wchar-subscripts,-Wpointer-arith,-Wformat,-Wformat-security,-Wshadow,-Wunused,-fno-common)
+  set(cxx_flags -Wall -Wcast-align -Wextra -Wpointer-arith -Wformat -Wformat-security -Wshadow -Wunused -fno-common -Wno-unused-function)
+  set(cuda_flags -Xcompiler=-Wall,-Wcast-align,-Wpointer-arith,-Wformat,-Wformat-security,-Wshadow,-fno-common,-Wunused,-Wno-unknown-pragmas,-Wno-unused-local-typedefs,-Wno-unused-function)
+
+  #Clang does not support the -Wchar-subscripts flag for warning if an array
+  #subscript has a char type.
+  if (VTKM_COMPILER_IS_GNU)
+    list(APPEND cxx_flags -Wchar-subscripts)
+    set(cuda_flags "${cuda_flags},-Wchar-subscripts")
+  endif()

  #Only add float-conversion warnings for gcc as the integer warnigns in GCC
  #include the implicit casting of all types smaller than int to ints.
@ -161,17 +170,21 @@ elseif(VTKM_COMPILER_IS_GNU OR VTKM_COMPILER_IS_CLANG)
  endif()
 endif()

-#common warnings for all platforms when building cuda
-if(TARGET vtkm::cuda)
+function(setup_cuda_flags)
  if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
    #nvcc 9 introduced specific controls to disable the stack size warning
    #otherwise we let the warning occur. We have to set this in CMAKE_CUDA_FLAGS
    #as it is passed to the device link step, unlike compile_options
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xnvlink=--suppress-stack-size-warning")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xnvlink=--suppress-stack-size-warning" PARENT_SCOPE)
  endif()

  set(display_error_nums -Xcudafe=--display_error_number)
  target_compile_options(vtkm_developer_flags INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${display_error_nums}>)
+endfunction()
+
+#common warnings for all platforms when building cuda
+if ((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda))
+  setup_cuda_flags()
 endif()

 if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
--- a/CMake/VTKmConfig.cmake.in
+++ b/CMake/VTKmConfig.cmake.in
@ -39,6 +39,7 @@
 #  VTKm_ENABLE_CUDA           Will be enabled if VTK-m was built with CUDA support
 #  VTKm_ENABLE_TBB            Will be enabled if VTK-m was built with TBB support
 #  VTKm_ENABLE_OPENMP         Will be enabled if VTK-m was built with OpenMP support
+#  VTKm_ENABLE_KOKKOS         Will be enabled if VTK-m was built with Kokkos support
 #  VTKm_ENABLE_LOGGING        Will be enabled if VTK-m was built with logging support
 #  VTKm_ENABLE_MPI            Will be enabled if VTK-m was built with MPI support
 #  VTKm_ENABLE_RENDERING      Will be enabled if VTK-m was built with rendering support
@ -67,8 +68,9 @@ set(VTKm_VERSION "@VTKm_VERSION@")

 set(VTKm_BUILD_SHARED_LIBS "@VTKm_BUILD_SHARED_LIBS@")
 set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@")
-set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
+set(VTKm_ENABLE_KOKKOS "@VTKm_ENABLE_KOKKOS@")
 set(VTKm_ENABLE_OPENMP "@VTKm_ENABLE_OPENMP@")
+set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
 set(VTKm_ENABLE_LOGGING "@VTKm_ENABLE_LOGGING@")
 set(VTKm_ENABLE_RENDERING "@VTKm_ENABLE_RENDERING@")
 set(VTKm_ENABLE_GL_CONTEXT "@VTKm_ENABLE_GL_CONTEXT@")
@ -101,6 +103,12 @@ endif()
 if(VTKm_ENABLE_CUDA AND VTKM_FROM_INSTALL_DIR)
  set_target_properties(vtkm::cuda PROPERTIES cuda_architecture_flags "@VTKm_CUDA_Architecture_Flags@")
  set_target_properties(vtkm::cuda PROPERTIES requires_static_builds TRUE)
+
+  # If VTK-m is built with 3.18+ and the consumer is < 3.18 we need to drop
+  # these properties as they break the VTK-m cuda flag logic
+  if(CMAKE_VERSION VERSION_LESS 3.18)
+    set_target_properties(vtkm::cuda PROPERTIES INTERFACE_LINK_OPTIONS "")
+  endif()
 endif()

 # VTKm requires some CMake Find modules not included with CMake, so
--- a/CMake/VTKmDeviceAdapters.cmake
+++ b/CMake/VTKmDeviceAdapters.cmake
@ -127,10 +127,13 @@ if(VTKm_ENABLE_CUDA)
      requires_static_builds TRUE
    )

+    target_compile_options(vtkm_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)

-    set_target_properties(vtkm_cuda PROPERTIES
-      INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
+      CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0)
+      # CUDA 11+ deprecated C++11 support
+      target_compile_features(vtkm_cuda INTERFACE cxx_std_14)
+    endif()

    # add the -gencode flags so that all cuda code
    # way compiled properly
@ -164,7 +167,10 @@ if(VTKm_ENABLE_CUDA)
    # 6 - volta
    #   - Uses: --generate-code=arch=compute_70,code=sm_70
    # 7 - turing
-    #   - Uses: --generate-code=arch=compute_75code=sm_75
+    #   - Uses: --generate-code=arch=compute_75,code=sm_75
+    # 8 - ampere
+    #   - Uses: --generate-code=arch=compute_80,code=sm_80
+    #   - Uses: --generate-code=arch=compute_86,code=sm_86
    # 8 - all
    #   - Uses: --generate-code=arch=compute_30,code=sm_30
    #   - Uses: --generate-code=arch=compute_35,code=sm_35
@ -172,12 +178,14 @@ if(VTKm_ENABLE_CUDA)
    #   - Uses: --generate-code=arch=compute_60,code=sm_60
    #   - Uses: --generate-code=arch=compute_70,code=sm_70
    #   - Uses: --generate-code=arch=compute_75,code=sm_75
+    #   - Uses: --generate-code=arch=compute_80,code=sm_80
+    #   - Uses: --generate-code=arch=compute_86,code=sm_86
    # 8 - none
    #

    #specify the property
    set(VTKm_CUDA_Architecture "native" CACHE STRING "Which GPU Architecture(s) to compile for")
-    set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell pascal volta turing all none)
+    set_property(CACHE VTKm_CUDA_Architecture PROPERTY STRINGS native fermi kepler maxwell pascal volta turing ampere all none)

    #detect what the property is set too
    if(VTKm_CUDA_Architecture STREQUAL "native")
@ -231,23 +239,124 @@ if(VTKm_ENABLE_CUDA)
      set(arch_flags --generate-code=arch=compute_70,code=sm_70)
    elseif(VTKm_CUDA_Architecture STREQUAL "turing")
      set(arch_flags --generate-code=arch=compute_75,code=sm_75)
+    elseif(VTKm_CUDA_Architecture STREQUAL "ampere")
+      set(arch_flags --generate-code=arch=compute_80,code=sm_80)
+      set(arch_flags --generate-code=arch=compute_86,code=sm_86)
    elseif(VTKm_CUDA_Architecture STREQUAL "all")
      set(arch_flags --generate-code=arch=compute_30,code=sm_30
                     --generate-code=arch=compute_35,code=sm_35
                     --generate-code=arch=compute_50,code=sm_50
                     --generate-code=arch=compute_60,code=sm_60
                     --generate-code=arch=compute_70,code=sm_70
-                     --generate-code=arch=compute_75,code=sm_75)
+                     --generate-code=arch=compute_75,code=sm_75
+                     --generate-code=arch=compute_80,code=sm_80
+                     --generate-code=arch=compute_86,code=sm_86)
    endif()

    string(REPLACE ";" " " arch_flags "${arch_flags}")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${arch_flags}")
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      #We propagate cuda flags via target* options so that they
+      #export cleanly
+      set(CMAKE_CUDA_ARCHITECTURES OFF)
+      target_compile_options(vtkm_cuda INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:${arch_flags}>)
+      target_link_options(vtkm_cuda INTERFACE $<DEVICE_LINK:${arch_flags}>)
+    else()
+      # Before 3.18 we had to use CMAKE_CUDA_FLAGS as we had no way
+      # to propagate flags to the device link step
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${arch_flags}")
+    endif()

    # This needs to be lower-case for the property to be properly exported
    # CMake 3.15 we can add `cuda_architecture_flags` to the EXPORT_PROPERTIES
    # target property to have this automatically exported for us
-    set_target_properties(vtkm_cuda PROPERTIES cuda_architecture_flags "${arch_flags}")
    set(VTKm_CUDA_Architecture_Flags "${arch_flags}")
+    set_target_properties(vtkm_cuda PROPERTIES cuda_architecture_flags "${arch_flags}")
+    unset(arch_flags)
+  endif()
+endif()
+
+#-----------------------------------------------------------------------------
+# Kokkos with its Cuda backend enabled, expects everything to be compiled using its
+# `nvcc-wrapper` as the CXX compiler. As the name suggests, nvcc-wrapper is a wrapper around
+# Cuda's nvcc compiler. Kokkos targets have all of the flags meant for the nvcc compiler set as the
+# CXX compiler flags. This function changes all such flags to be CUDA flags so that we can use
+# CMake and vtk-m's existing infrastructure to compile for Cuda and Host separately. Without this
+# all of the files will be compiled using nvcc which can be very time consuming. It can also have
+# issues with calling host functions from device functions when compiling code for other backends.
+function(kokkos_fix_compile_options)
+  set(targets Kokkos::kokkos)
+  set(seen_targets)
+  set(cuda_arch)
+
+  while(targets)
+    list(GET targets 0 target_name)
+    list(REMOVE_AT targets 0)
+
+    get_target_property(link_libraries ${target_name} INTERFACE_LINK_LIBRARIES)
+    foreach(lib_target IN LISTS link_libraries)
+      if (TARGET ${lib_target})
+        if (lib_target IN_LIST seen_targets)
+          continue()
+        endif()
+
+        list(APPEND seen_targets ${lib_target})
+        list(APPEND targets ${lib_target})
+        get_target_property(compile_options ${lib_target} INTERFACE_COMPILE_OPTIONS)
+        if (compile_options)
+          string(REGEX MATCH "[$]<[$]<COMPILE_LANGUAGE:CXX>:-Xcompiler;.*>" cxx_compile_options "${compile_options}")
+          string(REGEX MATCH "-arch=sm_[0-9][0-9]" cuda_arch "${compile_options}")
+          string(REPLACE "-Xcompiler;" "" cxx_compile_options "${cxx_compile_options}")
+          list(TRANSFORM compile_options REPLACE "--relocatable-device-code=true" "") #We use CMake for this flag
+          list(TRANSFORM compile_options REPLACE "COMPILE_LANGUAGE:CXX" "COMPILE_LANGUAGE:CUDA")
+          list(APPEND compile_options "${cxx_compile_options}")
+          set_property(TARGET ${lib_target} PROPERTY INTERFACE_COMPILE_OPTIONS ${compile_options})
+        endif()
+
+        set_property(TARGET ${lib_target} PROPERTY INTERFACE_LINK_OPTIONS "")
+      endif()
+    endforeach()
+  endwhile()
+
+  set_property(TARGET vtkm::kokkos PROPERTY INTERFACE_LINK_OPTIONS "$<DEVICE_LINK:${cuda_arch}>")
+  if (OPENMP IN_LIST Kokkos_DEVICES)
+    set_property(TARGET vtkm::kokkos PROPERTY INTERFACE_LINK_OPTIONS "$<HOST_LINK:-fopenmp>")
+  endif()
+endfunction()
+
+if(VTKm_ENABLE_KOKKOS AND NOT TARGET vtkm::kokkos)
+  cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+
+  find_package(Kokkos REQUIRED)
+  if (CUDA IN_LIST Kokkos_DEVICES)
+    cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+    enable_language(CUDA)
+
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND
+       CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0" AND
+       CMAKE_BUILD_TYPE STREQUAL "Release")
+      message(WARNING "There is a known issue with Cuda 10 and -O3 optimization. Switching to -O2. Please refer to issue #555.")
+      string(REPLACE "-O3" "-O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+      string(REPLACE "-O3" "-O2" CMAKE_CUDA_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+    endif()
+
+    string(REGEX MATCH "[0-9][0-9]$" cuda_arch ${Kokkos_ARCH})
+    set(CMAKE_CUDA_ARCHITECTURES ${cuda_arch})
+    message(STATUS "Detected Cuda arch from Kokkos: ${cuda_arch}")
+
+    add_library(vtkm::kokkos_cuda INTERFACE IMPORTED GLOBAL)
+  elseif(HIP IN_LIST Kokkos_DEVICES)
+    cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+    enable_language(HIP)
+    add_library(vtkm::kokkos_hip INTERFACE IMPORTED GLOBAL)
+    set_property(TARGET Kokkos::kokkoscore PROPERTY INTERFACE_COMPILE_OPTIONS "")
+    set_property(TARGET Kokkos::kokkoscore PROPERTY INTERFACE_LINK_OPTIONS "")
+  endif()
+
+  add_library(vtkm::kokkos INTERFACE IMPORTED GLOBAL)
+  set_target_properties(vtkm::kokkos PROPERTIES INTERFACE_LINK_LIBRARIES "Kokkos::kokkos")
+
+  if (TARGET vtkm::kokkos_cuda)
+    kokkos_fix_compile_options()
  endif()
 endif()

--- a/CMake/VTKmMPI.cmake
+++ b/CMake/VTKmMPI.cmake
@ -1,24 +0,0 @@
-##============================================================================
-##  Copyright (c) Kitware, Inc.
-##  All rights reserved.
-##  See LICENSE.txt for details.
-##
-##  This software is distributed WITHOUT ANY WARRANTY; without even
-##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-##  PURPOSE.  See the above copyright notice for more information.
-##============================================================================
-
-if(VTKm_ENABLE_MPI AND NOT TARGET MPI::MPI_CXX)
-  if(CMAKE_VERSION VERSION_LESS 3.15)
-    #While CMake 3.10 introduced the new MPI module.
-    #Fixes related to MPI+CUDA that VTK-m needs are
-    #only found in CMake 3.15+.
-    find_package(MPI REQUIRED MODULE)
-  else()
-    #clunky but we need to make sure we use the upstream module if it exists
-    set(orig_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH})
-    set(CMAKE_MODULE_PATH "")
-    find_package(MPI REQUIRED MODULE)
-    set(CMAKE_MODULE_PATH ${orig_CMAKE_MODULE_PATH})
-  endif()
-endif()
--- a/CMake/VTKmRenderingContexts.cmake
+++ b/CMake/VTKmRenderingContexts.cmake
@ -61,6 +61,20 @@ function(vtkm_find_gl)

  if(DO_GLUT_FIND AND NOT TARGET GLUT::GLUT)
    find_package(GLUT ${GLUT_REQUIRED} ${QUIETLY})
+
+    if(APPLE AND CMAKE_VERSION VERSION_LESS 3.19.2)
+      get_target_property(lib_path GLUT::GLUT IMPORTED_LOCATION)
+      if(EXISTS "${lib_path}.tbd")
+        set_target_properties(GLUT::GLUT PROPERTIES
+          IMPORTED_LOCATION "${lib_path}.tbd")
+      endif()
+
+      get_target_property(lib_path GLUT::Cocoa IMPORTED_LOCATION)
+      if(EXISTS "${lib_path}.tbd")
+        set_target_properties(GLUT::Cocoa PROPERTIES
+          IMPORTED_LOCATION "${lib_path}.tbd")
+      endif()
+    endif()
  endif()

 endfunction()
--- a/CMake/VTKmWrappers.cmake
+++ b/CMake/VTKmWrappers.cmake
@ -10,9 +10,13 @@

 include(CMakeParseArguments)

+include(VTKmCMakeBackports)
 include(VTKmDeviceAdapters)
 include(VTKmCPUVectorization)
-include(VTKmMPI)
+
+if(VTKm_ENABLE_MPI AND NOT TARGET MPI::MPI_CXX)
+  find_package(MPI REQUIRED MODULE)
+endif()

 #-----------------------------------------------------------------------------
 # INTERNAL FUNCTIONS
@ -29,7 +33,7 @@ function(vtkm_get_kit_name kitvar)
  # Optional second argument to get dir_prefix.
  if (${ARGC} GREATER 1)
    set(${ARGV1} "${dir_prefix}" PARENT_SCOPE)
-  endif (${ARGC} GREATER 1)
+  endif ()
 endfunction(vtkm_get_kit_name)

 #-----------------------------------------------------------------------------
@ -62,7 +66,7 @@ function(vtkm_generate_export_header lib_name)

  # Now generate a header that holds the macros needed to easily export
  # template classes. This
-  string(TOUPPER ${kit_name} BASE_NAME_UPPER)
+  string(TOUPPER ${lib_name} BASE_NAME_UPPER)
  set(EXPORT_MACRO_NAME "${BASE_NAME_UPPER}")

  set(EXPORT_IS_BUILT_STATIC 0)
@ -77,17 +81,17 @@ function(vtkm_generate_export_header lib_name)
  if(NOT EXPORT_IMPORT_CONDITION)
    #set EXPORT_IMPORT_CONDITION to what the DEFINE_SYMBOL would be when
    #building shared
-    set(EXPORT_IMPORT_CONDITION ${kit_name}_EXPORTS)
+    set(EXPORT_IMPORT_CONDITION ${lib_name}_EXPORTS)
  endif()


  configure_file(
      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
-      ${VTKm_BINARY_DIR}/include/${dir_prefix}/${kit_name}_export.h
+      ${VTKm_BINARY_DIR}/include/${dir_prefix}/${lib_name}_export.h
    @ONLY)

  if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
-    install(FILES ${VTKm_BINARY_DIR}/include/${dir_prefix}/${kit_name}_export.h
+    install(FILES ${VTKm_BINARY_DIR}/include/${dir_prefix}/${lib_name}_export.h
      DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/${dir_prefix}
      )
  endif()
@ -146,9 +150,14 @@ endfunction()
 # Pass to consumers extra compile flags they need to add to CMAKE_CUDA_FLAGS
 # to have CUDA compatibility.
 #
-# This is required as currently the -sm/-gencode flags when specified inside
-# COMPILE_OPTIONS / target_compile_options are not propagated to the device
-# linker. Instead they must be specified in CMAKE_CUDA_FLAGS
+# If VTK-m was built with CMake 3.18+ and you are using CMake 3.18+ and have
+# a cmake_minimum_required of 3.18 or have set policy CMP0105 to new, this will
+# return an empty string as the `vtkm::cuda` target will correctly propagate
+# all the necessary flags.
+#
+# This is required for CMake < 3.18 as they don't support the `$<DEVICE_LINK>`
+# generator expression for `target_link_options`. Instead they need to be
+# specified in CMAKE_CUDA_FLAGS
 #
 #
 # add_library(lib_that_uses_vtkm ...)
@ -156,7 +165,18 @@ endfunction()
 # target_link_libraries(lib_that_uses_vtkm PRIVATE vtkm_filter)
 #
 function(vtkm_get_cuda_flags settings_var)
+
  if(TARGET vtkm::cuda)
+    if(POLICY CMP0105)
+      cmake_policy(GET CMP0105 does_device_link)
+      get_property(arch_flags
+        TARGET vtkm::cuda
+        PROPERTY INTERFACE_LINK_OPTIONS)
+      if(arch_flags AND CMP0105 STREQUAL "NEW")
+        return()
+      endif()
+    endif()
+
    get_property(arch_flags
      TARGET    vtkm::cuda
      PROPERTY  cuda_architecture_flags)
@ -232,8 +252,14 @@ endfunction()
 #
 #
 #  MODIFY_CUDA_FLAGS: If enabled will add the required -arch=<ver> flags
-#  that VTK-m was compiled with. If you have multiple libraries that use
-#  VTK-m calling `vtkm_add_target_information` multiple times with
+#  that VTK-m was compiled with.
+#
+#  If VTK-m was built with CMake 3.18+ and you are using CMake 3.18+ and have
+#  a cmake_minimum_required of 3.18 or have set policy CMP0105 to new, this will
+#  return an empty string as the `vtkm::cuda` target will correctly propagate
+#  all the necessary flags.
+#
+#  Note: calling `vtkm_add_target_information` multiple times with
 #  `MODIFY_CUDA_FLAGS` will cause duplicate compiler flags. To resolve this issue
 #  you can; pass all targets and sources to a single `vtkm_add_target_information`
 #  call, have the first one use `MODIFY_CUDA_FLAGS`, or use the provided
@ -275,10 +301,11 @@ function(vtkm_add_target_information uses_vtkm_target)
    ${ARGN}
    )

-
  if(VTKm_TI_MODIFY_CUDA_FLAGS)
-    vtkm_get_cuda_flags(CMAKE_CUDA_FLAGS)
-    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} PARENT_SCOPE)
+    vtkm_get_cuda_flags(cuda_flags)
+    if(cuda_flags)
+      set(CMAKE_CUDA_FLAGS ${cuda_flags} PARENT_SCOPE)
+    endif()
  endif()

  set(targets ${uses_vtkm_target})
@ -291,6 +318,8 @@ function(vtkm_add_target_information uses_vtkm_target)
  # set the required target properties
  set_target_properties(${targets} PROPERTIES POSITION_INDEPENDENT_CODE ON)
  set_target_properties(${targets} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  # CUDA_ARCHITECTURES added in CMake 3.18
+  set_target_properties(${targets} PROPERTIES CUDA_ARCHITECTURES OFF)

  if(VTKm_TI_DROP_UNUSED_SYMBOLS)
    foreach(target IN LISTS targets)
@ -298,6 +327,12 @@ function(vtkm_add_target_information uses_vtkm_target)
    endforeach()
  endif()

+  if((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda))
+    set_source_files_properties(${VTKm_TI_DEVICE_SOURCES} PROPERTIES LANGUAGE "CUDA")
+  elseif(TARGET vtkm::kokkos_hip)
+    set_source_files_properties(${VTKm_TI_DEVICE_SOURCES} PROPERTIES LANGUAGE "HIP")
+  endif()
+
  # Validate that following:
  #   - We are building with CUDA enabled.
  #   - We are building a VTK-m library or a library that wants cross library
@ -305,11 +340,15 @@ function(vtkm_add_target_information uses_vtkm_target)
  #
  # This is required as CUDA currently doesn't support device side calls across
  # dynamic library boundaries.
-  if(TARGET vtkm::cuda)
-    set_source_files_properties(${VTKm_TI_DEVICE_SOURCES} PROPERTIES LANGUAGE "CUDA")
+  if((TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda))
    foreach(target IN LISTS targets)
      get_target_property(lib_type ${target} TYPE)
-      get_target_property(requires_static vtkm::cuda requires_static_builds)
+      if (TARGET vtkm::cuda)
+        get_target_property(requires_static vtkm::cuda requires_static_builds)
+      endif()
+      if (TARGET vtkm::kokkos)
+        get_target_property(requires_static vtkm::kokkos requires_static_builds)
+      endif()

      if(requires_static AND ${lib_type} STREQUAL "SHARED_LIBRARY" AND VTKm_TI_EXTENDS_VTKM)
        #We provide different error messages based on if we are building VTK-m
--- a/CMake/patches/3.15/FindMPI.cmake
+++ b/CMake/patches/3.15/FindMPI.cmake
--- a/CMake/patches/FindMPI.cmake
+++ b/CMake/patches/FindMPI.cmake
@ -0,0 +1,18 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##=============================================================================
+
+# This module is already included in new versions of CMake
+if(CMAKE_VERSION VERSION_LESS 3.15)
+  include(${CMAKE_CURRENT_LIST_DIR}/3.15/FindMPI.cmake)
+else()
+  include(${CMAKE_ROOT}/Modules/FindMPI.cmake)
+endif()
--- a/CMake/patches/README.md
+++ b/CMake/patches/README.md
@ -0,0 +1,7 @@
+# CMake backports
+
+This directory contains backports from newer CMake versions to help support
+actually using older CMake versions for building VTK-m. The directory name is the
+minimum version of CMake for which the contained files are no longer necessary.
+For example, the files under the `3.15` directory are not needed for 3.15 or
+3.16, but are for 3.14.
--- a/CMake/testing/VTKmCheckPyexpander.cmake
+++ b/CMake/testing/VTKmCheckPyexpander.cmake
@ -37,11 +37,19 @@ if(NOT GENERATED_FILE)
  return()
 endif()

-execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
-  RESULT_VARIABLE pyexpander_result
-  OUTPUT_VARIABLE pyexpander_output
+if(MSVC)
+  execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
+    RESULT_VARIABLE pyexpander_result
+    OUTPUT_VARIABLE pyexpander_output
  )
+else()
+  execute_process(
+    COMMAND ${PYEXPANDER_COMMAND} ${SOURCE_FILE}.in
+    RESULT_VARIABLE pyexpander_result
+    OUTPUT_VARIABLE pyexpander_output
+  )
+endif()

 if(pyexpander_result)
  # If pyexpander returned non-zero, it failed.
--- a/CMake/testing/VTKmCheckSourceInInstall.cmake
+++ b/CMake/testing/VTKmCheckSourceInInstall.cmake
@ -18,6 +18,7 @@
 #        -DVTKm_INSTALL_INCLUDE_DIR=<VTKm_INSTALL_INCLUDE_DIR>
 #        -DVTKm_ENABLE_RENDERING=<VTKm_ENABLE_RENDERING>
 #        -DVTKm_ENABLE_LOGGING=<VTKm_ENABLE_LOGGING>
+#        -DVTKm_ENABLE_HDF5_IO=<VTKm_ENABLE_HDF5_IO>
 #        -P <VTKm_SOURCE_DIR>/CMake/testing/VTKMCheckSourceInInstall.cmake
 ##

@ -39,7 +40,9 @@ endif ()
 if (NOT DEFINED VTKm_ENABLE_LOGGING)
  message(FATAL_ERROR "VTKm_ENABLE_LOGGING not defined.")
 endif ()
-
+if (NOT DEFINED VTKm_ENABLE_HDF5_IO)
+  message(FATAL_ERROR "VTKm_ENABLE_HDF5_IO not defined.")
+endif()

 include(CMakeParseArguments)
 # -----------------------------------------------------------------------------
@ -110,8 +113,19 @@ function(do_verify root_dir prefix)
    )

  set(file_exceptions
-    cont/ColorTablePrivate.hxx
    thirdparty/diy/vtkmdiy/cmake/mpi_types.h
+
+    # Ignore deprecated virtual classes (which are not installed if VTKm_NO_DEPRECATED_VIRTUAL
+    # is on). These exceptions can be removed when these files are completely removed.
+    cont/ArrayHandleVirtual.h
+    cont/ArrayHandleVirtual.hxx
+    cont/ArrayHandleVirtualCoordinates.h
+    cont/CellLocator.h
+    cont/PointLocator.h
+    cont/StorageVirtual.h
+    cont/StorageVirtual.hxx
+    exec/CellLocator.h
+    exec/PointLocator.h
    )

  #by default every header in a testing directory doesn't need to be installed
@ -124,7 +138,12 @@ function(do_verify root_dir prefix)
  if(NOT VTKm_ENABLE_LOGGING)
    list(APPEND directory_exceptions thirdparty/loguru)
  endif()
-
+  if (NOT VTKm_ENABLE_HDF5_IO)
+    list(APPEND file_exceptions
+      io/ImageWriterHDF5.h
+      io/ImageReaderHDF5.h
+      )
+  endif()
  #Step 2. Verify the installed files match what headers are listed in each
  # source directory
  verify_install_per_dir("${VTKm_SOURCE_DIR}/vtkm"
--- a/CMake/testing/VTKmTestInstall.cmake
+++ b/CMake/testing/VTKmTestInstall.cmake
@ -17,6 +17,7 @@ function(vtkm_test_install )
      "-DVTKm_INSTALL_INCLUDE_DIR=${VTKm_INSTALL_INCLUDE_DIR}"
      "-DVTKm_ENABLE_RENDERING=${VTKm_ENABLE_RENDERING}"
      "-DVTKm_ENABLE_LOGGING=${VTKm_ENABLE_LOGGING}"
+      "-DVTKm_ENABLE_HDF5_IO=${VTKm_ENABLE_HDF5_IO}"
      )

    #By having this as separate tests using fixtures, it will allow us in
@ -110,6 +111,10 @@ function(vtkm_test_against_install dir)
    )
  endif()

+  if(TARGET vtkm::kokkos)
+    list(APPEND args "-DKokkos_DIR=${Kokkos_DIR}")
+  endif()
+
  #determine if the test is expected to compile or fail to build. We use
  #this information to built the test name to make it clear to the user
  #what a 'passing' test means
--- a/CMake/testing/VTKmTestWrappers.cmake
+++ b/CMake/testing/VTKmTestWrappers.cmake
@ -27,7 +27,6 @@ function(vtkm_create_test_executable
  # for MPI tests, suffix test name and add MPI_Init/MPI_Finalize calls.
  if (is_mpi_test)
    set(extraArgs EXTRA_INCLUDE "vtkm/thirdparty/diy/environment.h")
-    set(CMAKE_TESTDRIVER_BEFORE_TESTMAIN "vtkmdiy::mpi::environment env(ac, av);")

    if (use_mpi)
      vtkm_diy_use_mpi(ON)
@ -48,9 +47,15 @@ function(vtkm_create_test_executable
  vtkm_add_drop_unused_function_flags(${prog})
  target_compile_definitions(${prog} PRIVATE ${defines})

-  #if all backends are enabled, we can use cuda compiler to handle all possible backends.
+  #determine if we have a device that requires a separate compiler enabled
+  set(device_lang_enabled FALSE)
+  if( (TARGET vtkm::cuda) OR (TARGET vtkm::kokkos_cuda) OR (TARGET vtkm::kokkos_hip))
+    set(device_lang_enabled TRUE)
+  endif()
+
+  #if all backends are enabled, we can use the device compiler to handle all possible backends.
  set(device_sources)
-  if(TARGET vtkm::cuda AND enable_all_backends)
+  if(device_lang_enabled AND enable_all_backends)
    set(device_sources ${sources})
  endif()
  vtkm_add_target_information(${prog} DEVICE_SOURCES ${device_sources})
@ -63,7 +68,7 @@ function(vtkm_create_test_executable
  set_property(TARGET ${prog} PROPERTY LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH})
  set_property(TARGET ${prog} PROPERTY RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH})

-  target_link_libraries(${prog} PRIVATE vtkm_cont ${libraries})
+  target_link_libraries(${prog} PRIVATE vtkm_cont_testing ${libraries})

  if(use_job_pool)
    vtkm_setup_job_pool()
@ -153,6 +158,13 @@ function(vtkm_unit_tests)
      #serially
      list(APPEND per_device_serial TRUE)
    endif()
+    if (VTKm_ENABLE_KOKKOS)
+      list(APPEND per_device_command_line_arguments --device=kokkos)
+      list(APPEND per_device_suffix "KOKKOS")
+      #may require more time because of kernel generation.
+      list(APPEND per_device_timeout 1500)
+      list(APPEND per_device_serial FALSE)
+    endif()
  endif()

  set(test_prog)
@ -172,6 +184,9 @@ function(vtkm_unit_tests)
  # Add the path to the location where regression test images are to be stored
  list(APPEND VTKm_UT_TEST_ARGS "--baseline-dir=${VTKm_SOURCE_DIR}/data/baseline")

+  # Add the path to the location where generated regression test images should be written
+  list(APPEND VTKm_UT_TEST_ARGS "--write-dir=${VTKm_BINARY_DIR}")
+
  if(VTKm_UT_MPI)
    if (VTKm_ENABLE_MPI)
      vtkm_create_test_executable(
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,15 +8,17 @@
 ##  PURPOSE.  See the above copyright notice for more information.
 ##============================================================================

-# If you want CUDA support, you will need to have CMake 3.9 on Linux/OSX.
-# We require CMake 3.11 with the MSVC generator as the $<COMPILE_LANGUAGE:>
-# generator expression is not supported on older versions.
+# If you want CUDA support, you will need to have CMake 3.13 on Linux/OSX.
 cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
 project (VTKm)

-if(${CMAKE_GENERATOR} MATCHES "Visual Studio")
-  cmake_minimum_required(VERSION 3.12...3.15 FATAL_ERROR)
-endif()
+# We only allow c++14
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# When using C++14 support make sure you use the standard C++ extensions rather
+# than compiler-specific versions of the extensions (to preserve portability).
+set(CMAKE_CXX_EXTENSIONS OFF)

 # Update module path
 set(VTKm_CMAKE_MODULE_PATH ${VTKm_SOURCE_DIR}/CMake)
@ -79,8 +81,9 @@ endmacro ()

 # Configurable Options
 vtkm_option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF)
-vtkm_option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
+vtkm_option(VTKm_ENABLE_KOKKOS "Enable Kokkos support" OFF)
 vtkm_option(VTKm_ENABLE_OPENMP "Enable OpenMP support" OFF)
+vtkm_option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
 vtkm_option(VTKm_ENABLE_RENDERING "Enable rendering library" ON)
 vtkm_option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)
 vtkm_option(VTKm_ENABLE_MPI "Enable MPI support" OFF)
@ -97,6 +100,11 @@ endif()
 vtkm_option(VTKm_USE_DOUBLE_PRECISION "Use double precision for floating point calculations" OFF)
 vtkm_option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)

+vtkm_option(VTKm_ENABLE_HDF5_IO "Enable HDF5 support" OFF)
+if (VTKm_ENABLE_HDF5_IO)
+  find_package(HDF5 REQUIRED COMPONENTS HL)
+endif()
+
 # VTK-m will turn on logging by default, but will set the default
 # logging level to WARN.  This option should not be visible by default
 # in the GUI, as ERROR and WARN level logging should not interfere
@ -108,6 +116,17 @@ vtkm_option(VTKm_ENABLE_LOGGING "Enable VTKm Logging" ON)
 # performance.
 vtkm_option(VTKm_NO_ASSERT "Disable assertions in debugging builds." OFF)

+# The CUDA compiler (as of CUDA 11) takes a surprising long time to compile
+# kernels with assert in them. By default we turn off asserts when compiling
+# for CUDA devices.
+vtkm_option(VTKm_NO_ASSERT_CUDA "Disable assertions for CUDA devices." ON)
+
+# The HIP compiler (as of ROCm 3.7) takes a surprising long time to compile
+# kernels with assert in them they generate `printf` calls which are very
+# slow ( cause massive register spillage). By default we turn off asserts when
+# compiling for HIP devices.
+vtkm_option(VTKm_NO_ASSERT_HIP "Disable assertions for HIP devices." ON)
+
 # When VTK-m is embedded into larger projects that wish to make end user
 # applications they want to only install libraries and don't want CMake/headers
 # installed.
@ -132,19 +151,26 @@ vtkm_option(VTKm_ENABLE_DEVELOPER_FLAGS "Enable compiler flags that are useful w
 # Some application might need not to install those, hence this option.
 vtkm_option(VTKm_NO_INSTALL_README_LICENSE "disable the installation of README and LICENSE files" OFF)

+# We are in the process of deprecating the use of virtual methods because they
+# are not well supported on many accelerators. Turn this option on to remove
+# the code entirely. Note that the deprecation of virtual methods is work in
+# progress, so not all use of virtual methods may be done. In VTK-m 2.0
+# virtual methods should be removed entirely and this option will be removed.
+vtkm_option(VTKm_NO_DEPRECATED_VIRTUAL "Do not compile support of deprecated virtual methods" OFF)
+
 mark_as_advanced(
  VTKm_ENABLE_LOGGING
  VTKm_NO_ASSERT
+  VTKm_NO_ASSERT_CUDA
+  VTKm_NO_ASSERT_HIP
  VTKm_INSTALL_ONLY_LIBRARIES
  VTKm_HIDE_PRIVATE_SYMBOLS
  VTKm_ENABLE_DEVELOPER_FLAGS
  VTKm_NO_INSTALL_README_LICENSE
+  VTKm_NO_DEPRECATED_VIRTUAL
  )

 #-----------------------------------------------------------------------------
-# When using C++11 support make sure you use the standard C++ extensions rather
-# than compiler-specific versions of the extensions (to preserve portability).
-set(CMAKE_CXX_EXTENSIONS Off)

 # Setup default build types
 include(VTKmBuildType)
@ -202,7 +228,7 @@ if (VTKm_ENABLE_TESTING)
  # Setup compiler flags for dynamic analysis if needed
  include(testing/VTKmCompilerDynamicAnalysisFlags)

-endif (VTKm_ENABLE_TESTING)
+endif()

 #-----------------------------------------------------------------------------
 # Check basic type sizes.
@ -266,10 +292,16 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
  # Install helper configure files.
  install(
    FILES
+      ${VTKm_SOURCE_DIR}/CMake/VTKmCMakeBackports.cmake
      ${VTKm_SOURCE_DIR}/CMake/FindTBB.cmake
-      ${VTKm_SOURCE_DIR}/CMake/FindMPI.cmake
+      ${VTKm_SOURCE_DIR}/CMake/patches/FindMPI.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
    )
+  install(
+    FILES
+      ${VTKm_SOURCE_DIR}/CMake/patches/3.15/FindMPI.cmake
+    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}/3.15
+    )

  # Install support files.
  install(
@ -279,7 +311,6 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
      ${VTKm_SOURCE_DIR}/CMake/VTKmDeviceAdapters.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmDIYUtils.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
-      ${VTKm_SOURCE_DIR}/CMake/VTKmMPI.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmRenderingContexts.cmake
      ${VTKm_SOURCE_DIR}/CMake/VTKmWrappers.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
@ -313,7 +344,7 @@ endif ()
 #-----------------------------------------------------------------------------
 #add the benchmarking folder
 if(VTKm_ENABLE_BENCHMARKS)
-    add_subdirectory(benchmarking)
+  add_subdirectory(benchmarking)
 endif()

 #-----------------------------------------------------------------------------
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@ -55,7 +55,7 @@ list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
  "nvlink warning : .*ArrayPortalVirtual.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorBoundingIntervalHierarchyExec.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorRectilinearGrid.* has address taken but no possible call to it"
-  "nvlink warning : .*CellLocatorUniformBins.* has address taken but no possible call to it"
+  "nvlink warning : .*CellLocatorTwoLevel.* has address taken but no possible call to it"
  "nvlink warning : .*CellLocatorUniformGrid.* has address taken but no possible call to it"

 )
--- a/README.md
+++ b/README.md
@ -64,7 +64,7 @@ effort.
 VTK-m Requires:

  + C++11 Compiler. VTK-m has been confirmed to work with the following
-      + GCC 4.8+
+      + GCC 5.4+
      + Clang 5.0+
      + XCode 5.0+
      + MSVC 2015+
@ -76,8 +76,8 @@ VTK-m Requires:
 Optional dependencies are:

  + CUDA Device Adapter
-      + [Cuda Toolkit 9.2+](https://developer.nvidia.com/cuda-toolkit)
-      + Note CUDA >= 10.1 is required on Windows
+      + [Cuda Toolkit 9.2, >= 10.2](https://developer.nvidia.com/cuda-toolkit)
+      + Note CUDA >= 10.2 is required on Windows
  + TBB Device Adapter
      + [TBB](https://www.threadingbuildingblocks.org/)
  + OpenMP Device Adapter
@ -103,14 +103,14 @@ Optional dependencies are:

 VTK-m has been tested on the following configurations:c
  + On Linux
-      + GCC 4.8.5, 5.4, 6.5, 7.4, 8.2, 9.2; Clang 5, 8; Intel 17.0.4; 19.0.0
+      + GCC 5.4.0, 5.4, 6.5, 7.4, 8.2, 9.2; Clang 5, 8; Intel 17.0.4; 19.0.0
      + CMake 3.12, 3.13, 3.16, 3.17
-      + CUDA 9.2.148, 10.0.130, 10.1.105, 10.2.89
+      + CUDA 9.2, 10.2, 11.0, 11.1 
      + TBB 4.4 U2, 2017 U7
  + On Windows
      + Visual Studio 2015, 2017
      + CMake 3.12, 3.17
-      + CUDA 10.1
+      + CUDA 10.2
      + TBB 2017 U3, 2018 U2
  + On MacOS
      + AppleClang 9.1
@ -200,7 +200,6 @@ scene.AddActor(vtkm::rendering::Actor(outputData.GetCellSet(),
                                      outputData.GetField(fieldName),
                                      colorTable));
 vtkm::rendering::View3D view(scene, mapper, canvas, camera, bg);
-view.Initialize();
 view.Paint();
 view.SaveAs("demo_output.png");
 ```
--- a/Utilities/CI/reproduce_ci_env.py
+++ b/Utilities/CI/reproduce_ci_env.py
@ -1,4 +1,4 @@
-#!/bin/env python3
+#!/usr/bin/env python3

 #=============================================================================
 #
@ -201,11 +201,8 @@ ENV GITLAB_CI=1 \
 COPY . /src
 ENV $gitlab_env
 WORKDIR /src
-#Let git fix issues from copying across OS (such as windows EOL)
-#Note that this will remove any changes not committed.
 RUN echo "$before_script || true" >> /setup-gitlab-env.sh && \
    echo "$script || true" >> /run-gitlab-stage.sh && \
-    git reset --hard && \
    bash /setup-gitlab-env.sh
 ''')

--- a/benchmarking/BenchmarkArrayTransfer.cxx
+++ b/benchmarking/BenchmarkArrayTransfer.cxx
@ -77,6 +77,14 @@ struct ReadWriteValues : vtkm::worklet::WorkletMapField
  }
 };

+// Takes a vector of data and creates a fresh ArrayHandle with memory just allocated
+// in the control environment.
+template <typename T>
+vtkm::cont::ArrayHandle<T> CreateFreshArrayHandle(const std::vector<T>& vec)
+{
+  return vtkm::cont::make_ArrayHandleMove(std::vector<T>(vec));
+}
+
 //------------- Benchmark functors -------------------------------------------

 // Copies NumValues from control environment to execution environment and
@ -97,14 +105,18 @@ void BenchContToExecRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);
+
    timer.Start();
    invoker(ReadValues{}, array);
    timer.Stop();
@ -181,19 +193,26 @@ void BenchContToExecReadWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);
+
    timer.Start();
    invoker(ReadWriteValues{}, array);
    timer.Stop();

    state.SetIterationTime(timer.GetElapsedTime());
+
+    // Remove data from execution environment so it has to be transferred again.
+    array.ReleaseResourcesExecution();
  }

  const int64_t iterations = static_cast<int64_t>(state.iterations());
@ -223,21 +242,23 @@ void BenchRoundTripRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);
+  std::vector<ValueType> vec(static_cast<std::size_t>(numValues), 2);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
-    // Ensure data is in control before we start:
-    array.ReleaseResourcesExecution();
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);

    timer.Start();
    invoker(ReadValues{}, array);

    // Copy back to host and read:
+    // (Note, this probably does not copy. The array exists in both control and execution for read.)
    auto portal = array.ReadPortal();
    for (vtkm::Id i = 0; i < numValues; ++i)
    {
@ -277,21 +298,23 @@ void BenchRoundTripReadWrite(benchmark::State& state)
  }

  std::vector<ValueType> vec(static_cast<std::size_t>(numValues));
-  ArrayType array = vtkm::cont::make_ArrayHandle(vec);

  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
-    // Ensure data is in control before we start:
-    array.ReleaseResourcesExecution();
+
+    // Make a fresh array each iteration to force a copy from control to execution each time.
+    // (Prevents unified memory devices from caching data.)
+    ArrayType array = CreateFreshArrayHandle(vec);

    timer.Start();

    // Do work on device:
    invoker(ReadWriteValues{}, array);

+    // Copy back to host and read/write:
    auto portal = array.WritePortal();
    for (vtkm::Id i = 0; i < numValues; ++i)
    {
@ -330,14 +353,14 @@ void BenchExecToContRead(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    // Time the copy:
    timer.Start();

@ -383,14 +406,14 @@ void BenchExecToContWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    timer.Start();

    // Allocate/write data on device
@ -435,14 +458,14 @@ void BenchExecToContReadWrite(benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  ArrayType array;
-  array.Allocate(numValues);
-
  vtkm::cont::Invoker invoker{ device };
  vtkm::cont::Timer timer{ device };
  for (auto _ : state)
  {
    (void)_;
+    ArrayType array;
+    array.Allocate(numValues);
+
    timer.Start();

    // Allocate/write data on device
--- a/benchmarking/BenchmarkAtomicArray.cxx
+++ b/benchmarking/BenchmarkAtomicArray.cxx
@ -260,7 +260,7 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(
    ->ArgNames({ "Values", "Ops", "Stride" }),
  vtkm::cont::AtomicArrayTypeList);

-// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to adjacent
+// Benchmarks AtomicArray::CompareExchange such that each work index writes to adjacent
 // indices.
 struct CASSeqWorker : public vtkm::worklet::WorkletMapField
 {
@ -273,12 +273,8 @@ struct CASSeqWorker : public vtkm::worklet::WorkletMapField
    const vtkm::Id idx = i % portal.GetNumberOfValues();
    const T val = static_cast<T>(i) + in;
    T oldVal = portal.Get(idx);
-    T assumed = static_cast<T>(0);
-    do
-    {
-      assumed = oldVal;
-      oldVal = portal.CompareAndSwap(idx, assumed + val, assumed);
-    } while (assumed != oldVal);
+    while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
+      ;
  }
 };

@ -371,7 +367,7 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCASSeqBaseline,
                                ->ArgNames({ "Values", "Ops" }),
                              vtkm::cont::AtomicArrayTypeList);

-// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to
+// Benchmarks AtomicArray::CompareExchange such that each work index writes to
 // a strided index:
 // ( floor(i / stride) + stride * (i % stride)
 struct CASStrideWorker : public vtkm::worklet::WorkletMapField
@ -393,12 +389,8 @@ struct CASStrideWorker : public vtkm::worklet::WorkletMapField
    const vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
    const T val = static_cast<T>(i) + in;
    T oldVal = portal.Get(idx);
-    T assumed = static_cast<T>(0);
-    do
-    {
-      assumed = oldVal;
-      oldVal = portal.CompareAndSwap(idx, assumed + val, assumed);
-    } while (assumed != oldVal);
+    while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
+      ;
  }
 };

--- a/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/benchmarking/BenchmarkDeviceAdapter.cxx
@ -20,6 +20,7 @@
 #include <vtkm/cont/Timer.h>

 #include <vtkm/worklet/StableSortIndices.h>
+#include <vtkm/worklet/WorkletMapField.h>

 #include <algorithm>
 #include <cmath>
--- a/benchmarking/BenchmarkFieldAlgorithms.cxx
+++ b/benchmarking/BenchmarkFieldAlgorithms.cxx
@ -8,18 +8,21 @@
 //  PURPOSE.  See the above copyright notice for more information.
 //============================================================================

+#include <vtkm/ImplicitFunction.h>
 #include <vtkm/Math.h>
 #include <vtkm/VectorAnalysis.h>

 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleMultiplexer.h>
-#include <vtkm/cont/ArrayHandleVirtual.h>
 #include <vtkm/cont/CellSetStructured.h>
-#include <vtkm/cont/ImplicitFunctionHandle.h>
 #include <vtkm/cont/Initialize.h>
 #include <vtkm/cont/Invoker.h>
 #include <vtkm/cont/Timer.h>

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
+#include <vtkm/cont/ArrayHandleVirtual.h>
+#endif
+
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/WorkletMapTopology.h>

@ -223,20 +226,20 @@ public:
  using ExecutionSignature = void(_1, _2, _3, _4);
  using InputDomain = _1;

-  template <typename WeightType, typename T, typename S, typename D>
+  template <typename WeightType, typename T, typename S>
  VTKM_EXEC void operator()(const vtkm::Id2& low_high,
                            const WeightType& weight,
-                            const vtkm::exec::ExecutionWholeArrayConst<T, S, D>& inPortal,
+                            const vtkm::exec::ExecutionWholeArrayConst<T, S>& inPortal,
                            T& result) const
  {
    //fetch the low / high values from inPortal
    result = vtkm::Lerp(inPortal.Get(low_high[0]), inPortal.Get(low_high[1]), weight);
  }

-  template <typename WeightType, typename T, typename S, typename D, typename U>
+  template <typename WeightType, typename T, typename S, typename U>
  VTKM_EXEC void operator()(const vtkm::Id2&,
                            const WeightType&,
-                            const vtkm::exec::ExecutionWholeArrayConst<T, S, D>&,
+                            const vtkm::exec::ExecutionWholeArrayConst<T, S>&,
                            U&) const
  {
    //the inPortal and result need to be the same type so this version only
@ -245,50 +248,35 @@ public:
  }
 };

-template <typename ImplicitFunction>
 class EvaluateImplicitFunction : public vtkm::worklet::WorkletMapField
 {
 public:
-  using ControlSignature = void(FieldIn, FieldOut);
-  using ExecutionSignature = void(_1, _2);
+  using ControlSignature = void(FieldIn, FieldOut, ExecObject);
+  using ExecutionSignature = void(_1, _2, _3);

-  EvaluateImplicitFunction(const ImplicitFunction* function)
-    : Function(function)
+  template <typename VecType, typename ScalarType, typename FunctionType>
+  VTKM_EXEC void operator()(const VecType& point,
+                            ScalarType& val,
+                            const FunctionType& function) const
  {
+    val = function.Value(point);
  }
-
-  template <typename VecType, typename ScalarType>
-  VTKM_EXEC void operator()(const VecType& point, ScalarType& val) const
-  {
-    val = this->Function->Value(point);
-  }
-
-private:
-  const ImplicitFunction* Function;
 };

-template <typename T1, typename T2>
 class Evaluate2ImplicitFunctions : public vtkm::worklet::WorkletMapField
 {
 public:
-  using ControlSignature = void(FieldIn, FieldOut);
-  using ExecutionSignature = void(_1, _2);
+  using ControlSignature = void(FieldIn, FieldOut, ExecObject, ExecObject);
+  using ExecutionSignature = void(_1, _2, _3, _4);

-  Evaluate2ImplicitFunctions(const T1* f1, const T2* f2)
-    : Function1(f1)
-    , Function2(f2)
+  template <typename VecType, typename ScalarType, typename FType1, typename FType2>
+  VTKM_EXEC void operator()(const VecType& point,
+                            ScalarType& val,
+                            const FType1& function1,
+                            const FType2& function2) const
  {
+    val = function1.Value(point) + function2.Value(point);
  }
-
-  template <typename VecType, typename ScalarType>
-  VTKM_EXEC void operator()(const VecType& point, ScalarType& val) const
-  {
-    val = this->Function1->Value(point) + this->Function2->Value(point);
-  }
-
-private:
-  const T1* Function1;
-  const T2* Function2;
 };

 struct PassThroughFunctor
@ -433,15 +421,19 @@ void BenchBlackScholesStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchBlackScholesStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchBlackScholesDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchBlackScholesImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.StockPrice),
           vtkm::cont::make_ArrayHandleVirtual(impl.OptionStrike),
           vtkm::cont::make_ArrayHandleVirtual(impl.OptionYears));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchBlackScholesDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchBlackScholesMultiplexer0(::benchmark::State& state)
@ -537,15 +529,19 @@ void BenchMathStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchMathStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchMathDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchMathImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.InputHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.TempHandle1),
           vtkm::cont::make_ArrayHandleVirtual(impl.TempHandle2));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchMathDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchMathMultiplexer0(::benchmark::State& state)
@ -636,13 +632,17 @@ void BenchFusedMathStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchFusedMathStatic, ValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchFusedMathDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchFusedMathImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.InputHandle));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchFusedMathDynamic, ValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 template <typename ValueType>
 void BenchFusedMathMultiplexer0(::benchmark::State& state)
@ -756,15 +756,19 @@ void BenchEdgeInterpStatic(::benchmark::State& state)
 };
 VTKM_BENCHMARK_TEMPLATES(BenchEdgeInterpStatic, InterpValueTypes);

+#ifndef VTKM_NO_DEPRECATED_VIRTUAL
 template <typename ValueType>
 void BenchEdgeInterpDynamic(::benchmark::State& state)
 {
+  VTKM_DEPRECATED_SUPPRESS_BEGIN
  BenchEdgeInterpImpl<ValueType> impl{ state };
  impl.Run(vtkm::cont::make_ArrayHandleVirtual(impl.EdgePairHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.WeightHandle),
           vtkm::cont::make_ArrayHandleVirtual(impl.FieldHandle));
+  VTKM_DEPRECATED_SUPPRESS_END
 };
 VTKM_BENCHMARK_TEMPLATES(BenchEdgeInterpDynamic, InterpValueTypes);
+#endif //VTKM_NO_DEPRECATED_VIRTUAL

 struct ImplicitFunctionBenchData
 {
@ -802,7 +806,7 @@ static ImplicitFunctionBenchData MakeImplicitFunctionBenchData()

 void BenchImplicitFunction(::benchmark::State& state)
 {
-  using EvalWorklet = EvaluateImplicitFunction<vtkm::Sphere>;
+  using EvalWorklet = EvaluateImplicitFunction;

  const vtkm::cont::DeviceAdapterId device = Config.Device;

@ -814,10 +818,7 @@ void BenchImplicitFunction(::benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  vtkm::cont::Token token;
-  auto handle = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere1);
-  auto function = static_cast<const vtkm::Sphere*>(handle.PrepareForExecution(device, token));
-  EvalWorklet eval(function);
+  EvalWorklet eval;

  vtkm::cont::Timer timer{ device };
  vtkm::cont::Invoker invoker{ device };
@ -826,7 +827,7 @@ void BenchImplicitFunction(::benchmark::State& state)
  {
    (void)_;
    timer.Start();
-    invoker(eval, data.Points, data.Result);
+    invoker(eval, data.Points, data.Result, data.Sphere1);
    timer.Stop();

    state.SetIterationTime(timer.GetElapsedTime());
@ -836,7 +837,7 @@ VTKM_BENCHMARK(BenchImplicitFunction);

 void BenchVirtualImplicitFunction(::benchmark::State& state)
 {
-  using EvalWorklet = EvaluateImplicitFunction<vtkm::ImplicitFunction>;
+  using EvalWorklet = EvaluateImplicitFunction;

  const vtkm::cont::DeviceAdapterId device = Config.Device;

@ -848,9 +849,7 @@ void BenchVirtualImplicitFunction(::benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  vtkm::cont::Token token;
-  auto sphere = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere1);
-  EvalWorklet eval(sphere.PrepareForExecution(device, token));
+  EvalWorklet eval;

  vtkm::cont::Timer timer{ device };
  vtkm::cont::Invoker invoker{ device };
@ -859,7 +858,7 @@ void BenchVirtualImplicitFunction(::benchmark::State& state)
  {
    (void)_;
    timer.Start();
-    invoker(eval, data.Points, data.Result);
+    invoker(eval, data.Points, data.Result, data.Sphere1);
    timer.Stop();

    state.SetIterationTime(timer.GetElapsedTime());
@ -869,7 +868,7 @@ VTKM_BENCHMARK(BenchVirtualImplicitFunction);

 void Bench2ImplicitFunctions(::benchmark::State& state)
 {
-  using EvalWorklet = Evaluate2ImplicitFunctions<vtkm::Sphere, vtkm::Sphere>;
+  using EvalWorklet = Evaluate2ImplicitFunctions;

  const vtkm::cont::DeviceAdapterId device = Config.Device;

@ -881,12 +880,7 @@ void Bench2ImplicitFunctions(::benchmark::State& state)
    state.SetLabel(desc.str());
  }

-  vtkm::cont::Token token;
-  auto h1 = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere1);
-  auto h2 = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere2);
-  auto f1 = static_cast<const vtkm::Sphere*>(h1.PrepareForExecution(device, token));
-  auto f2 = static_cast<const vtkm::Sphere*>(h2.PrepareForExecution(device, token));
-  EvalWorklet eval(f1, f2);
+  EvalWorklet eval;

  vtkm::cont::Timer timer{ device };
  vtkm::cont::Invoker invoker{ device };
@ -895,7 +889,7 @@ void Bench2ImplicitFunctions(::benchmark::State& state)
  {
    (void)_;
    timer.Start();
-    invoker(eval, data.Points, data.Result);
+    invoker(eval, data.Points, data.Result, data.Sphere1, data.Sphere2);
    timer.Stop();

    state.SetIterationTime(timer.GetElapsedTime());
@ -903,40 +897,6 @@ void Bench2ImplicitFunctions(::benchmark::State& state)
 }
 VTKM_BENCHMARK(Bench2ImplicitFunctions);

-void Bench2VirtualImplicitFunctions(::benchmark::State& state)
-{
-  using EvalWorklet = Evaluate2ImplicitFunctions<vtkm::ImplicitFunction, vtkm::ImplicitFunction>;
-
-  const vtkm::cont::DeviceAdapterId device = Config.Device;
-
-  auto data = MakeImplicitFunctionBenchData();
-
-  {
-    std::ostringstream desc;
-    desc << data.Points.GetNumberOfValues() << " points";
-    state.SetLabel(desc.str());
-  }
-
-  vtkm::cont::Token token;
-  auto s1 = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere1);
-  auto s2 = vtkm::cont::make_ImplicitFunctionHandle(data.Sphere2);
-  EvalWorklet eval(s1.PrepareForExecution(device, token), s2.PrepareForExecution(device, token));
-
-  vtkm::cont::Timer timer{ device };
-  vtkm::cont::Invoker invoker{ device };
-
-  for (auto _ : state)
-  {
-    (void)_;
-    timer.Start();
-    invoker(eval, data.Points, data.Result);
-    timer.Stop();
-
-    state.SetIterationTime(timer.GetElapsedTime());
-  }
-}
-VTKM_BENCHMARK(Bench2VirtualImplicitFunctions);
-
 } // end anon namespace

 int main(int argc, char* argv[])
--- a/benchmarking/BenchmarkFilters.cxx
+++ b/benchmarking/BenchmarkFilters.cxx
@ -24,8 +24,8 @@
 #include <vtkm/cont/ErrorInternal.h>
 #include <vtkm/cont/Logging.h>
 #include <vtkm/cont/RuntimeDeviceTracker.h>
-#include <vtkm/cont/StorageBasic.h>
 #include <vtkm/cont/Timer.h>
+#include <vtkm/cont/testing/MakeTestDataSet.h>

 #include <vtkm/cont/internal/OptionParser.h>

@ -39,6 +39,7 @@
 #include <vtkm/filter/Tetrahedralize.h>
 #include <vtkm/filter/Threshold.h>
 #include <vtkm/filter/ThresholdPoints.h>
+#include <vtkm/filter/Triangulate.h>
 #include <vtkm/filter/VectorMagnitude.h>
 #include <vtkm/filter/VertexClustering.h>
 #include <vtkm/filter/WarpScalar.h>
@ -92,12 +93,15 @@ vtkm::cont::InitializeResult Config;

 // The input dataset we'll use on the filters:
 static vtkm::cont::DataSet InputDataSet;
+static vtkm::cont::DataSet UnstructuredInputDataSet;
 // The point scalars to use:
 static std::string PointScalarsName;
 // The cell scalars to use:
 static std::string CellScalarsName;
 // The point vectors to use:
 static std::string PointVectorsName;
+// Whether the input is a file or is generated
+bool FileAsInput = false;

 bool InputIsStructured()
 {
@ -166,8 +170,8 @@ void BenchGradient(::benchmark::State& state, int options)
  }
 }

-#define VTKM_PRIVATE_GRADIENT_BENCHMARK(Name, Opts)                                                \
-  void BenchGradient##Name(::benchmark::State& state) { BenchGradient(state, Opts); }              \
+#define VTKM_PRIVATE_GRADIENT_BENCHMARK(Name, Opts)                                   \
+  void BenchGradient##Name(::benchmark::State& state) { BenchGradient(state, Opts); } \
  VTKM_BENCHMARK(BenchGradient##Name)

 VTKM_PRIVATE_GRADIENT_BENCHMARK(Scalar, Gradient | ScalarInput);
@ -343,10 +347,11 @@ void BenchContour(::benchmark::State& state)
 {
  const vtkm::cont::DeviceAdapterId device = Config.Device;

-  const vtkm::Id numIsoVals = static_cast<vtkm::Id>(state.range(0));
-  const bool mergePoints = static_cast<bool>(state.range(1));
-  const bool normals = static_cast<bool>(state.range(2));
-  const bool fastNormals = static_cast<bool>(state.range(3));
+  const bool isStructured = static_cast<vtkm::Id>(state.range(0));
+  const vtkm::Id numIsoVals = static_cast<vtkm::Id>(state.range(1));
+  const bool mergePoints = static_cast<bool>(state.range(2));
+  const bool normals = static_cast<bool>(state.range(3));
+  const bool fastNormals = static_cast<bool>(state.range(4));

  vtkm::filter::Contour filter;
  filter.SetActiveField(PointScalarsName, vtkm::cont::Field::Association::POINTS);
@ -372,11 +377,14 @@ void BenchContour(::benchmark::State& state)
  filter.SetComputeFastNormalsForUnstructured(fastNormals);

  vtkm::cont::Timer timer{ device };
+
+  vtkm::cont::DataSet input = isStructured ? InputDataSet : UnstructuredInputDataSet;
+
  for (auto _ : state)
  {
    (void)_;
    timer.Start();
-    auto result = filter.Execute(InputDataSet);
+    auto result = filter.Execute(input);
    ::benchmark::DoNotOptimize(result);
    timer.Stop();

@ -386,19 +394,25 @@ void BenchContour(::benchmark::State& state)

 void BenchContourGenerator(::benchmark::internal::Benchmark* bm)
 {
-  bm->ArgNames({ "NIsoVals", "MergePts", "GenNormals", "FastNormals" });
+  bm->ArgNames({ "IsStructuredDataSet", "NIsoVals", "MergePts", "GenNormals", "FastNormals" });

  auto helper = [&](const vtkm::Id numIsoVals) {
-    bm->Args({ numIsoVals, 0, 0, 0 });
-    bm->Args({ numIsoVals, 1, 0, 0 });
-    bm->Args({ numIsoVals, 0, 1, 0 });
-    bm->Args({ numIsoVals, 0, 1, 1 });
+    bm->Args({ 0, numIsoVals, 0, 0, 0 });
+    bm->Args({ 0, numIsoVals, 1, 0, 0 });
+    bm->Args({ 0, numIsoVals, 0, 1, 0 });
+    bm->Args({ 0, numIsoVals, 0, 1, 1 });
+    bm->Args({ 1, numIsoVals, 0, 0, 0 });
+    bm->Args({ 1, numIsoVals, 1, 0, 0 });
+    bm->Args({ 1, numIsoVals, 0, 1, 0 });
+    bm->Args({ 1, numIsoVals, 0, 1, 1 });
  };

  helper(1);
  helper(3);
  helper(12);
 }
+
+// :TODO: Disabled until SIGSEGV in Countour when passings field is resolved
 VTKM_BENCHMARK_APPLY(BenchContour, BenchContourGenerator);

 void BenchExternalFaces(::benchmark::State& state)
@ -428,10 +442,9 @@ void BenchTetrahedralize(::benchmark::State& state)
  const vtkm::cont::DeviceAdapterId device = Config.Device;

  // This filter only supports structured datasets:
-  if (!InputIsStructured())
+  if (FileAsInput && !InputIsStructured())
  {
    state.SkipWithError("Tetrahedralize Filter requires structured data.");
-    return;
  }

  vtkm::filter::Tetrahedralize filter;
@ -456,10 +469,9 @@ void BenchVertexClustering(::benchmark::State& state)
  const vtkm::Id numDivs = static_cast<vtkm::Id>(state.range(0));

  // This filter only supports unstructured datasets:
-  if (InputIsStructured())
+  if (FileAsInput && InputIsStructured())
  {
-    state.SkipWithError("VertexClustering Filter requires unstructured data.");
-    return;
+    state.SkipWithError("VertexClustering Filter requires unstructured data (use --tetra).");
  }

  vtkm::filter::VertexClustering filter;
@ -469,8 +481,9 @@ void BenchVertexClustering(::benchmark::State& state)
  for (auto _ : state)
  {
    (void)_;
+
    timer.Start();
-    auto result = filter.Execute(InputDataSet);
+    auto result = filter.Execute(UnstructuredInputDataSet);
    ::benchmark::DoNotOptimize(result);
    timer.Stop();

@ -530,13 +543,12 @@ struct PrepareForInput

 void BenchReverseConnectivityGen(::benchmark::State& state)
 {
-  if (InputIsStructured())
+  if (FileAsInput && InputIsStructured())
  {
-    state.SkipWithError("ReverseConnectivityGen requires unstructured data.");
-    return;
+    state.SkipWithError("ReverseConnectivityGen requires unstructured data (--use tetra).");
  }

-  auto cellset = InputDataSet.GetCellSet();
+  auto cellset = UnstructuredInputDataSet.GetCellSet();
  PrepareForInput functor;
  for (auto _ : state)
  {
@ -763,6 +775,10 @@ struct Arg : vtkm::cont::internal::option::Arg
    bool msg)
  {
    if ((option.arg != nullptr) && (option.arg[0] != '\0'))
+    {
+      return vtkm::cont::internal::option::ARG_OK;
+    }
+    else
    {
      if (msg)
      {
@ -770,10 +786,6 @@ struct Arg : vtkm::cont::internal::option::Arg
      }
      return vtkm::cont::internal::option::ARG_ILLEGAL;
    }
-    else
-    {
-      return vtkm::cont::internal::option::ARG_OK;
-    }
  }
 };

@ -861,8 +873,12 @@ void InitDataSet(int& argc, char** argv)

  if (options[HELP])
  {
-    // FIXME: Print google benchmark usage too
-    option::printUsage(std::cerr, usage.data());
+    option::printUsage(std::cout, usage.data());
+    // Print google benchmark usage too
+    const char* helpstr = "--help";
+    char* tmpargv[] = { argv[0], const_cast<char*>(helpstr), nullptr };
+    int tmpargc = 2;
+    VTKM_EXECUTE_BENCHMARKS(tmpargc, tmpargv);
    exit(0);
  }

@ -975,6 +991,7 @@ void InitDataSet(int& argc, char** argv)
    std::cerr << "[InitDataSet] Loading file: " << filename << "\n";
    vtkm::io::VTKDataSetReader reader(filename);
    InputDataSet = reader.ReadDataSet();
+    FileAsInput = true;
  }
  else
  {
@ -986,17 +1003,20 @@ void InitDataSet(int& argc, char** argv)
    InputDataSet = source.Execute();
  }

-  if (tetra)
-  {
-    std::cerr << "[InitDataSet] Tetrahedralizing dataset...\n";
-    vtkm::filter::Tetrahedralize tet;
-    tet.SetFieldsToPass(vtkm::filter::FieldSelection(vtkm::filter::FieldSelection::MODE_ALL));
-    InputDataSet = tet.Execute(InputDataSet);
-  }
-
  FindFields();
  CreateMissingFields();

+  std::cerr
+    << "[InitDataSet] Create UnstructuredInputDataSet from Tetrahedralized InputDataSet...\n";
+  vtkm::filter::Tetrahedralize tet;
+  tet.SetFieldsToPass(vtkm::filter::FieldSelection(vtkm::filter::FieldSelection::MODE_ALL));
+  UnstructuredInputDataSet = tet.Execute(InputDataSet);
+
+  if (tetra)
+  {
+    InputDataSet = UnstructuredInputDataSet;
+  }
+
  inputGenTimer.Stop();

  std::cerr << "[InitDataSet] DataSet initialization took " << inputGenTimer.GetElapsedTime()
@ -1015,16 +1035,12 @@ int main(int argc, char* argv[])
  // Parse VTK-m options:
  Config = vtkm::cont::Initialize(argc, args.data(), opts);

-  // This occurs when it is help
-  if (opts == vtkm::cont::InitializeOptions::None)
-  {
-    std::cout << Config.Usage << std::endl;
-  }
-  else
+  // This opts changes when it is help
+  if (opts != vtkm::cont::InitializeOptions::None)
  {
    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
-    InitDataSet(argc, args.data());
  }
+  InitDataSet(argc, args.data());

  const std::string dataSetSummary = []() -> std::string {
    std::ostringstream out;
--- a/benchmarking/BenchmarkODEIntegrators.cxx
+++ b/benchmarking/BenchmarkODEIntegrators.cxx
@ -0,0 +1,97 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include "Benchmarker.h"
+
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/DataSetBuilderUniform.h>
+#include <vtkm/cont/ErrorInternal.h>
+#include <vtkm/cont/Logging.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/Timer.h>
+#include <vtkm/cont/internal/OptionParser.h>
+#include <vtkm/filter/ParticleAdvection.h>
+#include <vtkm/worklet/particleadvection/EulerIntegrator.h>
+#include <vtkm/worklet/particleadvection/RK4Integrator.h>
+#ifdef VTKM_ENABLE_TBB
+#include <tbb/task_scheduler_init.h>
+#endif
+#ifdef VTKM_ENABLE_OPENMP
+#include <omp.h>
+#endif
+
+
+namespace
+{
+// Hold configuration state (e.g. active device):
+vtkm::cont::InitializeResult Config;
+
+// Wrapper around RK4:
+void BenchParticleAdvection(::benchmark::State& state)
+{
+  const vtkm::cont::DeviceAdapterId device = Config.Device;
+  const vtkm::Id3 dims(5, 5, 5);
+  const vtkm::Vec3f vecX(1, 0, 0);
+
+  vtkm::Id numPoints = dims[0] * dims[1] * dims[2];
+
+  std::vector<vtkm::Vec3f> vectorField(static_cast<std::size_t>(numPoints));
+  for (std::size_t i = 0; i < static_cast<std::size_t>(numPoints); i++)
+    vectorField[i] = vecX;
+
+  vtkm::cont::DataSetBuilderUniform dataSetBuilder;
+
+  vtkm::cont::DataSet ds = dataSetBuilder.Create(dims);
+  ds.AddPointField("vector", vectorField);
+
+  vtkm::cont::ArrayHandle<vtkm::Particle> seedArray =
+    vtkm::cont::make_ArrayHandle({ vtkm::Particle(vtkm::Vec3f(.2f, 1.0f, .2f), 0),
+                                   vtkm::Particle(vtkm::Vec3f(.2f, 2.0f, .2f), 1),
+                                   vtkm::Particle(vtkm::Vec3f(.2f, 3.0f, .2f), 2) });
+
+  vtkm::filter::ParticleAdvection particleAdvection;
+
+  particleAdvection.SetStepSize(vtkm::FloatDefault(1) / state.range(0));
+  particleAdvection.SetNumberOfSteps(static_cast<vtkm::Id>(state.range(0)));
+  particleAdvection.SetSeeds(seedArray);
+  particleAdvection.SetActiveField("vector");
+  vtkm::cont::Timer timer{ device };
+  for (auto _ : state)
+  {
+    (void)_;
+    timer.Start();
+    auto output = particleAdvection.Execute(ds);
+    ::benchmark::DoNotOptimize(output);
+    timer.Stop();
+
+    state.SetIterationTime(timer.GetElapsedTime());
+  }
+  state.SetComplexityN(state.range(0));
+}
+VTKM_BENCHMARK_OPTS(BenchParticleAdvection,
+                      ->RangeMultiplier(2)
+                      ->Range(32, 4096)
+                      ->ArgName("Steps")
+                      ->Complexity());
+
+} // end anon namespace
+
+int main(int argc, char* argv[])
+{
+  auto opts = vtkm::cont::InitializeOptions::DefaultAnyDevice;
+  std::vector<char*> args(argv, argv + argc);
+  vtkm::bench::detail::InitializeArgs(&argc, args, opts);
+  Config = vtkm::cont::Initialize(argc, args.data(), opts);
+  if (opts != vtkm::cont::InitializeOptions::None)
+  {
+    vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
+  }
+  VTKM_EXECUTE_BENCHMARKS(argc, args.data());
+}
--- a/benchmarking/BenchmarkRayTracing.cxx
+++ b/benchmarking/BenchmarkRayTracing.cxx
@ -26,8 +26,6 @@

 #include <vtkm/exec/FunctorBase.h>

-#include <vtkm/cont/ColorTable.hxx>
-
 #include <sstream>
 #include <string>
 #include <vector>
--- a/benchmarking/Benchmarker.h
+++ b/benchmarking/Benchmarker.h
@ -170,7 +170,7 @@
 /// and modified using the passed arguments; see the Google Benchmark documentation
 /// for more details. The `preamble` string may be used to supply additional
 /// information that will be appended to the output's preamble.
-#define VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble)                                     \
+#define VTKM_EXECUTE_BENCHMARKS_PREAMBLE(argc, argv, preamble) \
  vtkm::bench::detail::ExecuteBenchmarks(argc, argv, preamble)

 /// \def VTKM_BENCHMARK(BenchFunc)
@ -181,7 +181,7 @@
 /// ```
 /// void BenchFunc(::benchmark::State& state)
 /// ```
-#define VTKM_BENCHMARK(BenchFunc)                                                                  \
+#define VTKM_BENCHMARK(BenchFunc) \
  BENCHMARK(BenchFunc)->UseManualTime()->Unit(benchmark::kMillisecond)

 /// \def VTKM_BENCHMARK_OPTS(BenchFunc, Args)
@ -196,7 +196,7 @@
 /// Note the similarity to the raw Google Benchmark usage of
 /// `BENCHMARK(MyBenchmark)->ArgName("MyParam")->Range(32, 1024*1024);`. See
 /// the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_OPTS(BenchFunc, options)                                                    \
+#define VTKM_BENCHMARK_OPTS(BenchFunc, options) \
  BENCHMARK(BenchFunc)->UseManualTime()->Unit(benchmark::kMillisecond) options

 /// \def VTKM_BENCHMARK_APPLY(BenchFunc, ConfigFunc)
@ -211,7 +211,7 @@
 /// ```
 ///
 /// See the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_APPLY(BenchFunc, applyFunctor)                                              \
+#define VTKM_BENCHMARK_APPLY(BenchFunc, applyFunctor) \
  BENCHMARK(BenchFunc)->Apply(applyFunctor)->UseManualTime()->Unit(benchmark::kMillisecond)

 /// \def VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)
@ -224,7 +224,7 @@
 /// template <typename T>
 /// void BenchFunc(::benchmark::State& state)
 /// ```
-#define VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList)                                              \
+#define VTKM_BENCHMARK_TEMPLATES(BenchFunc, TypeList) \
  VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, vtkm::bench::detail::NullApply, TypeList)

 /// \def VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, Args, TypeList)
@ -237,10 +237,10 @@
 ///                                ->ArgName("MyParam")->Range(32, 1024*1024),
 ///                              vtkm::List<vtkm::Float32, vtkm::Vec3f_32>);
 /// ```
-#define VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, options, TypeList)                                \
-  VTKM_BENCHMARK_TEMPLATES_APPLY(                                                                  \
-    BenchFunc,                                                                                     \
-    [](::benchmark::internal::Benchmark* bm) { bm options->Unit(benchmark::kMillisecond); },       \
+#define VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFunc, options, TypeList)                          \
+  VTKM_BENCHMARK_TEMPLATES_APPLY(                                                            \
+    BenchFunc,                                                                               \
+    [](::benchmark::internal::Benchmark* bm) { bm options->Unit(benchmark::kMillisecond); }, \
    TypeList)

 /// \def VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ConfigFunc, TypeList)
@ -255,22 +255,22 @@
 /// ```
 ///
 /// See the Google Benchmark documentation for more details on the available options.
-#define VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ApplyFunctor, TypeList)                                                                                                             \
-  namespace                                                                                                                                                                           \
+#define VTKM_BENCHMARK_TEMPLATES_APPLY(BenchFunc, ApplyFunctor, TypeList)                            \
+  namespace                                                                                          \
  { /* A template function cannot be used as a template parameter, so wrap the function with       \
     * a template struct to get it into the GenerateTemplateBenchmarks class. */ \
-  template <typename... Ts>                                                                                                                                                           \
-  struct VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                                                                                                       \
-  {                                                                                                                                                                                   \
-    static ::benchmark::internal::Function* GetFunction() { return BenchFunc<Ts...>; }                                                                                                \
-  };                                                                                                                                                                                  \
-  } /* end anon namespace */                                                                                                                                                          \
-  int BENCHMARK_PRIVATE_NAME(BenchFunc) = vtkm::bench::detail::GenerateTemplateBenchmarks<                                                                                            \
-    brigand::bind<VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)>,                                                                                                                            \
+  template <typename... Ts>                                                                          \
+  struct VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                      \
+  {                                                                                                  \
+    static ::benchmark::internal::Function* GetFunction() { return BenchFunc<Ts...>; }               \
+  };                                                                                                 \
+  } /* end anon namespace */                                                                         \
+  int BENCHMARK_PRIVATE_NAME(BenchFunc) = vtkm::bench::detail::GenerateTemplateBenchmarks<           \
+    brigand::bind<VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)>,                                           \
    TypeList>::Register(#BenchFunc, ApplyFunctor)

 // Internal use only:
-#define VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc)                                                     \
+#define VTKM_BENCHMARK_WRAPPER_NAME(BenchFunc) \
  BENCHMARK_PRIVATE_CONCAT(_wrapper_, BenchFunc, __LINE__)

 namespace vtkm
@ -280,9 +280,7 @@ namespace bench
 namespace detail
 {

-static inline void NullApply(::benchmark::internal::Benchmark*)
-{
-}
+static inline void NullApply(::benchmark::internal::Benchmark*) {}

 /// Do not use directly. The VTKM_BENCHMARK_TEMPLATES macros should be used
 /// instead.
--- a/benchmarking/CMakeLists.txt
+++ b/benchmarking/CMakeLists.txt
@ -44,6 +44,7 @@ set(benchmarks
  BenchmarkDeviceAdapter
  BenchmarkFieldAlgorithms
  BenchmarkFilters
+  BenchmarkODEIntegrators
  BenchmarkTopologyAlgorithms
  )

--- a/data/baseline/5x6_7_MC_Rank0_Block0_Round1_CombinedMesh.ctm
+++ b/data/baseline/5x6_7_MC_Rank0_Block0_Round1_CombinedMesh.ctm
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5706bddc644b5b120ffbd424b3073ce989735272726de711ca8dac19b4a30ee1
+size 2653
--- a/data/baseline/contour-tangle.png
+++ b/data/baseline/contour-tangle.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:785051d9773c4a0ced2701de3499f9cd948da2a4c846a5187e30dfb5cb0783cb
+size 10830
--- a/data/baseline/contour-uniform.png
+++ b/data/baseline/contour-uniform.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d990b5f0e9ef27e4e5f87f4c62c4f9974992506521f32bd5901ac6670e71bfa
+size 9656
--- a/data/baseline/contour-wedge.png
+++ b/data/baseline/contour-wedge.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54e09a09c97a20627e54c835d2d488bc9f692ef1315122ab60241c006ab78813
+size 19742
--- a/data/baseline/point-transform.png
+++ b/data/baseline/point-transform.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1472e6002ca4ad4012e0c9f067f8254290fabe93c82713a4994ad97a7fdbdfc
+size 31218
--- a/data/baseline/split-sharp-edges.png
+++ b/data/baseline/split-sharp-edges.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff6d72bd325ffe0fb3b22bfdc294b6d674384afd662290424bb77634202b4ef
+size 71150
--- a/data/baseline/streamline.png
+++ b/data/baseline/streamline.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24c71e8846fe62e6f6eefdb72c9729639061af80bf9d3453d35c8c6838de9174
+size 37162
--- a/data/baseline/surface-normals.png
+++ b/data/baseline/surface-normals.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c905ec76e72513519515ec41cf5efd34490b98255ee7465f8b6746fcff41e5
+size 51865
--- a/data/baseline/vanc.ct_txt
+++ b/data/baseline/vanc.ct_txt
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf65aefbd8c8fe8fb479521af7e5fa894cc94b3f890e2cc527a8df5c6e5601c
+size 728
--- a/data/data/curvilinear/simple_structured_ascii.vtk
+++ b/data/data/curvilinear/simple_structured_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f85560cc05688d09c21b22e91c14cec22deecb3c51dc364d82cc9fd460c6ab6
+size 328
--- a/data/data/curvilinear/simple_structured_bin.vtk
+++ b/data/data/curvilinear/simple_structured_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a47045b1ae5539ef0125273ee9c50a9a6e809f78411f6a850ac34e6fa43189bb
+size 535
--- a/data/data/misc/5x6_7_MC_Rank0_Block0_Round1_BeforeCombineMesh1.ctm
+++ b/data/data/misc/5x6_7_MC_Rank0_Block0_Round1_BeforeCombineMesh1.ctm
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea0a0903fce2b7b42023ca0a2bdc008781a61fa74f75b2b107e6d0788c404551
+size 1441
--- a/data/data/misc/5x6_7_MC_Rank0_Block0_Round1_BeforeCombineMesh2.ctm
+++ b/data/data/misc/5x6_7_MC_Rank0_Block0_Round1_BeforeCombineMesh2.ctm
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58aed19216ce91b6c9bc7c0d8ee31c1062405ad6f5a4a977b49f213e2ce81307
+size 1518
--- a/data/data/rectilinear/fishtank.vtk
+++ b/data/data/rectilinear/fishtank.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef3dfd79f0c8d18780d0749014d71c0226134041283d33de0bcd994e343dd421
+size 2001070
--- a/data/data/rectilinear/fishtank_double_ascii.vtk
+++ b/data/data/rectilinear/fishtank_double_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb3d36ea5ecef5e7ef1057d0dddebbc590424915083091ead3dac2928000524
+size 2904465
--- a/data/data/rectilinear/fishtank_double_big_endian.vtk
+++ b/data/data/rectilinear/fishtank_double_big_endian.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bffad7dae3dd6ef018ad7a9e109464ced0f3b9bc15cf1fb5d555f6d0d00b621f
+size 3001624
--- a/data/data/rectilinear/fusion.vtk
+++ b/data/data/rectilinear/fusion.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cbdf56fd5445ddc5b6bc05507b8825fb8d74fe1ccce894bde03e5ff2ecf5fb6
+size 525141
--- a/data/data/rectilinear/simple_rectilinear1_ascii.vtk
+++ b/data/data/rectilinear/simple_rectilinear1_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:752021630d25aff8dfd00064badd452896be70bc8b2f94b008900b4fc70d4dd5
+size 1811
--- a/data/data/rectilinear/simple_rectilinear2_ascii.vtk
+++ b/data/data/rectilinear/simple_rectilinear2_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d1dbb4c28f1c829769ad3e03fc58f667935d8a461d3515036d5d98f5e3841cb
+size 395
--- a/data/data/rectilinear/vanc.vtk
+++ b/data/data/rectilinear/vanc.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc4033483646c7e3c7be921ca4f821d1277c0d6d79063b1565dfb78c4766bf4d
+size 1234
--- a/data/data/third_party/ecl_cc/README
+++ b/data/data/third_party/ecl_cc/README
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5e6e3dc559fefc7990daaec071fcd620f620e5ab8652dddaa6b43ca4ba08e7
+size 222
--- a/data/data/third_party/ecl_cc/internet.egr
+++ b/data/data/third_party/ecl_cc/internet.egr
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:892470e152ccd46ddcca70e26bcd88816c247f08c496319cea80864b6b94ce46
+size 3596536
--- a/data/data/uniform/StreamlineTestDataSet.vtk
+++ b/data/data/uniform/StreamlineTestDataSet.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2c0b3788197a48a305fc049f54d66c94c20298e617ef06dbe4fe0c2043f7366
+size 3590
--- a/data/data/uniform/simple_structured_points_ascii.vtk
+++ b/data/data/uniform/simple_structured_points_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1860e747d7f460afc63e32de184e445ffb966a42fb07f9d44ba39020584864f
+size 496
--- a/data/data/uniform/simple_structured_points_bin.vtk
+++ b/data/data/uniform/simple_structured_points_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9bea2064cd3402f3f5b7862e6b775e37f33210ba099f59358857d4bdae1020
+size 255
--- a/data/data/uniform/simple_structured_points_visit_ascii.vtk
+++ b/data/data/uniform/simple_structured_points_visit_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e154ba13346e6998b864316868da3f155e99efe4f330c8e080b0d7ece22b505a
+size 488
--- a/data/data/unstructured/PointTransformTestDataSet.vtk
+++ b/data/data/unstructured/PointTransformTestDataSet.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7191ea7dec00129cb262239a508aeba4bb9387e581adfa2049211f4514ee4130
+size 1020
--- a/data/data/unstructured/SplitSharpEdgesTestDataSet.vtk
+++ b/data/data/unstructured/SplitSharpEdgesTestDataSet.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7b7e73f60f3572e19178aa55fcd32cafb5c5823062241d28aa37d82b0031a2a
+size 1145
--- a/data/data/unstructured/SurfaceNormalsTestDataSet.vtk
+++ b/data/data/unstructured/SurfaceNormalsTestDataSet.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90aed1ed3c3eba58f1b0b1573b09e8c024e48f5ca822e9f88b0c1ff6593a978f
+size 693
--- a/data/data/unstructured/empty_poly.vtk
+++ b/data/data/unstructured/empty_poly.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0ddc7c712a6d544db85660cd9d325884892b18d6f0ed451361aaeae2a96413
+size 204
--- a/data/data/unstructured/empty_unstructured.vtk
+++ b/data/data/unstructured/empty_unstructured.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b5601eb23b1724d5309e69a51839615bce625f6e7641b52dc3d06e10b0c5ee
+size 745
--- a/data/data/unstructured/simple_poly_ascii.vtk
+++ b/data/data/unstructured/simple_poly_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff3108d009d2eef410593811857e38388001f7df624ddeaed3edceafbc838aea
+size 849
--- a/data/data/unstructured/simple_poly_bin.vtk
+++ b/data/data/unstructured/simple_poly_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aca6667b06deb4ec6236d5caa3d9518345bc1eb9021bc721289b81acc980af9
+size 789
--- a/data/data/unstructured/simple_unstructured_ascii.vtk
+++ b/data/data/unstructured/simple_unstructured_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:861fc904b7d4db43288fce85c8c1398726b54ac82d7bcbcebd8f12808cb5599b
+size 1002
--- a/data/data/unstructured/simple_unstructured_bin.vtk
+++ b/data/data/unstructured/simple_unstructured_bin.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e43c695763535251ab22af815651caa53d103b5fd168c72dfb9188e72e4ff4
+size 1244
--- a/data/data/unstructured/simple_unstructured_visit_ascii.vtk
+++ b/data/data/unstructured/simple_unstructured_visit_ascii.vtk
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3731448fe4d87b204e185829237a6a6b0140aed2fb27eea0533883a4cf4ed79d
+size 1065
--- a/docs/CI-README.md
+++ b/docs/CI-README.md
@ -60,14 +60,14 @@ Current gitlab runner tags for VTK-m are:
        Used to state that we require a linux based gitlab-runner
    - large-memory
        Used to state that this step will require a machine that has lots of memory.
-        This is currently used for cuda `build` requests
+        This is currently used for CUDA `build` requests
    - cuda-rt
-        Used to state that the runner is required to have the cuda runtime enviornment.
-        This isn't required to `build` VTK-m, only `test`
+        Used to state that the runner is required to have the CUDA runtime environment.
+        This is required to `build` and `test` VTK-m when using CUDA 
    - maxwell
    - pascal
    - turing
-        Only used on a `test` stage to signifiy which GPU hardware is required to
+        Only used on a `test` stage to signify which GPU hardware is required to
        run the VTK-m tests

 # How to use docker builders locally
@ -118,10 +118,9 @@ compilation of VTK-m. Instead of doing the compilation, instead you will be give
 ./reproduce_ci_env.py run rhel8
 ```

-To compile VTK-m from the the interactive shell you would do the following:
+To compile VTK-m from the the interactive shell with the settings of the CI job you would do the following:
 ```
-> src]# cd build/
-> build]# cmake --build .
+> src]# bash /run-gitlab-stage.sh
 ```

 # How to Add/Update Kitware Gitlab CI
@ -259,22 +258,3 @@ sudo docker login --username=<docker_hub_name>
 cd .gitlab/ci/docker
 sudo ./update_all.sh 20201230
 ```
-
-# ECP OSTI CI
-
-`.gitlab-ci-ecp.yml` allows for VTK-m to run CI on provided by ECP at NMC.
-
-To have this work properly you will need to make sure that the gitlab repository
-has been updated to this non-standard yaml file location
-( "Settings" -> "CI/CD" -> "General pipelines" -> "Custom CI configuration path").
-
-The ECP CI is setup to verify VTK-m mainly on Power9 hardware as that currently is
-missing from VTK-m standard CI infrastructure.
-
-Currently we verify Power9 support with `cuda` and `openmp` builders. The `cuda` builder
-is setup to use the default cuda SDK on the machine and the required `c++` compiler which
-currently is `gcc-4.8.5`. The `openmp` builder is setup to use the newest `c++` compiler provided
-on the machine so that we maximimze compiler coverage.
-
-## Issues
-Currently these builders don't report back to the VTK-m CDash instance.
--- a/docs/changelog/VTKDataSetWriter-remove-justpoints.md
+++ b/docs/changelog/VTKDataSetWriter-remove-justpoints.md
@ -0,0 +1,7 @@
+# Remove VTKDataSetWriter::WriteDataSet just_points parameter
+
+In the method `VTKDataSetWriter::WriteDataSet`, `just_points` parameter has been
+removed due to lack of usage. 
+
+The purpose of `just_points` was to allow exporting only the points of a
+DataSet without its cell data.
--- a/docs/changelog/add-kokkos-backend.md
+++ b/docs/changelog/add-kokkos-backend.md
@ -0,0 +1,5 @@
+# Add Kokkos backend
+
+Adds a new device backend `Kokkos` which uses the kokkos library for parallelism.
+User must provide the kokkos build and Vtk-m will use the default configured execution
+space.
--- a/docs/changelog/array-extract-component.md
+++ b/docs/changelog/array-extract-component.md
@ -0,0 +1,226 @@
+# Extract component arrays from unknown arrays
+
+One of the problems with the data structures of VTK-m is that non-templated
+classes like `DataSet`, `Field`, and `UnknownArrayHandle` (formally
+`VariantArrayHandle`) internally hold an `ArrayHandle` of a particular type
+that has to be cast to the correct task before it can be reasonably used.
+That in turn is problematic because the list of possible `ArrayHandle`
+types is very long.
+
+At one time we were trying to compensate for this by using
+`ArrayHandleVirtual`. However, for technical reasons this class is
+infeasible for every use case of VTK-m and has been deprecated. Also, this
+was only a partial solution since using it still required different code
+paths for, say, handling values of `vtkm::Float32` and `vtkm::Vec3f_32`
+even though both are essentially arrays of 32-bit floats.
+
+The extract component feature compensates for this problem by allowing you
+to extract the components from an `ArrayHandle`. This feature allows you to
+create a single code path to handle `ArrayHandle`s containing scalars or
+vectors of any size. Furthermore, when you extract a component from an
+array, the storage gets normalized so that one code path covers all storage
+types.
+
+## `ArrayExtractComponent`
+
+The basic enabling feature is a new function named `ArrayExtractComponent`.
+This function takes takes an `ArrayHandle` and an index to a component. It
+then returns an `ArrayHandleStride` holding the selected component of each
+entry in the original array.
+
+We will get to the structure of `ArrayHandleStride` later. But the
+important part is that `ArrayHandleStride` does _not_ depend on the storage
+type of the original `ArrayHandle`. That means whether you extract a
+component from `ArrayHandleBasic`, `ArrayHandleSOA`,
+`ArrayHandleCartesianProduct`, or any other type, you get back the same
+`ArrayHandleStride`. Likewise, regardless of whether the input
+`ArrayHandle` has a `ValueType` of `FloatDefault`, `Vec2f`, `Vec3f`, or any
+other `Vec` of a default float, you get the same `ArrayHandleStride`. Thus,
+you can see how this feature can dramatically reduce code paths if used
+correctly.
+
+It should be noted that `ArrayExtractComponent` will (logically) flatten
+the `ValueType` before extracting the component. Thus, nested `Vec`s such
+as `Vec<Vec3f, 3>` will be treated as a `Vec<FloatDefault, 9>`. The
+intention is so that the extracted component will always be a basic C type.
+For the purposes of this document when we refer to the "component type", we
+really mean the base component type.
+
+Different `ArrayHandle` implementations provide their own implementations
+for `ArrayExtractComponent` so that the component can be extracted without
+deep copying all the data. We will visit how `ArrayHandleStride` can
+represent different data layouts later, but first let's go into the main
+use case.
+
+## Extract components from `UnknownArrayHandle`
+
+The principle use case for `ArrayExtractComponent` is to get an
+`ArrayHandle` from an unknown array handle without iterating over _every_
+possible type. (Rather, we iterate over a smaller set of types.) To
+facilitate this, an `ExtractComponent` method has been added to
+`UnknownArrayHandle`.
+
+To use `UnknownArrayHandle::ExtractComponent`, you must give it the
+component type. You can check for the correct component type by using the
+`IsBaseComponentType` method. The method will then return an
+`ArrayHandleStride` for the component type specified.
+
+### Example
+
+As an example, let's say you have a worklet, `FooWorklet`, that does some
+per component operation on an array. Furthermore, let's say that you want
+to implement a function that, to the best of your ability, can apply
+`FooWorklet` on an array of any type. This function should be pre-compiled
+into a library so it doesn't have to be compiled over and over again.
+(`MapFieldPermutation` and `MapFieldMergeAverage` are real and important
+examples that have this behavior.)
+
+Without the extract component feature, the implementation might look
+something like this (many practical details left out):
+
+``` cpp
+struct ApplyFooFunctor
+{
+  template <typename ArrayType>
+  void operator()(const ArrayType& input, vtkm::cont::UnknownArrayHandle& output) const
+  {
+    ArrayType outputArray;
+	vtkm::cont::Invoke invoke;
+	invoke(FooWorklet{}, input, outputArray);
+	output = outputArray;
+  }
+};
+
+vtkm::cont::UnknownArrayHandle ApplyFoo(const vtkm::cont::UnknownArrayHandle& input)
+{
+  vtkm::cont::UnknownArrayHandle output;
+  input.CastAndCallForTypes<vtkm::TypeListAll, VTKM_DEFAULT_STORAGE_LIST_TAG>(
+    ApplyFooFunctor{}, output);
+  return output;
+}
+```
+
+Take a look specifically at the `CastAndCallForTypes` call near the bottom
+of this example. It calls for all types in `vtkm::TypeListAll`, which is
+about 40 instances. Then, it needs to be called for any type in the desired
+storage list. This could include basic arrays, SOA arrays, and lots of
+other specialized types. It would be expected for this code to generate
+over 100 paths for `ApplyFooFunctor`. This in turn contains a worklet
+invoke, which is not a small amount of code.
+
+Now consider how we can use the `ExtractComponent` feature to reduce the
+code paths:
+
+``` cpp
+struct ApplyFooFunctor
+{
+  template <typename T>
+  void operator()(T,
+                  const vtkm::cont::UnknownArrayHandle& input, 
+				  cont vtkm::cont::UnknownArrayHandle& output) const
+  {
+    if (!input.IsBasicComponentType<T>()) { return; }
+	VTKM_ASSERT(output.IsBasicComponentType<T>());
+
+	vtkm::cont::Invoke invoke;
+	invoke(FooWorklet{}, input.ExtractComponent<T>(), output.ExtractComponent<T>());
+  }
+};
+
+vtkm::cont::UnknownArrayHandle ApplyFoo(const vtkm::cont::UnknownArrayHandle& input)
+{
+  vtkm::cont::UnknownArrayHandle output = input.NewInstanceBasic();
+  output.Allocate(input.GetNumberOfValues());
+  vtkm::cont::ListForEach(ApplyFooFunctor{}, vtkm::TypeListScalarAll{}, input, output);
+  return output;
+}
+```
+
+The number of lines of code is about the same, but take a look at the
+`ListForEach` (which replaces the `CastAndCallForTypes`). This calling code
+takes `TypeListScalarAll` instead of `TypeListAll`, which reduces the
+instances created from around 40 to 13 (every basic C type). It is also no
+longer dependent on the storage, so these 13 instances are it. As an
+example of potential compile savings, changing the implementation of the
+`MapFieldMergePermutation` and `MapFieldMergeAverage` functions in this way
+reduced the filters_common library (on Mac, Debug build) by 24 MB (over a
+third of the total size).
+
+Another great advantage of this approach is that even though it takes less
+time to compile and generates less code, it actually covers more cases.
+Have an array containg values of `Vec<short, 13>`? No problem. The values
+were actually stored in an `ArrayHandleReverse`? It will still work.
+
+## `ArrayHandleStride`
+
+This functionality is made possible with the new `ArrayHandleStride`. This
+array behaves much like `ArrayHandleBasic`, except that it contains an
+_offset_ parameter to specify where in the buffer array to start reading
+and a _stride_ parameter to specify how many entries to skip for each
+successive entry. `ArrayHandleStride` also has optional parameters
+`divisor` and `modulo` that allow indices to be repeated at regular
+intervals.
+
+Here are how `ArrayHandleStride` extracts components from several common
+arrays. For each of these examples, we assume that the `ValueType` of the
+array is `Vec<T, N>`. They are each extracting _component_.
+
+### Extracting from `ArrayHandleBasic`
+
+When extracting from an `ArrayHandleBasic`, we just need to start at the
+proper component and skip the length of the `Vec`.
+
+* _offset_: _component_
+* _stride_: `N`
+
+### Extracting from `ArrayHandleSOA`
+
+Since each component is held in a separate array, they are densly packed.
+Each component could be represented by `ArrayHandleBasic`, but of course we
+use `ArrayHandleStride` to keep the type consistent.
+
+* _offset_: 0
+* _stride_: 1
+
+### Extracting from `ArrayHandleCartesianProduct`
+
+This array is the basic reason for implementing the _divisor_ and _modulo_
+parameters. Each of the 3 components have different parameters, which are
+the following (given that _dims_[3] captures the size of the 3 arrays for
+each dimension).
+
+* _offset_: 0
+* _stride_: 1
+* case _component_ == 0
+  * _divisor_: _ignored_
+  * _modulo_: _dims_[0]
+* case _component_ == 1
+  * _divisor_: _dims_[0]
+  * _modulo_: _dims_[1]
+* case _component_ == 2
+  * _divisor_: _dims_[0]
+  * _modulo_: _ignored_
+
+### Extracting from `ArrayHandleUniformPointCoordinates`
+
+This array cannot be represented directly because it is fully implicit.
+However, it can be trivially converted to `ArrayHandleCartesianProduct` in
+typically very little memory. (In fact, EAVL always represented uniform
+point coordinates by explicitly storing a Cartesian product.) Thus, for
+very little overhead the `ArrayHandleStride` can be created.
+
+## Runtime overhead of extracting components
+
+These benefits come at a cost, but not a large one. The "biggest" cost is
+the small cost of computing index arithmetic for each access into
+`ArrayHandleStride`. To make this as efficient as possible, there are
+conditions that skip over the modulo and divide steps if they are not
+necessary. (Integer modulo and divide tend to take much longer than
+addition and multiplication.) It is for this reason that we probably do not
+want to use this method all the time.
+
+Another cost is the fact that not every `ArrayHandle` can be represented by
+`ArrayHandleStride` directly without copying. If you ask to extract a
+component that cannot be directly represented, it will be copied into a
+basic array, which is not great. To make matters worse, for technical
+reasons this copy happens on the host rather than the device.
--- a/docs/changelog/array-handle-offsets-to-num-components.md
+++ b/docs/changelog/array-handle-offsets-to-num-components.md
@ -0,0 +1,29 @@
+# Create `ArrayHandleOffsetsToNumComponents`
+
+`ArrayHandleOffsetsToNumComponents` is a fancy array that takes an array of
+offsets and converts it to an array of the number of components for each
+packed entry.
+
+It is common in VTK-m to pack small vectors of variable sizes into a single
+contiguous array. For example, cells in an explicit cell set can each have
+a different amount of vertices (triangles = 3, quads = 4, tetra = 4, hexa =
+8, etc.). Generally, to access items in this list, you need an array of
+components in each entry and the offset for each entry. However, if you
+have just the array of offsets in sorted order, you can easily derive the
+number of components for each entry by subtracting adjacent entries. This
+works best if the offsets array has a size that is one more than the number
+of packed vectors with the first entry set to 0 and the last entry set to
+the total size of the packed array (the offset to the end).
+
+When packing data of this nature, it is common to start with an array that
+is the number of components. You can convert that to an offsets array using
+the `vtkm::cont::ConvertNumComponentsToOffsets` function. This will create
+an offsets array with one extra entry as previously described. You can then
+throw out the original number of components array and use the offsets with
+`ArrayHandleOffsetsToNumComponents` to represent both the offsets and num
+components while storing only one array.
+
+This replaces the use of `ArrayHandleDecorator` in `CellSetExplicit`.
+The two implementation should do the same thing, but the new
+`ArrayHandleOffsetsToNumComponents` should be less complex for
+compilers.
--- a/docs/changelog/array-range-compute-unknown.md
+++ b/docs/changelog/array-range-compute-unknown.md
@ -0,0 +1,18 @@
+# `ArrayRangeCompute` works on any array type without compiling device code
+
+Originally, `ArrayRangeCompute` required you to know specifically the
+`ArrayHandle` type (value type and storage type) and to compile using any
+device compiler. The method is changed to include only overloads that have
+precompiled versions of `ArrayRangeCompute`.
+
+Additionally, an `ArrayRangeCompute` overload that takes an
+`UnknownArrayHandle` has been added. In addition to allowing you to compute
+the range of arrays of unknown types, this implementation of
+`ArrayRangeCompute` serves as a fallback for `ArrayHandle` types that are
+not otherwise explicitly supported.
+
+If you really want to make sure that you compute the range directly on an
+`ArrayHandle` of a particular type, you can include
+`ArrayRangeComputeTemplate.h`, which contains a templated overload of
+`ArrayRangeCompute` that directly computes the range of an `ArrayHandle`.
+Including this header requires compiling for device code.
--- a/docs/changelog/buffer-memory-ownership.md
+++ b/docs/changelog/buffer-memory-ownership.md
@ -0,0 +1,29 @@
+# `vtkm::cont::internal::Buffer` now can have ownership transferred
+
+Memory once transferred to `Buffer` always had to be managed by VTK-m. This is problematic
+for applications that needed VTK-m to allocate memory, but have the memory ownership
+be longer than VTK-m.
+
+`Buffer::TakeHostBufferOwnership` allows for easy transfer ownership of memory out of VTK-m.
+When taking ownership of an VTK-m buffer you are provided the following information:
+
+- Memory: A `void*` pointer to the array
+- Container: A `void*` pointer used to free the memory. This is necessary to support cases such as allocations transferred into VTK-m from a `std::vector`.
+- Delete: The function to call to actually delete the transferred memory
+- Reallocate: The function to call to re-allocate the transferred memory. This will throw an exception if users try
+to reallocate a buffer that was 'view' only
+- Size: The size in number of elements of the array 
+
+ 
+To properly steal memory from VTK-m you do the following:
+```cpp
+  vtkm::cont::ArrayHandle<T> arrayHandle;
+
+  ...
+
+  auto stolen = arrayHandle.GetBuffers()->TakeHostBufferOwnership();
+    
+  ...
+
+  stolen.Delete(stolen.Container);
+```
--- a/docs/changelog/buffer.md
+++ b/docs/changelog/buffer.md
@ -0,0 +1,202 @@
+# Redesign of ArrayHandle to access data using typeless buffers
+
+The original implementation of `ArrayHandle` is meant to be very generic.
+To define an `ArrayHandle`, you actually create a `Storage` class that
+maintains the data and provides portals to access it (on the host). Because
+the `Storage` can provide any type of data structure it wants, you also
+need to define an `ArrayTransfer` that describes how to move the
+`ArrayHandle` to and from a device. It also has to be repeated for every
+translation unit that uses them.
+
+This is a very powerful mechanism. However, one of the major problems with
+this approach is that every `ArrayHandle` type needs to have a separate
+compile path for every value type crossed with every device. Because of
+this limitation, the `ArrayHandle` for the basic storage has a special
+implementation that manages the actual data allocation and movement as
+`void *` arrays. In this way all the data management can be compiled once
+and put into the `vtkm_cont` library. This has dramatically improved the
+VTK-m compile time.
+
+This new design replicates the basic `ArrayHandle`'s success to all other
+storage types. The basic idea is to make the implementation of
+`ArrayHandle` storage slightly less generic. Instead of requiring it to
+manage the data it stores, it instead just builds `ArrayPortal`s from
+`void` pointers that it is given. The management of `void` pointers can be
+done in non-templated classes that are compiled into a library.
+
+This initial implementation does not convert all `ArrayHandle`s to avoid
+making non-backward compatible changes before the next minor revision of
+VTK-m. In particular, it would be particularly difficult to convert
+`ArrayHandleVirtual`. It could be done, but it would be a lot of work for a
+class that will likely be removed.
+
+## Buffer
+
+Key to these changes is the introduction of a
+`vtkm::cont::internal::Buffer` object. As the name implies, the `Buffer`
+object manages a single block of bytes. `Buffer` is agnostic to the type of
+data being stored. It only knows the length of the buffer in bytes. It is
+responsible for allocating space on the host and any devices as necessary
+and for transferring data among them. (Since `Buffer` knows nothing about
+the type of data, a precondition of VTK-m would be that the host and all
+devices have to have the same endian.)
+
+The idea of the `Buffer` object is similar in nature to the existing
+`vtkm::cont::internal::ExecutionArrayInterfaceBasicBase` except that it
+will manage a buffer of data among the control and all devices rather than
+in one device through a templated subclass.
+
+As explained below, `ArrayHandle` holds some fixed number of `Buffer`
+objects. (The number can be zero for implicit `ArrayHandle`s.) Because all
+the interaction with the devices happen through `Buffer`, it will no longer
+be necessary to compile any reference to `ArrayHandle` for devices (e.g.
+you won't have to use nvcc just because the code links `ArrayHandle.h`).
+
+## Storage
+
+The `vtkm::cont::internal::Storage` class changes dramatically. Although an
+instance will be kept, the intention is for `Storage` itself to be a
+stateless object. It will manage its data through `Buffer` objects provided
+from the `ArrayHandle`.
+
+That said, it is possible for `Storage` to have some state. For example,
+the `Storage` for `ArrayHandleImplicit` must hold on to the instance of the
+portal used to manage the state.
+
+
+## ArrayTransport
+
+The `vtkm::cont::internal::ArrayTransfer` class will be removed completely.
+All data transfers will be handled internally with the `Buffer` object
+
+## Portals
+
+A big change for this design is that the type of a portal for an
+`ArrayHandle` will be the same for all devices and the host. Thus, we no
+longer need specialized versions of portals for each device. We only have
+one portal type. And since they are constructed from `void *` pointers, one
+method can create them all.
+
+
+## Advantages
+
+The `ArrayHandle` interface should not change significantly for external
+uses, but this redesign offers several advantages.
+
+### Faster Compiles
+
+Because the memory management is contained in a non-templated `Buffer`
+class, it can be compiled once in a library and used by all template
+instances of `ArrayHandle`. It should have similar compile advantages to
+our current specialization of the basic `ArrayHandle`, but applied to all
+types of `ArrayHandle`s.
+
+### Fewer Templates
+
+Hand-in-hand with faster compiles, the new design should require fewer
+templates and template instances. We have immediately gotten rid of
+`ArrayTransport`. `Storage` is also much shorter. Because all
+`ArrayPortal`s are the same for every device and the host, we need many
+fewer versions of those classes. In the device adapter, we can probably
+collapse the three `ArrayManagerExecution` classes into a single, much
+simpler class that does simple memory allocation and copy.
+
+### Fewer files need to be compiled for CUDA
+
+Including `ArrayHandle.h` no longer adds code that compiles for a device.
+Thus, we should no longer need to compile for a specific device adapter
+just because we access an `ArrayHandle`. This should make it much easier to
+achieve our goal of a "firewall". That is, code that just calls VTK-m
+filters does not need to support all its compilers and flags.
+
+### Simpler ArrayHandle specialization
+
+The newer code should simplify the implementation of special `ArrayHandle`s
+a bit. You need only implement an `ArrayPortal` that operates on one or
+more `void *` arrays and a simple `Storage` class.
+
+### Out of band memory sharing
+
+With the current version of `ArrayHandle`, if you want to take data from
+one `ArrayHandle` you pretty much have to create a special template to wrap
+another `ArrayHandle` around that. With this new design, it is possible to
+take data from one `ArrayHandle` and give it to another `ArrayHandle` of a
+completely different type. You can't do this willy-nilly since different
+`ArrayHandle` types will interpret buffers differently. But there can be
+some special important use cases.
+
+One such case could be an `ArrayHandle` that provides strided access to a
+buffer. (Let's call it `ArrayHandleStride`.) The idea is that it interprets
+the buffer as an array for a particular type (like a basic `ArrayHandle`)
+but also defines a stride, skip, and repeat so that given an index it looks
+up the value `((index / skip) % repeat) * stride`. The point is that it can
+take an AoS array of tuples and represent an array of one of the
+components.
+
+The point would be that if you had a `VariantArrayHandle` or `Field`, you
+could pull out an array of one of the components as an `ArrayHandleStride`.
+An `ArrayHandleStride<vtkm::Float32>` could be used to represent that data
+that comes from any basic `ArrayHandle` with `vtkm::Float32` or a
+`vtkm::Vec` of that type. It could also represent data from an
+`ArrayHandleCartesianProduct` and `ArrayHandleSoA`. We could even represent
+an `ArrayHandleUniformPointCoordinates` by just making a small array. This
+allows us to statically access a whole bunch of potential array storage
+classes with a single type.
+
+### Potentially faster device transfers
+
+There is currently a fast-path for basic `ArrayHandle`s that does a block
+cuda memcpy between host and device. But for other `ArrayHandle`s that do
+not defer their `ArrayTransfer` to a sub-array, the transfer first has to
+copy the data into a known buffer.
+
+Because this new design stores all data in `Buffer` objects, any of these
+can be easily and efficiently copied between devices.
+
+## Disadvantages
+
+This new design gives up some features of the original `ArrayHandle` design.
+
+### Can only interface data that can be represented in a fixed number of buffers
+
+Because the original `ArrayHandle` design required the `Storage` to
+completely manage the data, it could represent it in any way possible. In
+this redesign, the data need to be stored in some fixed number of memory
+buffers.
+
+This is a pretty open requirement. I suspect most data formats will be
+storable in this. The user's guide has an example of data stored in a
+`std::deque` that will not be representable. But that is probably not a
+particularly practical example.
+
+### VTK-m would only be able to support hosts and devices with the same endian
+
+Because data are transferred as `void *` blocks of memory, there is no way
+to correct words if the endian on the two devices does not agree. As far as
+I know, there should be no issues with the proposed ECP machines.
+
+If endian becomes an issue, it might be possible to specify a word length
+in the `Buffer`. That would assume that all numbers stored in the `Buffer`
+have the same word length.
+
+### ArrayPortals must be completely recompiled in each translation unit
+
+We can declare that an `ArrayHandle` does not need to include the device
+adapter header files in part because it no longer needs specialized
+`ArrayPortal`s for each device. However, that means that a translation unit
+compiled with the host compiler (say gcc) will produce different code for
+the `ArrayPortal`s than those with the device compiler (say nvcc). This
+could lead to numerous linking problems.
+
+To get around these issues, we will probably have to enforce no exporting
+of any of the `ArrayPotal` symbols and force them all to be recompiled for
+each translation unit. This will serve to increase the compile times a bit.
+We will probably also still encounter linking errors as there would be no
+way to enforce this requirement.
+
+### Cannot have specialized portals for the control environment
+
+Because the new design unifies `ArrayPortal` types across control and
+execution environments, it is no longer possible to have a special version
+for the control environment to manage resources. This will require removing
+some recent behavior of control portals such as with MR !1988.
--- a/docs/changelog/copy-unknownarrayhandle.md
+++ b/docs/changelog/copy-unknownarrayhandle.md
@ -0,0 +1,9 @@
+# Precompiled `ArrayCopy` for `UnknownArrayHandle`
+
+Previously, in order to copy an `UnknownArrayHandle`, you had to specify
+some subset of types and then specially compile a copy for each potential
+type. With the new ability to extract a component from an
+`UnknownArrayHandle`, it is now feasible to precompile copying an
+`UnknownArrayHandle` to another array. This greatly reduces the overhead of
+using `ArrayCopy` to copy `UnknownArrayHandle`s while simultaneously
+increasing the likelihood that the copy will be successful.
--- a/docs/changelog/cuda-no-assert.md
+++ b/docs/changelog/cuda-no-assert.md
@ -0,0 +1,10 @@
+# Disable asserts for CUDA architecture builds
+
+`assert` is supported on recent CUDA cards, but compiling it appears to be
+very slow. By default, the `VTKM_ASSERT` macro has been disabled whenever
+compiling for a CUDA device (i.e. when `__CUDA_ARCH__` is defined).
+
+Asserts for CUDA devices can be turned back on by turning the
+`VTKm_NO_ASSERT_CUDA` CMake variable off. Turning this CMake variable off
+will enable assertions in CUDA kernels unless there is another reason
+turning off all asserts (such as a release build).
--- a/docs/changelog/deprecate-arrayhandlevirtualcoordinates.md
+++ b/docs/changelog/deprecate-arrayhandlevirtualcoordinates.md
@ -0,0 +1,39 @@
+# Deprecate ArrayHandleVirtualCoordinates
+
+As we port VTK-m to more types of accelerator architectures, supporting
+virtual methods is becoming more problematic. Thus, we are working to back
+out of using virtual methods in the execution environment.
+
+One of the most widespread users of virtual methods in the execution
+environment is `ArrayHandleVirtual`. As a first step of deprecating this
+class, we first deprecate the `ArrayHandleVirtualCoordinates` subclass.
+
+Not surprisingly, `ArrayHandleVirtualCoordinates` is used directly by
+`CoordinateSystem`. The biggest change necessary was that the `GetData`
+method returned an `ArrayHandleVirtualCoordinates`, which obviously would
+not work if that class is deprecated.
+
+An oddness about this return type is that it is quite different from the
+superclass's method of the same name. Rather, `Field` returns a
+`VariantArrayHandle`. Since this had to be corrected anyway, it was decided
+to change `CoordinateSystem`'s `GetData` to also return a
+`VariantArrayHandle`, although its typelist is set to just `vtkm::Vec3f`.
+
+To try to still support old code that expects the deprecated behavior of
+returning an `ArrayHandleVirtualCoordinates`, `CoordinateSystem::GetData`
+actually returns a "hidden" subclass of `VariantArrayHandle` that
+automatically converts itself to an `ArrayHandleVirtualCoordinates`. (A
+deprecation warning is given if this is done.)
+
+This approach to support deprecated code is not perfect. The returned value
+for `CoordinateSystem::GetData` can only be used as an `ArrayHandle` if a
+method is directly called on it or if it is cast specifically to
+`ArrayHandleVirtualCoordinates` or its superclass. For example, if passing
+it to a method argument typed as `vtkm::cont::ArrayHandle<T, S>` where `T`
+and `S` are template parameters, then the conversion will fail.
+
+To continue to support ease of use, `CoordinateSystem` now has a method
+named `GetDataAsMultiplexer` that returns the data as an
+`ArrayHandleMultiplexer`. This can be employed to quickly use the
+`CoordinateSystem` as an array without the overhead of a `CastAndCall`.
+
--- a/docs/changelog/deprecate-virtual-methods.md
+++ b/docs/changelog/deprecate-virtual-methods.md
@ -0,0 +1,17 @@
+# Virtual methods in execution environment deprecated
+
+The use of classes with any virtual methods in the execution environment is
+deprecated. Although we had code to correctly build virtual methods on some
+devices such as CUDA, this feature was not universally supported on all
+programming models we wish to support. Plus, the implementation of virtual
+methods is not hugely convenient on CUDA because the virtual methods could
+not be embedded in a library. To get around virtual methods declared in
+different libraries, all builds had to be static, and a special linking
+step to pull in possible virtual method implementations was required.
+
+For these reasons, VTK-m is no longer relying on virtual methods. (Other
+approaches like multiplexers are used instead.) The code will be officially
+removed in version 2.0. It is still supported in a deprecated sense (you
+should get a warning). However, if you want to build without virtual
+methods, you can set the `VTKm_NO_DEPRECATED_VIRTUAL` CMake flag, and they
+will not be compiled.
--- a/Show More
+++ b/Show More