Merge topic 'openmp'

183bcf109 Add initial version of an OpenMP backend. 7b5ad3e80 Expand device scheduler test to check for overlap. e621b6ba3 Generalize the TBB radix sort implementation. d60278434 Specialize swap for ArrayPortalValueReference types. 761f8986f Cache inputs to SSI::Unique benchmark. Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Robert Maynard <robert.maynard@kitware.com> Merge-request: !1099
2024-09-08 13:23:51 +00:00 · 2018-05-31 20:49:08 +00:00 · 2018-05-31 20:49:08 +00:00 · d833d9285f
commit d833d9285f
parent da71598147 183bcf109a
59 changed files with 5441 additions and 999 deletions
--- a/CMake/FindOpenMP.cmake
+++ b/CMake/FindOpenMP.cmake
@ -0,0 +1,551 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+
+#.rst:
+# FindOpenMP
+# ----------
+#
+# Finds OpenMP support
+#
+# This module can be used to detect OpenMP support in a compiler.  If
+# the compiler supports OpenMP, the flags required to compile with
+# OpenMP support are returned in variables for the different languages.
+# The variables may be empty if the compiler does not need a special
+# flag to support OpenMP.
+#
+# Variables
+# ^^^^^^^^^
+#
+# The module exposes the components ``C``, ``CXX``, and ``Fortran``.
+# Each of these controls the various languages to search OpenMP support for.
+#
+# Depending on the enabled components the following variables will be set:
+#
+# ``OpenMP_FOUND``
+#   Variable indicating that OpenMP flags for all requested languages have been found.
+#   If no components are specified, this is true if OpenMP settings for all enabled languages
+#   were detected.
+# ``OpenMP_VERSION``
+#   Minimal version of the OpenMP standard detected among the requested languages,
+#   or all enabled languages if no components were specified.
+#
+# This module will set the following variables per language in your
+# project, where ``<lang>`` is one of C, CXX, or Fortran:
+#
+# ``OpenMP_<lang>_FOUND``
+#   Variable indicating if OpenMP support for ``<lang>`` was detected.
+# ``OpenMP_<lang>_FLAGS``
+#   OpenMP compiler flags for ``<lang>``, separated by spaces.
+#
+# For linking with OpenMP code written in ``<lang>``, the following
+# variables are provided:
+#
+# ``OpenMP_<lang>_LIB_NAMES``
+#   :ref:`;-list <CMake Language Lists>` of libraries for OpenMP programs for ``<lang>``.
+# ``OpenMP_<libname>_LIBRARY``
+#   Location of the individual libraries needed for OpenMP support in ``<lang>``.
+# ``OpenMP_<lang>_LIBRARIES``
+#   A list of libraries needed to link with OpenMP code written in ``<lang>``.
+#
+# Additionally, the module provides :prop_tgt:`IMPORTED` targets:
+#
+# ``OpenMP::OpenMP_<lang>``
+#   Target for using OpenMP from ``<lang>``.
+#
+# Specifically for Fortran, the module sets the following variables:
+#
+# ``OpenMP_Fortran_HAVE_OMPLIB_HEADER``
+#   Boolean indicating if OpenMP is accessible through ``omp_lib.h``.
+# ``OpenMP_Fortran_HAVE_OMPLIB_MODULE``
+#   Boolean indicating if OpenMP is accessible through the ``omp_lib`` Fortran module.
+#
+# The module will also try to provide the OpenMP version variables:
+#
+# ``OpenMP_<lang>_SPEC_DATE``
+#   Date of the OpenMP specification implemented by the ``<lang>`` compiler.
+# ``OpenMP_<lang>_VERSION_MAJOR``
+#   Major version of OpenMP implemented by the ``<lang>`` compiler.
+# ``OpenMP_<lang>_VERSION_MINOR``
+#   Minor version of OpenMP implemented by the ``<lang>`` compiler.
+# ``OpenMP_<lang>_VERSION``
+#   OpenMP version implemented by the ``<lang>`` compiler.
+#
+# The specification date is formatted as given in the OpenMP standard:
+# ``yyyymm`` where ``yyyy`` and ``mm`` represents the year and month of
+# the OpenMP specification implemented by the ``<lang>`` compiler.
+
+cmake_policy(PUSH)
+cmake_policy(SET CMP0012 NEW) # if() recognizes numbers and booleans
+cmake_policy(SET CMP0054 NEW) # if() quoted variables not dereferenced
+cmake_policy(SET CMP0057 NEW) # if IN_LIST
+
+function(_OPENMP_FLAG_CANDIDATES LANG)
+  if(NOT OpenMP_${LANG}_FLAG)
+    unset(OpenMP_FLAG_CANDIDATES)
+
+    set(OMP_FLAG_GNU "-fopenmp")
+    set(OMP_FLAG_Clang "-fopenmp=libomp" "-fopenmp=libiomp5" "-fopenmp")
+    set(OMP_FLAG_AppleClang "-Xclang -fopenmp")
+    set(OMP_FLAG_HP "+Oopenmp")
+    if(WIN32)
+      set(OMP_FLAG_Intel "-Qopenmp")
+    elseif(CMAKE_${LANG}_COMPILER_ID STREQUAL "Intel" AND
+           "${CMAKE_${LANG}_COMPILER_VERSION}" VERSION_LESS "15.0.0.20140528")
+      set(OMP_FLAG_Intel "-openmp")
+    else()
+      set(OMP_FLAG_Intel "-qopenmp")
+    endif()
+    set(OMP_FLAG_MIPSpro "-mp")
+    set(OMP_FLAG_MSVC "-openmp")
+    set(OMP_FLAG_PathScale "-openmp")
+    set(OMP_FLAG_NAG "-openmp")
+    set(OMP_FLAG_Absoft "-openmp")
+    set(OMP_FLAG_PGI "-mp")
+    set(OMP_FLAG_Flang "-fopenmp")
+    set(OMP_FLAG_SunPro "-xopenmp")
+    set(OMP_FLAG_XL "-qsmp=omp")
+    # Cray compiler activate OpenMP with -h omp, which is enabled by default.
+    set(OMP_FLAG_Cray " " "-h omp")
+
+    # If we know the correct flags, use those
+    if(DEFINED OMP_FLAG_${CMAKE_${LANG}_COMPILER_ID})
+      set(OpenMP_FLAG_CANDIDATES "${OMP_FLAG_${CMAKE_${LANG}_COMPILER_ID}}")
+    # Fall back to reasonable default tries otherwise
+    else()
+      set(OpenMP_FLAG_CANDIDATES "-openmp" "-fopenmp" "-mp" " ")
+    endif()
+    set(OpenMP_${LANG}_FLAG_CANDIDATES "${OpenMP_FLAG_CANDIDATES}" PARENT_SCOPE)
+  else()
+    set(OpenMP_${LANG}_FLAG_CANDIDATES "${OpenMP_${LANG}_FLAG}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# sample openmp source code to test
+set(OpenMP_C_CXX_TEST_SOURCE
+"
+#include <omp.h>
+int main() {
+#ifdef _OPENMP
+  int n = omp_get_max_threads();
+  return 0;
+#else
+  breaks_on_purpose
+#endif
+}
+")
+
+# in Fortran, an implementation may provide an omp_lib.h header
+# or omp_lib module, or both (OpenMP standard, section 3.1)
+# Furthmore !$ is the Fortran equivalent of #ifdef _OPENMP (OpenMP standard, 2.2.2)
+# Without the conditional compilation, some compilers (e.g. PGI) might compile OpenMP code
+# while not actually enabling OpenMP, building code sequentially
+set(OpenMP_Fortran_TEST_SOURCE
+  "
+      program test
+      @OpenMP_Fortran_INCLUDE_LINE@
+  !$  integer :: n
+      n = omp_get_num_threads()
+      end program test
+  "
+)
+
+function(_OPENMP_WRITE_SOURCE_FILE LANG SRC_FILE_CONTENT_VAR SRC_FILE_NAME SRC_FILE_FULLPATH)
+  set(WORK_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/FindOpenMP)
+  if("${LANG}" STREQUAL "C")
+    set(SRC_FILE "${WORK_DIR}/${SRC_FILE_NAME}.c")
+    file(WRITE "${SRC_FILE}" "${OpenMP_C_CXX_${SRC_FILE_CONTENT_VAR}}")
+  elseif("${LANG}" STREQUAL "CXX")
+    set(SRC_FILE "${WORK_DIR}/${SRC_FILE_NAME}.cpp")
+    file(WRITE "${SRC_FILE}" "${OpenMP_C_CXX_${SRC_FILE_CONTENT_VAR}}")
+  elseif("${LANG}" STREQUAL "Fortran")
+    set(SRC_FILE "${WORK_DIR}/${SRC_FILE_NAME}.f90")
+    file(WRITE "${SRC_FILE}_in" "${OpenMP_Fortran_${SRC_FILE_CONTENT_VAR}}")
+    configure_file("${SRC_FILE}_in" "${SRC_FILE}" @ONLY)
+  endif()
+  set(${SRC_FILE_FULLPATH} "${SRC_FILE}" PARENT_SCOPE)
+endfunction()
+
+include(CMakeParseImplicitLinkInfo)
+
+function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
+  _OPENMP_FLAG_CANDIDATES("${LANG}")
+  _OPENMP_WRITE_SOURCE_FILE("${LANG}" "TEST_SOURCE" OpenMPTryFlag _OPENMP_TEST_SRC)
+
+  unset(OpenMP_VERBOSE_COMPILE_OPTIONS)
+  separate_arguments(OpenMP_VERBOSE_OPTIONS NATIVE_COMMAND "${CMAKE_${LANG}_VERBOSE_FLAG}")
+  foreach(_VERBOSE_OPTION IN LISTS OpenMP_VERBOSE_OPTIONS)
+    if(NOT _VERBOSE_OPTION MATCHES "^-Wl,")
+      list(APPEND OpenMP_VERBOSE_COMPILE_OPTIONS ${_VERBOSE_OPTION})
+    endif()
+  endforeach()
+
+  foreach(OPENMP_FLAG IN LISTS OpenMP_${LANG}_FLAG_CANDIDATES)
+    set(OPENMP_FLAGS_TEST "${OPENMP_FLAG}")
+    if(OpenMP_VERBOSE_COMPILE_OPTIONS)
+      string(APPEND OPENMP_FLAGS_TEST " ${OpenMP_VERBOSE_COMPILE_OPTIONS}")
+    endif()
+    string(REGEX REPLACE "[-/=+]" "" OPENMP_PLAIN_FLAG "${OPENMP_FLAG}")
+    try_compile( OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG} ${CMAKE_BINARY_DIR} ${_OPENMP_TEST_SRC}
+      CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OPENMP_FLAGS_TEST}"
+      LINK_LIBRARIES ${CMAKE_${LANG}_VERBOSE_FLAG}
+      OUTPUT_VARIABLE OpenMP_TRY_COMPILE_OUTPUT
+    )
+
+    if(OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG})
+      set("${OPENMP_FLAG_VAR}" "${OPENMP_FLAG}" PARENT_SCOPE)
+
+      if(CMAKE_${LANG}_VERBOSE_FLAG)
+        unset(OpenMP_${LANG}_IMPLICIT_LIBRARIES)
+        unset(OpenMP_${LANG}_IMPLICIT_LINK_DIRS)
+        unset(OpenMP_${LANG}_IMPLICIT_FWK_DIRS)
+        unset(OpenMP_${LANG}_LOG_VAR)
+
+        file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+        "Detecting ${LANG} OpenMP compiler ABI info compiled with the following output:\n${OpenMP_TRY_COMPILE_OUTPUT}\n\n")
+
+        cmake_parse_implicit_link_info("${OpenMP_TRY_COMPILE_OUTPUT}"
+          OpenMP_${LANG}_IMPLICIT_LIBRARIES
+          OpenMP_${LANG}_IMPLICIT_LINK_DIRS
+          OpenMP_${LANG}_IMPLICIT_FWK_DIRS
+          OpenMP_${LANG}_LOG_VAR
+          "${CMAKE_${LANG}_IMPLICIT_OBJECT_REGEX}"
+        )
+
+        file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+        "Parsed ${LANG} OpenMP implicit link information from above output:\n${OpenMP_${LANG}_LOG_VAR}\n\n")
+
+        unset(_OPENMP_LIB_NAMES)
+        foreach(_OPENMP_IMPLICIT_LIB IN LISTS OpenMP_${LANG}_IMPLICIT_LIBRARIES)
+          get_filename_component(_OPENMP_IMPLICIT_LIB_DIR "${_OPENMP_IMPLICIT_LIB}" DIRECTORY)
+          get_filename_component(_OPENMP_IMPLICIT_LIB_NAME "${_OPENMP_IMPLICIT_LIB}" NAME)
+          get_filename_component(_OPENMP_IMPLICIT_LIB_PLAIN "${_OPENMP_IMPLICIT_LIB}" NAME_WE)
+          string(REGEX REPLACE "([][+.*?()^$])" "\\\\\\1" _OPENMP_IMPLICIT_LIB_PLAIN_ESC "${_OPENMP_IMPLICIT_LIB_PLAIN}")
+          string(REGEX REPLACE "([][+.*?()^$])" "\\\\\\1" _OPENMP_IMPLICIT_LIB_PATH_ESC "${_OPENMP_IMPLICIT_LIB}")
+          if(NOT ( "${_OPENMP_IMPLICIT_LIB}" IN_LIST CMAKE_${LANG}_IMPLICIT_LINK_LIBRARIES
+            OR "${CMAKE_${LANG}_STANDARD_LIBRARIES}" MATCHES "(^| )(-Wl,)?(-l)?(${_OPENMP_IMPLICIT_LIB_PLAIN_ESC}|${_OPENMP_IMPLICIT_LIB_PATH_ESC})( |$)"
+            OR "${CMAKE_${LANG}_LINK_EXECUTABLE}" MATCHES "(^| )(-Wl,)?(-l)?(${_OPENMP_IMPLICIT_LIB_PLAIN_ESC}|${_OPENMP_IMPLICIT_LIB_PATH_ESC})( |$)" ) )
+            if(_OPENMP_IMPLICIT_LIB_DIR)
+              set(OpenMP_${_OPENMP_IMPLICIT_LIB_PLAIN}_LIBRARY "${_OPENMP_IMPLICIT_LIB}" CACHE FILEPATH
+                "Path to the ${_OPENMP_IMPLICIT_LIB_PLAIN} library for OpenMP")
+            else()
+              find_library(OpenMP_${_OPENMP_IMPLICIT_LIB_PLAIN}_LIBRARY
+                NAMES "${_OPENMP_IMPLICIT_LIB_NAME}"
+                DOC "Path to the ${_OPENMP_IMPLICIT_LIB_PLAIN} library for OpenMP"
+                HINTS ${OpenMP_${LANG}_IMPLICIT_LINK_DIRS}
+                CMAKE_FIND_ROOT_PATH_BOTH
+                NO_DEFAULT_PATH
+              )
+            endif()
+            mark_as_advanced(OpenMP_${_OPENMP_IMPLICIT_LIB_PLAIN}_LIBRARY)
+            list(APPEND _OPENMP_LIB_NAMES ${_OPENMP_IMPLICIT_LIB_PLAIN})
+          endif()
+        endforeach()
+        set("${OPENMP_LIB_NAMES_VAR}" "${_OPENMP_LIB_NAMES}" PARENT_SCOPE)
+      else()
+        # We do not know how to extract implicit OpenMP libraries for this compiler.
+        # Assume that it handles them automatically, e.g. the Intel Compiler on
+        # Windows should put the dependency in its object files.
+        set("${OPENMP_LIB_NAMES_VAR}" "" PARENT_SCOPE)
+      endif()
+      break()
+    elseif(CMAKE_${LANG}_COMPILER_ID STREQUAL "AppleClang"
+      AND CMAKE_${LANG}_COMPILER_VERSION VERSION_GREATER_EQUAL "7.0")
+
+      # Check for separate OpenMP library on AppleClang 7+
+      find_library(OpenMP_libomp_LIBRARY
+        NAMES omp gomp iomp5
+        HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES}
+      )
+      mark_as_advanced(OpenMP_libomp_LIBRARY)
+
+      if(OpenMP_libomp_LIBRARY)
+        try_compile( OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG} ${CMAKE_BINARY_DIR} ${_OPENMP_TEST_SRC}
+          CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OPENMP_FLAGS_TEST}"
+          LINK_LIBRARIES ${CMAKE_${LANG}_VERBOSE_FLAG} ${OpenMP_libomp_LIBRARY}
+          OUTPUT_VARIABLE OpenMP_TRY_COMPILE_OUTPUT
+        )
+        if(OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG})
+          set("${OPENMP_FLAG_VAR}" "${OPENMP_FLAG}" PARENT_SCOPE)
+          set("${OPENMP_LIB_NAMES_VAR}" "libomp" PARENT_SCOPE)
+          break()
+        endif()
+      endif()
+    endif()
+    set("${OPENMP_LIB_NAMES_VAR}" "NOTFOUND" PARENT_SCOPE)
+    set("${OPENMP_FLAG_VAR}" "NOTFOUND" PARENT_SCOPE)
+  endforeach()
+
+  unset(OpenMP_VERBOSE_COMPILE_OPTIONS)
+endfunction()
+
+set(OpenMP_C_CXX_CHECK_VERSION_SOURCE
+"
+#include <stdio.h>
+#include <omp.h>
+const char ompver_str[] = { 'I', 'N', 'F', 'O', ':', 'O', 'p', 'e', 'n', 'M',
+                            'P', '-', 'd', 'a', 't', 'e', '[',
+                            ('0' + ((_OPENMP/100000)%10)),
+                            ('0' + ((_OPENMP/10000)%10)),
+                            ('0' + ((_OPENMP/1000)%10)),
+                            ('0' + ((_OPENMP/100)%10)),
+                            ('0' + ((_OPENMP/10)%10)),
+                            ('0' + ((_OPENMP/1)%10)),
+                            ']', '\\0' };
+int main()
+{
+  puts(ompver_str);
+  return 0;
+}
+")
+
+set(OpenMP_Fortran_CHECK_VERSION_SOURCE
+"
+      program omp_ver
+      @OpenMP_Fortran_INCLUDE_LINE@
+      integer, parameter :: zero = ichar('0')
+      integer, parameter :: ompv = openmp_version
+      character, dimension(24), parameter :: ompver_str =&
+      (/ 'I', 'N', 'F', 'O', ':', 'O', 'p', 'e', 'n', 'M', 'P', '-',&
+         'd', 'a', 't', 'e', '[',&
+         char(zero + mod(ompv/100000, 10)),&
+         char(zero + mod(ompv/10000, 10)),&
+         char(zero + mod(ompv/1000, 10)),&
+         char(zero + mod(ompv/100, 10)),&
+         char(zero + mod(ompv/10, 10)),&
+         char(zero + mod(ompv/1, 10)), ']' /)
+      print *, ompver_str
+      end program omp_ver
+")
+
+function(_OPENMP_GET_SPEC_DATE LANG SPEC_DATE)
+  _OPENMP_WRITE_SOURCE_FILE("${LANG}" "CHECK_VERSION_SOURCE" OpenMPCheckVersion _OPENMP_TEST_SRC)
+
+  set(BIN_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/FindOpenMP/ompver_${LANG}.bin")
+  string(REGEX REPLACE "[-/=+]" "" OPENMP_PLAIN_FLAG "${OPENMP_FLAG}")
+  try_compile(OpenMP_SPECTEST_${LANG}_${OPENMP_PLAIN_FLAG} "${CMAKE_BINARY_DIR}" "${_OPENMP_TEST_SRC}"
+              CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OpenMP_${LANG}_FLAGS}"
+              COPY_FILE ${BIN_FILE})
+
+  if(${OpenMP_SPECTEST_${LANG}_${OPENMP_PLAIN_FLAG}})
+    file(STRINGS ${BIN_FILE} specstr LIMIT_COUNT 1 REGEX "INFO:OpenMP-date")
+    set(regex_spec_date ".*INFO:OpenMP-date\\[0*([^]]*)\\].*")
+    if("${specstr}" MATCHES "${regex_spec_date}")
+      set(${SPEC_DATE} "${CMAKE_MATCH_1}" PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+macro(_OPENMP_SET_VERSION_BY_SPEC_DATE LANG)
+  set(OpenMP_SPEC_DATE_MAP
+    # Preview versions
+    "201611=5.0" # OpenMP 5.0 preview 1
+    # Combined versions, 2.5 onwards
+    "201511=4.5"
+    "201307=4.0"
+    "201107=3.1"
+    "200805=3.0"
+    "200505=2.5"
+    # C/C++ version 2.0
+    "200203=2.0"
+    # Fortran version 2.0
+    "200011=2.0"
+    # Fortran version 1.1
+    "199911=1.1"
+    # C/C++ version 1.0 (there's no 1.1 for C/C++)
+    "199810=1.0"
+    # Fortran version 1.0
+    "199710=1.0"
+  )
+
+  if(OpenMP_${LANG}_SPEC_DATE)
+    string(REGEX MATCHALL "${OpenMP_${LANG}_SPEC_DATE}=([0-9]+)\\.([0-9]+)" _version_match "${OpenMP_SPEC_DATE_MAP}")
+  else()
+    set(_version_match "")
+  endif()
+  if(NOT _version_match STREQUAL "")
+    set(OpenMP_${LANG}_VERSION_MAJOR ${CMAKE_MATCH_1})
+    set(OpenMP_${LANG}_VERSION_MINOR ${CMAKE_MATCH_2})
+    set(OpenMP_${LANG}_VERSION "${OpenMP_${LANG}_VERSION_MAJOR}.${OpenMP_${LANG}_VERSION_MINOR}")
+  else()
+    unset(OpenMP_${LANG}_VERSION_MAJOR)
+    unset(OpenMP_${LANG}_VERSION_MINOR)
+    unset(OpenMP_${LANG}_VERSION)
+  endif()
+  unset(_version_match)
+  unset(OpenMP_SPEC_DATE_MAP)
+endmacro()
+
+foreach(LANG IN ITEMS C CXX)
+  if(CMAKE_${LANG}_COMPILER_LOADED)
+    if(NOT DEFINED OpenMP_${LANG}_FLAGS OR "${OpenMP_${LANG}_FLAGS}" STREQUAL "NOTFOUND"
+      OR NOT DEFINED OpenMP_${LANG}_LIB_NAMES OR "${OpenMP_${LANG}_LIB_NAMES}" STREQUAL "NOTFOUND")
+      _OPENMP_GET_FLAGS("${LANG}" "${LANG}" OpenMP_${LANG}_FLAGS_WORK OpenMP_${LANG}_LIB_NAMES_WORK)
+    endif()
+
+    set(OpenMP_${LANG}_FLAGS "${OpenMP_${LANG}_FLAGS_WORK}"
+      CACHE STRING "${LANG} compiler flags for OpenMP parallelization")
+    set(OpenMP_${LANG}_LIB_NAMES "${OpenMP_${LANG}_LIB_NAMES_WORK}"
+      CACHE STRING "${LANG} compiler libraries for OpenMP parallelization")
+    mark_as_advanced(OpenMP_${LANG}_FLAGS OpenMP_${LANG}_LIB_NAMES)
+  endif()
+endforeach()
+
+if(CMAKE_Fortran_COMPILER_LOADED)
+  if(NOT DEFINED OpenMP_Fortran_FLAGS OR "${OpenMP_Fortran_FLAGS}" STREQUAL "NOTFOUND"
+    OR NOT DEFINED OpenMP_Fortran_LIB_NAMES OR "${OpenMP_Fortran_LIB_NAMES}" STREQUAL "NOTFOUND"
+    OR NOT DEFINED OpenMP_Fortran_HAVE_OMPLIB_MODULE)
+    set(OpenMP_Fortran_INCLUDE_LINE "use omp_lib\n      implicit none")
+    _OPENMP_GET_FLAGS("Fortran" "FortranHeader" OpenMP_Fortran_FLAGS_WORK OpenMP_Fortran_LIB_NAMES_WORK)
+    if(OpenMP_Fortran_FLAGS_WORK)
+      set(OpenMP_Fortran_HAVE_OMPLIB_MODULE TRUE CACHE BOOL INTERNAL "")
+    endif()
+
+    set(OpenMP_Fortran_FLAGS "${OpenMP_Fortran_FLAGS_WORK}"
+      CACHE STRING "Fortran compiler flags for OpenMP parallelization")
+    set(OpenMP_Fortran_LIB_NAMES "${OpenMP_Fortran_LIB_NAMES_WORK}"
+      CACHE STRING "Fortran compiler libraries for OpenMP parallelization")
+    mark_as_advanced(OpenMP_Fortran_FLAGS OpenMP_Fortran_LIB_NAMES)
+  endif()
+
+  if(NOT DEFINED OpenMP_Fortran_FLAGS OR "${OpenMP_Fortran_FLAGS}" STREQUAL "NOTFOUND"
+    OR NOT DEFINED OpenMP_Fortran_LIB_NAMES OR "${OpenMP_Fortran_LIB_NAMES}" STREQUAL "NOTFOUND"
+    OR NOT DEFINED OpenMP_Fortran_HAVE_OMPLIB_HEADER)
+    set(OpenMP_Fortran_INCLUDE_LINE "implicit none\n      include 'omp_lib.h'")
+    _OPENMP_GET_FLAGS("Fortran" "FortranModule" OpenMP_Fortran_FLAGS_WORK OpenMP_Fortran_LIB_NAMES_WORK)
+    if(OpenMP_Fortran_FLAGS_WORK)
+      set(OpenMP_Fortran_HAVE_OMPLIB_HEADER TRUE CACHE BOOL INTERNAL "")
+    endif()
+
+    set(OpenMP_Fortran_FLAGS "${OpenMP_Fortran_FLAGS_WORK}"
+      CACHE STRING "Fortran compiler flags for OpenMP parallelization")
+
+    set(OpenMP_Fortran_LIB_NAMES "${OpenMP_Fortran_LIB_NAMES}"
+      CACHE STRING "Fortran compiler libraries for OpenMP parallelization")
+  endif()
+
+  if(OpenMP_Fortran_HAVE_OMPLIB_MODULE)
+    set(OpenMP_Fortran_INCLUDE_LINE "use omp_lib\n      implicit none")
+  else()
+    set(OpenMP_Fortran_INCLUDE_LINE "implicit none\n      include 'omp_lib.h'")
+  endif()
+endif()
+
+if(NOT OpenMP_FIND_COMPONENTS)
+  set(OpenMP_FINDLIST C CXX Fortran)
+else()
+  set(OpenMP_FINDLIST ${OpenMP_FIND_COMPONENTS})
+endif()
+
+unset(_OpenMP_MIN_VERSION)
+
+include(FindPackageHandleStandardArgs)
+
+foreach(LANG IN LISTS OpenMP_FINDLIST)
+  if(CMAKE_${LANG}_COMPILER_LOADED)
+    if (NOT OpenMP_${LANG}_SPEC_DATE AND OpenMP_${LANG}_FLAGS)
+      _OPENMP_GET_SPEC_DATE("${LANG}" OpenMP_${LANG}_SPEC_DATE_INTERNAL)
+      set(OpenMP_${LANG}_SPEC_DATE "${OpenMP_${LANG}_SPEC_DATE_INTERNAL}" CACHE
+        INTERNAL "${LANG} compiler's OpenMP specification date")
+      _OPENMP_SET_VERSION_BY_SPEC_DATE("${LANG}")
+    endif()
+
+    set(OpenMP_${LANG}_FIND_QUIETLY ${OpenMP_FIND_QUIETLY})
+    set(OpenMP_${LANG}_FIND_REQUIRED ${OpenMP_FIND_REQUIRED})
+    set(OpenMP_${LANG}_FIND_VERSION ${OpenMP_FIND_VERSION})
+    set(OpenMP_${LANG}_FIND_VERSION_EXACT ${OpenMP_FIND_VERSION_EXACT})
+
+    set(_OPENMP_${LANG}_REQUIRED_VARS OpenMP_${LANG}_FLAGS)
+    if("${OpenMP_${LANG}_LIB_NAMES}" STREQUAL "NOTFOUND")
+      set(_OPENMP_${LANG}_REQUIRED_LIB_VARS OpenMP_${LANG}_LIB_NAMES)
+    else()
+      foreach(_OPENMP_IMPLICIT_LIB IN LISTS OpenMP_${LANG}_LIB_NAMES)
+        list(APPEND _OPENMP_${LANG}_REQUIRED_LIB_VARS OpenMP_${_OPENMP_IMPLICIT_LIB}_LIBRARY)
+      endforeach()
+    endif()
+
+    find_package_handle_standard_args(OpenMP_${LANG}
+      REQUIRED_VARS OpenMP_${LANG}_FLAGS ${_OPENMP_${LANG}_REQUIRED_LIB_VARS}
+      VERSION_VAR OpenMP_${LANG}_VERSION
+    )
+
+    if(OpenMP_${LANG}_FOUND)
+      if(DEFINED OpenMP_${LANG}_VERSION)
+        if(NOT _OpenMP_MIN_VERSION OR _OpenMP_MIN_VERSION VERSION_GREATER OpenMP_${LANG}_VERSION)
+          set(_OpenMP_MIN_VERSION OpenMP_${LANG}_VERSION)
+        endif()
+      endif()
+      set(OpenMP_${LANG}_LIBRARIES "")
+      foreach(_OPENMP_IMPLICIT_LIB IN LISTS OpenMP_${LANG}_LIB_NAMES)
+        list(APPEND OpenMP_${LANG}_LIBRARIES "${OpenMP_${_OPENMP_IMPLICIT_LIB}_LIBRARY}")
+      endforeach()
+
+      if(NOT TARGET OpenMP::OpenMP_${LANG})
+        add_library(OpenMP::OpenMP_${LANG} INTERFACE IMPORTED)
+      endif()
+      if(OpenMP_${LANG}_FLAGS)
+        separate_arguments(_OpenMP_${LANG}_OPTIONS NATIVE_COMMAND "${OpenMP_${LANG}_FLAGS}")
+        set_property(TARGET OpenMP::OpenMP_${LANG} PROPERTY
+          INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:${LANG}>:${_OpenMP_${LANG}_OPTIONS}>")
+        unset(_OpenMP_${LANG}_OPTIONS)
+      endif()
+      if(OpenMP_${LANG}_LIBRARIES)
+        set_property(TARGET OpenMP::OpenMP_${LANG} PROPERTY
+          INTERFACE_LINK_LIBRARIES "${OpenMP_${LANG}_LIBRARIES}")
+      endif()
+    endif()
+  endif()
+endforeach()
+
+unset(_OpenMP_REQ_VARS)
+foreach(LANG IN ITEMS C CXX Fortran)
+  if((NOT OpenMP_FIND_COMPONENTS AND CMAKE_${LANG}_COMPILER_LOADED) OR LANG IN_LIST OpenMP_FIND_COMPONENTS)
+    list(APPEND _OpenMP_REQ_VARS "OpenMP_${LANG}_FOUND")
+  endif()
+endforeach()
+
+find_package_handle_standard_args(OpenMP
+    REQUIRED_VARS ${_OpenMP_REQ_VARS}
+    VERSION_VAR ${_OpenMP_MIN_VERSION}
+    HANDLE_COMPONENTS)
+
+set(OPENMP_FOUND ${OpenMP_FOUND})
+
+if(CMAKE_Fortran_COMPILER_LOADED AND OpenMP_Fortran_FOUND)
+  if(NOT DEFINED OpenMP_Fortran_HAVE_OMPLIB_MODULE)
+    set(OpenMP_Fortran_HAVE_OMPLIB_MODULE FALSE CACHE BOOL INTERNAL "")
+  endif()
+  if(NOT DEFINED OpenMP_Fortran_HAVE_OMPLIB_HEADER)
+    set(OpenMP_Fortran_HAVE_OMPLIB_HEADER FALSE CACHE BOOL INTERNAL "")
+  endif()
+endif()
+
+if(NOT ( CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED OR CMAKE_Fortran_COMPILER_LOADED ))
+  message(SEND_ERROR "FindOpenMP requires the C, CXX or Fortran languages to be enabled")
+endif()
+
+unset(OpenMP_C_CXX_TEST_SOURCE)
+unset(OpenMP_Fortran_TEST_SOURCE)
+unset(OpenMP_C_CXX_CHECK_VERSION_SOURCE)
+unset(OpenMP_Fortran_CHECK_VERSION_SOURCE)
+unset(OpenMP_Fortran_INCLUDE_LINE)
+
+cmake_policy(POP)
--- a/CMake/VTKmConfig.cmake.in
+++ b/CMake/VTKmConfig.cmake.in
@ -27,6 +27,9 @@
 #   vtkm::tbb       Target that contains tbb related link information
 #                   implicitly linked to by `vtkm_cont` if tbb is enabled
 #
+#   vtkm::openmp    Target that contains openmp related link information
+#                   implicitly linked to by `vtkm_cont` if openmp is enabled
+#
 #   vtkm::cuda      Target that contains cuda related link information
 #                   implicitly linked to by `vtkm_cont` if cuda is enabled
 #
@ -41,6 +44,7 @@
 #  VTKm_BUILD_SHARED_LIBS     Will be enabled if VTK-m was built shared/dynamic
 #  VTKm_ENABLE_CUDA           Will be enabled if VTK-m was built with CUDA support
 #  VTKm_ENABLE_TBB            Will be enabled if VTK-m was built with TBB support
+#  VTKm_ENABLE_OPENMP         Will be enabled if VTK-m was built with OpenMP support
 #  VTKm_ENABLE_MPI            Will be enabled if VTK-m was built with MPI support
 #  VTKm_ENABLE_RENDERING      Will be enabled if VTK-m was built with rendering support
 #  VTKm_ENABLE_GL_CONTEXT     Will be enabled if VTK-m rendering was built with a GL context
@ -63,6 +67,7 @@ set(VTKm_VERSION "@VTKm_VERSION@")
 set(VTKm_BUILD_SHARED_LIBS "@VTKm_BUILD_SHARED_LIBS@")
 set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@")
 set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
+set(VTKm_ENABLE_OPENMP "@VTKm_ENABLE_OPENMP@")
 set(VTKm_ENABLE_RENDERING "@VTKm_ENABLE_RENDERING@")
 set(VTKm_ENABLE_GL_CONTEXT "@VTKm_ENABLE_GL_CONTEXT@")
 set(VTKm_ENABLE_OSMESA_CONTEXT "@VTKm_ENABLE_OSMESA_CONTEXT@")
--- a/CMake/VTKmDeviceAdapters.cmake
+++ b/CMake/VTKmDeviceAdapters.cmake
@ -47,6 +47,19 @@ if(VTKm_ENABLE_TBB AND NOT TARGET vtkm::tbb)
    endif()
 endif()

+if(VTKm_ENABLE_OPENMP AND NOT TARGET vtkm::openmp)
+  find_package(OpenMP 4.5 REQUIRED COMPONENTS CXX QUIET)
+
+  add_library(vtkm::openmp INTERFACE IMPORTED GLOBAL)
+  if(OpenMP_CXX_FLAGS)
+    set_target_properties(vtkm::openmp PROPERTIES
+      INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${OpenMP_CXX_FLAGS}>")
+  endif()
+  if(OpenMP_CXX_LIBRARIES)
+    set_target_properties(vtkm::openmp PROPERTIES
+      INTERFACE_LINK_LIBRARIES "${OpenMP_CXX_LIBRARIES}")
+  endif()
+endif()

 if(VTKm_ENABLE_CUDA AND NOT TARGET vtkm::cuda)
  cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -86,6 +86,7 @@ endmacro ()
 # Configurable Options
 vtkm_option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF)
 vtkm_option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
+vtkm_option(VTKm_ENABLE_OPENMP "Enable OpenMP support" OFF)
 vtkm_option(VTKm_ENABLE_RENDERING "Enable rendering library" ON)
 vtkm_option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON)
 vtkm_option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)
@ -266,6 +267,7 @@ if(NOT VTKm_INSTALL_ONLY_LIBRARIES)
    FILES
      ${VTKm_SOURCE_DIR}/CMake/FindTBB.cmake
      ${VTKm_SOURCE_DIR}/CMake/FindOpenGL.cmake
+      ${VTKm_SOURCE_DIR}/CMake/FindOpenMP.cmake
    DESTINATION ${VTKm_INSTALL_CMAKE_MODULE_DIR}
    )

--- a/README.md
+++ b/README.md
@ -73,6 +73,8 @@ Optional dependencies are:
      + [Cuda Toolkit 7+](https://developer.nvidia.com/cuda-toolkit)
  + TBB Device Adapter
      + [TBB](https://www.threadingbuildingblocks.org/)
+  + OpenMP Device Adapter
+      + Requires a compiler that supports OpenMP >= 4.5.
  + OpenGL Rendering
      + The rendering module contains multiple rendering implementations
        including standalone rendering code. The rendering module also
--- a/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/benchmarking/BenchmarkDeviceAdapter.cxx
@ -46,7 +46,9 @@

 #if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
 #include <tbb/task_scheduler_init.h>
-#endif // TBB
+#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
+#include <omp.h>
+#endif

 // This benchmark has a number of commandline options to customize its behavior.
 // See The BenchDevAlgoConfig documentations for details.
@ -748,6 +750,7 @@ private:
    const vtkm::Id N_VALID;
    const vtkm::Id PERCENT_VALID;
    ValueArrayHandle ValueHandle;
+    IndexArrayHandle IndexHandle;

    VTKM_CONT
    BenchStableSortIndicesUnique(vtkm::Id percent_valid)
@ -759,12 +762,14 @@ private:
        FillModuloTestValueKernel<Value>(
          N_VALID, this->ValueHandle.PrepareForOutput(arraySize, DeviceAdapterTag())),
        arraySize);
+      this->IndexHandle = SSI::Sort(this->ValueHandle);
    }

    VTKM_CONT
    vtkm::Float64 operator()()
    {
-      IndexArrayHandle indices = SSI::Sort(this->ValueHandle);
+      IndexArrayHandle indices;
+      Algorithm::Copy(this->IndexHandle, indices);
      Timer timer;
      SSI::Unique(this->ValueHandle, indices);
      return timer.GetElapsedTime();
@ -1190,6 +1195,8 @@ int main(int argc, char* argv[])
 {
 #if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
  int numThreads = tbb::task_scheduler_init::automatic;
+#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
+  int numThreads = omp_get_max_threads();
 #endif // TBB

  vtkm::benchmarking::BenchDevAlgoConfig& config = vtkm::benchmarking::Config;
@ -1320,8 +1327,12 @@ int main(int argc, char* argv[])
      std::istringstream parse(argv[i]);
      parse >> numThreads;
      std::cout << "Selected " << numThreads << " TBB threads." << std::endl;
+#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
+      std::istringstream parse(argv[i]);
+      parse >> numThreads;
+      std::cout << "Selected " << numThreads << " OpenMP threads." << std::endl;
 #else
-      std::cerr << "NumThreads valid only on TBB. Ignoring." << std::endl;
+      std::cerr << "NumThreads not valid on this device. Ignoring." << std::endl;
 #endif // TBB
    }
    else
@ -1334,6 +1345,8 @@ int main(int argc, char* argv[])
 #if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
  // Must not be destroyed as long as benchmarks are running:
  tbb::task_scheduler_init init(numThreads);
+#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
+  omp_set_num_threads(numThreads);
 #endif // TBB

  if (config.BenchmarkFlags == 0)
--- a/benchmarking/CMakeLists.txt
+++ b/benchmarking/CMakeLists.txt
@ -30,6 +30,12 @@ function(add_benchmark name files)
    target_compile_definitions(${name}_TBB PRIVATE "VTKM_DEVICE_ADAPTER=VTKM_DEVICE_ADAPTER_TBB")
  endif()

+  if (TARGET vtkm::openmp)
+    add_executable(${name}_OPENMP ${files})
+    list(APPEND benchmarks ${name}_OPENMP)
+    target_compile_definitions(${name}_OPENMP PRIVATE "VTKM_DEVICE_ADAPTER=VTKM_DEVICE_ADAPTER_OPENMP")
+  endif()
+
  if (TARGET vtkm::cuda)
    get_filename_component(fname "${name}" NAME_WE)
    get_filename_component(fullpath "${name}.cxx" ABSOLUTE)
@ -70,6 +76,9 @@ if(TARGET vtkm_rendering)
  if(TARGET BenchmarkRayTracing_TBB)
    target_link_libraries(BenchmarkRayTracing_TBB PRIVATE vtkm_rendering)
  endif()
+  if(TARGET BenchmarkRayTracing_OPENMP)
+    target_link_libraries(BenchmarkRayTracing_OPENMP PRIVATE vtkm_rendering)
+  endif()
  if(TARGET BenchmarkRayTracing_CUDA)
    target_link_libraries(BenchmarkRayTracing_CUDA PRIVATE vtkm_rendering)
  endif()
--- a/docs/changelog/openmp-backend.md
+++ b/docs/changelog/openmp-backend.md
@ -0,0 +1,5 @@
+# OpenMP Device Adapter
+
+A device adapter that leverages OpenMP 4.5 for threading is now available. The
+new adapter is enabled using the CMake option `VTKm_ENABLE_OPENMP` and its
+performance is comparable to the TBB device adapter.
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@ -151,6 +151,7 @@ add_subdirectory(arg)
 add_subdirectory(diy)
 add_subdirectory(serial)
 add_subdirectory(tbb)
+add_subdirectory(openmp)
 add_subdirectory(cuda)

 set(backends )
@ -160,6 +161,9 @@ endif()
 if(TARGET vtkm::cuda)
  list(APPEND backends vtkm::cuda)
 endif()
+if(TARGET vtkm::openmp)
+  list(APPEND backends vtkm::openmp)
+endif()
 target_link_libraries(vtkm_cont PUBLIC vtkm_compiler_flags ${backends})
 if(TARGET vtkm_diy)
  # This will become a required dependency eventually.
--- a/vtkm/cont/DeviceAdapterListTag.h
+++ b/vtkm/cont/DeviceAdapterListTag.h
@ -27,6 +27,7 @@
 #include <vtkm/ListTag.h>

 #include <vtkm/cont/cuda/DeviceAdapterCuda.h>
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
 #include <vtkm/cont/serial/DeviceAdapterSerial.h>
 #include <vtkm/cont/tbb/DeviceAdapterTBB.h>

@ -37,6 +38,7 @@ namespace cont

 struct DeviceAdapterListTagCommon : vtkm::ListTagBase<vtkm::cont::DeviceAdapterTagCuda,
                                                      vtkm::cont::DeviceAdapterTagTBB,
+                                                      vtkm::cont::DeviceAdapterTagOpenMP,
                                                      vtkm::cont::DeviceAdapterTagSerial>
 {
 };
--- a/vtkm/cont/internal/CMakeLists.txt
+++ b/vtkm/cont/internal/CMakeLists.txt
@ -38,6 +38,9 @@ set(headers
  DynamicTransform.h
  FunctorsGeneral.h
  IteratorFromArrayPortal.h
+  KXSort.h
+  ParallelRadixSort.h
+  ParallelRadixSortInterface.h
  SimplePolymorphicContainer.h
  StorageError.h
  VirtualObjectTransfer.h
--- a/vtkm/cont/internal/DeviceAdapterDefaultSelection.h
+++ b/vtkm/cont/internal/DeviceAdapterDefaultSelection.h
@ -36,7 +36,9 @@
 // Unfortunately, VTKM_ENABLE_TBB does not guarantee that TBB is (or isn't)
 // available, but there is no way to check for sure in a header library.
 #define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_TBB
-#else // !VTKM_CUDA && !VTKM_ENABLE_TBB
+#elif defined(VTKM_ENABLE_OPENMP)
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_OPENMP
+#else // !VTKM_CUDA && !VTKM_ENABLE_TBB && !VTKM_ENABLE_OPENMP
 #define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_SERIAL
 #endif // !VTKM_CUDA && !VTKM_ENABLE_TBB
 #endif // VTKM_DEVICE_ADAPTER
@ -68,6 +70,14 @@
 #include <vtkm/cont/tbb/internal/DeviceAdapterTagTBB.h>
 #define VTKM_DEFAULT_DEVICE_ADAPTER_TAG ::vtkm::cont::DeviceAdapterTagTBB

+// OpenMP:
+#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
+
+#include <vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h>
+#include <vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h>
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+#define VTKM_DEFAULT_DEVICE_ADAPTER_TAG ::vtkm::cont::DeviceAdapterTagOpenMP
+
 // Error:
 #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_ERROR

--- a/vtkm/cont/internal/DeviceAdapterTag.h
+++ b/vtkm/cont/internal/DeviceAdapterTag.h
@ -32,6 +32,7 @@
 #define VTKM_DEVICE_ADAPTER_SERIAL 1
 #define VTKM_DEVICE_ADAPTER_CUDA 2
 #define VTKM_DEVICE_ADAPTER_TBB 3
+#define VTKM_DEVICE_ADAPTER_OPENMP 4

 namespace vtkm
 {
--- a/vtkm/cont/tbb/internal/kxsort.h
+++ b/vtkm/cont/tbb/internal/kxsort.h
@ -1,3 +1,25 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+
 /* The MIT License
   Copyright (c) 2016 Dinghua Li <voutcn@gmail.com>

--- a/vtkm/cont/internal/ParallelRadixSort.h
+++ b/vtkm/cont/internal/ParallelRadixSort.h
--- a/vtkm/cont/internal/ParallelRadixSortInterface.h
+++ b/vtkm/cont/internal/ParallelRadixSortInterface.h
@ -0,0 +1,166 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_internal_ParallelRadixSortInterface_h
+#define vtk_m_cont_internal_ParallelRadixSortInterface_h
+
+#include <vtkm/BinaryPredicates.h>
+#include <vtkm/cont/ArrayHandle.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace internal
+{
+namespace radix
+{
+
+const size_t MIN_BYTES_FOR_PARALLEL = 400000;
+const size_t BYTES_FOR_MAX_PARALLELISM = 4000000;
+
+struct RadixSortTag
+{
+};
+
+struct PSortTag
+{
+};
+
+// Detect supported functors for radix sort:
+template <typename T>
+struct is_valid_compare_type : std::integral_constant<bool, false>
+{
+};
+template <typename T>
+struct is_valid_compare_type<std::less<T>> : std::integral_constant<bool, true>
+{
+};
+template <typename T>
+struct is_valid_compare_type<std::greater<T>> : std::integral_constant<bool, true>
+{
+};
+template <>
+struct is_valid_compare_type<vtkm::SortLess> : std::integral_constant<bool, true>
+{
+};
+template <>
+struct is_valid_compare_type<vtkm::SortGreater> : std::integral_constant<bool, true>
+{
+};
+
+// Convert vtkm::Sort[Less|Greater] to the std:: equivalents:
+template <typename BComp, typename T>
+BComp&& get_std_compare(BComp&& b, T&&)
+{
+  return std::forward<BComp>(b);
+}
+template <typename T>
+std::less<T> get_std_compare(vtkm::SortLess, T&&)
+{
+  return std::less<T>{};
+}
+template <typename T>
+std::greater<T> get_std_compare(vtkm::SortGreater, T&&)
+{
+  return std::greater<T>{};
+}
+
+// Determine if radix sort can be used for a given ValueType, StorageType, and
+// comparison functor.
+template <typename T, typename StorageTag, typename BinaryCompare>
+struct sort_tag_type
+{
+  using type = PSortTag;
+};
+template <typename T, typename BinaryCompare>
+struct sort_tag_type<T, vtkm::cont::StorageTagBasic, BinaryCompare>
+{
+  using PrimT = std::is_arithmetic<T>;
+  using LongDT = std::is_same<T, long double>;
+  using BComp = is_valid_compare_type<BinaryCompare>;
+  using type = typename std::conditional<PrimT::value && BComp::value && !LongDT::value,
+                                         RadixSortTag,
+                                         PSortTag>::type;
+};
+
+template <typename KeyType,
+          typename ValueType,
+          typename KeyStorageTagType,
+          typename ValueStorageTagType,
+          class BinaryCompare>
+struct sortbykey_tag_type
+{
+  using type = PSortTag;
+};
+template <typename KeyType, typename ValueType, class BinaryCompare>
+struct sortbykey_tag_type<KeyType,
+                          ValueType,
+                          vtkm::cont::StorageTagBasic,
+                          vtkm::cont::StorageTagBasic,
+                          BinaryCompare>
+{
+  using PrimKey = std::is_arithmetic<KeyType>;
+  using PrimValue = std::is_arithmetic<ValueType>;
+  using LongDKey = std::is_same<KeyType, long double>;
+  using BComp = is_valid_compare_type<BinaryCompare>;
+  using type = typename std::conditional<PrimKey::value && PrimValue::value && BComp::value &&
+                                           !LongDKey::value,
+                                         RadixSortTag,
+                                         PSortTag>::type;
+};
+
+#define VTKM_INTERNAL_RADIX_SORT_DECLARE(key_type)                                                 \
+  VTKM_CONT_EXPORT void parallel_radix_sort(                                                       \
+    key_type* data, size_t num_elems, const std::greater<key_type>& comp);                         \
+  VTKM_CONT_EXPORT void parallel_radix_sort(                                                       \
+    key_type* data, size_t num_elems, const std::less<key_type>& comp);                            \
+  VTKM_CONT_EXPORT void parallel_radix_sort_key_values(                                            \
+    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater<key_type>& comp);         \
+  VTKM_CONT_EXPORT void parallel_radix_sort_key_values(                                            \
+    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less<key_type>& comp);
+
+// Generate radix sort interfaces for key and key value sorts.
+#define VTKM_DECLARE_RADIX_SORT()                                                                  \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(short int)                                                      \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned short int)                                             \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(int)                                                            \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned int)                                                   \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(long int)                                                       \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned long int)                                              \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(long long int)                                                  \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned long long int)                                         \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(unsigned char)                                                  \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(signed char)                                                    \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(char)                                                           \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(char16_t)                                                       \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(char32_t)                                                       \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(wchar_t)                                                        \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(float)                                                          \
+  VTKM_INTERNAL_RADIX_SORT_DECLARE(double)
+}
+}
+}
+} // end vtkm::cont::internal::radix
+
+#endif // vtk_m_cont_internal_ParallelRadixSortInterface_h
--- a/vtkm/cont/openmp/CMakeLists.txt
+++ b/vtkm/cont/openmp/CMakeLists.txt
@ -0,0 +1,32 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(headers
+  DeviceAdapterOpenMP.h
+  )
+
+add_subdirectory(internal)
+
+vtkm_declare_headers(${headers} TESTABLE ${VTKm_ENABLE_OPENMP})
+
+#-----------------------------------------------------------------------------
+if (TARGET vtkm::openmp)
+  add_subdirectory(testing)
+endif()
--- a/vtkm/cont/openmp/DeviceAdapterOpenMP.h
+++ b/vtkm/cont/openmp/DeviceAdapterOpenMP.h
@ -0,0 +1,33 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_openmp_DeviceAdapterOpenMP_h
+#define vtk_m_cont_openmp_DeviceAdapterOpenMP_h
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.h>
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+
+#ifdef VTKM_ENABLE_OPENMP
+#include <vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h>
+#include <vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h>
+#include <vtkm/cont/openmp/internal/VirtualObjectTransferOpenMP.h>
+#endif
+
+#endif //vtk_m_cont_openmp_DeviceAdapterOpenMP_h
--- a/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.cxx
+++ b/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.cxx
@ -0,0 +1,42 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_cxx
+
+#include <vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace internal
+{
+
+ExecutionArrayInterfaceBasic<DeviceAdapterTagOpenMP>::ExecutionArrayInterfaceBasic(
+  StorageBasicBase& storage)
+  : Superclass(storage)
+{
+}
+
+} // end namespace internal
+
+VTKM_INSTANTIATE_ARRAYHANDLES_FOR_DEVICE_ADAPTER(DeviceAdapterTagOpenMP)
+}
+} // end vtkm::cont
--- a/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h
+++ b/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h
@ -0,0 +1,109 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h
+#define vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h
+
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+
+#include <vtkm/cont/internal/ArrayExportMacros.h>
+#include <vtkm/cont/internal/ArrayManagerExecution.h>
+#include <vtkm/cont/internal/ArrayManagerExecutionShareWithControl.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace internal
+{
+
+template <typename T, class StorageTag>
+class ArrayManagerExecution<T, StorageTag, vtkm::cont::DeviceAdapterTagOpenMP>
+  : public vtkm::cont::internal::ArrayManagerExecutionShareWithControl<T, StorageTag>
+{
+public:
+  using Superclass = vtkm::cont::internal::ArrayManagerExecutionShareWithControl<T, StorageTag>;
+  using ValueType = typename Superclass::ValueType;
+  using PortalType = typename Superclass::PortalType;
+  using PortalConstType = typename Superclass::PortalConstType;
+  using StorageType = typename Superclass::StorageType;
+
+  VTKM_CONT
+  ArrayManagerExecution(StorageType* storage)
+    : Superclass(storage)
+  {
+  }
+
+  VTKM_CONT
+  PortalConstType PrepareForInput(bool updateData)
+  {
+    return this->Superclass::PrepareForInput(updateData);
+  }
+
+  VTKM_CONT
+  PortalType PrepareForInPlace(bool updateData)
+  {
+    return this->Superclass::PrepareForInPlace(updateData);
+  }
+
+  VTKM_CONT
+  PortalType PrepareForOutput(vtkm::Id numberOfValues)
+  {
+    return this->Superclass::PrepareForOutput(numberOfValues);
+  }
+};
+
+
+template <typename T>
+struct ExecutionPortalFactoryBasic<T, DeviceAdapterTagOpenMP>
+  : public ExecutionPortalFactoryBasicShareWithControl<T>
+{
+  using Superclass = ExecutionPortalFactoryBasicShareWithControl<T>;
+
+  using typename Superclass::ValueType;
+  using typename Superclass::PortalType;
+  using typename Superclass::PortalConstType;
+  using Superclass::CreatePortal;
+  using Superclass::CreatePortalConst;
+};
+
+template <>
+struct VTKM_CONT_EXPORT ExecutionArrayInterfaceBasic<DeviceAdapterTagOpenMP>
+  : public ExecutionArrayInterfaceBasicShareWithControl
+{
+  using Superclass = ExecutionArrayInterfaceBasicShareWithControl;
+
+  VTKM_CONT
+  ExecutionArrayInterfaceBasic(StorageBasicBase& storage);
+
+  VTKM_CONT
+  DeviceAdapterId GetDeviceId() const final { return VTKM_DEVICE_ADAPTER_OPENMP; }
+};
+
+} // namespace internal
+
+#ifndef vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_cxx
+VTKM_EXPORT_ARRAYHANDLES_FOR_DEVICE_ADAPTER(DeviceAdapterTagOpenMP)
+#endif // !vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_cxx
+}
+} // namespace vtkm::cont
+
+#endif // vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h
--- a/vtkm/cont/openmp/internal/CMakeLists.txt
+++ b/vtkm/cont/openmp/internal/CMakeLists.txt
@ -0,0 +1,49 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(headers
+  ArrayManagerExecutionOpenMP.h
+  DeviceAdapterAlgorithmOpenMP.h
+  DeviceAdapterRuntimeDetectorOpenMP.h
+  DeviceAdapterTagOpenMP.h
+  FunctorsOpenMP.h
+  ParallelQuickSortOpenMP.h
+  ParallelRadixSortOpenMP.h
+  ParallelScanOpenMP.h
+  ParallelSortOpenMP.h
+  VirtualObjectTransferOpenMP.h
+  )
+
+vtkm_declare_headers(${headers} TESTABLE ${VTKm_ENABLE_OPENMP})
+
+target_sources(vtkm_cont PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterRuntimeDetectorOpenMP.cxx
+  )
+
+#-----------------------------------------------------------------------------
+if (NOT VTKm_ENABLE_OPENMP)
+  return()
+endif()
+
+target_sources(vtkm_cont PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionOpenMP.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmOpenMP.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/ParallelRadixSortOpenMP.cxx
+)
--- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.cxx
+++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.cxx
@ -0,0 +1,140 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h>
+#include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
+
+#include <vtkm/cont/ErrorExecution.h>
+
+#include <omp.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagOpenMP>::ScheduleTask(
+  vtkm::exec::openmp::internal::TaskTiling1D& functor,
+  vtkm::Id size)
+{
+  static constexpr vtkm::Id MESSAGE_SIZE = 1024;
+  char errorString[MESSAGE_SIZE];
+  errorString[0] = '\0';
+  vtkm::exec::internal::ErrorMessageBuffer errorMessage(errorString, MESSAGE_SIZE);
+  functor.SetErrorMessageBuffer(errorMessage);
+
+  static constexpr vtkm::Id CHUNK_SIZE = 1024;
+
+  VTKM_OPENMP_DIRECTIVE(parallel for
+                        schedule(guided))
+  for (vtkm::Id i = 0; i < size; i += CHUNK_SIZE)
+  {
+    const vtkm::Id end = std::min(i + CHUNK_SIZE, size);
+    functor(i, end);
+  }
+
+  if (errorMessage.IsErrorRaised())
+  {
+    throw vtkm::cont::ErrorExecution(errorString);
+  }
+}
+
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagOpenMP>::ScheduleTask(
+  vtkm::exec::openmp::internal::TaskTiling3D& functor,
+  vtkm::Id3 size)
+{
+  static constexpr vtkm::Id MESSAGE_SIZE = 1024;
+  char errorString[MESSAGE_SIZE];
+  errorString[0] = '\0';
+  vtkm::exec::internal::ErrorMessageBuffer errorMessage(errorString, MESSAGE_SIZE);
+  functor.SetErrorMessageBuffer(errorMessage);
+
+  vtkm::Id3 chunkDims;
+  if (size[0] > 512)
+  {
+    chunkDims = { 1024, 4, 1 };
+  }
+  else if (size[0] > 256)
+  {
+    chunkDims = { 512, 4, 2 };
+  }
+  else if (size[0] > 128)
+  {
+    chunkDims = { 256, 4, 4 };
+  }
+  else if (size[0] > 64)
+  {
+    chunkDims = { 128, 8, 4 };
+  }
+  else if (size[0] > 32)
+  {
+    chunkDims = { 64, 8, 8 };
+  }
+  else if (size[0] > 16)
+  {
+    chunkDims = { 32, 16, 8 };
+  }
+  else
+  {
+    chunkDims = { 16, 16, 16 };
+  }
+
+  const vtkm::Id3 numChunks{ openmp::CeilDivide(size[0], chunkDims[0]),
+                             openmp::CeilDivide(size[1], chunkDims[1]),
+                             openmp::CeilDivide(size[2], chunkDims[2]) };
+  const vtkm::Id chunkCount = numChunks[0] * numChunks[1] * numChunks[2];
+
+  // Lambda to convert chunkIdx into a start/end {i, j, k}:
+  auto computeIJK = [&](const vtkm::Id& chunkIdx, vtkm::Id3& start, vtkm::Id3& end) {
+    start[0] = chunkIdx % numChunks[0];
+    start[1] = (chunkIdx / numChunks[0]) % numChunks[1];
+    start[2] = (chunkIdx / (numChunks[0] * numChunks[1]));
+    start *= chunkDims; // c-wise mult
+
+    end[0] = std::min(start[0] + chunkDims[0], size[0]);
+    end[1] = std::min(start[1] + chunkDims[1], size[1]);
+    end[2] = std::min(start[2] + chunkDims[2], size[2]);
+  };
+
+  // Iterate through each chunk, converting the chunkIdx into an ijk range:
+  VTKM_OPENMP_DIRECTIVE(parallel for
+                        schedule(guided))
+  for (vtkm::Id chunkIdx = 0; chunkIdx < chunkCount; ++chunkIdx)
+  {
+    vtkm::Id3 startIJK;
+    vtkm::Id3 endIJK;
+    computeIJK(chunkIdx, startIJK, endIJK);
+
+    for (vtkm::Id k = startIJK[2]; k < endIJK[2]; ++k)
+    {
+      for (vtkm::Id j = startIJK[1]; j < endIJK[1]; ++j)
+      {
+        functor(startIJK[0], endIJK[0], j, k);
+      }
+    }
+  }
+
+  if (errorMessage.IsErrorRaised())
+  {
+    throw vtkm::cont::ErrorExecution(errorString);
+  }
+}
+}
+} // end namespace vtkm::cont
--- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
+++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h
@ -0,0 +1,385 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h
+#define vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h
+
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/Error.h>
+#include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+#include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
+#include <vtkm/cont/openmp/internal/ParallelScanOpenMP.h>
+#include <vtkm/cont/openmp/internal/ParallelSortOpenMP.h>
+#include <vtkm/exec/openmp/internal/TaskTilingOpenMP.h>
+
+// For serial fallback:
+#include <vtkm/cont/serial/DeviceAdapterSerial.h>
+
+#include <omp.h>
+
+#include <algorithm>
+#include <type_traits>
+
+namespace vtkm
+{
+namespace cont
+{
+
+template <>
+struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagOpenMP>
+  : vtkm::cont::internal::DeviceAdapterAlgorithmGeneral<
+      DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagOpenMP>,
+      vtkm::cont::DeviceAdapterTagOpenMP>
+{
+  using DevTag = DeviceAdapterTagOpenMP;
+  using SerialAlgo = DeviceAdapterAlgorithm<DeviceAdapterTagSerial>;
+
+public:
+  template <typename T, typename U, class CIn, class COut>
+  VTKM_CONT static void Copy(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                             vtkm::cont::ArrayHandle<U, COut>& output)
+  {
+    using namespace vtkm::cont::openmp;
+
+    const vtkm::Id inSize = input.GetNumberOfValues();
+    if (inSize == 0)
+    {
+      output.Allocate(0);
+      return;
+    }
+    auto inputPortal = input.PrepareForInput(DevTag());
+    auto outputPortal = output.PrepareForOutput(inSize, DevTag());
+    CopyHelper(inputPortal, outputPortal, 0, 0, inSize);
+  }
+
+  template <typename T, typename U, class CIn, class CStencil, class COut>
+  VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                               const vtkm::cont::ArrayHandle<U, CStencil>& stencil,
+                               vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    ::vtkm::NotZeroInitialized unary_predicate;
+    CopyIf(input, stencil, output, unary_predicate);
+  }
+
+  template <typename T, typename U, class CIn, class CStencil, class COut, class UnaryPredicate>
+  VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                               const vtkm::cont::ArrayHandle<U, CStencil>& stencil,
+                               vtkm::cont::ArrayHandle<T, COut>& output,
+                               UnaryPredicate unary_predicate)
+  {
+    using namespace vtkm::cont::openmp;
+
+    vtkm::Id inSize = input.GetNumberOfValues();
+    if (inSize == 0)
+    {
+      output.Allocate(0);
+      return;
+    }
+    auto inputPortal = input.PrepareForInput(DevTag());
+    auto stencilPortal = stencil.PrepareForInput(DevTag());
+    auto outputPortal = output.PrepareForOutput(inSize, DevTag());
+
+    auto inIter = vtkm::cont::ArrayPortalToIteratorBegin(inputPortal);
+    auto stencilIter = vtkm::cont::ArrayPortalToIteratorBegin(stencilPortal);
+    auto outIter = vtkm::cont::ArrayPortalToIteratorBegin(outputPortal);
+
+    CopyIfHelper helper;
+
+    VTKM_OPENMP_DIRECTIVE(parallel default(shared))
+    {
+
+      VTKM_OPENMP_DIRECTIVE(single)
+      {
+        // Calls omp_get_num_threads, thus must be used inside a parallel section.
+        helper.Initialize(inSize, sizeof(T));
+      }
+
+      VTKM_OPENMP_DIRECTIVE(for schedule(static))
+      for (vtkm::Id i = 0; i < helper.NumChunks; ++i)
+      {
+        helper.CopyIf(inIter, stencilIter, outIter, unary_predicate, i);
+      }
+    }
+
+    vtkm::Id numValues = helper.Reduce(outIter);
+    output.Shrink(numValues);
+  }
+
+
+  template <typename T, typename U, class CIn, class COut>
+  VTKM_CONT static bool CopySubRange(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                     vtkm::Id inputStartIndex,
+                                     vtkm::Id numberOfValuesToCopy,
+                                     vtkm::cont::ArrayHandle<U, COut>& output,
+                                     vtkm::Id outputIndex = 0)
+  {
+    using namespace vtkm::cont::openmp;
+
+    const vtkm::Id inSize = input.GetNumberOfValues();
+
+    // Check if the ranges overlap and fail if they do.
+    if (input == output &&
+        ((outputIndex >= inputStartIndex && outputIndex < inputStartIndex + numberOfValuesToCopy) ||
+         (inputStartIndex >= outputIndex && inputStartIndex < outputIndex + numberOfValuesToCopy)))
+    {
+      return false;
+    }
+
+    if (inputStartIndex < 0 || numberOfValuesToCopy < 0 || outputIndex < 0 ||
+        inputStartIndex >= inSize)
+    { //invalid parameters
+      return false;
+    }
+
+    //determine if the numberOfElementsToCopy needs to be reduced
+    if (inSize < (inputStartIndex + numberOfValuesToCopy))
+    { //adjust the size
+      numberOfValuesToCopy = (inSize - inputStartIndex);
+    }
+
+    const vtkm::Id outSize = output.GetNumberOfValues();
+    const vtkm::Id copyOutEnd = outputIndex + numberOfValuesToCopy;
+    if (outSize < copyOutEnd)
+    { //output is not large enough
+      if (outSize == 0)
+      { //since output has nothing, just need to allocate to correct length
+        output.Allocate(copyOutEnd);
+      }
+      else
+      { //we currently have data in this array, so preserve it in the new
+        //resized array
+        vtkm::cont::ArrayHandle<U, COut> temp;
+        temp.Allocate(copyOutEnd);
+        CopySubRange(output, 0, outSize, temp);
+        output = temp;
+      }
+    }
+
+    auto inputPortal = input.PrepareForInput(DevTag());
+    auto outputPortal = output.PrepareForInPlace(DevTag());
+
+    CopyHelper(inputPortal, outputPortal, inputStartIndex, outputIndex, numberOfValuesToCopy);
+
+    return true;
+  }
+
+  template <typename T, typename U, class CIn>
+  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input, U initialValue)
+  {
+    return Reduce(input, initialValue, vtkm::Add());
+  }
+
+  template <typename T, typename U, class CIn, class BinaryFunctor>
+  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                            U initialValue,
+                            BinaryFunctor binary_functor)
+  {
+    using namespace vtkm::cont::openmp;
+
+    auto portal = input.PrepareForInput(DevTag());
+    const OpenMPReductionSupported<typename std::decay<U>::type> fastPath;
+
+    return ReduceHelper::Execute(portal, initialValue, binary_functor, fastPath);
+  }
+
+  template <typename T,
+            typename U,
+            class CKeyIn,
+            class CValIn,
+            class CKeyOut,
+            class CValOut,
+            class BinaryFunctor>
+  VTKM_CONT static void ReduceByKey(const vtkm::cont::ArrayHandle<T, CKeyIn>& keys,
+                                    const vtkm::cont::ArrayHandle<U, CValIn>& values,
+                                    vtkm::cont::ArrayHandle<T, CKeyOut>& keys_output,
+                                    vtkm::cont::ArrayHandle<U, CValOut>& values_output,
+                                    BinaryFunctor func)
+  {
+    openmp::ReduceByKeyHelper(keys, values, keys_output, values_output, func);
+  }
+
+  template <typename T, class CIn, class COut>
+  VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    return ScanInclusive(input, output, vtkm::Add());
+  }
+
+  template <typename T, class CIn, class COut, class BinaryFunctor>
+  VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output,
+                                   BinaryFunctor binaryFunctor)
+  {
+    if (input.GetNumberOfValues() * sizeof(T) <= openmp::PAGE_SIZE)
+    {
+      return SerialAlgo::ScanInclusive(input, output, binaryFunctor);
+    }
+
+    using InPortalT = decltype(input.PrepareForInput(DevTag()));
+    using OutPortalT = decltype(output.PrepareForOutput(0, DevTag()));
+    using Impl = openmp::ScanInclusiveHelper<InPortalT, OutPortalT, BinaryFunctor>;
+
+    vtkm::Id numVals = input.GetNumberOfValues();
+    Impl impl(
+      input.PrepareForInput(DevTag()), output.PrepareForOutput(numVals, DevTag()), binaryFunctor);
+
+    return impl.Execute(vtkm::Id2(0, numVals));
+  }
+
+  template <typename T, class CIn, class COut>
+  VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    return ScanExclusive(input, output, vtkm::Add(), vtkm::TypeTraits<T>::ZeroInitialization());
+  }
+
+  template <typename T, class CIn, class COut, class BinaryFunctor>
+  VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output,
+                                   BinaryFunctor binaryFunctor,
+                                   const T& initialValue)
+  {
+    if (input.GetNumberOfValues() * sizeof(T) <= openmp::PAGE_SIZE)
+    {
+      return SerialAlgo::ScanExclusive(input, output, binaryFunctor, initialValue);
+    }
+
+    using InPortalT = decltype(input.PrepareForInput(DevTag()));
+    using OutPortalT = decltype(output.PrepareForOutput(0, DevTag()));
+    using Impl = openmp::ScanExclusiveHelper<InPortalT, OutPortalT, BinaryFunctor>;
+
+    vtkm::Id numVals = input.GetNumberOfValues();
+    Impl impl(input.PrepareForInput(DevTag()),
+              output.PrepareForOutput(numVals, DevTag()),
+              binaryFunctor,
+              initialValue);
+
+    return impl.Execute(vtkm::Id2(0, numVals));
+  }
+
+  /// \brief Unstable ascending sort of input array.
+  ///
+  /// Sorts the contents of \c values so that they in ascending value. Doesn't
+  /// guarantee stability
+  ///
+  template <typename T, class Storage>
+  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
+  {
+    Sort(values, vtkm::SortLess());
+  }
+
+  template <typename T, class Storage, class BinaryCompare>
+  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values,
+                             BinaryCompare binary_compare)
+  {
+    openmp::sort::parallel_sort(values, binary_compare);
+  }
+
+  template <typename T, typename U, class StorageT, class StorageU>
+  VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                                  vtkm::cont::ArrayHandle<U, StorageU>& values)
+  {
+    SortByKey(keys, values, std::less<T>());
+  }
+
+  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
+  VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                                  vtkm::cont::ArrayHandle<U, StorageU>& values,
+                                  BinaryCompare binary_compare)
+  {
+    openmp::sort::parallel_sort_bykey(keys, values, binary_compare);
+  }
+
+  template <typename T, class Storage>
+  VTKM_CONT static void Unique(vtkm::cont::ArrayHandle<T, Storage>& values)
+  {
+    Unique(values, std::equal_to<T>());
+  }
+
+  template <typename T, class Storage, class BinaryCompare>
+  VTKM_CONT static void Unique(vtkm::cont::ArrayHandle<T, Storage>& values,
+                               BinaryCompare binary_compare)
+  {
+    auto portal = values.PrepareForInPlace(DevTag());
+    auto iter = vtkm::cont::ArrayPortalToIteratorBegin(portal);
+
+    using IterT = typename std::decay<decltype(iter)>::type;
+    using Uniqifier = openmp::UniqueHelper<IterT, BinaryCompare>;
+
+    Uniqifier uniquifier(iter, portal.GetNumberOfValues(), binary_compare);
+    vtkm::Id outSize = uniquifier.Execute();
+    values.Shrink(outSize);
+  }
+
+  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::openmp::internal::TaskTiling1D& functor,
+                                            vtkm::Id size);
+  VTKM_CONT_EXPORT static void ScheduleTask(vtkm::exec::openmp::internal::TaskTiling3D& functor,
+                                            vtkm::Id3 size);
+
+  template <class FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id numInstances)
+  {
+    vtkm::exec::openmp::internal::TaskTiling1D kernel(functor);
+    ScheduleTask(kernel, numInstances);
+  }
+
+  template <class FunctorType>
+  VTKM_CONT static inline void Schedule(FunctorType functor, vtkm::Id3 rangeMax)
+  {
+    vtkm::exec::openmp::internal::TaskTiling3D kernel(functor);
+    ScheduleTask(kernel, rangeMax);
+  }
+
+  VTKM_CONT static void Synchronize()
+  {
+    // Nothing to do. This device schedules all of its operations using a
+    // split/join paradigm. This means that the if the control thread is
+    // calling this method, then nothing should be running in the execution
+    // environment.
+  }
+};
+
+template <>
+class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagOpenMP>
+{
+public:
+  template <typename WorkletType, typename InvocationType>
+  static vtkm::exec::serial::internal::TaskTiling1D MakeTask(const WorkletType& worklet,
+                                                             const InvocationType& invocation,
+                                                             vtkm::Id,
+                                                             vtkm::Id globalIndexOffset = 0)
+  {
+    return vtkm::exec::openmp::internal::TaskTiling1D(worklet, invocation, globalIndexOffset);
+  }
+
+  template <typename WorkletType, typename InvocationType>
+  static vtkm::exec::serial::internal::TaskTiling3D MakeTask(const WorkletType& worklet,
+                                                             const InvocationType& invocation,
+                                                             vtkm::Id3,
+                                                             vtkm::Id globalIndexOffset = 0)
+  {
+    return vtkm::exec::openmp::internal::TaskTiling3D(worklet, invocation, globalIndexOffset);
+  }
+};
+}
+} // namespace vtkm::cont
+
+#endif //vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h
--- a/vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.cxx
+++ b/vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.cxx
@ -0,0 +1,32 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include <vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.h>
+
+namespace vtkm
+{
+namespace cont
+{
+VTKM_CONT bool DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagOpenMP>::Exists() const
+{
+  using DeviceAdapterTraits = vtkm::cont::DeviceAdapterTraits<vtkm::cont::DeviceAdapterTagOpenMP>;
+  return DeviceAdapterTraits::Valid;
+}
+}
+}
--- a/vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.h
+++ b/vtkm/cont/openmp/internal/DeviceAdapterRuntimeDetectorOpenMP.h
@ -0,0 +1,47 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_openmp_internal_DeviceAdapterRuntimeDetector_h
+#define vtk_m_cont_openmp_internal_DeviceAdapterRuntimeDetector_h
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+#include <vtkm/cont/vtkm_cont_export.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+template <class DeviceAdapterTag>
+class DeviceAdapterRuntimeDetector;
+
+/// Determine if this machine supports Serial backend
+///
+template <>
+class VTKM_CONT_EXPORT DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagOpenMP>
+{
+public:
+  /// Returns true if the given device adapter is supported on the current
+  /// machine.
+  VTKM_CONT bool Exists() const;
+};
+}
+}
+
+#endif
--- a/vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h
+++ b/vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h
@ -0,0 +1,32 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h
+#define vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h
+
+#include <vtkm/cont/internal/DeviceAdapterTag.h>
+
+#ifdef VTKM_ENABLE_OPENMP
+VTKM_VALID_DEVICE_ADAPTER(OpenMP, VTKM_DEVICE_ADAPTER_OPENMP)
+#else
+VTKM_INVALID_DEVICE_ADAPTER(OpenMP, VTKM_DEVICE_ADAPTER_OPENMP)
+#endif
+
+#endif // vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h
--- a/vtkm/cont/openmp/internal/FunctorsOpenMP.h
+++ b/vtkm/cont/openmp/internal/FunctorsOpenMP.h
@ -0,0 +1,674 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_openmp_internal_FunctorsOpenMP_h
+#define vtk_m_cont_openmp_internal_FunctorsOpenMP_h
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+
+#include <vtkm/cont/internal/FunctorsGeneral.h>
+
+#include <vtkm/BinaryOperators.h>
+#include <vtkm/BinaryPredicates.h>
+#include <vtkm/Pair.h>
+#include <vtkm/Types.h>
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/ErrorExecution.h>
+
+#include <omp.h>
+
+#include <algorithm>
+#include <type_traits>
+#include <vector>
+
+// Wrap all '#pragma omp ...' calls in this macro so we can disable them in
+// non-omp builds and avoid a multitude of 'ignoring pragma..." warnings.
+#ifdef _OPENMP
+#define _VTKM_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir)
+#define VTKM_OPENMP_DIRECTIVE(dir) _VTKM_OPENMP_DIRECTIVE_IMPL(omp dir)
+#else // _OPENMP
+#define VTKM_OPENMP_DIRECTIVE(directive)
+#endif // _OPENMP
+
+// When defined, supported type / operator combinations will use the OpenMP
+// reduction(...) clause. Otherwise, all reductions use the general
+// implementation with a manual reduction once the threads complete.
+// I don't know how, but the benchmarks currently perform better without the
+// specializations.
+//#define VTKM_OPENMP_USE_NATIVE_REDUCTION
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+
+constexpr static vtkm::Id CACHE_LINE_SIZE = 64;
+constexpr static vtkm::Id PAGE_SIZE = 4096;
+
+// Returns ceil(num/den) for integral types
+template <typename T>
+static constexpr T CeilDivide(const T& numerator, const T& denominator)
+{
+  return (numerator + denominator - 1) / denominator;
+}
+
+// Computes the number of values per chunk. Note that numChunks + chunkSize may
+// exceed numVals, so be sure to check upper limits.
+static void ComputeChunkSize(const vtkm::Id numVals,
+                             const vtkm::Id numThreads,
+                             const vtkm::Id chunksPerThread,
+                             const vtkm::Id bytesPerValue,
+                             vtkm::Id& numChunks,
+                             vtkm::Id& valuesPerChunk)
+{
+  // try to evenly distribute pages across chunks:
+  const vtkm::Id bytesIn = numVals * bytesPerValue;
+  const vtkm::Id pagesIn = CeilDivide(bytesIn, PAGE_SIZE);
+  // If we don't have enough pages to honor chunksPerThread, ignore it:
+  numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
+  const vtkm::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
+  valuesPerChunk = CeilDivide(pagesPerChunk * PAGE_SIZE, bytesPerValue);
+}
+
+template <typename T, typename U>
+VTKM_EXEC_CONT static void DoCopy(T src, U dst, vtkm::Id numVals, std::true_type)
+{
+  if (numVals)
+  {
+    std::copy(src, src + numVals, dst);
+  }
+}
+
+// Don't use std::copy when type conversion is required because MSVC.
+template <typename InIterT, typename OutIterT>
+VTKM_EXEC_CONT static void DoCopy(InIterT inIter,
+                                  OutIterT outIter,
+                                  vtkm::Id numVals,
+                                  std::false_type)
+{
+  using ValueType = typename std::iterator_traits<OutIterT>::value_type;
+
+  for (vtkm::Id i = 0; i < numVals; ++i)
+  {
+    *(outIter++) = static_cast<ValueType>(*(inIter++));
+  }
+}
+
+template <typename InIterT, typename OutIterT>
+VTKM_EXEC_CONT static void DoCopy(InIterT inIter, OutIterT outIter, vtkm::Id numVals)
+{
+  using InValueType = typename std::iterator_traits<InIterT>::value_type;
+  using OutValueType = typename std::iterator_traits<OutIterT>::value_type;
+
+  DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
+}
+
+
+template <typename InPortalT, typename OutPortalT>
+VTKM_EXEC_CONT static void CopyHelper(InPortalT inPortal,
+                                      OutPortalT outPortal,
+                                      vtkm::Id inStart,
+                                      vtkm::Id outStart,
+                                      vtkm::Id numVals)
+{
+  using InValueT = typename InPortalT::ValueType;
+  using OutValueT = typename OutPortalT::ValueType;
+  constexpr auto isSame = std::is_same<InValueT, OutValueT>();
+
+  auto inIter = vtkm::cont::ArrayPortalToIteratorBegin(inPortal) + inStart;
+  auto outIter = vtkm::cont::ArrayPortalToIteratorBegin(outPortal) + outStart;
+  vtkm::Id valuesPerChunk;
+
+  VTKM_OPENMP_DIRECTIVE(parallel default(none) shared(inIter, outIter, valuesPerChunk, numVals))
+  {
+
+    VTKM_OPENMP_DIRECTIVE(single)
+    {
+      // Evenly distribute full pages to all threads. We manually chunk the
+      // data here so that we can exploit std::copy's memmove optimizations.
+      vtkm::Id numChunks;
+      ComputeChunkSize(
+        numVals, omp_get_num_threads(), 8, sizeof(InValueT), numChunks, valuesPerChunk);
+    }
+
+VTKM_OPENMP_DIRECTIVE(for schedule(static))
+for (vtkm::Id i = 0; i < numVals; i += valuesPerChunk)
+{
+  vtkm::Id chunkSize = std::min(numVals - i, valuesPerChunk);
+  DoCopy(inIter + i, outIter + i, chunkSize, isSame);
+}
+  }
+}
+
+struct CopyIfHelper
+{
+  vtkm::Id NumValues;
+  vtkm::Id NumThreads;
+  vtkm::Id ValueSize;
+
+  vtkm::Id NumChunks;
+  vtkm::Id ChunkSize;
+  std::vector<vtkm::Id> EndIds;
+
+  CopyIfHelper() = default;
+
+  void Initialize(vtkm::Id numValues, vtkm::Id valueSize)
+  {
+    this->NumValues = numValues;
+    this->NumThreads = omp_get_num_threads();
+    this->ValueSize = valueSize;
+
+    // Evenly distribute pages across the threads. We manually chunk the
+    // data here so that we can exploit std::copy's memmove optimizations.
+    ComputeChunkSize(
+      this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
+
+    this->EndIds.resize(this->NumChunks);
+  }
+
+  template <typename InIterT, typename StencilIterT, typename OutIterT, typename PredicateT>
+  void CopyIf(InIterT inIter,
+              StencilIterT stencilIter,
+              OutIterT outIter,
+              PredicateT pred,
+              vtkm::Id chunk)
+  {
+    vtkm::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
+    vtkm::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
+
+    vtkm::Id outPos = startPos;
+    for (vtkm::Id inPos = startPos; inPos < endPos; ++inPos)
+    {
+      if (pred(stencilIter[inPos]))
+      {
+        outIter[outPos++] = inIter[inPos];
+      }
+    }
+
+    this->EndIds[chunk] = outPos;
+  }
+
+  template <typename OutIterT>
+  vtkm::Id Reduce(OutIterT data)
+  {
+    vtkm::Id endPos = this->EndIds.front();
+    for (vtkm::Id i = 1; i < this->NumChunks; ++i)
+    {
+      vtkm::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
+      vtkm::Id chunkEnd = this->EndIds[i];
+      vtkm::Id numValuesToCopy = chunkEnd - chunkStart;
+      if (numValuesToCopy > 0 && chunkStart != endPos)
+      {
+        std::copy(data + chunkStart, data + chunkEnd, data + endPos);
+      }
+      endPos += numValuesToCopy;
+    }
+    return endPos;
+  }
+};
+
+#ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
+// OpenMP only declares reduction operations for primitive types. This utility
+// detects if a type T is supported.
+template <typename T>
+struct OpenMPReductionSupported : std::false_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Int8> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<UInt8> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Int16> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<UInt16> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Int32> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<UInt32> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Int64> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<UInt64> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Float32> : std::true_type
+{
+};
+template <>
+struct OpenMPReductionSupported<Float64> : std::true_type
+{
+};
+#else
+template <typename T>
+using OpenMPReductionSupported = std::false_type;
+#endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
+
+struct ReduceHelper
+{
+  // Generic implementation:
+  template <typename PortalT, typename ReturnType, typename Functor>
+  static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
+  {
+    internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
+
+    const vtkm::Id numVals = portal.GetNumberOfValues();
+    auto data = vtkm::cont::ArrayPortalToIteratorBegin(portal);
+
+    bool doParallel = false;
+    std::vector<ReturnType> threadData;
+
+    VTKM_OPENMP_DIRECTIVE(parallel default(none) firstprivate(f)
+                            shared(data, threadData, doParallel))
+    {
+
+      int tid = omp_get_thread_num();
+      int numThreads = omp_get_num_threads();
+
+      VTKM_OPENMP_DIRECTIVE(single)
+      {
+        if (numVals >= numThreads * 2)
+        {
+          doParallel = true;
+          threadData.resize(numThreads);
+        }
+      }
+
+      if (doParallel)
+      {
+        // Use the first (numThreads*2) values for initializing:
+        ReturnType accum;
+        accum = f(data[2 * tid], data[2 * tid + 1]);
+
+        // Assign each thread chunks of the remaining values for local reduction
+        VTKM_OPENMP_DIRECTIVE(for schedule(static))
+        for (vtkm::Id i = numThreads * 2; i < numVals; i++)
+        {
+          accum = f(accum, data[i]);
+        }
+
+        threadData[tid] = accum;
+      }
+    } // end parallel
+
+    if (doParallel)
+    {
+      // do the final reduction serially:
+      for (size_t i = 0; i < threadData.size(); ++i)
+      {
+        init = f(init, threadData[i]);
+      }
+    }
+    else
+    {
+      // Not enough threads. Do the entire reduction in serial:
+      for (vtkm::Id i = 0; i < numVals; ++i)
+      {
+        init = f(init, data[i]);
+      }
+    }
+
+    return init;
+  }
+
+#ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
+
+// Specialize for vtkm functors with OpenMP special cases:
+#define VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString)                                  \
+  template <typename PortalT, typename ReturnType>                                                 \
+  static ReturnType Execute(                                                                       \
+    PortalT portal, ReturnType value, FunctorType functorIn, std::true_type)                       \
+  {                                                                                                \
+    const vtkm::Id numValues = portal.GetNumberOfValues();                                         \
+    internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn);                         \
+    _Pragma(#PragmaString) for (vtkm::Id i = 0; i < numValues; ++i)                                \
+    {                                                                                              \
+      value = f(value, portal.Get(i));                                                             \
+    }                                                                                              \
+    return value;                                                                                  \
+  }
+
+// Constructing the pragma string inside the _Pragma call doesn't work so
+// we jump through a hoop:
+#define VTKM_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator)                                       \
+  VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)")
+
+  // + (Add, Sum)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Add, +)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Sum, +)
+  // * (Multiply, Product)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Multiply, *)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Product, *)
+  // - (Subtract)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Subtract, -)
+  // & (BitwiseAnd)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseAnd, &)
+  // | (BitwiseOr)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseOr, |)
+  // ^ (BitwiseXor)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseXor, ^)
+  // && (LogicalAnd)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::LogicalAnd, &&)
+  // || (LogicalOr)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::LogicalOr, ||)
+  // min (Minimum)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Minimum, min)
+  // max (Maximum)
+  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Maximum, max)
+
+#undef VTKM_OPENMP_SPECIALIZE_REDUCE
+#undef VTKM_OPENMP_SPECIALIZE_REDUCE1
+
+#endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
+};
+
+template <typename KeysInArray,
+          typename ValuesInArray,
+          typename KeysOutArray,
+          typename ValuesOutArray,
+          typename BinaryFunctor>
+void ReduceByKeyHelper(KeysInArray keysInArray,
+                       ValuesInArray valuesInArray,
+                       KeysOutArray keysOutArray,
+                       ValuesOutArray valuesOutArray,
+                       BinaryFunctor functor)
+{
+  using KeyType = typename KeysInArray::ValueType;
+  using ValueType = typename ValuesInArray::ValueType;
+
+  const vtkm::Id numValues = keysInArray.GetNumberOfValues();
+  auto keysInPortal = keysInArray.PrepareForInput(DeviceAdapterTagOpenMP());
+  auto valuesInPortal = valuesInArray.PrepareForInput(DeviceAdapterTagOpenMP());
+  auto keysIn = vtkm::cont::ArrayPortalToIteratorBegin(keysInPortal);
+  auto valuesIn = vtkm::cont::ArrayPortalToIteratorBegin(valuesInPortal);
+
+  auto keysOutPortal = keysOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP());
+  auto valuesOutPortal = valuesOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP());
+  auto keysOut = vtkm::cont::ArrayPortalToIteratorBegin(keysOutPortal);
+  auto valuesOut = vtkm::cont::ArrayPortalToIteratorBegin(valuesOutPortal);
+
+  internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
+  vtkm::Id outIdx = 0;
+
+  VTKM_OPENMP_DIRECTIVE(parallel default(none) firstprivate(keysIn, valuesIn, keysOut, valuesOut, f)
+                          shared(outIdx))
+  {
+    int tid = omp_get_thread_num();
+    int numThreads = omp_get_num_threads();
+
+    // Determine bounds for this thread's scan operation:
+    vtkm::Id chunkSize = (numValues + numThreads - 1) / numThreads;
+    vtkm::Id scanIdx = std::min(tid * chunkSize, numValues);
+    vtkm::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
+
+    auto threadKeysBegin = keysOut + scanIdx;
+    auto threadValuesBegin = valuesOut + scanIdx;
+    auto threadKey = threadKeysBegin;
+    auto threadValue = threadValuesBegin;
+
+    // Reduce each thread's partition:
+    KeyType rangeKey;
+    ValueType rangeValue;
+    for (;;)
+    {
+      if (scanIdx < scanEnd)
+      {
+        rangeKey = keysIn[scanIdx];
+        rangeValue = valuesIn[scanIdx];
+        ++scanIdx;
+
+        // Locate end of current range:
+        while (scanIdx < scanEnd && static_cast<KeyType>(keysIn[scanIdx]) == rangeKey)
+        {
+          rangeValue = f(rangeValue, valuesIn[scanIdx]);
+          ++scanIdx;
+        }
+
+        *threadKey = rangeKey;
+        *threadValue = rangeValue;
+        ++threadKey;
+        ++threadValue;
+      }
+      else
+      {
+        break;
+      }
+    }
+
+    if (tid == 0)
+    {
+      outIdx = static_cast<vtkm::Id>(threadKey - threadKeysBegin);
+    }
+
+    // Combine the reduction results. Skip tid == 0, since it's already in
+    // the correct location:
+    for (int i = 1; i < numThreads; ++i)
+    {
+
+      // This barrier ensures that:
+      // 1) Threads remain synchronized through this final reduction loop.
+      // 2) The outIdx variable is initialized by thread 0.
+      // 3) All threads have reduced their partitions.
+      VTKM_OPENMP_DIRECTIVE(barrier)
+
+      if (tid == i)
+      {
+        // Check if the previous thread's last key matches our first:
+        if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
+        {
+          valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
+          ++threadKeysBegin;
+          ++threadValuesBegin;
+        }
+
+        // Copy reduced partition to final location (if needed)
+        if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
+        {
+          std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
+          std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
+        }
+
+        outIdx += static_cast<vtkm::Id>(threadKey - threadKeysBegin);
+
+      } // end tid == i
+    }   // end combine reduction
+  }     // end parallel
+
+  keysOutArray.Shrink(outIdx);
+  valuesOutArray.Shrink(outIdx);
+}
+
+template <typename IterT, typename RawPredicateT>
+struct UniqueHelper
+{
+  using ValueType = typename std::iterator_traits<IterT>::value_type;
+  using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
+
+  struct Node
+  {
+    vtkm::Id2 InputRange{ -1, -1 };
+    vtkm::Id2 OutputRange{ -1, -1 };
+
+    // Pad the node out to the size of a cache line to prevent false sharing:
+    static constexpr size_t DataSize = 2 * sizeof(vtkm::Id2);
+    static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, CACHE_LINE_SIZE);
+    static constexpr size_t PaddingSize = NumCacheLines * CACHE_LINE_SIZE - DataSize;
+    unsigned char Padding[PaddingSize];
+  };
+
+  IterT Data;
+  vtkm::Id NumValues;
+  PredicateT Predicate;
+  vtkm::Id LeafSize;
+  std::vector<Node> Nodes;
+  size_t NextNode;
+
+  UniqueHelper(IterT iter, vtkm::Id numValues, RawPredicateT pred)
+    : Data(iter)
+    , NumValues(numValues)
+    , Predicate(pred)
+    , LeafSize(0)
+    , NextNode(0)
+  {
+  }
+
+  vtkm::Id Execute()
+  {
+    vtkm::Id outSize = 0;
+
+    VTKM_OPENMP_DIRECTIVE(parallel default(shared))
+    {
+      VTKM_OPENMP_DIRECTIVE(single)
+      {
+        this->Prepare();
+
+        // Kick off task-based divide-and-conquer uniquification:
+        Node* rootNode = this->AllocNode();
+        rootNode->InputRange = vtkm::Id2(0, this->NumValues);
+        this->Uniquify(rootNode);
+        outSize = rootNode->OutputRange[1] - rootNode->OutputRange[0];
+      }
+    }
+
+    return outSize;
+  }
+
+private:
+  void Prepare()
+  {
+    // Figure out how many values each thread should handle:
+    int numThreads = omp_get_num_threads();
+    vtkm::Id chunksPerThread = 8;
+    vtkm::Id numChunks;
+    ComputeChunkSize(
+      this->NumValues, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
+
+    // Compute an upper-bound of the number of nodes in the tree:
+    size_t numNodes = numChunks;
+    while (numChunks > 1)
+    {
+      numChunks = (numChunks + 1) / 2;
+      numNodes += numChunks;
+    }
+    this->Nodes.resize(numNodes);
+    this->NextNode = 0;
+  }
+
+  Node* AllocNode()
+  {
+    size_t nodeIdx;
+
+// GCC emits a false positive "value computed but not used" for this block:
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-value"
+
+    VTKM_OPENMP_DIRECTIVE(atomic capture)
+    {
+      nodeIdx = this->NextNode;
+      ++this->NextNode;
+    }
+
+#pragma GCC diagnostic pop
+
+    VTKM_ASSERT(nodeIdx < this->Nodes.size());
+
+    return &this->Nodes[nodeIdx];
+  }
+
+  bool IsLeaf(const vtkm::Id2& range) { return (range[1] - range[0]) <= this->LeafSize; }
+
+  // Not an strict midpoint, but ensures that the first range will always be
+  // a multiple of the leaf size.
+  vtkm::Id ComputeMidpoint(const vtkm::Id2& range)
+  {
+    const vtkm::Id n = range[1] - range[0];
+    const vtkm::Id np = this->LeafSize;
+
+    return CeilDivide(n / 2, np) * np + range[0];
+  }
+
+  void Uniquify(Node* node)
+  {
+    if (!this->IsLeaf(node->InputRange))
+    {
+      vtkm::Id midpoint = this->ComputeMidpoint(node->InputRange);
+
+      Node* right = this->AllocNode();
+      Node* left = this->AllocNode();
+
+      right->InputRange = vtkm::Id2(midpoint, node->InputRange[1]);
+
+      // Intel compilers seem to have trouble following the 'this' pointer
+      // when launching tasks, resulting in a corrupt task environment.
+      // Explicitly copying the pointer into a local variable seems to fix this.
+      auto explicitThis = this;
+
+      VTKM_OPENMP_DIRECTIVE(taskgroup)
+      {
+        VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Uniquify(right); }
+
+        left->InputRange = vtkm::Id2(node->InputRange[0], midpoint);
+        this->Uniquify(left);
+
+      } // end taskgroup. Both sides of the tree will be completed here.
+
+      // Combine the ranges in the left side:
+      if (this->Predicate(this->Data[left->OutputRange[1] - 1], this->Data[right->OutputRange[0]]))
+      {
+        ++right->OutputRange[0];
+      }
+
+      vtkm::Id numVals = right->OutputRange[1] - right->OutputRange[0];
+      DoCopy(this->Data + right->OutputRange[0], this->Data + left->OutputRange[1], numVals);
+
+      node->OutputRange[0] = left->OutputRange[0];
+      node->OutputRange[1] = left->OutputRange[1] + numVals;
+    }
+    else
+    {
+      auto start = this->Data + node->InputRange[0];
+      auto end = this->Data + node->InputRange[1];
+      end = std::unique(start, end, this->Predicate);
+      node->OutputRange[0] = node->InputRange[0];
+      node->OutputRange[1] = node->InputRange[0] + static_cast<vtkm::Id>(end - start);
+    }
+  }
+};
+}
+}
+} // end namespace vtkm::cont::openmp
+
+#endif // vtk_m_cont_openmp_internal_FunctorsOpenMP_h
--- a/vtkm/cont/openmp/internal/ParallelQuickSortOpenMP.h
+++ b/vtkm/cont/openmp/internal/ParallelQuickSortOpenMP.h
@ -0,0 +1,271 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+#include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
+
+#include <vtkm/cont/internal/FunctorsGeneral.h>
+
+#include <vtkm/Types.h>
+#include <vtkm/cont/ArrayHandle.h>
+
+#include <omp.h>
+
+#include <iterator>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+namespace sort
+{
+namespace quick
+{
+
+template <typename IterType, typename RawBinaryCompare>
+struct QuickSorter
+{
+  using BinaryCompare = vtkm::cont::internal::WrappedBinaryOperator<bool, RawBinaryCompare>;
+  using ValueType = typename std::iterator_traits<IterType>::value_type;
+
+  IterType Data;
+  BinaryCompare Compare;
+  vtkm::Id SerialSize;
+
+  QuickSorter(IterType iter, RawBinaryCompare comp)
+    : Data(iter)
+    , Compare(comp)
+    , SerialSize(0)
+  {
+  }
+
+  void Execute(const vtkm::Id2 range)
+  {
+    VTKM_OPENMP_DIRECTIVE(parallel default(shared))
+    {
+      VTKM_OPENMP_DIRECTIVE(single)
+      {
+        this->Prepare(range);
+        this->Sort(range);
+      }
+    }
+  }
+
+private:
+  void Prepare(const vtkm::Id2 /*range*/)
+  {
+    // Rough benchmarking on an 4-core+4HT processor shows that this sort is
+    // most efficient (within 5% of TBB sort) when we switch to a serial
+    // implementation once a partition is less than 32K keys
+    this->SerialSize = 32768;
+  }
+
+  vtkm::Pair<vtkm::Id, ValueType> MedianOf3(const vtkm::Pair<vtkm::Id, ValueType>& v1,
+                                            const vtkm::Pair<vtkm::Id, ValueType>& v2,
+                                            const vtkm::Pair<vtkm::Id, ValueType>& v3) const
+  {
+    if (this->Compare(v1.second, v2.second))
+    { // v1 < v2
+      if (this->Compare(v1.second, v3.second))
+      { // v1 < v3
+        if (this->Compare(v2.second, v3.second))
+        { // v1 < v2 < v3
+          return v2;
+        }
+        else // v3 < v2
+        {    // v1 < v3 < v2
+          return v3;
+        }
+      }
+      else // v3 < v1
+      {    // v3 < v1 < v2
+        return v1;
+      }
+    }
+    else
+    { // v2 < v1
+      if (this->Compare(v2.second, v3.second))
+      { // v2 < v3
+        if (this->Compare(v1.second, v3.second))
+        { // v2 < v1 < v3
+          return v1;
+        }
+        else
+        { // v2 < v3 < v1
+          return v3;
+        }
+      }
+      else
+      { // v3 < v2 < v1
+        return v2;
+      }
+    }
+  }
+
+  vtkm::Pair<vtkm::Id, ValueType> MedianOf3(const vtkm::Id ids[3]) const
+  {
+    return this->MedianOf3(vtkm::Pair<vtkm::Id, ValueType>(ids[0], this->Data[ids[0]]),
+                           vtkm::Pair<vtkm::Id, ValueType>(ids[1], this->Data[ids[1]]),
+                           vtkm::Pair<vtkm::Id, ValueType>(ids[2], this->Data[ids[2]]));
+  }
+
+  vtkm::Pair<vtkm::Id, ValueType> PseudoMedianOf9(const vtkm::Id ids[9]) const
+  {
+    return this->MedianOf3(
+      this->MedianOf3(ids), this->MedianOf3(ids + 3), this->MedianOf3(ids + 6));
+  }
+
+  // Approximate the median of the range and return its index.
+  vtkm::Pair<vtkm::Id, ValueType> SelectPivot(const vtkm::Id2 range) const
+  {
+    const vtkm::Id numVals = range[1] - range[0];
+    assert(numVals >= 9);
+
+    // Pseudorandomize the pivot locations to avoid issues with periodic data
+    // (evenly sampling inputs with periodic values tends to cause the same
+    // value to be obtained for all samples)
+    const vtkm::Id seed = range[0] * 3 / 2 + range[1] * 11 / 3 + numVals * 10 / 7;
+    const vtkm::Id delta = (numVals / 9) * 4 / 3;
+
+    vtkm::Id sampleLocations[9] = {
+      range[0] + ((seed + 0 * delta) % numVals), range[0] + ((seed + 1 * delta) % numVals),
+      range[0] + ((seed + 2 * delta) % numVals), range[0] + ((seed + 3 * delta) % numVals),
+      range[0] + ((seed + 4 * delta) % numVals), range[0] + ((seed + 5 * delta) % numVals),
+      range[0] + ((seed + 6 * delta) % numVals), range[0] + ((seed + 7 * delta) % numVals),
+      range[0] + ((seed + 8 * delta) % numVals)
+    };
+
+    return this->PseudoMedianOf9(sampleLocations);
+  }
+
+  // Select a pivot and partition data with it, returning the final location of
+  // the pivot element(s). We use Bentley-McIlroy three-way partitioning to
+  // improve handling of duplicate keys, so the pivot "location" is actually
+  // a range of identical keys, hence the vtkm::Id2 return type, which mark
+  // the [begin, end) of the pivot range.
+  vtkm::Id2 PartitionData(const vtkm::Id2 range)
+  {
+    using namespace std; // For ADL swap
+
+    const vtkm::Pair<vtkm::Id, ValueType> pivotData = this->SelectPivot(range);
+    const vtkm::Id& origPivotIdx = pivotData.first;
+    const ValueType& pivotVal = pivotData.second;
+
+    // Move the pivot to the end of the block while we partition the rest:
+    swap(this->Data[origPivotIdx], this->Data[range[1] - 1]);
+
+    // Indices of the last partitioned keys:
+    vtkm::Id2 dataCursors(range[0] - 1, range[1] - 1);
+
+    // Indices of the start/end of the keys equal to the pivot:
+    vtkm::Id2 pivotCursors(dataCursors);
+
+    for (;;)
+    {
+      // Advance the data cursors past all keys that are already partitioned:
+      while (this->Compare(this->Data[++dataCursors[0]], pivotVal))
+        ;
+      while (this->Compare(pivotVal, this->Data[--dataCursors[1]]) && dataCursors[1] > range[0])
+        ;
+
+      // Range is partitioned the cursors have crossed:
+      if (dataCursors[0] >= dataCursors[1])
+      {
+        break;
+      }
+
+      // Both dataCursors are pointing at incorrectly partitioned keys. Swap
+      // them to place them in the proper partitions:
+      swap(this->Data[dataCursors[0]], this->Data[dataCursors[1]]);
+
+      // If the elements we just swapped are actually equivalent to the pivot
+      // value, move them to the pivot storage locations:
+      if (!this->Compare(this->Data[dataCursors[0]], pivotVal))
+      {
+        ++pivotCursors[0];
+        swap(this->Data[pivotCursors[0]], this->Data[dataCursors[0]]);
+      }
+      if (!this->Compare(pivotVal, this->Data[dataCursors[1]]))
+      {
+        --pivotCursors[1];
+        swap(this->Data[pivotCursors[1]], this->Data[dataCursors[1]]);
+      }
+    }
+
+    // Data is now partitioned as:
+    // | Equal | Less | Greater | Equal |
+    // Move the equal keys to the middle for the final partitioning:
+    // | Less | Equal | Greater |
+    // First the original pivot value at the end:
+    swap(this->Data[range[1] - 1], this->Data[dataCursors[0]]);
+
+    // Update the cursors to either side of the pivot:
+    dataCursors = vtkm::Id2(dataCursors[0] - 1, dataCursors[0] + 1);
+
+    for (vtkm::Id i = range[0]; i < pivotCursors[0]; ++i, --dataCursors[0])
+    {
+      swap(this->Data[i], this->Data[dataCursors[0]]);
+    }
+
+    for (vtkm::Id i = range[1] - 2; i > pivotCursors[1]; --i, ++dataCursors[1])
+    {
+      swap(this->Data[i], this->Data[dataCursors[1]]);
+    }
+
+    // Adjust the cursor so we can use them to construct the regions for the
+    // recursive call:
+    ++dataCursors[0];
+
+    return dataCursors;
+  }
+
+  void Sort(const vtkm::Id2 range)
+  {
+    const vtkm::Id numVals = range[1] - range[0];
+    if (numVals <= this->SerialSize)
+    {
+      std::sort(this->Data + range[0], this->Data + range[1], this->Compare);
+      return;
+    }
+
+    const vtkm::Id2 pivots = this->PartitionData(range);
+    const vtkm::Id2 lhRange = vtkm::Id2(range[0], pivots[0]);
+    const vtkm::Id2 rhRange = vtkm::Id2(pivots[1], range[1]);
+
+    // Intel compilers seem to have trouble following the 'this' pointer
+    // when launching tasks, resulting in a corrupt task environment.
+    // Explicitly copying the pointer into a local variable seems to fix this.
+    auto explicitThis = this;
+
+    VTKM_OPENMP_DIRECTIVE(task default(none) firstprivate(rhRange, explicitThis))
+    {
+      explicitThis->Sort(rhRange);
+    }
+
+    this->Sort(lhRange);
+  }
+};
+}
+} // end namespace sort::quick
+}
+}
+} // end namespace vtkm::cont::openmp
--- a/vtkm/cont/openmp/internal/ParallelRadixSortOpenMP.cxx
+++ b/vtkm/cont/openmp/internal/ParallelRadixSortOpenMP.cxx
@ -0,0 +1,88 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/internal/ParallelRadixSort.h>
+
+#include <omp.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+namespace sort
+{
+namespace radix
+{
+
+struct RadixThreaderOpenMP
+{
+  size_t GetAvailableCores() const
+  {
+    size_t result;
+    if (omp_in_parallel())
+    {
+      result = static_cast<size_t>(omp_get_num_threads());
+    }
+    else
+    {
+#pragma omp parallel
+      {
+        result = static_cast<size_t>(omp_get_num_threads());
+      }
+    }
+
+    return result;
+  }
+
+  template <typename TaskType>
+  void RunParentTask(TaskType task)
+  {
+    assert(!omp_in_parallel());
+#pragma omp parallel default(none) shared(task)
+    {
+#pragma omp single
+      {
+        task();
+      }
+    } // Implied barrier ensures that child tasks will finish.
+  }
+
+  template <typename TaskType, typename ThreadData>
+  void RunChildTasks(ThreadData, TaskType left, TaskType right)
+  {
+    assert(omp_in_parallel());
+#pragma omp task default(none) firstprivate(right)
+    {
+      right();
+    }
+
+    // Execute the left task in the existing thread.
+    left();
+  }
+};
+
+VTKM_INSTANTIATE_RADIX_SORT_FOR_THREADER(RadixThreaderOpenMP)
+}
+} // end namespace sort::radix
+}
+}
+} // end namespace vtkm::cont::openmp
--- a/vtkm/cont/openmp/internal/ParallelRadixSortOpenMP.h
+++ b/vtkm/cont/openmp/internal/ParallelRadixSortOpenMP.h
@ -0,0 +1,44 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_openmp_internal_ParallelRadixSortOpenMP_h
+#define vtk_m_cont_openmp_internal_ParallelRadixSortOpenMP_h
+
+#include <vtkm/cont/internal/ParallelRadixSortInterface.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+namespace sort
+{
+namespace radix
+{
+
+VTKM_DECLARE_RADIX_SORT()
+}
+}
+}
+}
+} // end namespace vtkm::cont::openmp::sort::radix
+
+#endif // vtk_m_cont_openmp_internal_ParallelRadixSortOpenMP_h
--- a/vtkm/cont/openmp/internal/ParallelScanOpenMP.h
+++ b/vtkm/cont/openmp/internal/ParallelScanOpenMP.h
@ -0,0 +1,489 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+#include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
+
+#include <vtkm/cont/internal/FunctorsGeneral.h>
+
+#include <vtkm/Types.h>
+#include <vtkm/cont/ArrayHandle.h>
+
+#include <omp.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+namespace scan
+{
+
+enum class ChildType
+{
+  Left,
+  Right
+};
+
+// Generic implementation of modified Ladner & Fischer 1977 "adder" algorithm
+// used for backbone of exclusive/inclusive scans. Language in comments is
+// specific to computing a sum, but the implementation should be generic enough
+// for any scan operation.
+//
+// The basic idea is that a tree structure is used to partition the input into
+// sets of LeafSize. Each leaf of the tree is processed in two stages: First,
+// the sum of each leaf is computed, and this information is pushed up the tree
+// to compute the sum of each node's child leaves. Then the partial sum at the
+// start of each node is computed and pushed down the tree (the "carry"
+// values). In the second pass through each leaf's data, these partial sums are
+// used to compute the final output from the carry value and the input data.
+//
+// The passes will likely overlap due to the "leftEdge" optimizations, which
+// allow each leaf to start the second pass as soon as the first pass of all
+// previous leaves is completed. Additionally, the first leaf in the data will
+// combine both passes into one, computing the final output data while
+// generating its sum for the communication stage.
+template <typename ScanBody>
+struct Adder : public ScanBody
+{
+  template <typename NodeImpl>
+  struct NodeWrapper : public NodeImpl
+  {
+    // Range of IDs this node represents
+    vtkm::Id2 Range{ -1, -1 };
+
+    // Connections:
+    NodeWrapper* Parent{ nullptr };
+    NodeWrapper* Left{ nullptr };
+    NodeWrapper* Right{ nullptr };
+
+    // Special flag to mark nodes on the far left edge of the tree. This allows
+    // various optimization that start the second pass sooner on some ranges.
+    bool LeftEdge{ false };
+
+    // Pad the node out to the size of a cache line to prevent false sharing:
+    static constexpr size_t DataSize =
+      sizeof(NodeImpl) + sizeof(vtkm::Id2) + 3 * sizeof(NodeWrapper*) + sizeof(bool);
+    static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, CACHE_LINE_SIZE);
+    static constexpr size_t PaddingSize = NumCacheLines * CACHE_LINE_SIZE - DataSize;
+    unsigned char Padding[PaddingSize];
+  };
+
+  using Node = NodeWrapper<typename ScanBody::Node>;
+  using ValueType = typename ScanBody::ValueType;
+
+  vtkm::Id LeafSize;
+  std::vector<Node> Nodes;
+  size_t NextNode;
+
+  // Use ScanBody's ctor:
+  using ScanBody::ScanBody;
+
+  // Returns the total array sum:
+  ValueType Execute(const vtkm::Id2& range)
+  {
+    Node* rootNode = nullptr;
+
+    VTKM_OPENMP_DIRECTIVE(parallel default(shared))
+    {
+      VTKM_OPENMP_DIRECTIVE(single)
+      {
+        // Allocate nodes, prep metadata:
+        this->Prepare(range);
+
+        // Compute the partition and node sums:
+        rootNode = this->AllocNode();
+        rootNode->Range = range;
+        rootNode->LeftEdge = true;
+        ScanBody::InitializeRootNode(rootNode);
+
+        this->Scan(rootNode);
+      } // end single
+    }   // end parallel
+
+    return rootNode ? ScanBody::GetFinalResult(rootNode)
+                    : vtkm::TypeTraits<ValueType>::ZeroInitialization();
+  }
+
+private:
+  // Returns the next available node in a thread-safe manner.
+  Node* AllocNode()
+  {
+    size_t nodeIdx;
+
+// GCC emits a false positive "value computed but not used" for this block:
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-value"
+
+    VTKM_OPENMP_DIRECTIVE(atomic capture)
+    {
+      nodeIdx = this->NextNode;
+      ++this->NextNode;
+    }
+
+#pragma GCC diagnostic pop
+
+    VTKM_ASSERT(nodeIdx < this->Nodes.size());
+
+    return &this->Nodes[nodeIdx];
+  }
+
+  // Does the range represent a leave node?
+  bool IsLeaf(const vtkm::Id2& range) const { return (range[1] - range[0]) <= this->LeafSize; }
+
+  // Use to split ranges. Ensures that the first range is always a multiple of
+  // LeafSize, when possible.
+  vtkm::Id ComputeMidpoint(const vtkm::Id2& range) const
+  {
+    const vtkm::Id n = range[1] - range[0];
+    const vtkm::Id np = this->LeafSize;
+
+    return (((n / 2) + (np - 1)) / np) * np + range[0];
+  }
+
+  void Prepare(const vtkm::Id2& range)
+  {
+    // Figure out how many values each thread should handle:
+    vtkm::Id numVals = range[1] - range[0];
+    int numThreads = omp_get_num_threads();
+    vtkm::Id chunksPerThread = 8;
+    vtkm::Id numChunks;
+    ComputeChunkSize(
+      numVals, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
+
+    // Compute an upper-bound of the number of nodes in the tree:
+    size_t numNodes = numChunks;
+    while (numChunks > 1)
+    {
+      numChunks = (numChunks + 1) / 2;
+      numNodes += numChunks;
+    }
+    this->Nodes.resize(numNodes);
+    this->NextNode = 0;
+  }
+
+  // Build the tree and compute the sums:
+  void Scan(Node* node)
+  {
+    if (!this->IsLeaf(node->Range))
+    { // split range:
+      vtkm::Id midpoint = this->ComputeMidpoint(node->Range);
+
+      Node* right = this->AllocNode();
+      right->Parent = node;
+      node->Right = right;
+      right->Range = vtkm::Id2(midpoint, node->Range[1]);
+      ScanBody::InitializeChildNode(right, node, ChildType::Right, false);
+
+      // Intel compilers seem to have trouble following the 'this' pointer
+      // when launching tasks, resulting in a corrupt task environment.
+      // Explicitly copying the pointer into a local variable seems to fix this.
+      auto explicitThis = this;
+
+      VTKM_OPENMP_DIRECTIVE(taskgroup)
+      {
+        VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Scan(right); } // end right task
+
+        Node* left = this->AllocNode();
+        left->Parent = node;
+        node->Left = left;
+        left->Range = vtkm::Id2(node->Range[0], midpoint);
+        left->LeftEdge = node->LeftEdge;
+        ScanBody::InitializeChildNode(left, node, ChildType::Left, left->LeftEdge);
+        this->Scan(left);
+
+      } // end task group. Both l/r sums will be finished here.
+
+      ScanBody::CombineSummaries(node, node->Left, node->Right);
+      if (node->LeftEdge)
+      {
+        this->UpdateOutput(node);
+      }
+    }
+    else
+    { // Compute sums:
+      ScanBody::ComputeSummary(node, node->Range, node->LeftEdge);
+    }
+  }
+
+  void UpdateOutput(Node* node)
+  {
+    if (node->Left != nullptr)
+    {
+      assert(node->Right != nullptr);
+      ScanBody::PropagateSummaries(node, node->Left, node->Right, node->LeftEdge);
+
+      // if this node is on the left edge, we know that the left child's
+      // output is already updated, so only descend to the right:
+      if (node->LeftEdge)
+      {
+        this->UpdateOutput(node->Right);
+      }
+      else // Otherwise descent into both:
+      {
+        // Intel compilers seem to have trouble following the 'this' pointer
+        // when launching tasks, resulting in a corrupt task environment.
+        // Explicitly copying the pointer into a local variable seems to fix
+        // this.
+        auto explicitThis = this;
+
+        // no taskgroup/sync needed other than the final barrier of the parallel
+        // section.
+        VTKM_OPENMP_DIRECTIVE(task) { explicitThis->UpdateOutput(node->Right); } // end task
+        this->UpdateOutput(node->Left);
+      }
+    }
+    else
+    {
+      ScanBody::UpdateOutput(node, node->Range, node->LeftEdge);
+    }
+  }
+};
+
+template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
+struct ScanExclusiveBody
+{
+  using ValueType = typename InPortalT::ValueType;
+  using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
+
+  InPortalT InPortal;
+  OutPortalT OutPortal;
+  FunctorType Functor;
+  ValueType InitialValue;
+
+  struct Node
+  {
+    // Sum of all values in range
+    ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
+
+    // The sum of all elements prior to this node's range
+    ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
+  };
+
+  ScanExclusiveBody(const InPortalT& inPortal,
+                    const OutPortalT& outPortal,
+                    const RawFunctorT& functor,
+                    const ValueType& init)
+    : InPortal(inPortal)
+    , OutPortal(outPortal)
+    , Functor(functor)
+    , InitialValue(init)
+  {
+  }
+
+  // Initialize the root of the node tree
+  void InitializeRootNode(Node* /*root*/) {}
+
+  void InitializeChildNode(Node* /*node*/,
+                           const Node* /*parent*/,
+                           ChildType /*type*/,
+                           bool /*leftEdge*/)
+  {
+  }
+
+  void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
+  {
+    auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
+    node->Sum = input[range[0]];
+
+    // If this block is on the left edge, we can update the output while we
+    // compute the sum:
+    if (leftEdge)
+    {
+      // Set leftEdge arg to false to force the update:
+      node->Sum = UpdateOutputImpl(node, range, false, true);
+    }
+    else // Otherwise, only compute the sum and update the output in pass 2.
+    {
+      for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
+      {
+        node->Sum = this->Functor(node->Sum, input[i]);
+      }
+    }
+  }
+
+  void CombineSummaries(Node* parent, const Node* left, const Node* right)
+  {
+    parent->Sum = this->Functor(left->Sum, right->Sum);
+  }
+
+  void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
+  {
+    left->Carry = parent->Carry;
+    right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
+  }
+
+  void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
+  {
+    this->UpdateOutputImpl(node, range, leftEdge, false);
+  }
+
+  ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useInit)
+  {
+    if (skip)
+    {
+      // Do nothing; this was already done in ComputeSummary.
+      return vtkm::TypeTraits<ValueType>::ZeroInitialization();
+    }
+
+    auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
+    auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
+
+    // Be careful with the order input/output are modified. They might be
+    // pointing at the same data:
+    ValueType carry = useInit ? this->InitialValue : node->Carry;
+    vtkm::Id end = range[1];
+
+    for (vtkm::Id i = range[0]; i < end; ++i)
+    {
+      output[i] = this->Functor(carry, input[i]);
+
+      using std::swap; // Enable ADL
+      swap(output[i], carry);
+    }
+
+    return carry;
+  }
+
+  // Compute the final sum from the node's metadata:
+  ValueType GetFinalResult(const Node* node) const { return this->Functor(node->Sum, node->Carry); }
+};
+
+template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
+struct ScanInclusiveBody
+{
+  using ValueType = typename InPortalT::ValueType;
+  using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
+
+  InPortalT InPortal;
+  OutPortalT OutPortal;
+  FunctorType Functor;
+
+  struct Node
+  {
+    // Sum of all values in range
+    ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
+
+    // The sum of all elements prior to this node's range
+    ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
+  };
+
+  ScanInclusiveBody(const InPortalT& inPortal,
+                    const OutPortalT& outPortal,
+                    const RawFunctorT& functor)
+    : InPortal(inPortal)
+    , OutPortal(outPortal)
+    , Functor(functor)
+  {
+  }
+
+  // Initialize the root of the node tree
+  void InitializeRootNode(Node*)
+  {
+    // no-op
+  }
+
+  void InitializeChildNode(Node*, const Node*, ChildType, bool)
+  {
+    // no-op
+  }
+
+  void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
+  {
+    // If this block is on the left edge, we can update the output while we
+    // compute the sum:
+    if (leftEdge)
+    {
+      node->Sum = UpdateOutputImpl(node, range, false, false);
+    }
+    else // Otherwise, only compute the sum and update the output in pass 2.
+    {
+      auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
+      node->Sum = input[range[0]];
+      for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
+      {
+        node->Sum = this->Functor(node->Sum, input[i]);
+      }
+    }
+  }
+
+  void CombineSummaries(Node* parent, const Node* left, const Node* right)
+  {
+    parent->Sum = this->Functor(left->Sum, right->Sum);
+  }
+
+  void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
+  {
+    left->Carry = parent->Carry;
+    right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
+  }
+
+  void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
+  {
+    UpdateOutputImpl(node, range, leftEdge, true);
+  }
+
+  ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useCarry)
+  {
+    if (skip)
+    {
+      // Do nothing; this was already done in ComputeSummary.
+      return vtkm::TypeTraits<ValueType>::ZeroInitialization();
+    }
+
+    auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
+    auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
+
+    vtkm::Id start = range[0];
+    vtkm::Id end = range[1];
+    ValueType carry = node->Carry;
+
+    // Initialize with the first value if this is the first range:
+    if (!useCarry && start < end)
+    {
+      carry = input[start];
+      output[start] = carry;
+      ++start;
+    }
+
+    for (vtkm::Id i = start; i < end; ++i)
+    {
+      output[i] = this->Functor(carry, input[i]);
+      carry = output[i];
+    }
+
+    return output[end - 1];
+  }
+
+  // Compute the final sum from the node's metadata:
+  ValueType GetFinalResult(const Node* node) const { return node->Sum; }
+};
+
+} // end namespace scan
+
+template <typename InPortalT, typename OutPortalT, typename FunctorT>
+using ScanExclusiveHelper = scan::Adder<scan::ScanExclusiveBody<InPortalT, OutPortalT, FunctorT>>;
+
+template <typename InPortalT, typename OutPortalT, typename FunctorT>
+using ScanInclusiveHelper = scan::Adder<scan::ScanInclusiveBody<InPortalT, OutPortalT, FunctorT>>;
+}
+}
+} // end namespace vtkm::cont::openmp
--- a/vtkm/cont/openmp/internal/ParallelSortOpenMP.h
+++ b/vtkm/cont/openmp/internal/ParallelSortOpenMP.h
@ -0,0 +1,251 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h>
+#include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
+#include <vtkm/cont/openmp/internal/ParallelQuickSortOpenMP.h>
+#include <vtkm/cont/openmp/internal/ParallelRadixSortOpenMP.h>
+
+#include <vtkm/BinaryPredicates.h>
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/ArrayHandleIndex.h>
+#include <vtkm/cont/ArrayHandleZip.h>
+
+#include <omp.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace openmp
+{
+namespace sort
+{
+
+// Forward declare entry points (See stack overflow discussion 7255281 --
+// templated overloads of template functions are not specialization, and will
+// be resolved during the first phase of two part lookup).
+template <typename T, typename Container, class BinaryCompare>
+void parallel_sort(vtkm::cont::ArrayHandle<T, Container>&, BinaryCompare);
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>&,
+                         vtkm::cont::ArrayHandle<U, StorageU>&,
+                         BinaryCompare);
+
+// Quicksort values:
+template <typename HandleType, class BinaryCompare>
+void parallel_sort(HandleType& values,
+                   BinaryCompare binary_compare,
+                   vtkm::cont::internal::radix::PSortTag)
+{
+  auto portal = values.PrepareForInPlace(DeviceAdapterTagOpenMP());
+  auto iter = vtkm::cont::ArrayPortalToIteratorBegin(portal);
+  vtkm::Id2 range(0, values.GetNumberOfValues());
+
+  using IterType = typename std::decay<decltype(iter)>::type;
+  using Sorter = quick::QuickSorter<IterType, BinaryCompare>;
+
+  Sorter sorter(iter, binary_compare);
+  sorter.Execute(range);
+}
+
+// Radix sort values:
+template <typename T, typename StorageT, class BinaryCompare>
+void parallel_sort(vtkm::cont::ArrayHandle<T, StorageT>& values,
+                   BinaryCompare binary_compare,
+                   vtkm::cont::internal::radix::RadixSortTag)
+{
+  auto c = vtkm::cont::internal::radix::get_std_compare(binary_compare, T{});
+  radix::parallel_radix_sort(
+    values.GetStorage().GetArray(), static_cast<std::size_t>(values.GetNumberOfValues()), c);
+}
+
+// Value sort -- static switch between quicksort & radix sort
+template <typename T, typename Container, class BinaryCompare>
+void parallel_sort(vtkm::cont::ArrayHandle<T, Container>& values, BinaryCompare binary_compare)
+{
+  using namespace vtkm::cont::internal::radix;
+  using SortAlgorithmTag = typename sort_tag_type<T, Container, BinaryCompare>::type;
+
+  parallel_sort(values, binary_compare, SortAlgorithmTag{});
+}
+
+// Quicksort by key:
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                         vtkm::cont::ArrayHandle<U, StorageU>& values,
+                         BinaryCompare binary_compare,
+                         vtkm::cont::internal::radix::PSortTag)
+{
+  using KeyType = vtkm::cont::ArrayHandle<T, StorageT>;
+  constexpr bool larger_than_64bits = sizeof(U) > sizeof(vtkm::Int64);
+  if (larger_than_64bits)
+  {
+    /// More efficient sort:
+    /// Move value indexes when sorting and reorder the value array at last
+
+    using ValueType = vtkm::cont::ArrayHandle<U, StorageU>;
+    using IndexType = vtkm::cont::ArrayHandle<vtkm::Id>;
+    using ZipHandleType = vtkm::cont::ArrayHandleZip<KeyType, IndexType>;
+
+    IndexType indexArray;
+    ValueType valuesScattered;
+    const vtkm::Id size = values.GetNumberOfValues();
+
+    // Generate an in-memory index array:
+    {
+      auto handle = ArrayHandleIndex(keys.GetNumberOfValues());
+      auto inputPortal = handle.PrepareForInput(DeviceAdapterTagOpenMP());
+      auto outputPortal =
+        indexArray.PrepareForOutput(keys.GetNumberOfValues(), DeviceAdapterTagOpenMP());
+      openmp::CopyHelper(inputPortal, outputPortal, 0, 0, keys.GetNumberOfValues());
+    }
+
+    // Sort the keys and indicies:
+    ZipHandleType zipHandle = vtkm::cont::make_ArrayHandleZip(keys, indexArray);
+    parallel_sort(zipHandle,
+                  vtkm::cont::internal::KeyCompare<T, vtkm::Id, BinaryCompare>(binary_compare),
+                  vtkm::cont::internal::radix::PSortTag());
+
+    // Permute the values to their sorted locations:
+    {
+      auto valuesInPortal = values.PrepareForInput(DeviceAdapterTagOpenMP());
+      auto indexPortal = indexArray.PrepareForInput(DeviceAdapterTagOpenMP());
+      auto valuesOutPortal = valuesScattered.PrepareForOutput(size, DeviceAdapterTagOpenMP());
+
+      VTKM_OPENMP_DIRECTIVE(parallel for
+                            default(none)
+                            firstprivate(valuesInPortal, indexPortal, valuesOutPortal)
+                            schedule(static))
+      for (vtkm::Id i = 0; i < size; ++i)
+      {
+        valuesOutPortal.Set(i, valuesInPortal.Get(indexPortal.Get(i)));
+      }
+    }
+
+    // Copy the values back into the input array:
+    {
+      auto inputPortal = valuesScattered.PrepareForInput(DeviceAdapterTagOpenMP());
+      auto outputPortal =
+        values.PrepareForOutput(valuesScattered.GetNumberOfValues(), DeviceAdapterTagOpenMP());
+      openmp::CopyHelper(inputPortal, outputPortal, 0, 0, size);
+    }
+  }
+  else
+  {
+    using ValueType = vtkm::cont::ArrayHandle<U, StorageU>;
+    using ZipHandleType = vtkm::cont::ArrayHandleZip<KeyType, ValueType>;
+
+    ZipHandleType zipHandle = vtkm::cont::make_ArrayHandleZip(keys, values);
+    parallel_sort(zipHandle,
+                  vtkm::cont::internal::KeyCompare<T, U, BinaryCompare>(binary_compare),
+                  vtkm::cont::internal::radix::PSortTag{});
+  }
+}
+
+// Radix sort by key:
+template <typename T, typename StorageT, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                         vtkm::cont::ArrayHandle<vtkm::Id, StorageU>& values,
+                         BinaryCompare binary_compare,
+                         vtkm::cont::internal::radix::RadixSortTag)
+{
+  using namespace vtkm::cont::internal::radix;
+  auto c = get_std_compare(binary_compare, T{});
+  radix::parallel_radix_sort_key_values(keys.GetStorage().GetArray(),
+                                        values.GetStorage().GetArray(),
+                                        static_cast<std::size_t>(keys.GetNumberOfValues()),
+                                        c);
+}
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                         vtkm::cont::ArrayHandle<U, StorageU>& values,
+                         BinaryCompare binary_compare,
+                         vtkm::cont::internal::radix::RadixSortTag)
+{
+  using KeyType = vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>;
+  using ValueType = vtkm::cont::ArrayHandle<U, vtkm::cont::StorageTagBasic>;
+  using IndexType = vtkm::cont::ArrayHandle<vtkm::Id, vtkm::cont::StorageTagBasic>;
+  using ZipHandleType = vtkm::cont::ArrayHandleZip<KeyType, IndexType>;
+
+  IndexType indexArray;
+  ValueType valuesScattered;
+  const vtkm::Id size = values.GetNumberOfValues();
+
+  {
+    auto handle = ArrayHandleIndex(keys.GetNumberOfValues());
+    auto inputPortal = handle.PrepareForInput(DeviceAdapterTagOpenMP());
+    auto outputPortal =
+      indexArray.PrepareForOutput(keys.GetNumberOfValues(), DeviceAdapterTagOpenMP());
+    openmp::CopyHelper(inputPortal, outputPortal, 0, 0, keys.GetNumberOfValues());
+  }
+
+  const vtkm::Id valuesBytes = static_cast<vtkm::Id>(sizeof(T)) * keys.GetNumberOfValues();
+  if (valuesBytes > static_cast<vtkm::Id>(vtkm::cont::internal::radix::MIN_BYTES_FOR_PARALLEL))
+  {
+    parallel_sort_bykey(keys, indexArray, binary_compare);
+  }
+  else
+  {
+    ZipHandleType zipHandle = vtkm::cont::make_ArrayHandleZip(keys, indexArray);
+    parallel_sort(zipHandle,
+                  vtkm::cont::internal::KeyCompare<T, vtkm::Id, BinaryCompare>(binary_compare),
+                  vtkm::cont::internal::radix::PSortTag());
+  }
+
+  // Permute the values to their sorted locations:
+  {
+    auto valuesInPortal = values.PrepareForInput(DeviceAdapterTagOpenMP());
+    auto indexPortal = indexArray.PrepareForInput(DeviceAdapterTagOpenMP());
+    auto valuesOutPortal = valuesScattered.PrepareForOutput(size, DeviceAdapterTagOpenMP());
+
+    VTKM_OPENMP_DIRECTIVE(parallel for
+                          default(none)
+                          firstprivate(valuesInPortal, indexPortal, valuesOutPortal)
+                          schedule(static))
+    for (vtkm::Id i = 0; i < size; ++i)
+    {
+      valuesOutPortal.Set(i, valuesInPortal.Get(indexPortal.Get(i)));
+    }
+  }
+
+  {
+    auto inputPortal = valuesScattered.PrepareForInput(DeviceAdapterTagOpenMP());
+    auto outputPortal =
+      values.PrepareForOutput(valuesScattered.GetNumberOfValues(), DeviceAdapterTagOpenMP());
+    openmp::CopyHelper(inputPortal, outputPortal, 0, 0, valuesScattered.GetNumberOfValues());
+  }
+}
+
+// Sort by key -- static switch between radix and quick sort:
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                         vtkm::cont::ArrayHandle<U, StorageU>& values,
+                         BinaryCompare binary_compare)
+{
+  using namespace vtkm::cont::internal::radix;
+  using SortAlgorithmTag =
+    typename sortbykey_tag_type<T, U, StorageT, StorageU, BinaryCompare>::type;
+  parallel_sort_bykey(keys, values, binary_compare, SortAlgorithmTag{});
+}
+}
+}
+}
+} // end namespace vtkm::cont::openmp::sort
--- a/vtkm/cont/openmp/internal/VirtualObjectTransferOpenMP.h
+++ b/vtkm/cont/openmp/internal/VirtualObjectTransferOpenMP.h
@ -0,0 +1,50 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_openmp_internal_VirtualObjectTransferOpenMP_h
+#define vtk_m_cont_openmp_internal_VirtualObjectTransferOpenMP_h
+
+#include <vtkm/cont/internal/VirtualObjectTransfer.h>
+#include <vtkm/cont/internal/VirtualObjectTransferShareWithControl.h>
+#include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace internal
+{
+
+template <typename VirtualDerivedType>
+struct VirtualObjectTransfer<VirtualDerivedType, vtkm::cont::DeviceAdapterTagOpenMP>
+  : VirtualObjectTransferShareWithControl<VirtualDerivedType>
+{
+  VTKM_CONT VirtualObjectTransfer(const VirtualDerivedType* virtualObject)
+    : VirtualObjectTransferShareWithControl<VirtualDerivedType>(virtualObject)
+  {
+  }
+};
+}
+}
+} // vtkm::cont::internal
+
+
+
+#endif // vtk_m_cont_openmp_internal_VirtualObjectTransferOpenMP_h
--- a/vtkm/cont/openmp/testing/CMakeLists.txt
+++ b/vtkm/cont/openmp/testing/CMakeLists.txt
@ -0,0 +1,33 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(unit_tests
+  UnitTestOpenMPArrayHandle.cxx
+  UnitTestOpenMPArrayHandleFancy.cxx
+  UnitTestOpenMPCellLocatorTwoLevelUniformGrid.cxx
+  UnitTestOpenMPComputeRange.cxx
+  UnitTestOpenMPDataSetExplicit.cxx
+  UnitTestOpenMPDataSetSingleType.cxx
+  UnitTestOpenMPDeviceAdapter.cxx
+  UnitTestOpenMPImplicitFunction.cxx
+  UnitTestOpenMPPointLocatorUniformGrid.cxx
+  UnitTestOpenMPVirtualObjectHandle.cxx
+  )
+vtkm_unit_tests(OpenMP SOURCES ${unit_tests})
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx
@ -0,0 +1,31 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingArrayHandles.h>
+
+int UnitTestOpenMPArrayHandle(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingArrayHandles<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx
@ -0,0 +1,31 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingFancyArrayHandles.h>
+
+int UnitTestOpenMPArrayHandleFancy(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingFancyArrayHandles<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPCellLocatorTwoLevelUniformGrid.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPCellLocatorTwoLevelUniformGrid.cxx
@ -0,0 +1,32 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingCellLocatorTwoLevelUniformGrid.h>
+
+int UnitTestOpenMPCellLocatorTwoLevelUniformGrid(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::Testing::Run(
+    TestingCellLocatorTwoLevelUniformGrid<vtkm::cont::DeviceAdapterTagOpenMP>);
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPComputeRange.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPComputeRange.cxx
@ -0,0 +1,31 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingComputeRange.h>
+
+int UnitTestOpenMPComputeRange(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingComputeRange<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPDataSetExplicit.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPDataSetExplicit.cxx
@ -0,0 +1,31 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingDataSetExplicit.h>
+
+int UnitTestOpenMPDataSetExplicit(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingDataSetExplicit<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPDataSetSingleType.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPDataSetSingleType.cxx
@ -0,0 +1,31 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingDataSetSingleType.h>
+
+int UnitTestOpenMPDataSetSingleType(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingDataSetSingleType<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPDeviceAdapter.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPDeviceAdapter.cxx
@ -0,0 +1,32 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingDeviceAdapter.h>
+
+int UnitTestOpenMPDeviceAdapter(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  return vtkm::cont::testing::TestingDeviceAdapter<vtkm::cont::DeviceAdapterTagOpenMP>::Run();
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPImplicitFunction.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPImplicitFunction.cxx
@ -0,0 +1,43 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingImplicitFunction.h>
+
+namespace
+{
+
+void TestImplicitFunctions()
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+
+  vtkm::cont::testing::TestingImplicitFunction testing;
+  testing.Run(vtkm::cont::DeviceAdapterTagOpenMP());
+}
+
+} // anonymous namespace
+
+int UnitTestOpenMPImplicitFunction(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestImplicitFunctions);
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPPointLocatorUniformGrid.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPPointLocatorUniformGrid.cxx
@ -0,0 +1,33 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingPointLocatorUniformGrid.h>
+
+int UnitTestOpenMPPointLocatorUniformGrid(int, char* [])
+{
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+
+  return vtkm::cont::testing::Testing::Run(
+    TestingPointLocatorUniformGrid<vtkm::cont::DeviceAdapterTagOpenMP>());
+}
--- a/vtkm/cont/openmp/testing/UnitTestOpenMPVirtualObjectHandle.cxx
+++ b/vtkm/cont/openmp/testing/UnitTestOpenMPVirtualObjectHandle.cxx
@ -0,0 +1,49 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/cont/testing/TestingVirtualObjectHandle.h>
+
+namespace
+{
+
+void TestVirtualObjectHandle()
+{
+  using DeviceAdapterList = vtkm::ListTagBase<vtkm::cont::DeviceAdapterTagOpenMP>;
+  using DeviceAdapterList2 =
+    vtkm::ListTagBase<vtkm::cont::DeviceAdapterTagSerial, vtkm::cont::DeviceAdapterTagOpenMP>;
+
+  auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
+
+  tracker.ForceDevice(vtkm::cont::DeviceAdapterTagOpenMP{});
+  vtkm::cont::testing::TestingVirtualObjectHandle<DeviceAdapterList>::Run();
+
+  tracker.Reset();
+  vtkm::cont::testing::TestingVirtualObjectHandle<DeviceAdapterList2>::Run();
+}
+
+} // anonymous namespace
+
+int UnitTestOpenMPVirtualObjectHandle(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestVirtualObjectHandle);
+}
--- a/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
+++ b/vtkm/cont/tbb/internal/DeviceAdapterAlgorithmTBB.h
@ -251,21 +251,21 @@ public:
  {
    //this is required to get sort to work with zip handles
    std::less<T> lessOp;
-    vtkm::cont::tbb::internal::parallel_sort(values, lessOp);
+    vtkm::cont::tbb::sort::parallel_sort(values, lessOp);
  }

  template <typename T, class Container, class BinaryCompare>
  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Container>& values,
                             BinaryCompare binary_compare)
  {
-    vtkm::cont::tbb::internal::parallel_sort(values, binary_compare);
+    vtkm::cont::tbb::sort::parallel_sort(values, binary_compare);
  }

  template <typename T, typename U, class StorageT, class StorageU>
  VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
                                  vtkm::cont::ArrayHandle<U, StorageU>& values)
  {
-    vtkm::cont::tbb::internal::parallel_sort_bykey(keys, values, std::less<T>());
+    vtkm::cont::tbb::sort::parallel_sort_bykey(keys, values, std::less<T>());
  }

  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
@ -273,7 +273,7 @@ public:
                                  vtkm::cont::ArrayHandle<U, StorageU>& values,
                                  BinaryCompare binary_compare)
  {
-    vtkm::cont::tbb::internal::parallel_sort_bykey(keys, values, binary_compare);
+    vtkm::cont::tbb::sort::parallel_sort_bykey(keys, values, binary_compare);
  }

  template <typename T, class Storage>
--- a/vtkm/cont/tbb/internal/ParallelSortTBB.cxx
+++ b/vtkm/cont/tbb/internal/ParallelSortTBB.cxx
@ -49,36 +49,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-// Modifications of Takuya Akiba's original GitHub source code for inclusion
-// in VTK-m:
-//
-//   - Changed parallel threading from OpenMP to TBB tasks
-//   - Added minimum threshold for parallel, will instead invoke serial radix sort (kxsort)
-//   - Added std::greater<T> and std::less<T> to interface for descending order sorts
-//   - Added linear scaling of threads used by the algorithm for more stable performance
-//     on machines with lots of available threads (KNL and Haswell)
-//
-// This file contains an implementation of Satish parallel radix sort
-// as documented in the following citation:
-//
-//  Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort.
-//       N. Satish, C. Kim, J. Chhugani, A. D. Nguyen, V. W. Lee, D. Kim, and P. Dubey.
-//       In Proc. SIGMOD, pages 351–362, 2010
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <stdint.h>
-#include <utility>
-
-#include <vtkm/Types.h>
-#include <vtkm/cont/tbb/internal/ParallelSortTBB.h>
-
-VTKM_THIRDPARTY_PRE_INCLUDE
-
-#include <vtkm/cont/tbb/internal/kxsort.h>
+#include <vtkm/cont/internal/ParallelRadixSort.h>

 #if defined(VTKM_MSVC)

@ -107,852 +78,57 @@ namespace cont
 {
 namespace tbb
 {
-namespace internal
+namespace sort
 {

-const size_t MIN_BYTES_FOR_PARALLEL = 400000;
-const size_t BYTES_FOR_MAX_PARALLELISM = 4000000;
 const size_t MAX_CORES = ::tbb::tbb_thread::hardware_concurrency();
-const double CORES_PER_BYTE =
-  double(MAX_CORES - 1) / double(BYTES_FOR_MAX_PARALLELISM - MIN_BYTES_FOR_PARALLEL);
-const double Y_INTERCEPT = 1.0 - CORES_PER_BYTE * MIN_BYTES_FOR_PARALLEL;

-namespace utility
+// Simple TBB task wrapper around a generic functor.
+template <typename FunctorType>
+struct TaskWrapper : public ::tbb::task
 {
-// Return the number of threads that would be executed in parallel regions
-inline size_t GetMaxThreads(size_t num_bytes)
-{
-  size_t num_cores = (size_t)(CORES_PER_BYTE * double(num_bytes) + Y_INTERCEPT);
-  if (num_cores < 1)
-  {
-    return 1;
-  }
-  if (num_cores > MAX_CORES)
-  {
-    return MAX_CORES;
-  }
-  return num_cores;
-}
-} // namespace utility
+  FunctorType Functor;

-namespace internal
-{
-// Size of the software managed buffer
-const size_t kOutBufferSize = 32;
-
-// Ascending order radix sort is a no-op
-template <typename PlainType,
-          typename UnsignedType,
-          typename CompareType,
-          typename ValueManager,
-          unsigned int Base>
-struct ParallelRadixCompareInternal
-{
-  inline static void reverse(UnsignedType& t) { (void)t; }
-};
-
-// Handle descending order radix sort
-template <typename PlainType, typename UnsignedType, typename ValueManager, unsigned int Base>
-struct ParallelRadixCompareInternal<PlainType,
-                                    UnsignedType,
-                                    std::greater<PlainType>,
-                                    ValueManager,
-                                    Base>
-{
-  inline static void reverse(UnsignedType& t) { t = ((1 << Base) - 1) - t; }
-};
-
-// The algorithm is implemented in this internal class
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-class ParallelRadixSortInternal
-{
-public:
-  using CompareInternal =
-    ParallelRadixCompareInternal<PlainType, UnsignedType, CompareType, ValueManager, Base>;
-
-  ParallelRadixSortInternal();
-  ~ParallelRadixSortInternal();
-
-  void Init(PlainType* data, size_t num_elems);
-
-  PlainType* Sort(PlainType* data, ValueManager* value_manager);
-
-  static void InitAndSort(PlainType* data, size_t num_elems, ValueManager* value_manager);
-
-private:
-  CompareInternal compare_internal_;
-  size_t num_elems_;
-  size_t num_threads_;
-
-  UnsignedType* tmp_;
-  size_t** histo_;
-  UnsignedType*** out_buf_;
-  size_t** out_buf_n_;
-
-  size_t *pos_bgn_, *pos_end_;
-  ValueManager* value_manager_;
-
-  void DeleteAll();
-
-  UnsignedType* SortInternal(UnsignedType* data, ValueManager* value_manager);
-
-  // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads)
-  void ComputeRanges();
-
-  // First step of each iteration of sorting
-  // Compute the histogram of |src| using bits in [b, b + Base)
-  void ComputeHistogram(unsigned int b, UnsignedType* src);
-
-  // Second step of each iteration of sorting
-  // Scatter elements of |src| to |dst| using the histogram
-  void Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst);
-};
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  ParallelRadixSortInternal()
-  : num_elems_(0)
-  , num_threads_(0)
-  , tmp_(NULL)
-  , histo_(NULL)
-  , out_buf_(NULL)
-  , out_buf_n_(NULL)
-  , pos_bgn_(NULL)
-  , pos_end_(NULL)
-{
-  assert(sizeof(PlainType) == sizeof(UnsignedType));
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  ~ParallelRadixSortInternal()
-{
-  DeleteAll();
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  DeleteAll()
-{
-  delete[] tmp_;
-  tmp_ = NULL;
-
-  for (size_t i = 0; i < num_threads_; ++i)
-    delete[] histo_[i];
-  delete[] histo_;
-  histo_ = NULL;
-
-  for (size_t i = 0; i < num_threads_; ++i)
-  {
-    for (size_t j = 0; j < 1 << Base; ++j)
-    {
-      delete[] out_buf_[i][j];
-    }
-    delete[] out_buf_n_[i];
-    delete[] out_buf_[i];
-  }
-  delete[] out_buf_;
-  delete[] out_buf_n_;
-  out_buf_ = NULL;
-  out_buf_n_ = NULL;
-
-  delete[] pos_bgn_;
-  delete[] pos_end_;
-  pos_bgn_ = pos_end_ = NULL;
-
-  num_elems_ = 0;
-  num_threads_ = 0;
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  Init(PlainType* data, size_t num_elems)
-{
-  (void)data;
-  DeleteAll();
-
-  num_elems_ = num_elems;
-
-  num_threads_ = utility::GetMaxThreads(num_elems_ * sizeof(PlainType));
-
-  tmp_ = new UnsignedType[num_elems_];
-  histo_ = new size_t*[num_threads_];
-  for (size_t i = 0; i < num_threads_; ++i)
-  {
-    histo_[i] = new size_t[1 << Base];
-  }
-
-  out_buf_ = new UnsignedType**[num_threads_];
-  out_buf_n_ = new size_t*[num_threads_];
-  for (size_t i = 0; i < num_threads_; ++i)
-  {
-    out_buf_[i] = new UnsignedType*[1 << Base];
-    out_buf_n_[i] = new size_t[1 << Base];
-    for (size_t j = 0; j < 1 << Base; ++j)
-    {
-      out_buf_[i][j] = new UnsignedType[kOutBufferSize];
-    }
-  }
-
-  pos_bgn_ = new size_t[num_threads_];
-  pos_end_ = new size_t[num_threads_];
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-PlainType*
-ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::Sort(
-  PlainType* data,
-  ValueManager* value_manager)
-{
-  UnsignedType* src = reinterpret_cast<UnsignedType*>(data);
-  UnsignedType* res = SortInternal(src, value_manager);
-  return reinterpret_cast<PlainType*>(res);
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  InitAndSort(PlainType* data, size_t num_elems, ValueManager* value_manager)
-{
-  ParallelRadixSortInternal prs;
-  prs.Init(data, num_elems);
-  const PlainType* res = prs.Sort(data, value_manager);
-  if (res != data)
-  {
-    for (size_t i = 0; i < num_elems; ++i)
-      data[i] = res[i];
-  }
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-UnsignedType*
-ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  SortInternal(UnsignedType* data, ValueManager* value_manager)
-{
-
-  value_manager_ = value_manager;
-
-  // Compute |pos_bgn_| and |pos_end_|
-  ComputeRanges();
-
-  // Iterate from lower bits to higher bits
-  const size_t bits = CHAR_BIT * sizeof(UnsignedType);
-  UnsignedType *src = data, *dst = tmp_;
-  for (unsigned int b = 0; b < bits; b += Base)
-  {
-    ComputeHistogram(b, src);
-    Scatter(b, src, dst);
-
-    std::swap(src, dst);
-    value_manager->Next();
-  }
-
-  return src;
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  ComputeRanges()
-{
-  pos_bgn_[0] = 0;
-  for (size_t i = 0; i < num_threads_ - 1; ++i)
-  {
-    const size_t t = (num_elems_ - pos_bgn_[i]) / (num_threads_ - i);
-    pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t;
-  }
-  pos_end_[num_threads_ - 1] = num_elems_;
-}
-
-template <typename PlainType,
-          typename UnsignedType,
-          typename Encoder,
-          unsigned int Base,
-          typename Function>
-class RunTask : public ::tbb::task
-{
-public:
-  RunTask(size_t binary_tree_height,
-          size_t binary_tree_position,
-          Function f,
-          size_t num_elems,
-          size_t num_threads)
-    : binary_tree_height_(binary_tree_height)
-    , binary_tree_position_(binary_tree_position)
-    , f_(f)
-    , num_elems_(num_elems)
-    , num_threads_(num_threads)
+  TaskWrapper(FunctorType f)
+    : Functor(f)
  {
  }

  ::tbb::task* execute()
  {
-    size_t num_nodes_at_current_height = (size_t)pow(2, (double)binary_tree_height_);
-    if (num_threads_ <= num_nodes_at_current_height)
-    {
-      const size_t my_id = binary_tree_position_ - num_nodes_at_current_height;
-      if (my_id < num_threads_)
-      {
-        f_(my_id);
-      }
-      return NULL;
-    }
-    else
-    {
-      ::tbb::empty_task& p = *new (task::allocate_continuation())::tbb::empty_task();
-      RunTask& left = *new (p.allocate_child()) RunTask(
-        binary_tree_height_ + 1, 2 * binary_tree_position_, f_, num_elems_, num_threads_);
-      RunTask& right = *new (p.allocate_child()) RunTask(
-        binary_tree_height_ + 1, 2 * binary_tree_position_ + 1, f_, num_elems_, num_threads_);
-      p.set_ref_count(2);
-      task::spawn(left);
-      task::spawn(right);
-      return NULL;
-    }
-  }
-
-private:
-  size_t binary_tree_height_;
-  size_t binary_tree_position_;
-  Function f_;
-  size_t num_elems_;
-  size_t num_threads_;
-};
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  ComputeHistogram(unsigned int b, UnsignedType* src)
-{
-  // Compute local histogram
-
-  auto lambda = [=](const size_t my_id) {
-    const size_t my_bgn = pos_bgn_[my_id];
-    const size_t my_end = pos_end_[my_id];
-    size_t* my_histo = histo_[my_id];
-
-    memset(my_histo, 0, sizeof(size_t) * (1 << Base));
-    for (size_t i = my_bgn; i < my_end; ++i)
-    {
-      const UnsignedType s = Encoder::encode(src[i]);
-      UnsignedType t = (s >> b) & ((1 << Base) - 1);
-      compare_internal_.reverse(t);
-      ++my_histo[t];
-    }
-  };
-
-  using RunTaskType = RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>>;
-
-  RunTaskType& root =
-    *new (::tbb::task::allocate_root()) RunTaskType(0, 1, lambda, num_elems_, num_threads_);
-
-  ::tbb::task::spawn_root_and_wait(root);
-
-  // Compute global histogram
-  size_t s = 0;
-  for (size_t i = 0; i < 1 << Base; ++i)
-  {
-    for (size_t j = 0; j < num_threads_; ++j)
-    {
-      const size_t t = s + histo_[j][i];
-      histo_[j][i] = s;
-      s = t;
-    }
-  }
-}
-
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType,
-          typename Encoder,
-          typename ValueManager,
-          unsigned int Base>
-void ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>::
-  Scatter(unsigned int b, UnsignedType* src, UnsignedType* dst)
-{
-
-  auto lambda = [=](const size_t my_id) {
-    const size_t my_bgn = pos_bgn_[my_id];
-    const size_t my_end = pos_end_[my_id];
-    size_t* my_histo = histo_[my_id];
-    UnsignedType** my_buf = out_buf_[my_id];
-    size_t* my_buf_n = out_buf_n_[my_id];
-
-    memset(my_buf_n, 0, sizeof(size_t) * (1 << Base));
-    for (size_t i = my_bgn; i < my_end; ++i)
-    {
-      const UnsignedType s = Encoder::encode(src[i]);
-      UnsignedType t = (s >> b) & ((1 << Base) - 1);
-      compare_internal_.reverse(t);
-      my_buf[t][my_buf_n[t]] = src[i];
-      value_manager_->Push(my_id, t, my_buf_n[t], i);
-      ++my_buf_n[t];
-
-      if (my_buf_n[t] == kOutBufferSize)
-      {
-        size_t p = my_histo[t];
-        for (size_t j = 0; j < kOutBufferSize; ++j)
-        {
-          size_t tp = p++;
-          dst[tp] = my_buf[t][j];
-        }
-        value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]);
-
-        my_histo[t] += kOutBufferSize;
-        my_buf_n[t] = 0;
-      }
-    }
-
-    // Flush everything
-    for (size_t i = 0; i < 1 << Base; ++i)
-    {
-      size_t p = my_histo[i];
-      for (size_t j = 0; j < my_buf_n[i]; ++j)
-      {
-        size_t tp = p++;
-        dst[tp] = my_buf[i][j];
-      }
-      value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]);
-    }
-  };
-
-  using RunTaskType = RunTask<PlainType, UnsignedType, Encoder, Base, std::function<void(size_t)>>;
-
-  RunTaskType& root =
-    *new (::tbb::task::allocate_root()) RunTaskType(0, 1, lambda, num_elems_, num_threads_);
-
-  ::tbb::task::spawn_root_and_wait(root);
-}
-} // namespace internal
-
-// Encoders encode signed/unsigned integers and floating point numbers
-// to correctly ordered unsigned integers
-namespace encoder
-{
-class EncoderDummy
-{
-};
-
-class EncoderUnsigned
-{
-public:
-  template <typename UnsignedType>
-  inline static UnsignedType encode(UnsignedType x)
-  {
-    return x;
+    this->Functor(this);
+    return nullptr;
  }
 };

-class EncoderSigned
+struct RadixThreaderTBB
 {
-public:
-  template <typename UnsignedType>
-  inline static UnsignedType encode(UnsignedType x)
+  size_t GetAvailableCores() const { return MAX_CORES; }
+
+  template <typename TaskType>
+  void RunParentTask(TaskType task)
  {
-    return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1));
+    using Task = TaskWrapper<TaskType>;
+    Task& root = *new (::tbb::task::allocate_root()) Task(task);
+    ::tbb::task::spawn_root_and_wait(root);
+  }
+
+  template <typename TaskType>
+  void RunChildTasks(TaskWrapper<TaskType>* wrapper, TaskType left, TaskType right)
+  {
+    using Task = TaskWrapper<TaskType>;
+    ::tbb::empty_task& p = *new (wrapper->allocate_continuation())::tbb::empty_task();
+
+    Task& lchild = *new (p.allocate_child()) Task(left);
+    Task& rchild = *new (p.allocate_child()) Task(right);
+    p.set_ref_count(2);
+    ::tbb::task::spawn(lchild);
+    ::tbb::task::spawn(rchild);
  }
 };

-class EncoderDecimal
-{
-public:
-  template <typename UnsignedType>
-  inline static UnsignedType encode(UnsignedType x)
-  {
-    static const size_t bits = CHAR_BIT * sizeof(UnsignedType);
-    const UnsignedType a = x >> (bits - 1);
-    const UnsignedType b = (-static_cast<int>(a)) | (UnsignedType(1) << (bits - 1));
-    return x ^ b;
-  }
-};
-} // namespace encoder
-
-// Value managers are used to generalize the sorting algorithm
-// to sorting of keys and sorting of pairs
-namespace value_manager
-{
-class DummyValueManager
-{
-public:
-  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
-  {
-    (void)thread;
-    (void)bucket;
-    (void)num;
-    (void)from_pos;
-  }
-
-  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
-  {
-    (void)thread;
-    (void)bucket;
-    (void)num;
-    (void)to_pos;
-  }
-
-  void Next() {}
-};
-
-template <typename PlainType, typename ValueType, int Base>
-class PairValueManager
-{
-public:
-  PairValueManager()
-    : max_elems_(0)
-    , max_threads_(0)
-    , original_(NULL)
-    , tmp_(NULL)
-    , src_(NULL)
-    , dst_(NULL)
-    , out_buf_(NULL)
-  {
-  }
-
-  ~PairValueManager() { DeleteAll(); }
-
-  void Init(size_t max_elems);
-
-  void Start(ValueType* original, size_t num_elems)
-  {
-    assert(num_elems <= max_elems_);
-    src_ = original_ = original;
-    dst_ = tmp_;
-  }
-
-  inline void Push(int thread, size_t bucket, size_t num, size_t from_pos)
-  {
-    out_buf_[thread][bucket][num] = src_[from_pos];
-  }
-
-  inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos)
-  {
-    for (size_t i = 0; i < num; ++i)
-    {
-      dst_[to_pos++] = out_buf_[thread][bucket][i];
-    }
-  }
-
-  void Next() { std::swap(src_, dst_); }
-
-  ValueType* GetResult() { return src_; }
-private:
-  size_t max_elems_;
-  int max_threads_;
-
-  static constexpr size_t kOutBufferSize = internal::kOutBufferSize;
-  ValueType *original_, *tmp_;
-  ValueType *src_, *dst_;
-  ValueType*** out_buf_;
-
-  void DeleteAll();
-};
-
-template <typename PlainType, typename ValueType, int Base>
-void PairValueManager<PlainType, ValueType, Base>::Init(size_t max_elems)
-{
-  DeleteAll();
-
-  max_elems_ = max_elems;
-  max_threads_ = utility::GetMaxThreads(max_elems_ * sizeof(PlainType));
-
-  tmp_ = new ValueType[max_elems];
-
-  out_buf_ = new ValueType**[max_threads_];
-  for (int i = 0; i < max_threads_; ++i)
-  {
-    out_buf_[i] = new ValueType*[1 << Base];
-    for (size_t j = 0; j < 1 << Base; ++j)
-    {
-      out_buf_[i][j] = new ValueType[kOutBufferSize];
-    }
-  }
-}
-
-template <typename PlainType, typename ValueType, int Base>
-void PairValueManager<PlainType, ValueType, Base>::DeleteAll()
-{
-  delete[] tmp_;
-  tmp_ = NULL;
-
-  for (int i = 0; i < max_threads_; ++i)
-  {
-    for (size_t j = 0; j < 1 << Base; ++j)
-    {
-      delete[] out_buf_[i][j];
-    }
-    delete[] out_buf_[i];
-  }
-  delete[] out_buf_;
-  out_buf_ = NULL;
-
-  max_elems_ = 0;
-  max_threads_ = 0;
-}
-} // namespace value_manager
-
-// Frontend class for sorting keys
-template <typename PlainType,
-          typename CompareType,
-          typename UnsignedType = PlainType,
-          typename Encoder = encoder::EncoderDummy,
-          unsigned int Base = 8>
-class KeySort
-{
-  using DummyValueManager = value_manager::DummyValueManager;
-  using Internal = internal::ParallelRadixSortInternal<PlainType,
-                                                       CompareType,
-                                                       UnsignedType,
-                                                       Encoder,
-                                                       DummyValueManager,
-                                                       Base>;
-
-public:
-  void InitAndSort(PlainType* data, size_t num_elems, const CompareType& comp)
-  {
-    (void)comp;
-    DummyValueManager dvm;
-    Internal::InitAndSort(data, num_elems, &dvm);
-  }
-};
-
-// Frontend class for sorting pairs
-template <typename PlainType,
-          typename ValueType,
-          typename CompareType,
-          typename UnsignedType = PlainType,
-          typename Encoder = encoder::EncoderDummy,
-          int Base = 8>
-class PairSort
-{
-  using ValueManager = value_manager::PairValueManager<PlainType, ValueType, Base>;
-  using Internal = internal::
-    ParallelRadixSortInternal<PlainType, CompareType, UnsignedType, Encoder, ValueManager, Base>;
-
-public:
-  void InitAndSort(PlainType* keys, ValueType* vals, size_t num_elems, const CompareType& comp)
-  {
-    (void)comp;
-    ValueManager vm;
-    vm.Init(num_elems);
-    vm.Start(vals, num_elems);
-    Internal::InitAndSort(keys, num_elems, &vm);
-    ValueType* res_vals = vm.GetResult();
-    if (res_vals != vals)
-    {
-      for (size_t i = 0; i < num_elems; ++i)
-      {
-        vals[i] = res_vals[i];
-      }
-    }
-  }
-
-private:
-};
-
-#define KEY_SORT_CASE(plain_type, compare_type, unsigned_type, encoder_type)                       \
-  template <>                                                                                      \
-  class KeySort<plain_type, compare_type>                                                          \
-    : public KeySort<plain_type, compare_type, unsigned_type, encoder::Encoder##encoder_type>      \
-  {                                                                                                \
-  };                                                                                               \
-  template <typename V>                                                                            \
-  class PairSort<plain_type, V, compare_type>                                                      \
-    : public PairSort<plain_type, V, compare_type, unsigned_type, encoder::Encoder##encoder_type>  \
-  {                                                                                                \
-  };
-
-// Unsigned integers
-KEY_SORT_CASE(unsigned int, std::less<unsigned int>, unsigned int, Unsigned);
-KEY_SORT_CASE(unsigned int, std::greater<unsigned int>, unsigned int, Unsigned);
-KEY_SORT_CASE(unsigned short int, std::less<unsigned short int>, unsigned short int, Unsigned);
-KEY_SORT_CASE(unsigned short int, std::greater<unsigned short int>, unsigned short int, Unsigned);
-KEY_SORT_CASE(unsigned long int, std::less<unsigned long int>, unsigned long int, Unsigned);
-KEY_SORT_CASE(unsigned long int, std::greater<unsigned long int>, unsigned long int, Unsigned);
-KEY_SORT_CASE(unsigned long long int,
-              std::less<unsigned long long int>,
-              unsigned long long int,
-              Unsigned);
-KEY_SORT_CASE(unsigned long long int,
-              std::greater<unsigned long long int>,
-              unsigned long long int,
-              Unsigned);
-
-// Unsigned char
-KEY_SORT_CASE(unsigned char, std::less<unsigned char>, unsigned char, Unsigned);
-KEY_SORT_CASE(unsigned char, std::greater<unsigned char>, unsigned char, Unsigned);
-KEY_SORT_CASE(char16_t, std::less<char16_t>, uint16_t, Unsigned);
-KEY_SORT_CASE(char16_t, std::greater<char16_t>, uint16_t, Unsigned);
-KEY_SORT_CASE(char32_t, std::less<char32_t>, uint32_t, Unsigned);
-KEY_SORT_CASE(char32_t, std::greater<char32_t>, uint32_t, Unsigned);
-KEY_SORT_CASE(wchar_t, std::less<wchar_t>, uint32_t, Unsigned);
-KEY_SORT_CASE(wchar_t, std::greater<wchar_t>, uint32_t, Unsigned);
-
-// Signed integers
-KEY_SORT_CASE(char, std::less<char>, unsigned char, Signed);
-KEY_SORT_CASE(char, std::greater<char>, unsigned char, Signed);
-KEY_SORT_CASE(short, std::less<short>, unsigned short, Signed);
-KEY_SORT_CASE(short, std::greater<short>, unsigned short, Signed);
-KEY_SORT_CASE(int, std::less<int>, unsigned int, Signed);
-KEY_SORT_CASE(int, std::greater<int>, unsigned int, Signed);
-KEY_SORT_CASE(long, std::less<long>, unsigned long, Signed);
-KEY_SORT_CASE(long, std::greater<long>, unsigned long, Signed);
-KEY_SORT_CASE(long long, std::less<long long>, unsigned long long, Signed);
-KEY_SORT_CASE(long long, std::greater<long long>, unsigned long long, Signed);
-
-// |signed char| and |char| are treated as different types
-KEY_SORT_CASE(signed char, std::less<signed char>, unsigned char, Signed);
-KEY_SORT_CASE(signed char, std::greater<signed char>, unsigned char, Signed);
-
-// Floating point numbers
-KEY_SORT_CASE(float, std::less<float>, uint32_t, Decimal);
-KEY_SORT_CASE(float, std::greater<float>, uint32_t, Decimal);
-KEY_SORT_CASE(double, std::less<double>, uint64_t, Decimal);
-KEY_SORT_CASE(double, std::greater<double>, uint64_t, Decimal);
-
-#undef KEY_SORT_CASE
-
-template <typename T, typename CompareType>
-struct run_kx_radix_sort_keys
-{
-  static void run(T* data, size_t num_elems, const CompareType& comp)
-  {
-    std::sort(data, data + num_elems, comp);
-  }
-};
-
-#define KX_SORT_KEYS(key_type)                                                                     \
-  template <>                                                                                      \
-  struct run_kx_radix_sort_keys<key_type, std::less<key_type>>                                     \
-  {                                                                                                \
-    static void run(key_type* data, size_t num_elems, const std::less<key_type>& comp)             \
-    {                                                                                              \
-      (void)comp;                                                                                  \
-      kx::radix_sort(data, data + num_elems);                                                      \
-    }                                                                                              \
-  };
-
-KX_SORT_KEYS(unsigned short int);
-KX_SORT_KEYS(int);
-KX_SORT_KEYS(unsigned int);
-KX_SORT_KEYS(long int);
-KX_SORT_KEYS(unsigned long int);
-KX_SORT_KEYS(long long int);
-KX_SORT_KEYS(unsigned long long int);
-KX_SORT_KEYS(unsigned char);
-
-#undef KX_SORT_KEYS
-
-template <typename T, typename CompareType>
-bool use_serial_sort_keys(T* data, size_t num_elems, const CompareType& comp)
-{
-  size_t total_bytes = (num_elems) * sizeof(T);
-  if (total_bytes < MIN_BYTES_FOR_PARALLEL)
-  {
-    run_kx_radix_sort_keys<T, CompareType>::run(data, num_elems, comp);
-    return true;
-  }
-  return false;
-}
-
-// Generate radix sort interfaces for key and key value sorts.
-#define VTKM_TBB_SORT_EXPORT(key_type)                                                             \
-  void parallel_radix_sort_key_values(                                                             \
-    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater<key_type>& comp)          \
-  {                                                                                                \
-    PairSort<key_type, vtkm::Id, std::greater<key_type>> ps;                                       \
-    ps.InitAndSort(keys, vals, num_elems, comp);                                                   \
-  }                                                                                                \
-  void parallel_radix_sort_key_values(                                                             \
-    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less<key_type>& comp)             \
-  {                                                                                                \
-    PairSort<key_type, vtkm::Id, std::less<key_type>> ps;                                          \
-    ps.InitAndSort(keys, vals, num_elems, comp);                                                   \
-  }                                                                                                \
-  void parallel_radix_sort(key_type* data, size_t num_elems, const std::greater<key_type>& comp)   \
-  {                                                                                                \
-    if (!use_serial_sort_keys(data, num_elems, comp))                                              \
-    {                                                                                              \
-      KeySort<key_type, std::greater<key_type>> ks;                                                \
-      ks.InitAndSort(data, num_elems, comp);                                                       \
-    }                                                                                              \
-  }                                                                                                \
-  void parallel_radix_sort(key_type* data, size_t num_elems, const std::less<key_type>& comp)      \
-  {                                                                                                \
-    if (!use_serial_sort_keys(data, num_elems, comp))                                              \
-    {                                                                                              \
-      KeySort<key_type, std::less<key_type>> ks;                                                   \
-      ks.InitAndSort(data, num_elems, comp);                                                       \
-    }                                                                                              \
-  }
-
-VTKM_TBB_SORT_EXPORT(short int);
-VTKM_TBB_SORT_EXPORT(unsigned short int);
-VTKM_TBB_SORT_EXPORT(int);
-VTKM_TBB_SORT_EXPORT(unsigned int);
-VTKM_TBB_SORT_EXPORT(long int);
-VTKM_TBB_SORT_EXPORT(unsigned long int);
-VTKM_TBB_SORT_EXPORT(long long int);
-VTKM_TBB_SORT_EXPORT(unsigned long long int);
-VTKM_TBB_SORT_EXPORT(unsigned char);
-VTKM_TBB_SORT_EXPORT(signed char);
-VTKM_TBB_SORT_EXPORT(char);
-VTKM_TBB_SORT_EXPORT(char16_t);
-VTKM_TBB_SORT_EXPORT(char32_t);
-VTKM_TBB_SORT_EXPORT(wchar_t);
-VTKM_TBB_SORT_EXPORT(float);
-VTKM_TBB_SORT_EXPORT(double);
-
-#undef VTKM_TBB_SORT_EXPORT
-
-VTKM_THIRDPARTY_POST_INCLUDE
-}
+VTKM_INSTANTIATE_RADIX_SORT_FOR_THREADER(RadixThreaderTBB)
 }
 }
 }
+} // vtkm::cont::tbb::sort
--- a/vtkm/cont/tbb/internal/ParallelSortTBB.h
+++ b/vtkm/cont/tbb/internal/ParallelSortTBB.h
@ -24,6 +24,7 @@
 #include <vtkm/BinaryPredicates.h>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleZip.h>
+#include <vtkm/cont/internal/ParallelRadixSortInterface.h>

 #include <vtkm/cont/tbb/internal/ArrayManagerExecutionTBB.h>
 #include <vtkm/cont/tbb/internal/DeviceAdapterTagTBB.h>
@ -38,128 +39,27 @@ namespace cont
 {
 namespace tbb
 {
-namespace internal
+namespace sort
 {
-struct RadixSortTag
-{
-};
-struct PSortTag
-{
-};

-template <typename T>
-struct is_valid_compare_type : std::integral_constant<bool, false>
-{
-};
-template <typename T>
-struct is_valid_compare_type<std::less<T>> : std::integral_constant<bool, true>
-{
-};
-template <typename T>
-struct is_valid_compare_type<std::greater<T>> : std::integral_constant<bool, true>
-{
-};
-template <>
-struct is_valid_compare_type<vtkm::SortLess> : std::integral_constant<bool, true>
-{
-};
-template <>
-struct is_valid_compare_type<vtkm::SortGreater> : std::integral_constant<bool, true>
-{
-};
-template <typename BComp, typename T>
-BComp&& get_std_compare(BComp&& b, T&&)
-{
-  return std::forward<BComp>(b);
-}
-template <typename T>
-std::less<T> get_std_compare(vtkm::SortLess, T&&)
-{
-  return std::less<T>{};
-}
-template <typename T>
-std::greater<T> get_std_compare(vtkm::SortGreater, T&&)
-{
-  return std::greater<T>{};
-}
-
-
-template <typename T, typename StorageTag, typename BinaryCompare>
-struct sort_tag_type
-{
-  using type = PSortTag;
-};
-template <typename T, typename BinaryCompare>
-struct sort_tag_type<T, vtkm::cont::StorageTagBasic, BinaryCompare>
-{
-  using PrimT = std::is_arithmetic<T>;
-  using LongDT = std::is_same<T, long double>;
-  using BComp = is_valid_compare_type<BinaryCompare>;
-  using type = typename std::conditional<PrimT::value && BComp::value && !LongDT::value,
-                                         RadixSortTag,
-                                         PSortTag>::type;
-};
-
-template <typename T, typename U, typename StorageTagT, typename StorageTagU, class BinaryCompare>
-struct sortbykey_tag_type
-{
-  using type = PSortTag;
-};
-template <typename T, typename U, typename BinaryCompare>
-struct sortbykey_tag_type<T,
-                          U,
-                          vtkm::cont::StorageTagBasic,
-                          vtkm::cont::StorageTagBasic,
-                          BinaryCompare>
-{
-  using PrimT = std::is_arithmetic<T>;
-  using PrimU = std::is_arithmetic<U>;
-  using LongDT = std::is_same<T, long double>;
-  using BComp = is_valid_compare_type<BinaryCompare>;
-  using type =
-    typename std::conditional<PrimT::value && PrimU::value && BComp::value && !LongDT::value,
-                              RadixSortTag,
-                              PSortTag>::type;
-};
-
-
-#define VTKM_TBB_SORT_EXPORT(key_type)                                                             \
-  VTKM_CONT_EXPORT void parallel_radix_sort(                                                       \
-    key_type* data, size_t num_elems, const std::greater<key_type>& comp);                         \
-  VTKM_CONT_EXPORT void parallel_radix_sort(                                                       \
-    key_type* data, size_t num_elems, const std::less<key_type>& comp);                            \
-  VTKM_CONT_EXPORT void parallel_radix_sort_key_values(                                            \
-    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::greater<key_type>& comp);         \
-  VTKM_CONT_EXPORT void parallel_radix_sort_key_values(                                            \
-    key_type* keys, vtkm::Id* vals, size_t num_elems, const std::less<key_type>& comp);
-
-// Generate radix sort interfaces for key and key value sorts.
-VTKM_TBB_SORT_EXPORT(short int);
-VTKM_TBB_SORT_EXPORT(unsigned short int);
-VTKM_TBB_SORT_EXPORT(int);
-VTKM_TBB_SORT_EXPORT(unsigned int);
-VTKM_TBB_SORT_EXPORT(long int);
-VTKM_TBB_SORT_EXPORT(unsigned long int);
-VTKM_TBB_SORT_EXPORT(long long int);
-VTKM_TBB_SORT_EXPORT(unsigned long long int);
-VTKM_TBB_SORT_EXPORT(unsigned char);
-VTKM_TBB_SORT_EXPORT(signed char);
-VTKM_TBB_SORT_EXPORT(char);
-VTKM_TBB_SORT_EXPORT(char16_t);
-VTKM_TBB_SORT_EXPORT(char32_t);
-VTKM_TBB_SORT_EXPORT(wchar_t);
-VTKM_TBB_SORT_EXPORT(float);
-VTKM_TBB_SORT_EXPORT(double);
-#undef VTKM_TBB_SORT_EXPORT
+// Declare the compiled radix sort specializations:
+VTKM_DECLARE_RADIX_SORT()

+// Forward declare entry points (See stack overflow discussion 7255281 --
+// templated overloads of template functions are not specialization, and will
+// be resolved during the first phase of two part lookup).
 template <typename T, typename Container, class BinaryCompare>
-void parallel_sort(vtkm::cont::ArrayHandle<T, Container>& values, BinaryCompare binary_compare)
-{
-  using SortAlgorithmTag = typename sort_tag_type<T, Container, BinaryCompare>::type;
-  parallel_sort(values, binary_compare, SortAlgorithmTag{});
-}
+void parallel_sort(vtkm::cont::ArrayHandle<T, Container>&, BinaryCompare);
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>&,
+                         vtkm::cont::ArrayHandle<U, StorageU>&,
+                         BinaryCompare);
+
+// Quicksort values:
 template <typename HandleType, class BinaryCompare>
-void parallel_sort(HandleType& values, BinaryCompare binary_compare, PSortTag)
+void parallel_sort(HandleType& values,
+                   BinaryCompare binary_compare,
+                   vtkm::cont::internal::radix::PSortTag)
 {
  auto arrayPortal = values.PrepareForInPlace(vtkm::cont::DeviceAdapterTagTBB());

@ -169,31 +69,37 @@ void parallel_sort(HandleType& values, BinaryCompare binary_compare, PSortTag)
  internal::WrappedBinaryOperator<bool, BinaryCompare> wrappedCompare(binary_compare);
  ::tbb::parallel_sort(iterators.GetBegin(), iterators.GetEnd(), wrappedCompare);
 }
+
+// Radix sort values:
 template <typename T, typename StorageT, class BinaryCompare>
 void parallel_sort(vtkm::cont::ArrayHandle<T, StorageT>& values,
                   BinaryCompare binary_compare,
-                   RadixSortTag)
+                   vtkm::cont::internal::radix::RadixSortTag)
 {
+  using namespace vtkm::cont::internal::radix;
  auto c = get_std_compare(binary_compare, T{});
  parallel_radix_sort(
    values.GetStorage().GetArray(), static_cast<std::size_t>(values.GetNumberOfValues()), c);
 }

-template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
-void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
-                         vtkm::cont::ArrayHandle<U, StorageU>& values,
-                         BinaryCompare binary_compare)
+// Value sort -- static switch between quicksort and radix sort
+template <typename T, typename Container, class BinaryCompare>
+void parallel_sort(vtkm::cont::ArrayHandle<T, Container>& values, BinaryCompare binary_compare)
 {
-  using SortAlgorithmTag =
-    typename sortbykey_tag_type<T, U, StorageT, StorageU, BinaryCompare>::type;
-  parallel_sort_bykey(keys, values, binary_compare, SortAlgorithmTag{});
+  using namespace vtkm::cont::internal::radix;
+  using SortAlgorithmTag = typename sort_tag_type<T, Container, BinaryCompare>::type;
+  parallel_sort(values, binary_compare, SortAlgorithmTag{});
 }
+
+
+// Quicksort by key
 template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
 void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
                         vtkm::cont::ArrayHandle<U, StorageU>& values,
                         BinaryCompare binary_compare,
-                         PSortTag)
+                         vtkm::cont::internal::radix::PSortTag)
 {
+  using namespace vtkm::cont::internal::radix;
  using KeyType = vtkm::cont::ArrayHandle<T, StorageT>;
  constexpr bool larger_than_64bits = sizeof(U) > sizeof(vtkm::Int64);
  if (larger_than_64bits)
@ -243,23 +149,28 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
      zipHandle, vtkm::cont::internal::KeyCompare<T, U, BinaryCompare>(binary_compare), PSortTag{});
  }
 }
+
+// Radix sort by key -- Specialize for vtkm::Id values:
 template <typename T, typename StorageT, typename StorageU, class BinaryCompare>
 void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
                         vtkm::cont::ArrayHandle<vtkm::Id, StorageU>& values,
                         BinaryCompare binary_compare,
-                         RadixSortTag)
+                         vtkm::cont::internal::radix::RadixSortTag)
 {
+  using namespace vtkm::cont::internal::radix;
  auto c = get_std_compare(binary_compare, T{});
  parallel_radix_sort_key_values(keys.GetStorage().GetArray(),
                                 values.GetStorage().GetArray(),
                                 static_cast<std::size_t>(keys.GetNumberOfValues()),
                                 c);
 }
+
+// Radix sort by key -- Generic impl:
 template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
 void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
                         vtkm::cont::ArrayHandle<U, StorageU>& values,
                         BinaryCompare binary_compare,
-                         RadixSortTag)
+                         vtkm::cont::internal::radix::RadixSortTag)
 {
  using KeyType = vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>;
  using ValueType = vtkm::cont::ArrayHandle<U, vtkm::cont::StorageTagBasic>;
@ -287,7 +198,7 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
    ZipHandleType zipHandle = vtkm::cont::make_ArrayHandleZip(keys, indexArray);
    parallel_sort(zipHandle,
                  vtkm::cont::internal::KeyCompare<T, vtkm::Id, BinaryCompare>(binary_compare),
-                  PSortTag{});
+                  vtkm::cont::internal::radix::PSortTag{});
  }

  tbb::ScatterPortal(values.PrepareForInput(vtkm::cont::DeviceAdapterTagTBB()),
@ -301,9 +212,21 @@ void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
    tbb::CopyPortals(inputPortal, outputPortal, 0, 0, valuesScattered.GetNumberOfValues());
  }
 }
+
+// Sort by key -- static switch between radix and quick sort:
+template <typename T, typename StorageT, typename U, typename StorageU, class BinaryCompare>
+void parallel_sort_bykey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                         vtkm::cont::ArrayHandle<U, StorageU>& values,
+                         BinaryCompare binary_compare)
+{
+  using namespace vtkm::cont::internal::radix;
+  using SortAlgorithmTag =
+    typename sortbykey_tag_type<T, U, StorageT, StorageU, BinaryCompare>::type;
+  parallel_sort_bykey(keys, values, binary_compare, SortAlgorithmTag{});
 }
 }
 }
 }
+} // end namespace vtkm::cont::tbb::sort

 #endif // vtk_m_cont_tbb_internal_ParallelSort_h
--- a/vtkm/cont/testing/TestingDeviceAdapter.h
+++ b/vtkm/cont/testing/TestingDeviceAdapter.h
@ -113,23 +113,31 @@ public:
    IdPortalType OutputArray;
  };

-  struct ClearArrayKernel
+  template <typename PortalType>
+  struct GenericClearArrayKernel
  {
+    using ValueType = typename PortalType::ValueType;
+
    VTKM_CONT
-    ClearArrayKernel(const IdPortalType& array)
+    GenericClearArrayKernel(const PortalType& array,
+                            const ValueType& fillValue = static_cast<ValueType>(OFFSET))
      : Array(array)
      , Dims()
+      , FillValue(fillValue)
    {
    }

    VTKM_CONT
-    ClearArrayKernel(const IdPortalType& array, const vtkm::Id3& dims)
+    GenericClearArrayKernel(const PortalType& array,
+                            const vtkm::Id3& dims,
+                            const ValueType& fillValue = static_cast<ValueType>(OFFSET))
      : Array(array)
      , Dims(dims)
+      , FillValue(fillValue)
    {
    }

-    VTKM_EXEC void operator()(vtkm::Id index) const { this->Array.Set(index, OFFSET); }
+    VTKM_EXEC void operator()(vtkm::Id index) const { this->Array.Set(index, this->FillValue); }

    VTKM_EXEC void operator()(vtkm::Id3 index) const
    {
@ -140,10 +148,13 @@ public:

    VTKM_CONT void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer&) {}

-    IdPortalType Array;
+    PortalType Array;
    vtkm::Id3 Dims;
+    ValueType FillValue;
  };

+  using ClearArrayKernel = GenericClearArrayKernel<IdPortalType>;
+
  struct ClearArrayMapKernel //: public vtkm::exec::WorkletMapField
  {

@ -191,6 +202,57 @@ public:
    vtkm::Id3 Dims;
  };

+  // Checks that each instance is only visited once:
+  struct OverlapKernel
+  {
+    using ArrayType = ArrayHandle<bool>;
+    using PortalType = typename ArrayType::template ExecutionTypes<DeviceAdapterTag>::Portal;
+
+    PortalType TrackerPortal;
+    PortalType ValidPortal;
+    vtkm::Id3 Dims;
+
+    VTKM_CONT
+    OverlapKernel(const PortalType& trackerPortal,
+                  const PortalType& validPortal,
+                  const vtkm::Id3& dims)
+      : TrackerPortal(trackerPortal)
+      , ValidPortal(validPortal)
+      , Dims(dims)
+    {
+    }
+
+    VTKM_CONT
+    OverlapKernel(const PortalType& trackerPortal, const PortalType& validPortal)
+      : TrackerPortal(trackerPortal)
+      , ValidPortal(validPortal)
+      , Dims()
+    {
+    }
+
+    VTKM_EXEC void operator()(vtkm::Id index) const
+    {
+      if (this->TrackerPortal.Get(index))
+      { // this index has already been visited, that's an error
+        this->ValidPortal.Set(index, false);
+      }
+      else
+      {
+        this->TrackerPortal.Set(index, true);
+        this->ValidPortal.Set(index, true);
+      }
+    }
+
+    VTKM_EXEC void operator()(vtkm::Id3 index) const
+    {
+      //convert from id3 to id
+      vtkm::Id flatIndex = index[0] + this->Dims[0] * (index[1] + this->Dims[1] * index[2]);
+      this->operator()(flatIndex);
+    }
+
+    VTKM_CONT void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer&) {}
+  };
+
  struct OneErrorKernel
  {
    VTKM_EXEC void operator()(vtkm::Id index) const
@ -642,6 +704,78 @@ private:
        VTKM_TEST_ASSERT(value == index + OFFSET, "Got bad value for scheduled vtkm::Id3 kernels.");
      }
    } //release memory
+
+    // Ensure that each element is only visited once:
+    std::cout << "-------------------------------------------" << std::endl;
+    std::cout << "Testing Schedule for overlap" << std::endl;
+
+    {
+      using BoolArray = ArrayHandle<bool>;
+      using BoolPortal = typename BoolArray::template ExecutionTypes<DeviceAdapterTag>::Portal;
+      BoolArray tracker;
+      BoolArray valid;
+
+      // Initialize tracker with 'false' values
+      std::cout << "Allocating and initializing memory" << std::endl;
+      Algorithm::Schedule(GenericClearArrayKernel<BoolPortal>(
+                            tracker.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag()), false),
+                          ARRAY_SIZE);
+      Algorithm::Schedule(GenericClearArrayKernel<BoolPortal>(
+                            valid.PrepareForOutput(ARRAY_SIZE, DeviceAdapterTag()), false),
+                          ARRAY_SIZE);
+
+      std::cout << "Running Overlap kernel." << std::endl;
+      Algorithm::Schedule(OverlapKernel(tracker.PrepareForInPlace(DeviceAdapterTag()),
+                                        valid.PrepareForInPlace(DeviceAdapterTag())),
+                          ARRAY_SIZE);
+
+      std::cout << "Checking results." << std::endl;
+
+      auto vPortal = valid.GetPortalConstControl();
+      for (vtkm::Id i = 0; i < ARRAY_SIZE; i++)
+      {
+        bool isValid = vPortal.Get(i);
+        VTKM_TEST_ASSERT(isValid, "Schedule executed some elements more than once.");
+      }
+    } // release memory
+
+    // Ensure that each element is only visited once:
+    std::cout << "-------------------------------------------" << std::endl;
+    std::cout << "Testing Schedule for overlap with vtkm::Id3" << std::endl;
+
+    {
+      static constexpr vtkm::Id numElems{ DIM_SIZE * DIM_SIZE * DIM_SIZE };
+      static const vtkm::Id3 dims{ DIM_SIZE, DIM_SIZE, DIM_SIZE };
+
+      using BoolArray = ArrayHandle<bool>;
+      using BoolPortal = typename BoolArray::template ExecutionTypes<DeviceAdapterTag>::Portal;
+      BoolArray tracker;
+      BoolArray valid;
+
+      // Initialize tracker with 'false' values
+      std::cout << "Allocating and initializing memory" << std::endl;
+      Algorithm::Schedule(GenericClearArrayKernel<BoolPortal>(
+                            tracker.PrepareForOutput(numElems, DeviceAdapterTag()), dims, false),
+                          numElems);
+      Algorithm::Schedule(GenericClearArrayKernel<BoolPortal>(
+                            valid.PrepareForOutput(numElems, DeviceAdapterTag()), dims, false),
+                          numElems);
+
+      std::cout << "Running Overlap kernel." << std::endl;
+      Algorithm::Schedule(OverlapKernel(tracker.PrepareForInPlace(DeviceAdapterTag()),
+                                        valid.PrepareForInPlace(DeviceAdapterTag()),
+                                        dims),
+                          dims);
+
+      std::cout << "Checking results." << std::endl;
+
+      auto vPortal = valid.GetPortalConstControl();
+      for (vtkm::Id i = 0; i < numElems; i++)
+      {
+        bool isValid = vPortal.Get(i);
+        VTKM_TEST_ASSERT(isValid, "Id3 Schedule executed some elements more than once.");
+      }
+    } // release memory
  }

  static VTKM_CONT void TestCopyIf()
--- a/vtkm/exec/CMakeLists.txt
+++ b/vtkm/exec/CMakeLists.txt
@ -52,6 +52,7 @@ vtkm_declare_headers(${header_impls} TESTABLE OFF)
 #-----------------------------------------------------------------------------
 add_subdirectory(serial)
 add_subdirectory(tbb)
+add_subdirectory(openmp)
 add_subdirectory(cuda)

 #-----------------------------------------------------------------------------
--- a/vtkm/exec/openmp/CMakeLists.txt
+++ b/vtkm/exec/openmp/CMakeLists.txt
@ -0,0 +1,22 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+#-----------------------------------------------------------------------------
+add_subdirectory(internal)
--- a/vtkm/exec/openmp/internal/CMakeLists.txt
+++ b/vtkm/exec/openmp/internal/CMakeLists.txt
@ -0,0 +1,30 @@
+##============================================================================
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##============================================================================
+
+set(headers
+  TaskTilingOpenMP.h
+  )
+
+vtkm_declare_headers(${headers})
+
+#-----------------------------------------------------------------------------
+if (VTKm_ENABLE_OPENMP)
+  add_subdirectory(testing)
+endif()
--- a/vtkm/exec/openmp/internal/TaskTilingOpenMP.h
+++ b/vtkm/exec/openmp/internal/TaskTilingOpenMP.h
@ -0,0 +1,41 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_exec_openmp_internal_TaskTilingOpenMP_h
+#define vtk_m_exec_openmp_internal_TaskTilingOpenMP_h
+
+#include <vtkm/exec/serial/internal/TaskTiling.h>
+
+namespace vtkm
+{
+namespace exec
+{
+namespace openmp
+{
+namespace internal
+{
+
+using TaskTiling1D = vtkm::exec::serial::internal::TaskTiling1D;
+using TaskTiling3D = vtkm::exec::serial::internal::TaskTiling3D;
+}
+}
+}
+} // namespace vtkm::exec::tbb::internal
+
+#endif //vtk_m_exec_tbb_internal_TaskTiling_h
--- a/vtkm/exec/openmp/internal/testing/CMakeLists.txt
+++ b/vtkm/exec/openmp/internal/testing/CMakeLists.txt
@ -0,0 +1,27 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2018 UT-Battelle, LLC.
+##  Copyright 2018 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+
+set(unit_tests
+  UnitTestTaskTilingOpenMP.cxx
+  )
+
+vtkm_unit_tests(SOURCES ${unit_tests})
--- a/vtkm/exec/openmp/internal/testing/UnitTestTaskTilingOpenMP.cxx
+++ b/vtkm/exec/openmp/internal/testing/UnitTestTaskTilingOpenMP.cxx
@ -0,0 +1,30 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR
+
+#include <vtkm/cont/openmp/DeviceAdapterOpenMP.h>
+#include <vtkm/exec/internal/testing/TestingTaskTiling.h>
+
+int UnitTestTaskTilingOpenMP(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(
+    vtkm::exec::internal::testing::TestTaskTiling<vtkm::cont::DeviceAdapterTagOpenMP>);
+}
--- a/vtkm/internal/ArrayPortalValueReference.h
+++ b/vtkm/internal/ArrayPortalValueReference.h
@ -101,6 +101,26 @@ void swap(vtkm::internal::ArrayPortalValueReference<T> a,
 {
  a.Swap(b);
 }
+
+template <typename T>
+void swap(vtkm::internal::ArrayPortalValueReference<T> a,
+          typename vtkm::internal::ArrayPortalValueReference<T>::ValueType& b)
+{
+  using ValueType = typename vtkm::internal::ArrayPortalValueReference<T>::ValueType;
+  const ValueType tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+void swap(typename vtkm::internal::ArrayPortalValueReference<T>::ValueType& a,
+          vtkm::internal::ArrayPortalValueReference<T> b)
+{
+  using ValueType = typename vtkm::internal::ArrayPortalValueReference<T>::ValueType;
+  const ValueType tmp = b;
+  b = a;
+  a = tmp;
+}
 }
 } // namespace vtkm::internal

--- a/vtkm/internal/CMakeLists.txt
+++ b/vtkm/internal/CMakeLists.txt
@ -29,6 +29,7 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS})

 set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA})
 set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB})
+set(VTKM_ENABLE_OPENMP ${VTKm_ENABLE_OPENMP})
 set(VTKM_ENABLE_MPI ${VTKm_ENABLE_MPI})

 if(VTKM_ENABLE_CUDA)
--- a/vtkm/internal/Configure.h.in
+++ b/vtkm/internal/Configure.h.in
@ -229,6 +229,10 @@
 #ifndef VTKM_ENABLE_TBB
 #cmakedefine VTKM_ENABLE_TBB
 #endif
+//Mark if we are building with OpenMP enabled
+#ifndef VTKM_ENABLE_OPENMP
+#cmakedefine VTKM_ENABLE_OPENMP
+#endif

 //Mark if we are building with MPI enabled.
 #cmakedefine VTKM_ENABLE_MPI
--- a/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx
+++ b/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx
@ -905,6 +905,10 @@ template VTKM_RENDERING_EXPORT void LinearBVH::ConstructOnDevice<
 template VTKM_RENDERING_EXPORT void LinearBVH::ConstructOnDevice<vtkm::cont::DeviceAdapterTagTBB>(
  vtkm::cont::DeviceAdapterTagTBB);
 #endif
+#ifdef VTKM_ENABLE_OPENMP
+template VTKM_CONT_EXPORT void LinearBVH::ConstructOnDevice<vtkm::cont::DeviceAdapterTagOpenMP>(
+  vtkm::cont::DeviceAdapterTagOpenMP);
+#endif
 #ifdef VTKM_ENABLE_CUDA
 template VTKM_RENDERING_EXPORT void LinearBVH::ConstructOnDevice<vtkm::cont::DeviceAdapterTagCuda>(
  vtkm::cont::DeviceAdapterTagCuda);
--- a/vtkm/rendering/raytracing/RayTracingTypeDefs.h
+++ b/vtkm/rendering/raytracing/RayTracingTypeDefs.h
@ -87,6 +87,13 @@ inline std::string GetDeviceString<vtkm::cont::DeviceAdapterTagTBB>(vtkm::cont::
  return "tbb";
 }

+template <>
+inline std::string GetDeviceString<vtkm::cont::DeviceAdapterTagOpenMP>(
+  vtkm::cont::DeviceAdapterTagOpenMP)
+{
+  return "openmp";
+}
+
 template <>
 inline std::string GetDeviceString<vtkm::cont::DeviceAdapterTagCuda>(
  vtkm::cont::DeviceAdapterTagCuda)