Merge branch 'master' into vtk-m-cmake_refactor

Includes updating to cleanup benchmark code and handle the new MPI option
2024-07-30 18:50:59 +00:00 · 2017-12-28 14:23:21 -05:00 · 2017-12-28 14:23:21 -05:00 · 24e57556e6
commit 24e57556e6
parent eec36a0a3b de7162ab8f
119 changed files with 16076 additions and 1342 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -13,3 +13,5 @@ data/* filter=lfs diff=lfs merge=lfs -text
 *.md             whitespace=tab-in-indent conflict-marker-size=79 -whitespace
 *.rst            whitespace=tab-in-indent conflict-marker-size=79
 *.txt            whitespace=tab-in-indent
+
+diy/**           -format.clang-format -whitespace
--- a/CMake/VTKmCheckCopyright.cmake
+++ b/CMake/VTKmCheckCopyright.cmake
@ -39,6 +39,9 @@ set(FILES_TO_CHECK
 set(EXCEPTIONS
  LICENSE.txt
  README.txt
+  diy/include/diy
+  diy/LEGAL.txt
+  diy/LICENSE.txt
  )

 if (NOT VTKm_SOURCE_DIR)
--- a/CMake/VTKmConfig.cmake.in
+++ b/CMake/VTKmConfig.cmake.in
@ -62,6 +62,7 @@ set(VTKm_ENABLE_CUDA "@VTKm_ENABLE_CUDA@")
 set(VTKm_ENABLE_TBB "@VTKm_ENABLE_TBB@")
 set(VTKm_ENABLE_RENDERING "@VTKm_ENABLE_RENDERING@")
 set(VTKm_RENDERING_BACKEND "@VTKm_RENDERING_BACKEND@")
+set(VTKm_ENABLE_MPI "@VTKm_ENABLE_MPI@")

 # Load the library exports, but only if not compiling VTK-m itself
 set_and_check(VTKm_CONFIG_DIR "@PACKAGE_VTKm_INSTALL_CONFIG_DIR@")
--- a/CMake/VTKmMacros.cmake
+++ b/CMake/VTKmMacros.cmake
@ -1,833 +0,0 @@
-##============================================================================
-##  Copyright (c) Kitware, Inc.
-##  All rights reserved.
-##  See LICENSE.txt for details.
-##  This software is distributed WITHOUT ANY WARRANTY; without even
-##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-##  PURPOSE.  See the above copyright notice for more information.
-##
-##  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-##  Copyright 2014 UT-Battelle, LLC.
-##  Copyright 2014 Los Alamos National Security.
-##
-##  Under the terms of Contract DE-NA0003525 with NTESS,
-##  the U.S. Government retains certain rights in this software.
-##
-##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
-##  Laboratory (LANL), the U.S. Government retains certain rights in
-##  this software.
-##============================================================================
-
-include(CMakeParseArguments)
-
-# Utility to build a kit name from the current directory.
-function(vtkm_get_kit_name kitvar)
-  # Will this always work?  It should if ${CMAKE_CURRENT_SOURCE_DIR} is
-  # built from ${VTKm_SOURCE_DIR}.
-  string(REPLACE "${VTKm_SOURCE_DIR}/" "" dir_prefix ${CMAKE_CURRENT_SOURCE_DIR})
-  string(REPLACE "/" "_" kit "${dir_prefix}")
-  set(${kitvar} "${kit}" PARENT_SCOPE)
-  # Optional second argument to get dir_prefix.
-  if (${ARGC} GREATER 1)
-    set(${ARGV1} "${dir_prefix}" PARENT_SCOPE)
-  endif (${ARGC} GREATER 1)
-endfunction(vtkm_get_kit_name)
-
-
-#Utility to setup nvcc flags so that we properly work around issues inside FindCUDA.
-#if we are generating cu files need to setup four things.
-#1. Explicitly set the cuda device adapter as a define this is currently
-#   done as a work around since the cuda executable ignores compile
-#   definitions
-#2. Disable unused function warnings
-#   the FindCUDA module and helper methods don't read target level
-#   properties so we have to modify CUDA_NVCC_FLAGS  instead of using
-#   target and source level COMPILE_FLAGS and COMPILE_DEFINITIONS
-#3. Set the compile option /bigobj when using VisualStudio generators
-#   While we have specified this as target compile flag, those aren't
-#   currently loooked at by FindCUDA, so we have to manually add it ourselves
-function(vtkm_setup_nvcc_flags old_nvcc_flags old_cxx_flags )
-  set(${old_nvcc_flags} ${CUDA_NVCC_FLAGS} PARENT_SCOPE)
-  set(${old_nvcc_flags} ${CMAKE_CXX_FLAGS} PARENT_SCOPE)
-  set(new_nvcc_flags ${CUDA_NVCC_FLAGS})
-  set(new_cxx_flags ${CMAKE_CXX_FLAGS})
-  list(APPEND new_nvcc_flags "-DVTKM_DEVICE_ADAPTER=VTKM_DEVICE_ADAPTER_CUDA")
-  list(APPEND new_nvcc_flags "-w")
-  if(MSVC)
-    list(APPEND new_nvcc_flags "--compiler-options;/bigobj")
-
-    # The MSVC compiler gives a warning about having two incompatiable warning
-    # flags in the command line. So, ironically, adding -w above to remove
-    # warnings makes MSVC give a warning. To get around that, remove all
-    # warning flags from the standard CXX arguments (which are typically passed
-    # to the CUDA compiler).
-    string(REGEX REPLACE "[-/]W[1-4]" "" new_cxx_flags "${new_cxx_flags}")
-    string(REGEX REPLACE "[-/]Wall" "" new_cxx_flags "${new_cxx_flags}")
-  endif()
-  set(CUDA_NVCC_FLAGS ${new_nvcc_flags} PARENT_SCOPE)
-  set(CMAKE_CXX_FLAGS ${new_cxx_flags} PARENT_SCOPE)
-endfunction(vtkm_setup_nvcc_flags)
-
-#Utility to set MSVC only COMPILE_DEFINITIONS and COMPILE_FLAGS needed to
-#reduce number of warnings and compile issues with Visual Studio
-function(vtkm_setup_msvc_properties target )
-  if(NOT MSVC)
-    return()
-  endif()
-
-  #disable MSVC CRT and SCL warnings as they recommend using non standard
-  #c++ extensions
-  target_compile_definitions(${target} PRIVATE "_SCL_SECURE_NO_WARNINGS"
-                                               "_CRT_SECURE_NO_WARNINGS")
-
-  #C4702 Generates numerous false positives with template code about
-  #      unreachable code
-  #C4505 Generates numerous warnings about unused functions being
-  #      removed when doing header test builds.
-  target_compile_options(${target} PRIVATE -wd4702 -wd4505)
-
-  # In VS2013 the C4127 warning has a bug in the implementation and
-  # generates false positive warnings for lots of template code
-  if(MSVC_VERSION LESS 1900)
-    target_compile_options(${target} PRIVATE -wd4127 )
-  endif()
-
-endfunction(vtkm_setup_msvc_properties)
-
-# vtkm_target_name(<name>)
-#
-# This macro does some basic checking for library naming, and also adds a suffix
-# to the output name with the VTKm version by default. Setting the variable
-# VTKm_CUSTOM_LIBRARY_SUFFIX will override the suffix.
-function(vtkm_target_name _name)
-  get_property(_type TARGET ${_name} PROPERTY TYPE)
-  if(NOT "${_type}" STREQUAL EXECUTABLE)
-    set_property(TARGET ${_name} PROPERTY VERSION 1)
-    set_property(TARGET ${_name} PROPERTY SOVERSION 1)
-  endif()
-  if("${_name}" MATCHES "^[Vv][Tt][Kk][Mm]")
-    set(_vtkm "")
-  else()
-    set(_vtkm "vtkm")
-    #message(AUTHOR_WARNING "Target [${_name}] does not start in 'vtkm'.")
-  endif()
-  # Support custom library suffix names, for other projects wanting to inject
-  # their own version numbers etc.
-  if(DEFINED VTKm_CUSTOM_LIBRARY_SUFFIX)
-    set(_lib_suffix "${VTKm_CUSTOM_LIBRARY_SUFFIX}")
-  else()
-    set(_lib_suffix "-${VTKm_VERSION_MAJOR}.${VTKm_VERSION_MINOR}")
-  endif()
-  set_property(TARGET ${_name} PROPERTY OUTPUT_NAME ${_vtk}${_name}${_lib_suffix})
-endfunction()
-
-function(vtkm_target _name)
-  vtkm_target_name(${_name})
-endfunction()
-
-# Builds a source file and an executable that does nothing other than
-# compile the given header files.
-function(vtkm_add_header_build_test name dir_prefix use_cuda)
-  set(hfiles ${ARGN})
-  if (use_cuda)
-    set(suffix ".cu")
-  else (use_cuda)
-    set(suffix ".cxx")
-  endif (use_cuda)
-  set(cxxfiles)
-  foreach (header ${ARGN})
-    get_source_file_property(cant_be_tested ${header} VTKm_CANT_BE_HEADER_TESTED)
-
-    if( NOT cant_be_tested )
-      string(REPLACE "${CMAKE_CURRENT_BINARY_DIR}" "" header "${header}")
-      get_filename_component(headername ${header} NAME_WE)
-      set(src ${CMAKE_CURRENT_BINARY_DIR}/TB_${headername}${suffix})
-      configure_file(${VTKm_SOURCE_DIR}/CMake/TestBuild.cxx.in ${src} @ONLY)
-      list(APPEND cxxfiles ${src})
-    endif()
-
-  endforeach (header)
-
-  #only attempt to add a test build executable if we have any headers to
-  #test. this might not happen when everything depends on thrust.
-  list(LENGTH cxxfiles cxxfiles_len)
-  if (use_cuda AND ${cxxfiles_len} GREATER 0)
-
-    vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags )
-
-    # Cuda compiles do not respect target_include_directories
-    # and we want system includes so we have to hijack cuda
-    # to do it
-    foreach(dir ${VTKm_INCLUDE_DIRS})
-      #this internal variable has changed names depending on the CMake ver
-      list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -isystem ${dir})
-      list(APPEND CUDA_NVCC_INCLUDE_DIRS_USER -isystem ${dir})
-    endforeach()
-
-    cuda_include_directories(${VTKm_SOURCE_DIR}
-                             ${VTKm_BINARY_INCLUDE_DIR}
-                            )
-
-    cuda_add_library(TestBuild_${name} STATIC ${cxxfiles} ${hfiles})
-
-
-    set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
-    set(CMAKE_CXX_FLAGS ${old_cxx_flags})
-
-  elseif (${cxxfiles_len} GREATER 0)
-    add_library(TestBuild_${name} STATIC ${cxxfiles} ${hfiles})
-    target_include_directories(TestBuild_${name} PRIVATE vtkm ${VTKm_INCLUDE_DIRS})
-  endif ()
-  target_link_libraries(TestBuild_${name} PRIVATE vtkm_cont ${VTKm_LIBRARIES})
-  set_source_files_properties(${hfiles}
-    PROPERTIES HEADER_FILE_ONLY TRUE
-    )
-
-  vtkm_setup_msvc_properties(TestBuild_${name})
-
-  # Send the libraries created for test builds to their own directory so as to
-  # not polute the directory with useful libraries.
-  set_target_properties(TestBuild_${name} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}/testbuilds
-    LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}/testbuilds
-    RUNTIME_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}/testbuilds
-    )
-endfunction(vtkm_add_header_build_test)
-
-function(vtkm_install_headers dir_prefix)
-  set(hfiles ${ARGN})
-  install(FILES ${hfiles}
-    DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/${dir_prefix}
-    )
-endfunction(vtkm_install_headers)
-
-function(vtkm_install_template_sources)
-  vtkm_get_kit_name(name dir_prefix)
-  set(hfiles ${ARGN})
-  vtkm_install_headers("${dir_prefix}" ${hfiles})
-  # CMake does not add installed files as project files, and template sources
-  # are not declared as source files anywhere, add a fake target here to let
-  # an IDE know that these sources exist.
-  add_custom_target(${name}_template_srcs SOURCES ${hfiles})
-endfunction(vtkm_install_template_sources)
-
-# Declare a list of headers that require thrust to be enabled
-# for them to header tested. In cases of thrust version 1.5 or less
-# we have to make sure openMP is enabled, otherwise we are okay
-function(vtkm_requires_thrust_to_test)
-  #determine the state of thrust and testing
-  set(cant_be_tested FALSE)
-    if(NOT VTKm_ENABLE_THRUST)
-      #mark as not valid
-      set(cant_be_tested TRUE)
-    elseif(NOT VTKm_ENABLE_OPENMP)
-      #mark also as not valid
-      set(cant_be_tested TRUE)
-    endif()
-
-  foreach(header ${ARGN})
-    #set a property on the file that marks if we can header test it
-    set_source_files_properties( ${header}
-        PROPERTIES VTKm_CANT_BE_HEADER_TESTED ${cant_be_tested} )
-
-  endforeach(header)
-
-endfunction(vtkm_requires_thrust_to_test)
-
-# Declare a list of header files.  Will make sure the header files get
-# compiled and show up in an IDE.
-function(vtkm_declare_headers)
-  set(options CUDA)
-  set(oneValueArgs TESTABLE)
-  set(multiValueArgs)
-  cmake_parse_arguments(VTKm_DH "${options}"
-    "${oneValueArgs}" "${multiValueArgs}"
-    ${ARGN}
-    )
-
-  #The testable keyword allows the caller to turn off the header testing,
-  #mainly used so that backends can be installed even when they can't be
-  #built on the machine.
-  #Since this is an optional property not setting it means you do want testing
-  if(NOT DEFINED VTKm_DH_TESTABLE)
-      set(VTKm_DH_TESTABLE ON)
-  endif()
-
-  set(hfiles ${VTKm_DH_UNPARSED_ARGUMENTS})
-  vtkm_get_kit_name(name dir_prefix)
-
-  #only do header testing if enable testing is turned on
-  if (VTKm_ENABLE_TESTING AND VTKm_DH_TESTABLE)
-    vtkm_add_header_build_test(
-      "${name}" "${dir_prefix}" "${VTKm_DH_CUDA}" ${hfiles})
-  endif()
-  #always install headers
-  vtkm_install_headers("${dir_prefix}" ${hfiles})
-endfunction(vtkm_declare_headers)
-
-# Declare a list of worklet files.
-function(vtkm_declare_worklets)
-  # Currently worklets are just really header files.
-  vtkm_declare_headers(${ARGN})
-endfunction(vtkm_declare_worklets)
-
-function(vtkm_pyexpander_generated_file generated_file_name)
-  # If pyexpander is available, add targets to build and check
-  if(PYEXPANDER_FOUND AND PYTHONINTERP_FOUND)
-    add_custom_command(
-      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${generated_file_name}.checked
-      COMMAND ${CMAKE_COMMAND}
-        -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
-        -DPYEXPANDER_COMMAND=${PYEXPANDER_COMMAND}
-        -DSOURCE_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${generated_file_name}
-        -DGENERATED_FILE=${CMAKE_CURRENT_BINARY_DIR}/${generated_file_name}
-        -P ${VTKm_CMAKE_MODULE_PATH}/VTKmCheckPyexpander.cmake
-      MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/${generated_file_name}.in
-      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${generated_file_name}
-      COMMENT "Checking validity of ${generated_file_name}"
-      )
-    add_custom_target(check_${generated_file_name} ALL
-      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${generated_file_name}.checked
-      )
-  endif()
-endfunction(vtkm_pyexpander_generated_file)
-
-# Declare unit tests, which should be in the same directory as a kit
-# (package, module, whatever you call it).  Usage:
-#
-# vtkm_unit_tests(
-#   SOURCES <source_list>
-#   LIBRARIES <dependent_library_list>
-#   TEST_ARGS <argument_list>
-#   )
-function(vtkm_unit_tests)
-  set(options CUDA)
-  set(oneValueArgs)
-  set(multiValueArgs SOURCES LIBRARIES TEST_ARGS)
-  cmake_parse_arguments(VTKm_UT
-    "${options}" "${oneValueArgs}" "${multiValueArgs}"
-    ${ARGN}
-    )
-
-  if (VTKm_ENABLE_TESTING)
-    vtkm_get_kit_name(kit)
-    #we use UnitTests_ so that it is an unique key to exclude from coverage
-    set(test_prog UnitTests_${kit})
-    create_test_sourcelist(TestSources ${test_prog}.cxx ${VTKm_UT_SOURCES})
-
-    #determine the timeout for all the tests based on the backend. CUDA tests
-    #generally require more time because of kernel generation.
-    set(timeout 180)
-    if (VTKm_UT_CUDA)
-      set(timeout 1500)
-    endif()
-
-    if (VTKm_UT_CUDA)
-      vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags )
-
-      # Cuda compiles do not respect target_include_directories
-      cuda_include_directories(${VTKm_SOURCE_DIR}
-                               ${VTKm_BINARY_INCLUDE_DIR}
-                               ${VTKm_INCLUDE_DIRS}
-                               )
-
-      cuda_add_executable(${test_prog} ${TestSources})
-
-      set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
-      set(CMAKE_CXX_FLAGS ${old_cxx_flags})
-
-    else (VTKm_UT_CUDA)
-      add_executable(${test_prog} ${TestSources})
-    endif (VTKm_UT_CUDA)
-
-    set_target_properties(${test_prog} PROPERTIES
-      ARCHIVE_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-      LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-      RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH}
-      )
-
-    #do it as a property value so we don't pollute the include_directories
-    #for any other targets
-    target_include_directories(${test_prog} PRIVATE ${VTKm_INCLUDE_DIRS})
-
-    target_link_libraries(${test_prog} PRIVATE vtkm_cont ${VTKm_LIBRARIES})
-
-    target_compile_options(${test_prog} PRIVATE ${VTKm_COMPILE_OPTIONS})
-
-    vtkm_setup_msvc_properties(${test_prog})
-
-    foreach (test ${VTKm_UT_SOURCES})
-      get_filename_component(tname ${test} NAME_WE)
-      add_test(NAME ${tname}
-        COMMAND ${test_prog} ${tname} ${VTKm_UT_TEST_ARGS}
-        )
-      set_tests_properties("${tname}" PROPERTIES TIMEOUT ${timeout})
-    endforeach (test)
-  endif (VTKm_ENABLE_TESTING)
-
-endfunction(vtkm_unit_tests)
-
-# Save the worklets to test with each device adapter
-# Usage:
-#
-# vtkm_save_worklet_unit_tests( sources )
-#
-# notes: will save the sources absolute path as the
-# vtkm_source_worklet_unit_tests global property
-function(vtkm_save_worklet_unit_tests )
-
-  #create the test driver when we are called, since
-  #the test driver expect the test files to be in the same
-  #directory as the test driver
-  create_test_sourcelist(test_sources WorkletTestDriver.cxx ${ARGN})
-
-  #store the absolute path for the test drive and all the test
-  #files
-  set(driver ${CMAKE_CURRENT_BINARY_DIR}/WorkletTestDriver.cxx)
-  set(cxx_sources)
-  set(cu_sources)
-
-  #we need to store the absolute source for the file so that
-  #we can properly compile it into the test driver. At
-  #the same time we want to configure each file into the build
-  #directory as a .cu file so that we can compile it with cuda
-  #if needed
-  foreach(fname ${ARGN})
-    set(absPath)
-
-    get_filename_component(absPath ${fname} ABSOLUTE)
-    get_filename_component(file_name_only ${fname} NAME_WE)
-
-    set(cuda_file_name "${CMAKE_CURRENT_BINARY_DIR}/${file_name_only}.cu")
-    configure_file("${absPath}"
-                   "${cuda_file_name}"
-                   COPYONLY)
-    list(APPEND cxx_sources ${absPath})
-    list(APPEND cu_sources ${cuda_file_name})
-  endforeach()
-
-  #we create a property that holds all the worklets to test,
-  #but don't actually attempt to create a unit test with the yet.
-  #That is done by each device adapter
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_worklet_unit_tests_sources ${cxx_sources})
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_worklet_unit_tests_cu_sources ${cu_sources})
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_worklet_unit_tests_drivers ${driver})
-
-endfunction(vtkm_save_worklet_unit_tests)
-
-# Call each worklet test for the given device adapter
-# Usage:
-#
-# vtkm_worklet_unit_tests( device_adapter )
-#
-# notes: will look for the vtkm_source_worklet_unit_tests global
-# property to find what are the worklet unit tests that need to be
-# compiled for the give device adapter
-function(vtkm_worklet_unit_tests device_adapter)
-
-  set(unit_test_srcs)
-  get_property(unit_test_srcs GLOBAL
-               PROPERTY vtkm_worklet_unit_tests_sources )
-
-  set(unit_test_drivers)
-  get_property(unit_test_drivers GLOBAL
-               PROPERTY vtkm_worklet_unit_tests_drivers )
-
-  #detect if we are generating a .cu files
-  set(is_cuda FALSE)
-  if("${device_adapter}" STREQUAL "VTKM_DEVICE_ADAPTER_CUDA")
-    set(is_cuda TRUE)
-  endif()
-
-  #determine the timeout for all the tests based on the backend. The first CUDA
-  #worklet test requires way more time because of the overhead to allow the
-  #driver to convert the kernel code from virtual arch to actual arch.
-  #
-  set(timeout 180)
-  if(is_cuda)
-    set(timeout 1500)
-  endif()
-
-  if(VTKm_ENABLE_TESTING)
-    string(REPLACE "VTKM_DEVICE_ADAPTER_" "" device_type ${device_adapter})
-
-    vtkm_get_kit_name(kit)
-
-    #inject the device adapter into the test program name so each one is unique
-    set(test_prog WorkletTests_${device_type})
-
-
-    if(is_cuda)
-      get_property(unit_test_srcs GLOBAL PROPERTY vtkm_worklet_unit_tests_cu_sources )
-      vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags )
-
-      # Cuda compiles do not respect target_include_directories
-      cuda_include_directories(${VTKm_SOURCE_DIR}
-                               ${VTKm_BINARY_INCLUDE_DIR}
-                               ${VTKm_INCLUDE_DIRS}
-                               )
-
-      cuda_add_executable(${test_prog} ${unit_test_drivers} ${unit_test_srcs})
-
-      set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
-      set(CMAKE_CXX_FLAGS ${old_cxx_flags})
-    else()
-      add_executable(${test_prog} ${unit_test_drivers} ${unit_test_srcs})
-    endif()
-
-    set_target_properties(${test_prog} PROPERTIES
-      ARCHIVE_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-      LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-      RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH}
-      )
-    target_include_directories(${test_prog} PRIVATE ${VTKm_INCLUDE_DIRS})
-    target_link_libraries(${test_prog} PRIVATE vtkm_cont ${VTKm_LIBRARIES})
-
-    #add the specific compile options for this executable
-    target_compile_options(${test_prog} PRIVATE ${VTKm_COMPILE_OPTIONS})
-
-    #add a test for each worklet test file. We will inject the device
-    #adapter type into the test name so that it is easier to see what
-    #exact device a test is failing on.
-    foreach (test ${unit_test_srcs})
-      get_filename_component(tname ${test} NAME_WE)
-      add_test(NAME "${tname}${device_type}"
-        COMMAND ${test_prog} ${tname}
-        )
-
-      set_tests_properties("${tname}${device_type}" PROPERTIES TIMEOUT ${timeout})
-    endforeach (test)
-
-    vtkm_setup_msvc_properties(${test_prog})
-
-    #set the device adapter on the executable
-    target_compile_definitions(${test_prog} PRIVATE "VTKM_DEVICE_ADAPTER=${device_adapter}")
-  endif()
-endfunction(vtkm_worklet_unit_tests)
-
-# Save the benchmarks to run with each device adapter
-# This is based on vtkm_save_worklet_unit_tests
-# Usage:
-#
-# vtkm_save_benchmarks( <sources> [HEADERS <headers>] )
-#
-#
-# Each benchmark source file needs to implement main(int agrc, char *argv[])
-#
-# notes: will save the sources absolute path as the
-# vtkm_benchmarks_sources global property
-function(vtkm_save_benchmarks)
-
-  #store the absolute path for all the test files
-  set(cxx_sources)
-  set(cu_sources)
-
-  cmake_parse_arguments(save_benchmarks "" "" "HEADERS" ${ARGN})
-
-  #we need to store the absolute source for the file so that
-  #we can properly compile it into the benchmark driver. At
-  #the same time we want to configure each file into the build
-  #directory as a .cu file so that we can compile it with cuda
-  #if needed
-  foreach(fname ${save_benchmarks_UNPARSED_ARGUMENTS})
-    set(absPath)
-
-    get_filename_component(absPath ${fname} ABSOLUTE)
-    get_filename_component(file_name_only ${fname} NAME_WE)
-
-    set(cuda_file_name "${CMAKE_CURRENT_BINARY_DIR}/${file_name_only}.cu")
-    configure_file("${absPath}"
-                   "${cuda_file_name}"
-                   COPYONLY)
-    list(APPEND cxx_sources ${absPath})
-    list(APPEND cu_sources ${cuda_file_name})
-  endforeach()
-
-  #we create a property that holds all the worklets to test,
-  #but don't actually attempt to create a unit test with the yet.
-  #That is done by each device adapter
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_benchmarks_sources ${cxx_sources})
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_benchmarks_cu_sources ${cu_sources})
-  set_property( GLOBAL APPEND
-                PROPERTY vtkm_benchmarks_headers ${save_benchmarks_HEADERS})
-
-endfunction(vtkm_save_benchmarks)
-
-# Call each benchmark for the given device adapter
-# Usage:
-#
-# vtkm_benchmark( device_adapter )
-#
-# notes: will look for the vtkm_benchmarks_sources global
-# property to find what are the benchmarks that need to be
-# compiled for the give device adapter
-function(vtkm_benchmarks device_adapter)
-
-  set(benchmark_srcs)
-  get_property(benchmark_srcs GLOBAL
-               PROPERTY vtkm_benchmarks_sources )
-
-  set(benchmark_headers)
-  get_property(benchmark_headers GLOBAL
-               PROPERTY vtkm_benchmarks_headers )
-
-  #detect if we are generating a .cu files
-  set(is_cuda FALSE)
-  set(old_nvcc_flags ${CUDA_NVCC_FLAGS})
-  set(old_cxx_flags ${CMAKE_CXX_FLAGS})
-  if("${device_adapter}" STREQUAL "VTKM_DEVICE_ADAPTER_CUDA")
-    set(is_cuda TRUE)
-  endif()
-
-  if(VTKm_ENABLE_BENCHMARKS)
-    string(REPLACE "VTKM_DEVICE_ADAPTER_" "" device_type ${device_adapter})
-
-    if(is_cuda)
-      vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags )
-      get_property(benchmark_srcs GLOBAL PROPERTY vtkm_benchmarks_cu_sources )
-    endif()
-
-    foreach( file  ${benchmark_srcs})
-      #inject the device adapter into the benchmark program name so each one is unique
-      get_filename_component(benchmark_prog ${file} NAME_WE)
-      set(benchmark_prog "${benchmark_prog}_${device_type}")
-
-      if(is_cuda)
-        # Cuda compiles do not respect target_include_directories
-
-        cuda_include_directories(${VTKm_SOURCE_DIR}
-                                 ${VTKm_BINARY_INCLUDE_DIR}
-                                 ${VTKm_BACKEND_INCLUDE_DIRS}
-                                 )
-
-        cuda_add_executable(${benchmark_prog} ${file} ${benchmark_headers})
-      else()
-        add_executable(${benchmark_prog} ${file} ${benchmark_headers})
-      endif()
-
-      set_target_properties(${benchmark_prog} PROPERTIES
-        ARCHIVE_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-        LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-        RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH}
-        )
-
-      set_source_files_properties(${benchmark_headers}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-
-      target_include_directories(${benchmark_prog} PRIVATE ${VTKm_BACKEND_INCLUDE_DIRS})
-      target_link_libraries(${benchmark_prog} PRIVATE vtkm_cont ${VTKm_BACKEND_LIBRARIES})
-
-      vtkm_setup_msvc_properties(${benchmark_prog})
-
-      #add the specific compile options for this executable
-      target_compile_options(${benchmark_prog} PRIVATE ${VTKm_COMPILE_OPTIONS})
-
-      #set the device adapter on the executable
-      target_compile_definitions(${benchmark_prog} PRIVATE "VTKM_DEVICE_ADAPTER=${device_adapter}")
-
-    endforeach()
-
-    if(is_cuda)
-      set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
-      set(CMAKE_CXX_FLAGS ${old_cxx_flags})
-    endif()
-  endif()
-
-endfunction(vtkm_benchmarks)
-
-# Given a list of *.cxx source files that during configure time are deterimined
-# to have CUDA code, wrap the sources in *.cu files so that they get compiled
-# with nvcc.
-function(vtkm_wrap_sources_for_cuda cuda_source_list_var)
-  set(original_sources ${ARGN})
-
-  set(cuda_sources)
-  foreach(source_file ${original_sources})
-    get_filename_component(source_name ${source_file} NAME_WE)
-    get_filename_component(source_file_path ${source_file} ABSOLUTE)
-    set(wrapped_file ${CMAKE_CURRENT_BINARY_DIR}/${source_name}.cu)
-    configure_file(
-      ${VTKm_SOURCE_DIR}/CMake/WrapCUDASource.cu.in
-      ${wrapped_file}
-      @ONLY)
-    list(APPEND cuda_sources ${wrapped_file})
-  endforeach(source_file)
-
-  # Set original sources as header files (which they basically are) so that
-  # we can add them to the file list and they will show up in IDE but they will
-  # not be compiled separately.
-  set_source_files_properties(${original_sources}
-    PROPERTIES HEADER_FILE_ONLY TRUE
-    )
-  set(${cuda_source_list_var} ${cuda_sources} ${original_sources} PARENT_SCOPE)
-endfunction(vtkm_wrap_sources_for_cuda)
-
-# Add a VTK-m library. The name of the library will match the "kit" name
-# (e.g. vtkm_rendering) unless the NAME argument is given.
-#
-# vtkm_library(
-#   [NAME <name>]
-#   SOURCES <source_list>
-#   [HEADERS <headers_list>]
-#   [CUDA]
-#   [WRAP_FOR_CUDA <source_list>]
-#   [LIBRARIES <dependent_library_list>]
-#   )
-function(vtkm_library)
-  set(options CUDA)
-  set(oneValueArgs NAME)
-  set(multiValueArgs SOURCES HEADERS WRAP_FOR_CUDA)
-  cmake_parse_arguments(VTKm_LIB
-    "${options}" "${oneValueArgs}" "${multiValueArgs}"
-    ${ARGN}
-    )
-
-  vtkm_get_kit_name(kit dir_prefix)
-  if(VTKm_LIB_NAME)
-    set(lib_name ${VTKm_LIB_NAME})
-  else()
-    set(lib_name ${kit})
-  endif()
-
-  list(APPEND VTKm_LIB_SOURCES ${VTKm_LIB_HEADERS})
-  set_source_files_properties(${VTKm_LIB_HEADERS}
-    PROPERTIES HEADER_FILE_ONLY TRUE
-    )
-
-  if(VTKm_LIB_WRAP_FOR_CUDA)
-    if(VTKm_ENABLE_CUDA)
-      # If we have some sources marked as WRAP_FOR_CUDA and we support CUDA,
-      # then we need to turn on CDUA, wrap those sources, and add the wrapped
-      # code to the sources list.
-      set(VTKm_LIB_CUDA TRUE)
-      vtkm_wrap_sources_for_cuda(cuda_sources ${VTKm_LIB_WRAP_FOR_CUDA})
-      list(APPEND VTKm_LIB_SOURCES ${cuda_sources})
-    else()
-      # If we have some sources marked as WRAP_FOR_CUDA but we do not support
-      # CUDA, then just compile these sources normally by adding them to the
-      # sources list.
-      list(APPEND VTKm_LIB_SOURCES ${VTKm_LIB_WRAP_FOR_CUDA})
-    endif()
-  endif()
-
-  if(VTKm_LIB_CUDA)
-    vtkm_setup_nvcc_flags(old_nvcc_flags old_cxx_flags)
-
-
-    # Cuda compiles do not respect target_include_directories
-    cuda_include_directories(${VTKm_SOURCE_DIR}
-                             ${VTKm_BINARY_INCLUDE_DIR}
-                             ${VTKm_BACKEND_INCLUDE_DIRS}
-                             )
-
-    if(BUILD_SHARED_LIBS AND NOT WIN32)
-      set(compile_options -Xcompiler=${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY}hidden)
-    endif()
-
-    cuda_add_library(${lib_name} ${VTKm_LIB_SOURCES}
-                     OPTIONS "${compile_options}")
-
-    set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
-    set(CMAKE_CXX_FLAGS ${old_cxx_flags})
-  else()
-    add_library(${lib_name} ${VTKm_LIB_SOURCES})
-  endif()
-
-  vtkm_target(${lib_name})
-
-  target_link_libraries(${lib_name} PUBLIC vtkm)
-  target_link_libraries(${lib_name} PRIVATE
-    ${VTKm_BACKEND_LIBRARIES}
-    ${VTKm_LIB_LIBRARIES}
-    )
-
-  set(cxx_args ${VTKm_COMPILE_OPTIONS})
-  separate_arguments(cxx_args)
-  target_compile_options(${lib_name} PRIVATE ${cxx_args})
-
-  # Make sure libraries go to lib directory and dll go to bin directory.
-  # Mostly important on Windows.
-  set_target_properties(${lib_name} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-    LIBRARY_OUTPUT_DIRECTORY ${VTKm_LIBRARY_OUTPUT_PATH}
-    RUNTIME_OUTPUT_DIRECTORY ${VTKm_EXECUTABLE_OUTPUT_PATH}
-    )
-
-  vtkm_setup_msvc_properties(${lib_name})
-
-  if(VTKm_EXTRA_COMPILER_WARNINGS)
-    set(cxx_args ${CMAKE_CXX_FLAGS_WARN_EXTRA})
-    separate_arguments(cxx_args)
-    target_compile_options(${lib_name}
-      PRIVATE ${cxx_args}
-      )
-  endif(VTKm_EXTRA_COMPILER_WARNINGS)
-
-  #Now generate a header that holds the macros needed to easily export
-  #template classes. This
-  string(TOUPPER ${lib_name} BASE_NAME_UPPER)
-  set(EXPORT_MACRO_NAME "${BASE_NAME_UPPER}")
-
-  set(EXPORT_IS_BUILT_STATIC 0)
-  get_target_property(is_static ${lib_name} TYPE)
-  if(${is_static} STREQUAL "STATIC_LIBRARY")
-    #If we are building statically set the define symbol
-    set(EXPORT_IS_BUILT_STATIC 1)
-  endif()
-  unset(is_static)
-
-  get_target_property(EXPORT_IMPORT_CONDITION ${lib_name} DEFINE_SYMBOL)
-  if(NOT EXPORT_IMPORT_CONDITION)
-    #set EXPORT_IMPORT_CONDITION to what the DEFINE_SYMBOL would be when
-    #building shared
-    set(EXPORT_IMPORT_CONDITION ${lib_name}_EXPORTS)
-  endif()
-
-  configure_file(
-      ${VTKm_SOURCE_DIR}/CMake/VTKmExportHeaderTemplate.h.in
-      ${VTKm_BINARY_INCLUDE_DIR}/${dir_prefix}/${lib_name}_export.h
-    @ONLY)
-
-  unset(EXPORT_MACRO_NAME)
-  unset(EXPORT_IS_BUILT_STATIC)
-  unset(EXPORT_IMPORT_CONDITION)
-
-  install(TARGETS ${lib_name}
-    EXPORT ${VTKm_EXPORT_NAME}
-    ARCHIVE DESTINATION ${VTKm_INSTALL_LIB_DIR}
-    LIBRARY DESTINATION ${VTKm_INSTALL_LIB_DIR}
-    RUNTIME DESTINATION ${VTKm_INSTALL_BIN_DIR}
-    )
-  vtkm_install_headers("${dir_prefix}"
-    ${VTKm_BINARY_INCLUDE_DIR}/${dir_prefix}/${lib_name}_export.h
-    ${VTKm_LIB_HEADERS}
-    )
-endfunction(vtkm_library)
-
-# The Thrust project is not as careful as the VTKm project in avoiding warnings
-# on shadow variables and unused arguments.  With a real GCC compiler, you
-# can disable these warnings inline, but with something like nvcc, those
-# pragmas cause errors.  Thus, this macro will disable the compiler warnings.
-macro(vtkm_disable_troublesome_thrust_warnings)
-  vtkm_disable_troublesome_thrust_warnings_var(CMAKE_CXX_FLAGS_DEBUG)
-  vtkm_disable_troublesome_thrust_warnings_var(CMAKE_CXX_FLAGS_MINSIZEREL)
-  vtkm_disable_troublesome_thrust_warnings_var(CMAKE_CXX_FLAGS_RELEASE)
-  vtkm_disable_troublesome_thrust_warnings_var(CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-endmacro(vtkm_disable_troublesome_thrust_warnings)
-
-macro(vtkm_disable_troublesome_thrust_warnings_var flags_var)
-  set(old_flags "${${flags_var}}")
-  string(REPLACE "-Wshadow" "" new_flags "${old_flags}")
-  string(REPLACE "-Wunused-parameter" "" new_flags "${new_flags}")
-  string(REPLACE "-Wunused" "" new_flags "${new_flags}")
-  string(REPLACE "-Wextra" "" new_flags "${new_flags}")
-  string(REPLACE "-Wall" "" new_flags "${new_flags}")
-  set(${flags_var} "${new_flags}")
-endmacro(vtkm_disable_troublesome_thrust_warnings_var)
-
-include(VTKmConfigureComponents)
--- a/CMake/VTKmWrappers.cmake
+++ b/CMake/VTKmWrappers.cmake
@ -141,7 +141,7 @@ function(vtkm_library)
  endif()
  set(lib_name ${VTKm_LIB_NAME})

-  if(VTKm_ENABLE_CUDA)
+  if(TARGET vtkm::cuda)
    set_source_files_properties(${VTKm_LIB_WRAP_FOR_CUDA} PROPERTIES LANGUAGE "CUDA")
  endif()

@ -176,23 +176,37 @@ endfunction(vtkm_library)
 # Declare unit tests, which should be in the same directory as a kit
 # (package, module, whatever you call it).  Usage:
 #
-# [CUDA]: mark all source files as being compiled with the cuda compiler
+# vtkm_unit_tests(
+#   NAME
+#   SOURCES <source_list>
+#   BACKEND <type>
+#   LIBRARIES <dependent_library_list>
+#   TEST_ARGS <argument_list>
+#   <options>
+#   )
+#
 # [BACKEND]: mark all source files as being compiled with the proper defines
 #            to make this backend the default backend
 #            If the backend is specified as CUDA it will also imply all
 #            sources should be treated as CUDA sources
 #            The backend name will also be added to the executable name
 #            so you can test multiple backends easily
-# vtkm_unit_tests(
-#   NAME
-#   CUDA
-#   SOURCES <source_list>
-#   BACKEND <type>
-#   LIBRARIES <dependent_library_list>
-#   TEST_ARGS <argument_list>
-#   )
+#
+# [LIBRARIES] : extra libraries that this set of tests need to link too
+#
+# [TEST_ARGS] : arguments that should be passed on the command line to the
+#               test executable
+#
+# Supported <options> are documented below. These can be specified for
+# all tests or for individual tests. When specifying these for individual tests,
+# simply add them after the test name in the <source_list> separated by a comma.
+# e.g. `UnitTestMultiBlock,MPI`.
+#
+# Supported <options> are
+# * MPI : the test(s) will be executed using `mpirun`.
+#
 function(vtkm_unit_tests)
-  set(options CUDA NO_TESTS)
+  set(options MPI)
  set(oneValueArgs BACKEND NAME)
  set(multiValueArgs SOURCES LIBRARIES TEST_ARGS)
  cmake_parse_arguments(VTKm_UT
@ -204,12 +218,11 @@ function(vtkm_unit_tests)
    return()
  endif()

+  vtkm_parse_test_options(VTKm_UT_SOURCES "${options}" ${VTKm_UT_SOURCES})
+
  set(backend )
  if(VTKm_UT_BACKEND)
    set(backend "_${VTKm_UT_BACKEND}")
-    if(backend STREQUAL "CUDA")
-      set(VTKm_UT_CUDA "TRUE")
-    endif()
  endif()

  vtkm_get_kit_name(kit)
@ -219,6 +232,9 @@ function(vtkm_unit_tests)
    set(test_prog "${VTKm_UT_NAME}${backend}")
  endif()

+  if(VTKm_UT_BACKEND STREQUAL "CUDA")
+    set_source_files_properties(${VTKm_UT_SOURCES} PROPERTIES LANGUAGE "CUDA")
+  endif()

  create_test_sourcelist(TestSources ${test_prog}.cxx ${VTKm_UT_SOURCES})

@ -238,7 +254,7 @@ function(vtkm_unit_tests)
  #determine the timeout for all the tests based on the backend. CUDA tests
  #generally require more time because of kernel generation.
  set(timeout 180)
-  if(VTKm_UT_CUDA)
+  if(VTKm_UT_BACKEND STREQUAL "CUDA")
    set(timeout 1500)
  endif()
  foreach (test ${VTKm_UT_SOURCES})
@ -251,20 +267,36 @@ function(vtkm_unit_tests)

 endfunction(vtkm_unit_tests)

-#-----------------------------------------------------------------------------
-# Declare benchmarks, which use all the same infrastructure as tests but
-# don't actually do the add_test at the end
+# -----------------------------------------------------------------------------
+# vtkm_parse_test_options(varname options)
+#   INTERNAL: Parse options specified for individual tests.
 #
-# [BACKEND]: mark all source files as being compiled with the proper defines
-#            to make this backend the default backend
-#            If the backend is specified as CUDA it will also imply all
-#            sources should be treated as CUDA sources
-#            The backend name will also be added to the executable name
-#            so you can test multiple backends easily
-# vtkm_benchmarks(
-#   SOURCES <source_list>
-#   BACKEND <type>
-#   LIBRARIES <dependent_library_list>
-function(vtkm_benchmarks)
-  vtkm_unit_tests(NAME Benchmarks NO_TESTS ${ARGN})
-endfunction(vtkm_benchmarks)
+#   Parses the arguments to separate out options specified after the test name
+#   separated by a comma e.g.
+#
+#   TestName,Option1,Option2
+#
+#   For every option in options, this will set _TestName_Option1,
+#   _TestName_Option2, etc in the parent scope.
+#
+function(vtkm_parse_test_options varname options)
+  set(names)
+  foreach(arg IN LISTS ARGN)
+    set(test_name ${arg})
+    set(test_options)
+    if(test_name AND "x${test_name}" MATCHES "^x([^,]*),(.*)$")
+      set(test_name "${CMAKE_MATCH_1}")
+      string(REPLACE "," ";" test_options "${CMAKE_MATCH_2}")
+    endif()
+    foreach(opt IN LISTS test_options)
+      list(FIND options "${opt}" index)
+      if(index EQUAL -1)
+        message(WARNING "Unknown option '${opt}' specified for test '${test_name}'")
+      else()
+        set(_${test_name}_${opt} TRUE PARENT_SCOPE)
+      endif()
+    endforeach()
+    list(APPEND names ${test_name})
+  endforeach()
+  set(${varname} ${names} PARENT_SCOPE)
+endfunction()
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -75,6 +75,7 @@ option(VTKm_ENABLE_TBB "Enable TBB support" OFF)
 option(VTKm_ENABLE_RENDERING "Enable rendering library" ON)
 option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON)
 option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF)
+option(VTKm_ENABLE_MPI "Enable MPI support" OFF)

 option(VTKm_ENABLE_DOCUMENTATION "Build Doxygen documentation" OFF)
 option(VTKm_ENABLE_EXAMPLES "Build examples" OFF)
@ -181,6 +182,11 @@ find_package(Pyexpander)

 #-----------------------------------------------------------------------------
 # Add subdirectories
+if(VTKm_ENABLE_MPI)
+  # This `if` is temporary and will be removed once `diy` supports building
+  # without MPI.
+  add_subdirectory(diy)
+endif()
 add_subdirectory(vtkm)

 #-----------------------------------------------------------------------------
--- a/diy/CMakeLists.txt
+++ b/diy/CMakeLists.txt
@ -0,0 +1,65 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2017 UT-Battelle, LLC.
+##  Copyright 2017 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+
+#==============================================================================
+# See License.txt
+#==============================================================================
+add_library(diy INTERFACE)
+
+# diy needs C++11
+target_compile_features(diy INTERFACE cxx_auto_type)
+
+target_include_directories(diy INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${VTKm_INSTALL_INCLUDE_DIR}>)
+
+# presently, this dependency is required. Make it optional in the future.
+set(arg)
+foreach(apath IN LISTS MPI_C_INCLUDE_PATH MPI_CXX_INCLUDE_PATH)
+  list(APPEND arg $<BUILD_INTERFACE:${apath}>)
+endforeach()
+target_include_directories(diy INTERFACE ${arg})
+
+target_link_libraries(diy INTERFACE
+  $<BUILD_INTERFACE:${MPI_C_LIBRARIES}>
+  $<BUILD_INTERFACE:${MPI_CXX_LIBRARIES}>)
+
+if(MPI_C_COMPILE_DEFINITIONS)
+  target_compile_definitions(diy INTERFACE
+    $<$<COMPILE_LANGUAGE:C>:${MPI_C_COMPILE_DEFINITIONS}>)
+endif()
+if(MPI_CXX_COMPILE_DEFNITIONS)
+  target_compile_definitions(diy INTERFACE
+    $<$<COMPILE_LANGUAGE:CXX>:${MPI_CXX_COMPILE_DEFNITIONS>)
+endif()
+
+install(TARGETS diy
+  EXPORT ${VTKm_EXPORT_NAME})
+
+# Install headers
+install(DIRECTORY include/diy
+  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR})
+
+# Install other files.
+install(FILES LEGAL.txt LICENSE.txt
+  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/diy
+  )
--- a/diy/LEGAL.txt
+++ b/diy/LEGAL.txt
@ -0,0 +1,19 @@
+Copyright Notice
+
+DIY2, Copyright (c) 2015, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from the U.S. Dept. of Energy).  All rights reserved.
+
+If you have questions about your rights to use or distribute this software,
+please contact Berkeley Lab's Technology Transfer Department at  TTD@lbl.gov.
+
+NOTICE.  This software is owned by the U.S. Department of Energy.  As such, the
+U.S. Government has been granted for itself and others acting on its behalf a
+paid-up, nonexclusive, irrevocable, worldwide license in the Software to
+reproduce, prepare derivative works, and perform publicly and display publicly.
+Beginning five (5) years after the date permission to assert copyright is
+obtained from the U.S. Department of Energy, and subject to any subsequent five
+(5) year renewals, the U.S. Government is granted for itself and others acting
+on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
+Software to reproduce, prepare derivative works, distribute copies to the
+public, perform publicly and display publicly, and to permit others to do so.
--- a/diy/LICENSE.txt
+++ b/diy/LICENSE.txt
@ -0,0 +1,41 @@
+License Agreement
+
+"DIY2, Copyright (c) 2015, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from the U.S. Dept. of Energy).  All rights reserved."
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+(1) Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+(2) Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+(3) Neither the name of the University of California, Lawrence Berkeley National
+Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to Lawrence Berkeley National Laboratory,
+without imposing a separate written license agreement for such Enhancements,
+then you hereby grant the following license: a  non-exclusive, royalty-free
+perpetual license to install, use, modify, prepare derivative works, incorporate
+into other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
--- a/diy/include/diy/algorithms.hpp
+++ b/diy/include/diy/algorithms.hpp
@ -0,0 +1,191 @@
+#ifndef DIY_ALGORITHMS_HPP
+#define DIY_ALGORITHMS_HPP
+
+#include <vector>
+
+#include "master.hpp"
+#include "assigner.hpp"
+#include "reduce.hpp"
+#include "reduce-operations.hpp"
+#include "partners/swap.hpp"
+
+#include "detail/algorithms/sort.hpp"
+#include "detail/algorithms/kdtree.hpp"
+#include "detail/algorithms/kdtree-sampling.hpp"
+
+#include "log.hpp"
+
+namespace diy
+{
+    /**
+     * \ingroup Algorithms
+     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
+     */
+    template<class Block, class T, class Cmp>
+    void sort(Master&                   master,               //!< master object
+              const Assigner&           assigner,             //!< assigner object
+              std::vector<T> Block::*   values,               //!< all values to sort
+              std::vector<T> Block::*   samples,              //!< (output) boundaries of blocks
+              size_t                    num_samples,          //!< desired number of samples
+              const Cmp&                cmp,                  //!< comparison function
+              int                       k   = 2,              //!< k-ary reduction will be used
+              bool                      samples_only = false) //!< false: results will be all_to_all exchanged; true: only sort but don't exchange results
+    {
+        bool immediate = master.immediate();
+        master.set_immediate(false);
+
+        // NB: although sorter will go out of scope, its member functions sample()
+        //     and exchange() will return functors whose copies get saved inside reduce
+        detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples);
+
+        // swap-reduce to all-gather samples
+        RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks());
+        RegularSwapPartners   partners(decomposer, k);
+        reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds()));
+
+        // all_to_all to exchange the values
+        if (!samples_only)
+            all_to_all(master, assigner, sorter.exchange(), k);
+
+        master.set_immediate(immediate);
+    }
+
+
+    /**
+     * \ingroup Algorithms
+     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
+     * shorter version of above sort algorithm with the default less-than comparator used for T
+     * and all_to_all exchange included
+     */
+    template<class Block, class T>
+    void sort(Master&                   master,      //!< master object
+              const Assigner&           assigner,    //!< assigner object
+              std::vector<T> Block::*   values,      //!< all values to sort
+              std::vector<T> Block::*   samples,     //!< (output) boundaries of blocks
+              size_t                    num_samples, //!< desired number of samples
+              int                       k   = 2)     //!< k-ary reduction will be used
+    {
+        sort(master, assigner, values, samples, num_samples, std::less<T>(), k);
+    }
+
+    /**
+     * \ingroup Algorithms
+     * \brief build a kd-tree and sort a set of points into it (use histograms to determine split values)
+     */
+    template<class Block, class Point>
+    void kdtree(Master&                         master,      //!< master object
+                const Assigner&                 assigner,    //!< assigner object
+                int                             dim,         //!< dimensionality
+                const ContinuousBounds&         domain,      //!< global data extents
+                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
+                size_t                          bins,        //!< number of histogram bins for splitting a dimension
+                bool                            wrap = false)//!< periodic boundaries in all dimensions
+    {
+        if (assigner.nblocks() & (assigner.nblocks() - 1))
+            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
+
+        typedef     diy::RegularContinuousLink      RCLink;
+
+        for (size_t i = 0; i < master.size(); ++i)
+        {
+            RCLink* link   = static_cast<RCLink*>(master.link(i));
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir[j] = -1; wrap_dir[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir[j] = 1; wrap_dir[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
+        }
+
+        detail::KDTreePartition<Block,Point>    kdtree_partition(dim, points, bins);
+
+        detail::KDTreePartners                  partners(dim, assigner.nblocks(), wrap, domain);
+        reduce(master, assigner, partners, kdtree_partition);
+
+        // update master.expected to match the links
+        int expected = 0;
+        for (size_t i = 0; i < master.size(); ++i)
+            expected += master.link(i)->size_unique();
+        master.set_expected(expected);
+    }
+
+    /**
+     * \ingroup Algorithms
+     * \brief build a kd-tree and sort a set of points into it (use sampling to determine split values)
+     */
+    template<class Block, class Point>
+    void kdtree_sampling
+               (Master&                         master,      //!< master object
+                const Assigner&                 assigner,    //!< assigner object
+                int                             dim,         //!< dimensionality
+                const ContinuousBounds&         domain,      //!< global data extents
+                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
+                size_t                          samples,     //!< number of samples to take in each block
+                bool                            wrap = false)//!< periodic boundaries in all dimensions
+    {
+        if (assigner.nblocks() & (assigner.nblocks() - 1))
+            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
+
+        typedef     diy::RegularContinuousLink      RCLink;
+
+        for (size_t i = 0; i < master.size(); ++i)
+        {
+            RCLink* link   = static_cast<RCLink*>(master.link(i));
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir[j] = -1; wrap_dir[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir[j] = 1; wrap_dir[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
+        }
+
+        detail::KDTreeSamplingPartition<Block,Point>    kdtree_partition(dim, points, samples);
+
+        detail::KDTreePartners                          partners(dim, assigner.nblocks(), wrap, domain);
+        reduce(master, assigner, partners, kdtree_partition);
+
+        // update master.expected to match the links
+        int expected = 0;
+        for (size_t i = 0; i < master.size(); ++i)
+            expected += master.link(i)->size_unique();
+        master.set_expected(expected);
+    }
+}
+
+#endif
--- a/diy/include/diy/assigner.hpp
+++ b/diy/include/diy/assigner.hpp
@ -0,0 +1,126 @@
+#ifndef DIY_ASSIGNER_HPP
+#define DIY_ASSIGNER_HPP
+
+#include <vector>
+
+namespace diy
+{
+  // Derived types should define
+  //   int rank(int gid) const
+  // that converts a global block id to a rank that it's assigned to.
+  class Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Manages how blocks are assigned to processes
+      */
+                    Assigner(int size,     //!< total number of processes
+                             int nblocks   //!< total (global) number of blocks
+                             ):
+                      size_(size), nblocks_(nblocks)    {}
+
+      //! returns the total number of process ranks
+      int           size() const                        { return size_; }
+      //! returns the total number of global blocks
+      int           nblocks() const                     { return nblocks_; }
+      //! sets the total number of global blocks
+      void          set_nblocks(int nblocks)            { nblocks_ = nblocks; }
+      //! gets the local gids for a given process rank
+      virtual void  local_gids(int rank, std::vector<int>& gids) const   =0;
+      //! returns the process rank of the block with global id gid (need not be local)
+      virtual int   rank(int gid) const     =0;
+
+    private:
+      int           size_;      // total number of ranks
+      int           nblocks_;   // total number of blocks
+  };
+
+  class ContiguousAssigner: public Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Assigns blocks to processes in contiguous gid (block global id) order
+      */
+            ContiguousAssigner(int size,     //!< total number of processes
+                               int nblocks   //!< total (global) number of blocks
+                               ):
+              Assigner(size, nblocks)           {}
+
+      using Assigner::size;
+      using Assigner::nblocks;
+
+      int   rank(int gid) const override
+      {
+          int div = nblocks() / size();
+          int mod = nblocks() % size();
+          int r = gid / (div + 1);
+          if (r < mod)
+          {
+              return r;
+          } else
+          {
+              return mod + (gid - (div + 1)*mod)/div;
+          }
+      }
+      inline
+      void  local_gids(int rank, std::vector<int>& gids) const override;
+  };
+
+  class RoundRobinAssigner: public Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Assigns blocks to processes in cyclic or round-robin gid (block global id) order
+      */
+            RoundRobinAssigner(int size,     //!< total number of processes
+                               int nblocks   //!< total (global) number of blocks
+                               ):
+              Assigner(size, nblocks)           {}
+
+      using Assigner::size;
+      using Assigner::nblocks;
+
+      int   rank(int gid) const override        { return gid % size(); }
+      inline
+      void  local_gids(int rank, std::vector<int>& gids) const override;
+  };
+}
+
+void
+diy::ContiguousAssigner::
+local_gids(int rank, std::vector<int>& gids) const
+{
+  int div = nblocks() / size();
+  int mod = nblocks() % size();
+
+  int from, to;
+  if (rank < mod)
+      from = rank * (div + 1);
+  else
+      from = mod * (div + 1) + (rank - mod) * div;
+
+  if (rank + 1 < mod)
+      to = (rank + 1) * (div + 1);
+  else
+      to = mod * (div + 1) + (rank + 1 - mod) * div;
+
+  for (int gid = from; gid < to; ++gid)
+    gids.push_back(gid);
+}
+
+void
+diy::RoundRobinAssigner::
+local_gids(int rank, std::vector<int>& gids) const
+{
+  int cur = rank;
+  while (cur < nblocks())
+  {
+    gids.push_back(cur);
+    cur += size();
+  }
+}
+
+#endif
--- a/diy/include/diy/collection.hpp
+++ b/diy/include/diy/collection.hpp
@ -0,0 +1,121 @@
+#ifndef DIY_COLLECTION_HPP
+#define DIY_COLLECTION_HPP
+
+#include <vector>
+
+#include "serialization.hpp"
+#include "storage.hpp"
+#include "thread.hpp"
+
+
+namespace diy
+{
+  class Collection
+  {
+    public:
+      typedef       void*                                       Element;
+      typedef       std::vector<Element>                        Elements;
+      typedef       critical_resource<int, recursive_mutex>     CInt;
+
+      typedef       void* (*Create)();
+      typedef       void  (*Destroy)(void*);
+      typedef       detail::Save                                Save;
+      typedef       detail::Load                                Load;
+
+    public:
+                    Collection(Create               create,
+                               Destroy              destroy,
+                               ExternalStorage*     storage,
+                               Save                 save,
+                               Load                 load):
+                        create_(create),
+                        destroy_(destroy),
+                        storage_(storage),
+                        save_(save),
+                        load_(load),
+                        in_memory_(0)               {}
+
+      size_t        size() const                    { return elements_.size(); }
+      const CInt&   in_memory() const               { return in_memory_; }
+      inline void   clear();
+
+      int           add(Element e)                  { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return elements_.size() - 1; }
+      void*         release(int i)                  { void* e = get(i); elements_[i] = 0; return e; }
+
+      void*         find(int i) const               { return elements_[i]; }                        // possibly returns 0, if the element is unloaded
+      void*         get(int i)                      { if (!find(i)) load(i); return find(i); }      // loads the element first, and then returns its address
+
+      int           available() const               { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
+
+      inline void   load(int i);
+      inline void   unload(int i);
+
+      Create        creator() const                 { return create_; }
+      Destroy       destroyer() const               { return destroy_; }
+      Load          loader() const                  { return load_; }
+      Save          saver() const                   { return save_; }
+
+      void*         create() const                  { return create_(); }
+      void          destroy(int i)                  { if (find(i)) { destroy_(find(i)); elements_[i] = 0; } else if (external_[i] != -1) storage_->destroy(external_[i]); }
+
+      bool          own() const                     { return destroy_ != 0; }
+
+      ExternalStorage*      storage() const         { return storage_; }
+
+    private:
+      Create                create_;
+      Destroy               destroy_;
+      ExternalStorage*      storage_;
+      Save                  save_;
+      Load                  load_;
+
+      Elements              elements_;
+      std::vector<int>      external_;
+      CInt                  in_memory_;
+  };
+}
+
+void
+diy::Collection::
+clear()
+{
+  if (own())
+    for (size_t i = 0; i < size(); ++i)
+      destroy(i);
+  elements_.clear();
+  external_.clear();
+  *in_memory_.access() = 0;
+}
+
+void
+diy::Collection::
+unload(int i)
+{
+  //BinaryBuffer bb;
+  void* e = find(i);
+  //save_(e, bb);
+  //external_[i] = storage_->put(bb);
+  external_[i] = storage_->put(e, save_);
+
+  destroy_(e);
+  elements_[i] = 0;
+
+  --(*in_memory_.access());
+}
+
+void
+diy::Collection::
+load(int i)
+{
+  //BinaryBuffer bb;
+  //storage_->get(external_[i], bb);
+  void* e = create_();
+  //load_(e, bb);
+  storage_->get(external_[i], e, load_);
+  elements_[i] = e;
+  external_[i] = -1;
+
+  ++(*in_memory_.access());
+}
+
+#endif
--- a/diy/include/diy/communicator.hpp
+++ b/diy/include/diy/communicator.hpp
@ -0,0 +1,13 @@
+#ifndef DIY_COMMUNICATOR_HPP
+#define DIY_COMMUNICATOR_HPP
+
+#warning "diy::Communicator (in diy/communicator.hpp) is deprecated, use diy::mpi::communicator directly"
+
+#include "mpi.hpp"
+
+namespace diy
+{
+  typedef mpi::communicator         Communicator;
+}
+
+#endif
--- a/diy/include/diy/constants.h
+++ b/diy/include/diy/constants.h
@ -0,0 +1,22 @@
+#ifndef DIY_CONSTANTS_H
+#define DIY_CONSTANTS_H
+
+// Default DIY_MAX_DIM to 4, unless provided by the user
+// (used for static min/max size in various Bounds)
+#ifndef DIY_MAX_DIM
+#define DIY_MAX_DIM 4
+#endif
+
+enum
+{
+  DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
+  DIY_X1 = 0x02, /* maximum-side x (right) neighbor */
+  DIY_Y0 = 0x04, /* minimum-side y (bottom) neighbor */
+  DIY_Y1 = 0x08, /* maximum-side y (top) neighbor */
+  DIY_Z0 = 0x10, /* minimum-side z (back) neighbor */
+  DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */
+  DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */
+  DIY_T1 = 0x80  /* maximum-side t (later) neighbor */
+};
+
+#endif
--- a/diy/include/diy/critical-resource.hpp
+++ b/diy/include/diy/critical-resource.hpp
@ -0,0 +1,53 @@
+#ifndef DIY_CRITICAL_RESOURCE_HPP
+#define DIY_CRITICAL_RESOURCE_HPP
+
+namespace diy
+{
+  // TODO: when not running under C++11, i.e., when lock_guard is TinyThread's
+  //       lock_guard, and not C++11's unique_lock, this implementation might
+  //       be buggy since the copy constructor is invoked when
+  //       critical_resource::access() returns an instance of this class. Once
+  //       the temporary is destroyed the mutex is unlocked. I'm not 100%
+  //       certain of this because I'd expect a deadlock on copy constructor,
+  //       but it's clearly not happening -- so I may be missing something.
+  //       (This issue will take care of itself in DIY3 once we switch to C++11 completely.)
+  template<class T, class Mutex>
+  class resource_accessor
+  {
+    public:
+                resource_accessor(T& x, Mutex& m):
+                    x_(x), lock_(m)                         {}
+
+      T&        operator*()                                 { return x_; }
+      T*        operator->()                                { return &x_; }
+      const T&  operator*() const                           { return x_; }
+      const T*  operator->() const                          { return &x_; }
+
+    private:
+      T&                        x_;
+      lock_guard<Mutex>         lock_;
+  };
+
+  template<class T, class Mutex = fast_mutex>
+  class critical_resource
+  {
+    public:
+      typedef           resource_accessor<T, Mutex>         accessor;
+      typedef           resource_accessor<const T, Mutex>   const_accessor;     // eventually, try shared locking
+
+    public:
+                        critical_resource()                 {}
+                        critical_resource(const T& x):
+                            x_(x)                           {}
+
+      accessor          access()                            { return accessor(x_, m_); }
+      const_accessor    const_access() const                { return const_accessor(x_, m_); }
+
+    private:
+      T                 x_;
+      mutable Mutex     m_;
+  };
+}
+
+
+#endif
--- a/diy/include/diy/decomposition.hpp
+++ b/diy/include/diy/decomposition.hpp
@ -0,0 +1,716 @@
+#ifndef DIY_DECOMPOSITION_HPP
+#define DIY_DECOMPOSITION_HPP
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+#include "link.hpp"
+#include "assigner.hpp"
+#include "master.hpp"
+
+namespace diy
+{
+namespace detail
+{
+  template<class Bounds_, class Enable = void>
+  struct BoundsHelper;
+
+  // discrete bounds
+  template<class Bounds>
+  struct BoundsHelper<Bounds, typename std::enable_if<std::is_integral<typename Bounds::Coordinate>::value>::type>
+  {
+    using Coordinate = typename Bounds::Coordinate;
+
+    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)          { return min + (max - min + 1)/n * i; }
+    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool shared_face)
+    {
+      if (i == n - 1)
+        return max;
+      else
+        return from(i+1, n, min, max, shared_face) - (shared_face ? 0 : 1);
+    }
+
+    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
+    {
+        Coordinate width = (max - min + 1)/n;
+        Coordinate res = (x - min)/width;
+        if (res >= n) res = n - 1;
+
+        if (shared && x == from(res, n, min, max, shared))
+            --res;
+        return res;
+    }
+    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
+    {
+        Coordinate width = (max - min + 1)/n;
+        Coordinate res = (x - min)/width + 1;
+        if (shared && x == from(res, n, min, max, shared))
+            ++res;
+        return res;
+    }
+  };
+
+  // continuous bounds
+  template<class Bounds>
+  struct BoundsHelper<Bounds, typename std::enable_if<std::is_floating_point<typename Bounds::Coordinate>::value>::type>
+  {
+    using Coordinate = typename Bounds::Coordinate;
+
+    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * i; }
+    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * (i+1); }
+
+    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::floor((x - min)/width); if (min + res*width == x) return (res - 1); else return res; }
+    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::ceil ((x - min)/width); if (min + res*width == x) return (res + 1); else return res; }
+  };
+}
+
+  //! \ingroup Decomposition
+  //! Decomposes a regular (discrete or continuous) domain into even blocks;
+  //! creates Links with Bounds along the way.
+  template<class Bounds_>
+  struct RegularDecomposer
+  {
+    typedef         Bounds_                                         Bounds;
+    typedef         typename BoundsValue<Bounds>::type              Coordinate;
+    typedef         typename RegularLinkSelector<Bounds>::type      Link;
+
+    using Creator = std::function<void(int,      Bounds, Bounds, Bounds, Link)>;
+    using Updater = std::function<void(int, int, Bounds, Bounds, Bounds, Link)>;
+
+    typedef         std::vector<bool>                               BoolVector;
+    typedef         std::vector<Coordinate>                         CoordinateVector;
+    typedef         std::vector<int>                                DivisionsVector;
+
+    /// @param dim:        dimensionality of the decomposition
+    /// @param domain:     bounds of global domain
+    /// @param nblocks:    total number of global blocks
+    /// @param share_face: indicates dimensions on which to share block faces
+    /// @param wrap:       indicates dimensions on which to wrap the boundary
+    /// @param ghosts:     indicates how many ghosts to use in each dimension
+    /// @param divisions:  indicates how many cuts to make along each dimension
+    ///                   (0 means "no constraint," i.e., leave it up to the algorithm)
+                    RegularDecomposer(int               dim_,
+                                      const Bounds&     domain_,
+                                      int               nblocks_,
+                                      BoolVector        share_face_ = BoolVector(),
+                                      BoolVector        wrap_       = BoolVector(),
+                                      CoordinateVector  ghosts_     = CoordinateVector(),
+                                      DivisionsVector   divisions_  = DivisionsVector()):
+                      dim(dim_), domain(domain_), nblocks(nblocks_),
+                      share_face(share_face_),
+                      wrap(wrap_), ghosts(ghosts_), divisions(divisions_)
+    {
+      if ((int) share_face.size() < dim)  share_face.resize(dim);
+      if ((int) wrap.size() < dim)        wrap.resize(dim);
+      if ((int) ghosts.size() < dim)      ghosts.resize(dim);
+      if ((int) divisions.size() < dim)   divisions.resize(dim);
+
+      fill_divisions(divisions);
+    }
+
+    // Calls create(int gid, const Bounds& bounds, const Link& link)
+    void            decompose(int rank, const Assigner& assigner, const Creator& create);
+
+    void            decompose(int rank, const Assigner& assigner, Master& master, const Updater& update);
+
+    void            decompose(int rank, const Assigner& assigner, Master& master);
+
+    // find lowest gid that owns a particular point
+    template<class Point>
+    int             lowest_gid(const Point& p) const;
+
+    void            gid_to_coords(int gid, DivisionsVector& coords) const       { gid_to_coords(gid, coords, divisions); }
+    int             coords_to_gid(const DivisionsVector& coords) const          { return coords_to_gid(coords, divisions); }
+    void            fill_divisions(std::vector<int>& divisions) const;
+
+    void            fill_bounds(Bounds& bounds, const DivisionsVector& coords, bool add_ghosts = false) const;
+    void            fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const;
+
+    static bool     all(const std::vector<int>& v, int x);
+    static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions);
+    static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions);
+
+    static void     factor(std::vector<unsigned>& factors, int n);
+
+    // Point to GIDs functions
+    template<class Point>
+    void            point_to_gids(std::vector<int>& gids, const Point& p) const;
+
+    //! returns gid of a block that contains the point; ignores ghosts
+    template<class Point>
+    int             point_to_gid(const Point& p) const;
+
+    template<class Point>
+    int             num_gids(const Point& p) const;
+
+    template<class Point>
+    void            top_bottom(int& top, int& bottom, const Point& p, int axis) const;
+
+
+    int               dim;
+    Bounds            domain;
+    int               nblocks;
+    BoolVector        share_face;
+    BoolVector        wrap;
+    CoordinateVector  ghosts;
+    DivisionsVector   divisions;
+
+  };
+
+  /**
+   * \ingroup Decomposition
+   * \brief Decomposes the domain into a prescribed pattern of blocks.
+   *
+   * @param dim        dimension of the domain
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param create     the callback functor
+   * @param wrap       indicates dimensions on which to wrap the boundary
+   * @param ghosts     indicates how many ghosts to use in each dimension
+   * @param divs       indicates how many cuts to make along each dimension
+   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
+   *
+   * `create(...)` is called with each block assigned to the local domain. See [decomposition example](#decomposition-example).
+   */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 const typename RegularDecomposer<Bounds>::Creator&   create,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, create);
+  }
+
+  /**
+   * \ingroup Decomposition
+   * \brief Decomposes the domain into a prescribed pattern of blocks.
+   *
+   * @param dim        dimension of the domain
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param master     gets the blocks once this function returns
+   * @param wrap       indicates dimensions on which to wrap the boundary
+   * @param ghosts     indicates how many ghosts to use in each dimension
+   * @param divs       indicates how many cuts to make along each dimension
+   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
+   *
+   * `master` must have been supplied a create function in order for this function to work.
+   */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 Master&            master,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, master);
+  }
+
+  /**
+   * \ingroup Decomposition
+   * \brief A "null" decompositon that simply creates the blocks and adds them to the master
+   *
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param master     gets the blocks once this function returns
+   */
+  inline
+  void decompose(int                rank,
+                 const Assigner&    assigner,
+                 Master&            master)
+  {
+    std::vector<int>  local_gids;
+    assigner.local_gids(rank, local_gids);
+
+    for (size_t i = 0; i < local_gids.size(); ++i)
+      master.add(local_gids[i], master.create(), new diy::Link);
+  }
+
+    /**
+     * \ingroup Decomposition
+     * \brief Add a decomposition (modify links) of an existing set of blocks that were
+     * added to the master previously
+     *
+     * @param rank       local rank
+     * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+     *                   also communicates the total number of blocks
+     */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 Master&            master,
+                 const typename RegularDecomposer<Bounds>::Updater&   update,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     =
+                 typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       =
+                 typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+      RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).
+          decompose(rank, assigner, master, update);
+  }
+
+  //! Decomposition example: \example decomposition/test-decomposition.cpp
+  //! Direct master insertion example: \example decomposition/test-direct-master.cpp
+}
+
+// decomposes domain and adds blocks to the master
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master)
+{
+  decompose(rank, assigner, [&master](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
+  {
+    void*     b = master.create();
+    Link*     l = new Link(link);
+    master.add(gid, b, l);
+  });
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, const Creator& create)
+{
+  std::vector<int> gids;
+  assigner.local_gids(rank, gids);
+  for (int i = 0; i < (int)gids.size(); ++i)
+  {
+    int gid = gids[i];
+
+    DivisionsVector coords;
+    gid_to_coords(gid, coords);
+
+    Bounds core, bounds;
+    fill_bounds(core,   coords);
+    fill_bounds(bounds, coords, true);
+
+    // Fill link with all the neighbors
+    Link link(dim, core, bounds);
+    std::vector<int>  offsets(dim, -1);
+    offsets[0] = -2;
+    while (!all(offsets, 1))
+    {
+      // next offset
+      int i;
+      for (i = 0; i < dim; ++i)
+        if (offsets[i] == 1)
+          offsets[i] = -1;
+        else
+          break;
+      ++offsets[i];
+
+      if (all(offsets, 0)) continue;      // skip ourselves
+
+      DivisionsVector     nhbr_coords(dim);
+      Direction           dir, wrap_dir;
+      bool                inbounds = true;
+      for (int i = 0; i < dim; ++i)
+      {
+        nhbr_coords[i] = coords[i] + offsets[i];
+
+        // wrap
+        if (nhbr_coords[i] < 0)
+        {
+          if (wrap[i])
+          {
+            nhbr_coords[i] = divisions[i] - 1;
+            wrap_dir[i] = -1;
+          }
+          else
+            inbounds = false;
+        }
+
+        if (nhbr_coords[i] >= divisions[i])
+        {
+          if (wrap[i])
+          {
+            nhbr_coords[i] = 0;
+            wrap_dir[i] = 1;
+          }
+          else
+            inbounds = false;
+        }
+
+        // NB: this needs to match the addressing scheme in dir_t (in constants.h)
+        if (offsets[i] == -1 || offsets[i] == 1)
+          dir[i] = offsets[i];
+      }
+      if (!inbounds) continue;
+
+      int nhbr_gid = coords_to_gid(nhbr_coords);
+      BlockID bid; bid.gid = nhbr_gid; bid.proc = assigner.rank(nhbr_gid);
+      link.add_neighbor(bid);
+
+      Bounds nhbr_bounds;
+      fill_bounds(nhbr_bounds, nhbr_coords);
+      link.add_bounds(nhbr_bounds);
+
+      link.add_direction(dir);
+      link.add_wrap(wrap_dir);
+    }
+
+    create(gid, core, bounds, domain, link);
+  }
+}
+
+// decomposes domain but does not add blocks to master, assumes they were added already
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master, const Updater& update)
+{
+    decompose(rank, assigner, [&master,&update](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
+    {
+        int lid = master.lid(gid);
+        Link* l = new Link(link);
+        master.replace_link(lid, l);
+        update(gid, lid, core, bounds, domain, *l);
+    });
+}
+
+template<class Bounds>
+bool
+diy::RegularDecomposer<Bounds>::
+all(const std::vector<int>& v, int x)
+{
+  for (unsigned i = 0; i < v.size(); ++i)
+    if (v[i] != x)
+      return false;
+  return true;
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions)
+{
+  int dim = divisions.size();
+  for (int i = 0; i < dim; ++i)
+  {
+    coords.push_back(gid % divisions[i]);
+    gid /= divisions[i];
+  }
+}
+
+template<class Bounds>
+int
+diy::RegularDecomposer<Bounds>::
+coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions)
+{
+  int gid = 0;
+  for (int i = coords.size() - 1; i >= 0; --i)
+  {
+    gid *= divisions[i];
+    gid += coords[i];
+  }
+  return gid;
+}
+
+//! \ingroup Decomposition
+//! Gets the bounds, with or without ghosts, for a block specified by its block coordinates
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_bounds(Bounds& bounds,                  //!< (output) bounds
+            const DivisionsVector& coords,   //!< coordinates of the block in the decomposition
+            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
+    const
+{
+  for (int i = 0; i < dim; ++i)
+  {
+    bounds.min[i] = detail::BoundsHelper<Bounds>::from(coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
+    bounds.max[i] = detail::BoundsHelper<Bounds>::to  (coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
+  }
+
+  for (int i = dim; i < DIY_MAX_DIM; ++i)   // set the unused dimension to 0
+  {
+    bounds.min[i] = 0;
+    bounds.max[i] = 0;
+  }
+
+  if (!add_ghosts)
+    return;
+
+  for (int i = 0; i < dim; ++i)
+  {
+    if (wrap[i])
+    {
+      bounds.min[i] -= ghosts[i];
+      bounds.max[i] += ghosts[i];
+    } else
+    {
+      bounds.min[i] = std::max(domain.min[i], bounds.min[i] - ghosts[i]);
+      bounds.max[i] = std::min(domain.max[i], bounds.max[i] + ghosts[i]);
+    }
+  }
+}
+
+//! \ingroup Decomposition
+//! Gets the bounds, with or without ghosts, for a block specified by its gid
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_bounds(Bounds& bounds,                  //!< (output) bounds
+            int gid,                         //!< global id of the block
+            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
+    const
+{
+    DivisionsVector coords;
+    gid_to_coords(gid, coords);
+    if (add_ghosts)
+        fill_bounds(bounds, coords, true);
+    else
+        fill_bounds(bounds, coords);
+}
+
+namespace diy { namespace detail {
+// current state of division in one dimension used in fill_divisions below
+template<class Coordinate>
+struct Div
+{
+    int dim;                                 // 0, 1, 2, etc. e.g. for x, y, z etc.
+    int nb;                                  // number of blocks so far in this dimension
+    Coordinate b_size;                       // block size so far in this dimension
+
+    // sort on descending block size unless tied, in which case
+    // sort on ascending num blocks in current dim unless tied, in which case
+    // sort on ascending dimension
+    bool operator<(Div rhs) const
+    {
+        // sort on second value of the pair unless tied, in which case sort on first
+        if (b_size == rhs.b_size)
+        {
+            if (nb == rhs.nb)
+                return(dim < rhs.dim);
+            return(nb < rhs.nb);
+        }
+        return(b_size > rhs.b_size);
+    }
+};
+} }
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_divisions(std::vector<int>& divisions) const
+{
+    // prod = number of blocks unconstrained by user; c = number of unconstrained dimensions
+    int prod = 1; int c = 0;
+    for (int i = 0; i < dim; ++i)
+        if (divisions[i] != 0)
+        {
+            prod *= divisions[i];
+            ++c;
+        }
+
+    if (nblocks % prod != 0)
+        throw std::runtime_error("Total number of blocks cannot be factored into provided divs");
+
+    if (c == (int) divisions.size())               // nothing to do; user provided all divs
+        return;
+
+    // factor number of blocks left in unconstrained dimensions
+    // factorization is sorted from smallest to largest factors
+    std::vector<unsigned> factors;
+    factor(factors, nblocks/prod);
+
+    using detail::Div;
+    std::vector< Div<Coordinate> > missing_divs;              // pairs consisting of (dim, #divs)
+
+    // init missing_divs
+    for (int i = 0; i < dim; i++)
+    {
+        if (divisions[i] == 0)
+        {
+            Div<Coordinate> div;
+            div.dim = i;
+            div.nb = 1;
+            div.b_size = domain.max[i] - domain.min[i];
+            missing_divs.push_back(div);
+        }
+    }
+
+    // iterate over factorization of number of blocks (factors are sorted smallest to largest)
+    // NB: using int instead of size_t because must be negative in order to break out of loop
+    for (int i = factors.size() - 1; i >= 0; --i)
+    {
+        // fill in missing divs by dividing dimension w/ largest block size
+        // except when this would be illegal (resulting in bounds.max < bounds.min;
+        // only a problem for discrete bounds
+
+        // sort on decreasing block size
+        std::sort(missing_divs.begin(), missing_divs.end());
+
+        // split the dimension with the largest block size (first element in vector)
+        Coordinate min =
+            detail::BoundsHelper<Bounds>::from(0,
+                                               missing_divs[0].nb * factors[i],
+                                               domain.min[missing_divs[0].dim],
+                                               domain.max[missing_divs[0].dim],
+                                               share_face[missing_divs[0].dim]);
+        Coordinate max =
+            detail::BoundsHelper<Bounds>::to(0,
+                                             missing_divs[0].nb * factors[i],
+                                             domain.min[missing_divs[0].dim],
+                                             domain.max[missing_divs[0].dim],
+                                             share_face[missing_divs[0].dim]);
+        if (max >= min)
+        {
+            missing_divs[0].nb    *= factors[i];
+            missing_divs[0].b_size = max - min;
+        }
+        else
+        {
+            std::ostringstream oss;
+            oss << "Unable to decompose domain into " << nblocks << " blocks: " << min << " " << max;
+            throw std::runtime_error(oss.str());
+        }
+    }
+
+    // assign the divisions
+    for (size_t i = 0; i < missing_divs.size(); i++)
+        divisions[missing_divs[i].dim] = missing_divs[i].nb;
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+factor(std::vector<unsigned>& factors, int n)
+{
+  while (n != 1)
+    for (int i = 2; i <= n; ++i)
+    {
+      if (n % i == 0)
+      {
+        factors.push_back(i);
+        n /= i;
+        break;
+      }
+    }
+}
+
+// Point to GIDs
+// TODO: deal with wrap correctly
+// TODO: add an optional ghosts argument to ignore ghosts (if we want to find the true owners, or something like that)
+template<class Bounds>
+template<class Point>
+void
+diy::RegularDecomposer<Bounds>::
+point_to_gids(std::vector<int>& gids, const Point& p) const
+{
+    std::vector< std::pair<int, int> > ranges(dim);
+    for (int i = 0; i < dim; ++i)
+        top_bottom(ranges[i].second, ranges[i].first, p, i);
+
+    // look up gids for all combinations
+    DivisionsVector coords(dim), location(dim);
+    while(location.back() < ranges.back().second - ranges.back().first)
+    {
+        for (int i = 0; i < dim; ++i)
+            coords[i] = ranges[i].first + location[i];
+        gids.push_back(coords_to_gid(coords, divisions));
+
+        location[0]++;
+        unsigned i = 0;
+        while (i < dim-1 && location[i] == ranges[i].second - ranges[i].first)
+        {
+            location[i] = 0;
+            ++i;
+            location[i]++;
+        }
+    }
+}
+
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+point_to_gid(const Point& p) const
+{
+    int gid = 0;
+    for (int axis = dim - 1; axis >= 0; --axis)
+    {
+      int bottom  = detail::BoundsHelper<Bounds>::lower(p[axis], divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+          bottom  = std::max(0, bottom);
+
+      // coupled with coords_to_gid
+      gid *= divisions[axis];
+      gid += bottom;
+    }
+
+    return gid;
+}
+
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+num_gids(const Point& p) const
+{
+    int res = 1;
+    for (int i = 0; i < dim; ++i)
+    {
+        int top, bottom;
+        top_bottom(top, bottom, p, i);
+        res *= top - bottom;
+    }
+    return res;
+}
+
+template<class Bounds>
+template<class Point>
+void
+diy::RegularDecomposer<Bounds>::
+top_bottom(int& top, int& bottom, const Point& p, int axis) const
+{
+    Coordinate l = p[axis] - ghosts[axis];
+    Coordinate r = p[axis] + ghosts[axis];
+
+    top     = detail::BoundsHelper<Bounds>::upper(r, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+    bottom  = detail::BoundsHelper<Bounds>::lower(l, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+
+    if (!wrap[axis])
+    {
+        bottom  = std::max(0, bottom);
+        top     = std::min(divisions[axis], top);
+    }
+}
+
+// find lowest gid that owns a particular point
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+lowest_gid(const Point& p) const
+{
+    // TODO: optimize - no need to compute all gids
+    std::vector<int> gids;
+    point_to_gids(gids, p);
+    std::sort(gids.begin(), gids.end());
+    return gids[0];
+}
+
+#endif
--- a/diy/include/diy/detail/algorithms/kdtree-sampling.hpp
+++ b/diy/include/diy/detail/algorithms/kdtree-sampling.hpp
@ -0,0 +1,450 @@
+#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+#define DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+
+#include <vector>
+#include <cassert>
+#include "../../partners/all-reduce.hpp"
+#include "../../log.hpp"
+
+// TODO: technically, what's done now is not a perfect subsample:
+//       we take the same number of samples from every block, in reality this number should be selected at random,
+//       so that the total number of samples adds up to samples*nblocks
+//
+// NB: random samples are chosen using rand(), which is assumed to be seeded
+//     externally. Once we switch to C++11, we should use its more advanced
+//     random number generators (and take a generator as an external parameter)
+//     (TODO)
+
+namespace diy
+{
+namespace detail
+{
+
+template<class Block, class Point>
+struct KDTreeSamplingPartition
+{
+    typedef     diy::RegularContinuousLink      RCLink;
+    typedef     diy::ContinuousBounds           Bounds;
+
+    typedef     std::vector<float>              Samples;
+
+                KDTreeSamplingPartition(int                             dim,
+                                        std::vector<Point>  Block::*    points,
+                                        size_t                          samples):
+                    dim_(dim), points_(points), samples_(samples)           {}
+
+    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
+
+    int         divide_gid(int gid, bool lower, int round, int rounds) const;
+    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
+    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
+
+    void        compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    void        add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const;
+    void        receive_samples(Block* b, const diy::ReduceProxy& srp,       Samples& samples) const;
+    void        forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const;
+
+    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const;
+    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
+
+    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
+    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
+    float       find_split(const Bounds& changed, const Bounds& original) const;
+
+    int                             dim_;
+    std::vector<Point>  Block::*    points_;
+    size_t                          samples_;
+};
+
+}
+}
+
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
+{
+    int dim;
+    if (srp.round() < partners.rounds())
+        dim = partners.dim(srp.round());
+    else
+        dim = partners.dim(srp.round() - 1);
+
+    if (srp.round() == partners.rounds())
+        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
+    {
+        dequeue_exchange(b, srp, dim);         // from the swap round
+        split_to_neighbors(b, srp, dim);
+    }
+    else if (partners.swap_round(srp.round()))
+    {
+        Samples samples;
+        receive_samples(b, srp, samples);
+        enqueue_exchange(b, srp, dim, samples);
+    } else if (partners.sub_round(srp.round()) == 0)
+    {
+        if (srp.round() > 0)
+        {
+            int prev_dim = dim - 1;
+            if (prev_dim < 0)
+                prev_dim += dim_;
+            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+        }
+
+        compute_local_samples(b, srp, dim);
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)     // we are reusing partners class, so really we are talking about the samples rounds here
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        srp.enqueue(srp.out_link().target(0), samples);
+    } else
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        if (samples.size() != 1)
+        {
+            // pick the median
+            std::nth_element(samples.begin(), samples.begin() + samples.size()/2, samples.end());
+            std::swap(samples[0], samples[samples.size()/2]);
+            //std::sort(samples.begin(), samples.end());
+            //samples[0] = (samples[samples.size()/2] + samples[samples.size()/2 + 1])/2;
+            samples.resize(1);
+        }
+        forward_samples(b, srp, samples);
+    }
+}
+
+template<class Block, class Point>
+int
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+divide_gid(int gid, bool lower, int round, int rounds) const
+{
+    if (lower)
+        gid &= ~(1 << (rounds - 1 - round));
+    else
+        gid |=  (1 << (rounds - 1 - round));
+    return gid;
+}
+
+// round here is the outer iteration of the algorithm
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+{
+    auto        log  = get_logger();
+    int         gid  = srp.gid();
+    int         lid  = srp.master()->lid(gid);
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // (gid, dir) -> i
+    std::map<std::pair<int,diy::Direction>, int> link_map;
+    for (int i = 0; i < link->size(); ++i)
+        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
+
+    // NB: srp.enqueue(..., ...) should match the link
+    std::vector<float>  splits(link->size());
+    for (int i = 0; i < link->size(); ++i)
+    {
+        float split; diy::Direction dir;
+
+        int in_gid = link->target(i).gid;
+        while(srp.incoming(in_gid))
+        {
+            srp.dequeue(in_gid, split);
+            srp.dequeue(in_gid, dir);
+
+            // reverse dir
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
+
+            int k = link_map[std::make_pair(in_gid, dir)];
+            log->trace("{} {} {} -> {}", in_gid, dir, split, k);
+            splits[k] = split;
+        }
+    }
+
+    RCLink      new_link(dim_, link->core(), link->core());
+
+    bool lower = !(gid & (1 << (rounds - 1 - round)));
+
+    // fill out the new link
+    for (int i = 0; i < link->size(); ++i)
+    {
+        diy::Direction  dir = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
+        {
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
+                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                new_link.add_neighbor(nbr);
+
+                new_link.add_direction(dir);
+
+                Bounds bounds = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
+                new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
+            }
+        } else // non-aligned side
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
+
+                Bounds  bounds  = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
+
+                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
+                {
+                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                    new_link.add_neighbor(nbr);
+                    new_link.add_direction(dir);
+                    new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
+                }
+            }
+        }
+    }
+
+    // add link to the dual block
+    int dual_gid = divide_gid(gid, !lower, round, rounds);
+    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
+    new_link.add_neighbor(dual);
+
+    Bounds nbr_bounds = link->bounds();     // old block bounds
+    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
+    new_link.add_bounds(nbr_bounds);
+
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
+    if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
+        new_link.add_direction(right);
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
+        new_link.add_direction(left);
+    }
+
+    // update the link; notice that this won't conflict with anything since
+    // reduce is using its own notion of the link constructed through the
+    // partners
+    link->swap(new_link);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // determine split
+    float split = find_split(link->core(), link->bounds());
+
+    for (int i = 0; i < link->size(); ++i)
+    {
+        srp.enqueue(link->target(i), split);
+        srp.enqueue(link->target(i), link->direction(i));
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    // compute and enqueue local samples
+    Samples samples;
+    size_t points_size = (b->*points_).size();
+    size_t n = std::min(points_size, samples_);
+    samples.reserve(n);
+    for (size_t i = 0; i < n; ++i)
+    {
+        float x = (b->*points_)[rand() % points_size][dim];
+        samples.push_back(x);
+    }
+
+    srp.enqueue(srp.out_link().target(0), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    // dequeue and combine the samples
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+        int nbr_gid = srp.in_link().target(i).gid;
+
+        Samples smpls;
+        srp.dequeue(nbr_gid, smpls);
+        for (size_t i = 0; i < smpls.size(); ++i)
+            samples.push_back(smpls[i]);
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    srp.dequeue(srp.in_link().target(0).gid, samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const
+{
+    for (int i = 0; i < srp.out_link().size(); ++i)
+        srp.enqueue(srp.out_link().target(i), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    int k = srp.out_link().size();
+
+    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
+        return;
+
+    // pick split points
+    float split = samples[0];
+
+    // subset and enqueue
+    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+      float x = (b->*points_)[i][dim];
+      int loc = x < split ? 0 : 1;
+      out_points[loc].push_back((b->*points_)[i]);
+    }
+    int pos = -1;
+    for (int i = 0; i < k; ++i)
+    {
+      if (srp.out_link().target(i).gid == srp.gid())
+      {
+        (b->*points_).swap(out_points[i]);
+        pos = i;
+      }
+      else
+        srp.enqueue(srp.out_link().target(i), out_points[i]);
+    }
+    if (pos == 0)
+        link->core().max[dim] = split;
+    else
+        link->core().min[dim] = split;
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+      int nbr_gid = srp.in_link().target(i).gid;
+      if (nbr_gid == srp.gid())
+          continue;
+
+      std::vector<Point>   in_points;
+      srp.dequeue(nbr_gid, in_points);
+      for (size_t j = 0; j < in_points.size(); ++j)
+      {
+        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
+            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
+                                                 in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
+        (b->*points_).push_back(in_points[j]);
+      }
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
+{
+    if (lower)
+        bounds.max[dim] = split;
+    else
+        bounds.min[dim] = split;
+}
+
+template<class Block, class Point>
+bool
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
+{
+    if (wrap)
+    {
+        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
+            return true;
+        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
+            return true;
+    }
+    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
+}
+
+template<class Block, class Point>
+float
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_split(const Bounds& changed, const Bounds& original) const
+{
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (changed.min[i] != original.min[i])
+            return changed.min[i];
+        if (changed.max[i] != original.max[i])
+            return changed.max[i];
+    }
+    assert(0);
+    return -1;
+}
+
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
+
+#endif
--- a/diy/include/diy/detail/algorithms/kdtree.hpp
+++ b/diy/include/diy/detail/algorithms/kdtree.hpp
@ -0,0 +1,569 @@
+#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_HPP
+#define DIY_DETAIL_ALGORITHMS_KDTREE_HPP
+
+#include <vector>
+#include <cassert>
+#include "../../partners/all-reduce.hpp"
+#include "../../log.hpp"
+
+namespace diy
+{
+namespace detail
+{
+
+struct KDTreePartners;
+
+template<class Block, class Point>
+struct KDTreePartition
+{
+    typedef     diy::RegularContinuousLink      RCLink;
+    typedef     diy::ContinuousBounds           Bounds;
+
+    typedef     std::vector<size_t>             Histogram;
+
+                KDTreePartition(int                             dim,
+                                std::vector<Point>  Block::*    points,
+                                size_t                          bins):
+                    dim_(dim), points_(points), bins_(bins)            {}
+
+    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
+
+    int         divide_gid(int gid, bool lower, int round, int rounds) const;
+    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
+    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
+
+    void        compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    void        add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const;
+    void        receive_histogram(Block* b, const diy::ReduceProxy& srp,       Histogram& histogram) const;
+    void        forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const;
+
+    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const;
+    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
+
+    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
+    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
+    float       find_split(const Bounds& changed, const Bounds& original) const;
+
+    int                             dim_;
+    std::vector<Point>  Block::*    points_;
+    size_t                          bins_;
+};
+
+}
+}
+
+struct diy::detail::KDTreePartners
+{
+  // bool = are we in a swap (vs histogram) round
+  // int  = round within that partner
+  typedef           std::pair<bool, int>                    RoundType;
+  typedef           diy::ContinuousBounds                   Bounds;
+
+                    KDTreePartners(int dim, int nblocks, bool wrap_, const Bounds& domain_):
+                        decomposer(1, interval(0,nblocks-1), nblocks),
+                        histogram(decomposer, 2),
+                        swap(decomposer, 2, false),
+                        wrap(wrap_),
+                        domain(domain_)
+  {
+    for (unsigned i = 0; i < swap.rounds(); ++i)
+    {
+      // fill histogram rounds
+      for (unsigned j = 0; j < histogram.rounds(); ++j)
+      {
+        rounds_.push_back(std::make_pair(false, j));
+        dim_.push_back(i % dim);
+        if (j == histogram.rounds() / 2 - 1 - i)
+            j += 2*i;
+      }
+
+      // fill swap round
+      rounds_.push_back(std::make_pair(true, i));
+      dim_.push_back(i % dim);
+
+      // fill link round
+      rounds_.push_back(std::make_pair(true, -1));          // (true, -1) signals link round
+      dim_.push_back(i % dim);
+    }
+  }
+
+  size_t        rounds() const                              { return rounds_.size(); }
+  size_t        swap_rounds() const                         { return swap.rounds(); }
+
+  int           dim(int round) const                        { return dim_[round]; }
+  bool          swap_round(int round) const                 { return rounds_[round].first; }
+  int           sub_round(int round) const                  { return rounds_[round].second; }
+
+  inline bool   active(int round, int gid, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        return true;
+    else if (swap_round(round) && sub_round(round) < 0)     // link round
+        return true;
+    else if (swap_round(round))
+        return swap.active(sub_round(round), gid, m);
+    else
+        return histogram.active(sub_round(round), gid, m);
+  }
+
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        link_neighbors(-1, gid, partners, m);
+    else if (swap_round(round) && sub_round(round) < 0)       // link round
+        swap.incoming(sub_round(round - 1) + 1, gid, partners, m);
+    else if (swap_round(round))
+        histogram.incoming(histogram.rounds(), gid, partners, m);
+    else
+    {
+        if (round > 0 && sub_round(round) == 0)
+            link_neighbors(-1, gid, partners, m);
+        else if (round > 0 && sub_round(round - 1) != sub_round(round) - 1)        // jump through the histogram rounds
+            histogram.incoming(sub_round(round - 1) + 1, gid, partners, m);
+        else
+            histogram.incoming(sub_round(round), gid, partners, m);
+    }
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        swap.outgoing(sub_round(round-1) + 1, gid, partners, m);
+    else if (swap_round(round) && sub_round(round) < 0)       // link round
+        link_neighbors(-1, gid, partners, m);
+    else if (swap_round(round))
+        swap.outgoing(sub_round(round), gid, partners, m);
+    else
+        histogram.outgoing(sub_round(round), gid, partners, m);
+  }
+
+  inline void   link_neighbors(int, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    int         lid  = m.lid(gid);
+    diy::Link*  link = m.link(lid);
+
+    std::set<int> result;       // partners must be unique
+    for (int i = 0; i < link->size(); ++i)
+        result.insert(link->target(i).gid);
+
+    for (std::set<int>::const_iterator it = result.begin(); it != result.end(); ++it)
+        partners.push_back(*it);
+  }
+
+  // 1-D domain to feed into histogram and swap
+  diy::RegularDecomposer<diy::DiscreteBounds>   decomposer;
+
+  diy::RegularAllReducePartners     histogram;
+  diy::RegularSwapPartners          swap;
+
+  std::vector<RoundType>            rounds_;
+  std::vector<int>                  dim_;
+
+  bool                              wrap;
+  Bounds                            domain;
+};
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
+{
+    int dim;
+    if (srp.round() < partners.rounds())
+        dim = partners.dim(srp.round());
+    else
+        dim = partners.dim(srp.round() - 1);
+
+    if (srp.round() == partners.rounds())
+        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
+    {
+        dequeue_exchange(b, srp, dim);         // from the swap round
+        split_to_neighbors(b, srp, dim);
+    }
+    else if (partners.swap_round(srp.round()))
+    {
+        Histogram   histogram;
+        receive_histogram(b, srp, histogram);
+        enqueue_exchange(b, srp, dim, histogram);
+    } else if (partners.sub_round(srp.round()) == 0)
+    {
+        if (srp.round() > 0)
+        {
+            int prev_dim = dim - 1;
+            if (prev_dim < 0)
+                prev_dim += dim_;
+            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+        }
+
+        compute_local_histogram(b, srp, dim);
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)
+    {
+        Histogram   histogram(bins_);
+        add_histogram(b, srp, histogram);
+        srp.enqueue(srp.out_link().target(0), histogram);
+    }
+    else
+    {
+        Histogram   histogram(bins_);
+        add_histogram(b, srp, histogram);
+        forward_histogram(b, srp, histogram);
+    }
+}
+
+template<class Block, class Point>
+int
+diy::detail::KDTreePartition<Block,Point>::
+divide_gid(int gid, bool lower, int round, int rounds) const
+{
+    if (lower)
+        gid &= ~(1 << (rounds - 1 - round));
+    else
+        gid |=  (1 << (rounds - 1 - round));
+    return gid;
+}
+
+// round here is the outer iteration of the algorithm
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+{
+    int         gid  = srp.gid();
+    int         lid  = srp.master()->lid(gid);
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // (gid, dir) -> i
+    std::map<std::pair<int,diy::Direction>, int> link_map;
+    for (int i = 0; i < link->size(); ++i)
+        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
+
+    // NB: srp.enqueue(..., ...) should match the link
+    std::vector<float>  splits(link->size());
+    for (int i = 0; i < link->size(); ++i)
+    {
+        float split; diy::Direction dir;
+
+        int in_gid = link->target(i).gid;
+        while(srp.incoming(in_gid))
+        {
+            srp.dequeue(in_gid, split);
+            srp.dequeue(in_gid, dir);
+
+            // reverse dir
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
+
+            int k = link_map[std::make_pair(in_gid, dir)];
+            splits[k] = split;
+        }
+    }
+
+    RCLink      new_link(dim_, link->core(), link->core());
+
+    bool lower = !(gid & (1 << (rounds - 1 - round)));
+
+    // fill out the new link
+    for (int i = 0; i < link->size(); ++i)
+    {
+        diy::Direction  dir      = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
+        {
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
+                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                new_link.add_neighbor(nbr);
+
+                new_link.add_direction(dir);
+
+                Bounds bounds = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
+                new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
+            }
+        } else // non-aligned side
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
+
+                Bounds  bounds  = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
+
+                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
+                {
+                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                    new_link.add_neighbor(nbr);
+                    new_link.add_direction(dir);
+                    new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
+                }
+            }
+        }
+    }
+
+    // add link to the dual block
+    int dual_gid = divide_gid(gid, !lower, round, rounds);
+    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
+    new_link.add_neighbor(dual);
+
+    Bounds nbr_bounds = link->bounds();     // old block bounds
+    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
+    new_link.add_bounds(nbr_bounds);
+
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
+    if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
+        new_link.add_direction(right);
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
+        new_link.add_direction(left);
+    }
+
+    // update the link; notice that this won't conflict with anything since
+    // reduce is using its own notion of the link constructed through the
+    // partners
+    link->swap(new_link);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // determine split
+    float split = find_split(link->core(), link->bounds());
+
+    for (int i = 0; i < link->size(); ++i)
+    {
+        srp.enqueue(link->target(i), split);
+        srp.enqueue(link->target(i), link->direction(i));
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // compute and enqueue local histogram
+    Histogram histogram(bins_);
+
+    float   width = (link->core().max[dim] - link->core().min[dim])/bins_;
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+        float x = (b->*points_)[i][dim];
+        int loc = (x - link->core().min[dim]) / width;
+        if (loc < 0)
+            throw std::runtime_error(fmt::format("{} {} {}", loc, x, link->core().min[dim]));
+        if (loc >= (int) bins_)
+            loc = bins_ - 1;
+        ++(histogram[loc]);
+    }
+
+    srp.enqueue(srp.out_link().target(0), histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+{
+    // dequeue and add up the histograms
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+        int nbr_gid = srp.in_link().target(i).gid;
+
+        Histogram hist;
+        srp.dequeue(nbr_gid, hist);
+        for (size_t i = 0; i < hist.size(); ++i)
+            histogram[i] += hist[i];
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+receive_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+{
+    srp.dequeue(srp.in_link().target(0).gid, histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const
+{
+    for (int i = 0; i < srp.out_link().size(); ++i)
+        srp.enqueue(srp.out_link().target(i), histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const
+{
+    auto        log = get_logger();
+
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    int k = srp.out_link().size();
+
+    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
+        return;
+
+    // pick split points
+    size_t total = 0;
+    for (size_t i = 0; i < histogram.size(); ++i)
+        total += histogram[i];
+    log->trace("Histogram total: {}", total);
+
+    size_t cur   = 0;
+    float  width = (link->core().max[dim] - link->core().min[dim])/bins_;
+    float  split = 0;
+    for (size_t i = 0; i < histogram.size(); ++i)
+    {
+        if (cur + histogram[i] > total/2)
+        {
+            split = link->core().min[dim] + width*i;
+            break;
+        }
+        cur += histogram[i];
+    }
+    log->trace("Found split: {} (dim={}) in {} - {}", split, dim, link->core().min[dim], link->core().max[dim]);
+
+    // subset and enqueue
+    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+      float x = (b->*points_)[i][dim];
+      int loc = x < split ? 0 : 1;
+      out_points[loc].push_back((b->*points_)[i]);
+    }
+    int pos = -1;
+    for (int i = 0; i < k; ++i)
+    {
+      if (srp.out_link().target(i).gid == srp.gid())
+      {
+        (b->*points_).swap(out_points[i]);
+        pos = i;
+      }
+      else
+        srp.enqueue(srp.out_link().target(i), out_points[i]);
+    }
+    if (pos == 0)
+        link->core().max[dim] = split;
+    else
+        link->core().min[dim] = split;
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+      int nbr_gid = srp.in_link().target(i).gid;
+      if (nbr_gid == srp.gid())
+          continue;
+
+      std::vector<Point>   in_points;
+      srp.dequeue(nbr_gid, in_points);
+      for (size_t j = 0; j < in_points.size(); ++j)
+      {
+        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
+            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
+                                     in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
+        (b->*points_).push_back(in_points[j]);
+      }
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
+{
+    if (lower)
+        bounds.max[dim] = split;
+    else
+        bounds.min[dim] = split;
+}
+
+template<class Block, class Point>
+bool
+diy::detail::KDTreePartition<Block,Point>::
+intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
+{
+    if (wrap)
+    {
+        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
+            return true;
+        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
+            return true;
+    }
+    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
+}
+
+template<class Block, class Point>
+float
+diy::detail::KDTreePartition<Block,Point>::
+find_split(const Bounds& changed, const Bounds& original) const
+{
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (changed.min[i] != original.min[i])
+            return changed.min[i];
+        if (changed.max[i] != original.max[i])
+            return changed.max[i];
+    }
+    assert(0);
+    return -1;
+}
+
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreePartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
+
+#endif
--- a/diy/include/diy/detail/algorithms/sort.hpp
+++ b/diy/include/diy/detail/algorithms/sort.hpp
@ -0,0 +1,162 @@
+#ifndef DIY_DETAIL_ALGORITHMS_SORT_HPP
+#define DIY_DETAIL_ALGORITHMS_SORT_HPP
+
+#include <functional>
+#include <algorithm>
+
+namespace diy
+{
+
+namespace detail
+{
+
+template<class Block, class T, class Cmp>
+struct SampleSort
+{
+    typedef         std::vector<T>      Block::*ValuesVector;
+    struct Sampler;
+    struct Exchanger;
+
+                    SampleSort(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_, size_t num_samples_):
+                        values(values_), samples(samples_),
+                        cmp(cmp_), num_samples(num_samples_)                    {}
+
+    Sampler         sample() const                                              { return Sampler(values, samples, cmp, num_samples); }
+    Exchanger       exchange() const                                            { return Exchanger(values, samples, cmp); }
+
+    static void     dequeue_values(std::vector<T>& v, const ReduceProxy& rp, bool skip_self = true)
+    {
+        auto log = get_logger();
+
+        int k_in  = rp.in_link().size();
+
+        log->trace("dequeue_values(): gid={}, round={}; v.size()={}", rp.gid(), rp.round(), v.size());
+
+        if (detail::is_default< Serialization<T> >::value)
+        {
+            // add up sizes
+            size_t sz = 0;
+            size_t end = v.size();
+            for (int i = 0; i < k_in; ++i)
+            {
+                log->trace("    incoming size from {}: {}", rp.in_link().target(i).gid, sz);
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                sz += in.size() / sizeof(T);
+            }
+            log->trace("    incoming size: {}", sz);
+            v.resize(end + sz);
+
+            for (int i = 0; i < k_in; ++i)
+            {
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                size_t sz = in.size() / sizeof(T);
+                T* bg = (T*) &in.buffer[0];
+                std::copy(bg, bg + sz, &v[end]);
+                end += sz;
+            }
+        } else
+        {
+            for (int i = 0; i < k_in; ++i)
+            {
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                while(in)
+                {
+                    T x;
+                    diy::load(in, x);
+                    v.emplace_back(std::move(x));
+                }
+            }
+        }
+        log->trace("    v.size()={}", v.size());
+    }
+
+    ValuesVector    values;
+    ValuesVector    samples;
+    Cmp             cmp;
+    size_t          num_samples;
+};
+
+template<class Block, class T, class Cmp>
+struct SampleSort<Block,T,Cmp>::Sampler
+{
+                    Sampler(ValuesVector values_, ValuesVector dividers_, const Cmp& cmp_, size_t num_samples_):
+                        values(values_), dividers(dividers_), cmp(cmp_), num_samples(num_samples_)    {}
+
+    void            operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    {
+        int k_in  = srp.in_link().size();
+        int k_out = srp.out_link().size();
+
+        std::vector<T> samples;
+
+        if (k_in == 0)
+        {
+            // draw random samples
+            for (size_t i = 0; i < num_samples; ++i)
+                samples.push_back((b->*values)[std::rand() % (b->*values).size()]);
+        } else
+            dequeue_values(samples, srp, false);
+
+        if (k_out == 0)
+        {
+            // pick subsamples that separate quantiles
+            std::sort(samples.begin(), samples.end(), cmp);
+            std::vector<T>  subsamples(srp.nblocks() - 1);
+            int step = samples.size() / srp.nblocks();       // NB: subsamples.size() + 1
+            for (size_t i = 0; i < subsamples.size(); ++i)
+                subsamples[i] = samples[(i+1)*step];
+            (b->*dividers).swap(subsamples);
+        }
+        else
+        {
+            for (int i = 0; i < k_out; ++i)
+            {
+                MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
+                save(out, &samples[0], samples.size());
+            }
+        }
+    }
+
+    ValuesVector    values;
+    ValuesVector    dividers;
+    Cmp             cmp;
+    size_t          num_samples;
+};
+
+template<class Block, class T, class Cmp>
+struct SampleSort<Block,T,Cmp>::Exchanger
+{
+                    Exchanger(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_):
+                        values(values_), samples(samples_), cmp(cmp_)       {}
+
+    void            operator()(Block* b, const ReduceProxy& rp) const
+    {
+        if (rp.round() == 0)
+        {
+            // enqueue values to the correct locations
+            for (size_t i = 0; i < (b->*values).size(); ++i)
+            {
+                int to = std::lower_bound((b->*samples).begin(), (b->*samples).end(), (b->*values)[i], cmp) - (b->*samples).begin();
+                rp.enqueue(rp.out_link().target(to), (b->*values)[i]);
+            }
+            (b->*values).clear();
+        } else
+        {
+            dequeue_values((b->*values), rp, false);
+            std::sort((b->*values).begin(), (b->*values).end(), cmp);
+        }
+    }
+
+    ValuesVector    values;
+    ValuesVector    samples;
+    Cmp             cmp;
+};
+
+}
+
+}
+
+#endif
--- a/diy/include/diy/detail/block_traits.hpp
+++ b/diy/include/diy/detail/block_traits.hpp
@ -0,0 +1,31 @@
+#ifndef DIY_BLOCK_TRAITS_HPP
+#define DIY_BLOCK_TRAITS_HPP
+
+#include "traits.hpp"
+
+namespace diy
+{
+namespace detail
+{
+    template<class F>
+    struct block_traits
+    {
+        typedef typename std::remove_pointer<typename function_traits<F>::template arg<0>::type>::type type;
+    };
+
+    // matches block member functions
+    template<class Block, class R, class... Args>
+    struct block_traits<R(Block::*)(Args...)>
+    {
+        typedef Block type;
+    };
+
+    template<class Block, class R, class... Args>
+    struct block_traits<R(Block::*)(Args...) const>
+    {
+        typedef Block type;
+    };
+}
+}
+
+#endif
--- a/diy/include/diy/detail/collectives.hpp
+++ b/diy/include/diy/detail/collectives.hpp
@ -0,0 +1,54 @@
+#ifndef DIY_COLLECTIVES_HPP
+#define DIY_COLLECTIVES_HPP
+
+namespace diy
+{
+namespace detail
+{
+  struct CollectiveOp
+  {
+    virtual void    init()                                  =0;
+    virtual void    update(const CollectiveOp& other)       =0;
+    virtual void    global(const mpi::communicator& comm)   =0;
+    virtual void    copy_from(const CollectiveOp& other)    =0;
+    virtual void    result_out(void* dest) const            =0;
+    virtual         ~CollectiveOp()                         {}
+  };
+
+  template<class T, class Op>
+  struct AllReduceOp: public CollectiveOp
+  {
+          AllReduceOp(const T& x, Op op):
+            in_(x), op_(op)         {}
+
+    void  init()                                    { out_ = in_; }
+    void  update(const CollectiveOp& other)         { out_ = op_(out_, static_cast<const AllReduceOp&>(other).in_); }
+    void  global(const mpi::communicator& comm)     { T res; mpi::all_reduce(comm, out_, res, op_); out_ = res; }
+    void  copy_from(const CollectiveOp& other)      { out_ = static_cast<const AllReduceOp&>(other).out_; }
+    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = out_; }
+
+    private:
+      T     in_, out_;
+      Op    op_;
+  };
+
+  template<class T>
+  struct Scratch: public CollectiveOp
+  {
+          Scratch(const T& x):
+            x_(x)                                   {}
+
+    void  init()                                    {}
+    void  update(const CollectiveOp& other)         {}
+    void  global(const mpi::communicator& comm)     {}
+    void  copy_from(const CollectiveOp& other)      {}
+    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = x_; }
+
+    private:
+      T     x_;
+  };
+
+}
+}
+
+#endif
--- a/diy/include/diy/detail/reduce/all-to-all.hpp
+++ b/diy/include/diy/detail/reduce/all-to-all.hpp
@ -0,0 +1,169 @@
+#ifndef DIY_DETAIL_ALL_TO_ALL_HPP
+#define DIY_DETAIL_ALL_TO_ALL_HPP
+
+#include "../block_traits.hpp"
+
+namespace diy
+{
+
+namespace detail
+{
+  template<class Op>
+  struct AllToAllReduce
+  {
+    using Block = typename block_traits<Op>::type;
+
+         AllToAllReduce(const Op& op_, const Assigner& assigner):
+             op(op_)
+    {
+      for (int gid = 0; gid < assigner.nblocks(); ++gid)
+      {
+        BlockID nbr = { gid, assigner.rank(gid) };
+        all_neighbors_link.add_neighbor(nbr);
+      }
+    }
+
+    void operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    {
+      int k_in  = srp.in_link().size();
+      int k_out = srp.out_link().size();
+
+      if (k_in == 0 && k_out == 0)  // special case of a single block
+      {
+          ReduceProxy all_srp_out(srp, srp.block(), 0, srp.assigner(), empty_link,         all_neighbors_link);
+          ReduceProxy all_srp_in (srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+
+          op(b, all_srp_out);
+          MemoryBuffer& in_queue = all_srp_in.incoming(all_srp_in.in_link().target(0).gid);
+          in_queue.swap(all_srp_out.outgoing(all_srp_out.out_link().target(0)));
+          in_queue.reset();
+
+          op(b, all_srp_in);
+          return;
+      }
+
+      if (k_in == 0)                // initial round
+      {
+        ReduceProxy all_srp(srp, srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
+        op(b, all_srp);
+
+        Master::OutgoingQueues all_queues;
+        all_queues.swap(*all_srp.outgoing());       // clears out the queues and stores them locally
+
+        // enqueue outgoing
+        int group = all_srp.out_link().size() / k_out;
+        for (int i = 0; i < k_out; ++i)
+        {
+          std::pair<int,int> range(i*group, (i+1)*group);
+          srp.enqueue(srp.out_link().target(i), range);
+          for (int j = i*group; j < (i+1)*group; ++j)
+          {
+            int from = srp.gid();
+            int to   = all_srp.out_link().target(j).gid;
+            srp.enqueue(srp.out_link().target(i), std::make_pair(from, to));
+            srp.enqueue(srp.out_link().target(i), all_queues[all_srp.out_link().target(j)]);
+          }
+        }
+      } else if (k_out == 0)        // final round
+      {
+        // dequeue incoming + reorder into the correct order
+        ReduceProxy all_srp(srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+
+        Master::IncomingQueues all_incoming;
+        all_incoming.swap(*srp.incoming());
+
+        std::pair<int, int> range;      // all the ranges should be the same
+        for (int i = 0; i < k_in; ++i)
+        {
+          int gid_in = srp.in_link().target(i).gid;
+          MemoryBuffer& in = all_incoming[gid_in];
+          load(in, range);
+          while(in)
+          {
+            std::pair<int, int> from_to;
+            load(in, from_to);
+            load(in, all_srp.incoming(from_to.first));
+            all_srp.incoming(from_to.first).reset();
+          }
+        }
+
+        op(b, all_srp);
+      } else                                        // intermediate round: reshuffle queues
+      {
+        // add up buffer sizes
+        std::vector<size_t> sizes_out(k_out, sizeof(std::pair<int,int>));
+        std::pair<int, int> range;      // all the ranges should be the same
+        for (int i = 0; i < k_in; ++i)
+        {
+          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
+
+          load(in, range);
+          int group = (range.second - range.first)/k_out;
+
+          std::pair<int, int> from_to;
+          size_t s;
+          while(in)
+          {
+            diy::load(in, from_to);
+            diy::load(in, s);
+
+            int j = (from_to.second - range.first) / group;
+            sizes_out[j] += s + sizeof(size_t) + sizeof(std::pair<int,int>);
+            in.skip(s);
+          }
+          in.reset();
+        }
+
+        // reserve outgoing buffers of correct size
+        int group = (range.second - range.first)/k_out;
+        for (int i = 0; i < k_out; ++i)
+        {
+          MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
+          out.reserve(sizes_out[i]);
+
+          std::pair<int, int> out_range;
+          out_range.first  = range.first + group*i;
+          out_range.second = range.first + group*(i+1);
+          save(out, out_range);
+        }
+
+        // re-direct the queues
+        for (int i = 0; i < k_in; ++i)
+        {
+          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
+
+          std::pair<int, int> range;
+          load(in, range);
+
+          std::pair<int, int> from_to;
+          while(in)
+          {
+            load(in, from_to);
+            int j = (from_to.second - range.first) / group;
+
+            MemoryBuffer& out = srp.outgoing(srp.out_link().target(j));
+            save(out, from_to);
+            MemoryBuffer::copy(in, out);
+          }
+        }
+      }
+    }
+
+    const Op&           op;
+    Link                all_neighbors_link, empty_link;
+  };
+
+  struct SkipIntermediate
+  {
+         SkipIntermediate(size_t rounds_):
+            rounds(rounds_)                                     {}
+
+    bool operator()(int round, int, const Master&) const        { if (round == 0 || round == (int) rounds) return false; return true; }
+
+    size_t  rounds;
+  };
+}
+
+}
+
+#endif
--- a/diy/include/diy/detail/traits.hpp
+++ b/diy/include/diy/detail/traits.hpp
@ -0,0 +1,318 @@
+//--------------------------------------
+// utils/traits: Additional type traits
+//--------------------------------------
+//
+//          Copyright kennytm (auraHT Ltd.) 2011.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file doc/LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+/**
+
+``<utils/traits.hpp>`` --- Additional type traits
+=================================================
+
+This module provides additional type traits and related functions, missing from
+the standard library.
+
+*/
+
+#ifndef DIY_UTILS_TRAITS_HPP
+#define DIY_UTILS_TRAITS_HPP
+
+#include <cstdlib>
+#include <tuple>
+#include <functional>
+#include <type_traits>
+
+namespace diy
+{
+namespace detail {
+
+/**
+.. macro:: DECLARE_HAS_TYPE_MEMBER(member_name)
+
+    This macro declares a template ``has_member_name`` which will check whether
+    a type member ``member_name`` exists in a particular type.
+
+    Example::
+
+        DECLARE_HAS_TYPE_MEMBER(result_type)
+
+        ...
+
+        printf("%d\n", has_result_type< std::plus<int> >::value);
+        // ^ prints '1' (true)
+        printf("%d\n", has_result_type< double(*)() >::value);
+        // ^ prints '0' (false)
+*/
+#define DECLARE_HAS_TYPE_MEMBER(member_name) \
+    template <typename, typename = void> \
+    struct has_##member_name \
+    { enum { value = false }; }; \
+    template <typename T> \
+    struct has_##member_name<T, typename std::enable_if<sizeof(typename T::member_name)||true>::type> \
+    { enum { value = true }; };
+
+/**
+.. type:: struct utils::function_traits<F>
+
+    Obtain compile-time information about a function object *F*.
+
+    This template currently supports the following types:
+
+    * Normal function types (``R(T...)``), function pointers (``R(*)(T...)``)
+      and function references (``R(&)(T...)`` and ``R(&&)(T...)``).
+    * Member functions (``R(C::*)(T...)``)
+    * ``std::function<F>``
+    * Type of lambda functions, and any other types that has a unique
+      ``operator()``.
+    * Type of ``std::mem_fn`` (only for GCC's libstdc++ and LLVM's libc++).
+      Following the C++ spec, the first argument will be a raw pointer.
+*/
+template <typename T>
+struct function_traits
+    : public function_traits<decltype(&T::operator())>
+{};
+
+namespace xx_impl
+{
+    template <typename C, typename R, typename... A>
+    struct memfn_type
+    {
+        typedef typename std::conditional<
+            std::is_const<C>::value,
+            typename std::conditional<
+                std::is_volatile<C>::value,
+                R (C::*)(A...) const volatile,
+                R (C::*)(A...) const
+            >::type,
+            typename std::conditional<
+                std::is_volatile<C>::value,
+                R (C::*)(A...) volatile,
+                R (C::*)(A...)
+            >::type
+        >::type type;
+    };
+}
+
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType(Args...)>
+{
+    /**
+    .. type:: type result_type
+
+        The type returned by calling an instance of the function object type *F*.
+    */
+    typedef ReturnType result_type;
+
+    /**
+    .. type:: type function_type
+
+        The function type (``R(T...)``).
+    */
+    typedef ReturnType function_type(Args...);
+
+    /**
+    .. type:: type member_function_type<OwnerType>
+
+        The member function type for an *OwnerType* (``R(OwnerType::*)(T...)``).
+    */
+    template <typename OwnerType>
+    using member_function_type = typename xx_impl::memfn_type<
+        typename std::remove_pointer<typename std::remove_reference<OwnerType>::type>::type,
+        ReturnType, Args...
+    >::type;
+
+    /**
+    .. data:: static const size_t arity
+
+        Number of arguments the function object will take.
+    */
+    enum { arity = sizeof...(Args) };
+
+    /**
+    .. type:: type arg<n>::type
+
+        The type of the *n*-th argument.
+    */
+    template <size_t i>
+    struct arg
+    {
+        typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
+    };
+};
+
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType(*)(Args...)>
+    : public function_traits<ReturnType(Args...)>
+{};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...)>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) const>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef const ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) volatile>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef volatile ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) const volatile>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef const volatile ClassType& owner_type;
+};
+
+template <typename FunctionType>
+struct function_traits<std::function<FunctionType>>
+    : public function_traits<FunctionType>
+{};
+
+#if defined(_GLIBCXX_FUNCTIONAL)
+#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::_Mem_fn
+#elif defined(_LIBCPP_FUNCTIONAL)
+#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::__mem_fn
+#endif
+
+#ifdef MEM_FN_SYMBOL_XX0SL7G4Z0J
+
+template <typename R, typename C>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R C::*>>
+    : public function_traits<R(C*)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...)>>
+    : public function_traits<R(C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const>>
+    : public function_traits<R(const C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) volatile>>
+    : public function_traits<R(volatile C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const volatile>>
+    : public function_traits<R(const volatile C*, A...)>
+{};
+
+#undef MEM_FN_SYMBOL_XX0SL7G4Z0J
+#endif
+
+template <typename T>
+struct function_traits<T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<volatile T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const volatile T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<volatile T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const volatile T&&> : public function_traits<T> {};
+
+
+#define FORWARD_RES_8QR485JMSBT \
+    typename std::conditional< \
+        std::is_lvalue_reference<R>::value, \
+        T&, \
+        typename std::remove_reference<T>::type&& \
+    >::type
+
+/**
+.. function:: auto utils::forward_like<Like, T>(T&& t) noexcept
+
+    Forward the reference *t* like the type of *Like*. That means, if *Like* is
+    an lvalue (reference), this function will return an lvalue reference of *t*.
+    Otherwise, if *Like* is an rvalue, this function will return an rvalue
+    reference of *t*.
+
+    This is mainly used to propagate the expression category (lvalue/rvalue) of
+    a member of *Like*, generalizing ``std::forward``.
+*/
+template <typename R, typename T>
+FORWARD_RES_8QR485JMSBT forward_like(T&& input) noexcept
+{
+    return static_cast<FORWARD_RES_8QR485JMSBT>(input);
+}
+
+#undef FORWARD_RES_8QR485JMSBT
+
+/**
+.. type:: struct utils::copy_cv<From, To>
+
+    Copy the CV qualifier between the two types. For example,
+    ``utils::copy_cv<const int, double>::type`` will become ``const double``.
+*/
+template <typename From, typename To>
+struct copy_cv
+{
+private:
+    typedef typename std::remove_cv<To>::type raw_To;
+    typedef typename std::conditional<std::is_const<From>::value,
+                                      const raw_To, raw_To>::type const_raw_To;
+public:
+    /**
+    .. type:: type type
+
+        Result of cv-copying.
+    */
+    typedef typename std::conditional<std::is_volatile<From>::value,
+                                      volatile const_raw_To, const_raw_To>::type type;
+};
+
+/**
+.. type:: struct utils::pointee<T>
+
+    Returns the type by derefering an instance of *T*. This is a generalization
+    of ``std::remove_pointer``, that it also works with iterators.
+*/
+template <typename T>
+struct pointee
+{
+    /**
+    .. type:: type type
+
+        Result of dereferencing.
+    */
+    typedef typename std::remove_reference<decltype(*std::declval<T>())>::type type;
+};
+
+/**
+.. function:: std::add_rvalue_reference<T>::type utils::rt_val<T>() noexcept
+
+    Returns a value of type *T*. It is guaranteed to do nothing and will not
+    throw a compile-time error, but using the returned result will cause
+    undefined behavior.
+*/
+template <typename T>
+typename std::add_rvalue_reference<T>::type rt_val() noexcept
+{
+    return std::move(*static_cast<T*>(nullptr));
+}
+
+}
+
+}
+
+#endif
+
--- a/diy/include/diy/fmt/format.cc
+++ b/diy/include/diy/fmt/format.cc
@ -0,0 +1,935 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "format.h"
+
+#include <string.h>
+
+#include <cctype>
+#include <cerrno>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>  // for std::ptrdiff_t
+
+#if defined(_WIN32) && defined(__MINGW32__)
+# include <cstring>
+#endif
+
+#if FMT_USE_WINDOWS_H
+# if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
+#  include <windows.h>
+# else
+#  define NOMINMAX
+#  include <windows.h>
+#  undef NOMINMAX
+# endif
+#endif
+
+using fmt::internal::Arg;
+
+#if FMT_EXCEPTIONS
+# define FMT_TRY try
+# define FMT_CATCH(x) catch (x)
+#else
+# define FMT_TRY if (true)
+# define FMT_CATCH(x) if (false)
+#endif
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4127)  // conditional expression is constant
+# pragma warning(disable: 4702)  // unreachable code
+// Disable deprecation warning for strerror. The latter is not called but
+// MSVC fails to detect it.
+# pragma warning(disable: 4996)
+#endif
+
+// Dummy implementations of strerror_r and strerror_s called if corresponding
+// system functions are not available.
+static inline fmt::internal::Null<> strerror_r(int, char *, ...) {
+  return fmt::internal::Null<>();
+}
+static inline fmt::internal::Null<> strerror_s(char *, std::size_t, ...) {
+  return fmt::internal::Null<>();
+}
+
+namespace fmt {
+namespace {
+
+#ifndef _MSC_VER
+# define FMT_SNPRINTF snprintf
+#else  // _MSC_VER
+inline int fmt_snprintf(char *buffer, size_t size, const char *format, ...) {
+  va_list args;
+  va_start(args, format);
+  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
+  va_end(args);
+  return result;
+}
+# define FMT_SNPRINTF fmt_snprintf
+#endif  // _MSC_VER
+
+#if defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+# define FMT_SWPRINTF snwprintf
+#else
+# define FMT_SWPRINTF swprintf
+#endif // defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+
+// Checks if a value fits in int - used to avoid warnings about comparing
+// signed and unsigned integers.
+template <bool IsSigned>
+struct IntChecker {
+  template <typename T>
+  static bool fits_in_int(T value) {
+    unsigned max = INT_MAX;
+    return value <= max;
+  }
+  static bool fits_in_int(bool) { return true; }
+};
+
+template <>
+struct IntChecker<true> {
+  template <typename T>
+  static bool fits_in_int(T value) {
+    return value >= INT_MIN && value <= INT_MAX;
+  }
+  static bool fits_in_int(int) { return true; }
+};
+
+const char RESET_COLOR[] = "\x1b[0m";
+
+typedef void (*FormatFunc)(Writer &, int, StringRef);
+
+// Portable thread-safe version of strerror.
+// Sets buffer to point to a string describing the error code.
+// This can be either a pointer to a string stored in buffer,
+// or a pointer to some static immutable string.
+// Returns one of the following values:
+//   0      - success
+//   ERANGE - buffer is not large enough to store the error message
+//   other  - failure
+// Buffer should be at least of size 1.
+int safe_strerror(
+    int error_code, char *&buffer, std::size_t buffer_size) FMT_NOEXCEPT {
+  FMT_ASSERT(buffer != 0 && buffer_size != 0, "invalid buffer");
+
+  class StrError {
+   private:
+    int error_code_;
+    char *&buffer_;
+    std::size_t buffer_size_;
+
+    // A noop assignment operator to avoid bogus warnings.
+    void operator=(const StrError &) {}
+
+    // Handle the result of XSI-compliant version of strerror_r.
+    int handle(int result) {
+      // glibc versions before 2.13 return result in errno.
+      return result == -1 ? errno : result;
+    }
+
+    // Handle the result of GNU-specific version of strerror_r.
+    int handle(char *message) {
+      // If the buffer is full then the message is probably truncated.
+      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
+        return ERANGE;
+      buffer_ = message;
+      return 0;
+    }
+
+    // Handle the case when strerror_r is not available.
+    int handle(internal::Null<>) {
+      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
+    }
+
+    // Fallback to strerror_s when strerror_r is not available.
+    int fallback(int result) {
+      // If the buffer is full then the message is probably truncated.
+      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ?
+            ERANGE : result;
+    }
+
+    // Fallback to strerror if strerror_r and strerror_s are not available.
+    int fallback(internal::Null<>) {
+      errno = 0;
+      buffer_ = strerror(error_code_);
+      return errno;
+    }
+
+   public:
+    StrError(int err_code, char *&buf, std::size_t buf_size)
+      : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
+
+    int run() {
+      strerror_r(0, 0, "");  // Suppress a warning about unused strerror_r.
+      return handle(strerror_r(error_code_, buffer_, buffer_size_));
+    }
+  };
+  return StrError(error_code, buffer, buffer_size).run();
+}
+
+void format_error_code(Writer &out, int error_code,
+                       StringRef message) FMT_NOEXCEPT {
+  // Report error code making sure that the output fits into
+  // INLINE_BUFFER_SIZE to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.clear();
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  typedef internal::IntTraits<int>::MainType MainType;
+  MainType abs_value = static_cast<MainType>(error_code);
+  if (internal::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += internal::count_digits(abs_value);
+  if (message.size() <= internal::INLINE_BUFFER_SIZE - error_code_size)
+    out << message << SEP;
+  out << ERROR_STR << error_code;
+  assert(out.size() <= internal::INLINE_BUFFER_SIZE);
+}
+
+void report_error(FormatFunc func, int error_code,
+                  StringRef message) FMT_NOEXCEPT {
+  MemoryWriter full_message;
+  func(full_message, error_code, message);
+  // Use Writer::data instead of Writer::c_str to avoid potential memory
+  // allocation.
+  std::fwrite(full_message.data(), full_message.size(), 1, stderr);
+  std::fputc('\n', stderr);
+}
+
+// IsZeroInt::visit(arg) returns true iff arg is a zero integer.
+class IsZeroInt : public ArgVisitor<IsZeroInt, bool> {
+ public:
+  template <typename T>
+  bool visit_any_int(T value) { return value == 0; }
+};
+
+// Checks if an argument is a valid printf width specifier and sets
+// left alignment if it is negative.
+class WidthHandler : public ArgVisitor<WidthHandler, unsigned> {
+ private:
+  FormatSpec &spec_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(WidthHandler);
+
+ public:
+  explicit WidthHandler(FormatSpec &spec) : spec_(spec) {}
+
+  void report_unhandled_arg() {
+    FMT_THROW(FormatError("width is not integer"));
+  }
+
+  template <typename T>
+  unsigned visit_any_int(T value) {
+    typedef typename internal::IntTraits<T>::MainType UnsignedType;
+    UnsignedType width = static_cast<UnsignedType>(value);
+    if (internal::is_negative(value)) {
+      spec_.align_ = ALIGN_LEFT;
+      width = 0 - width;
+    }
+    if (width > INT_MAX)
+      FMT_THROW(FormatError("number is too big"));
+    return static_cast<unsigned>(width);
+  }
+};
+
+class PrecisionHandler : public ArgVisitor<PrecisionHandler, int> {
+ public:
+  void report_unhandled_arg() {
+    FMT_THROW(FormatError("precision is not integer"));
+  }
+
+  template <typename T>
+  int visit_any_int(T value) {
+    if (!IntChecker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
+      FMT_THROW(FormatError("number is too big"));
+    return static_cast<int>(value);
+  }
+};
+
+template <typename T, typename U>
+struct is_same {
+  enum { value = 0 };
+};
+
+template <typename T>
+struct is_same<T, T> {
+  enum { value = 1 };
+};
+
+// An argument visitor that converts an integer argument to T for printf,
+// if T is an integral type. If T is void, the argument is converted to
+// corresponding signed or unsigned type depending on the type specifier:
+// 'd' and 'i' - signed, other - unsigned)
+template <typename T = void>
+class ArgConverter : public ArgVisitor<ArgConverter<T>, void> {
+ private:
+  internal::Arg &arg_;
+  wchar_t type_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(ArgConverter);
+
+ public:
+  ArgConverter(internal::Arg &arg, wchar_t type)
+    : arg_(arg), type_(type) {}
+
+  void visit_bool(bool value) {
+    if (type_ != 's')
+      visit_any_int(value);
+  }
+
+  template <typename U>
+  void visit_any_int(U value) {
+    bool is_signed = type_ == 'd' || type_ == 'i';
+    using internal::Arg;
+    typedef typename internal::Conditional<
+        is_same<T, void>::value, U, T>::type TargetType;
+    if (sizeof(TargetType) <= sizeof(int)) {
+      // Extra casts are used to silence warnings.
+      if (is_signed) {
+        arg_.type = Arg::INT;
+        arg_.int_value = static_cast<int>(static_cast<TargetType>(value));
+      } else {
+        arg_.type = Arg::UINT;
+        typedef typename internal::MakeUnsigned<TargetType>::Type Unsigned;
+        arg_.uint_value = static_cast<unsigned>(static_cast<Unsigned>(value));
+      }
+    } else {
+      if (is_signed) {
+        arg_.type = Arg::LONG_LONG;
+        // glibc's printf doesn't sign extend arguments of smaller types:
+        //   std::printf("%lld", -42);  // prints "4294967254"
+        // but we don't have to do the same because it's a UB.
+        arg_.long_long_value = static_cast<LongLong>(value);
+      } else {
+        arg_.type = Arg::ULONG_LONG;
+        arg_.ulong_long_value =
+            static_cast<typename internal::MakeUnsigned<U>::Type>(value);
+      }
+    }
+  }
+};
+
+// Converts an integer argument to char for printf.
+class CharConverter : public ArgVisitor<CharConverter, void> {
+ private:
+  internal::Arg &arg_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(CharConverter);
+
+ public:
+  explicit CharConverter(internal::Arg &arg) : arg_(arg) {}
+
+  template <typename T>
+  void visit_any_int(T value) {
+    arg_.type = internal::Arg::CHAR;
+    arg_.int_value = static_cast<char>(value);
+  }
+};
+}  // namespace
+
+namespace internal {
+
+template <typename Char>
+class PrintfArgFormatter :
+    public ArgFormatterBase<PrintfArgFormatter<Char>, Char> {
+
+  void write_null_pointer() {
+    this->spec().type_ = 0;
+    this->write("(nil)");
+  }
+
+  typedef ArgFormatterBase<PrintfArgFormatter<Char>, Char> Base;
+
+ public:
+  PrintfArgFormatter(BasicWriter<Char> &w, FormatSpec &s)
+  : ArgFormatterBase<PrintfArgFormatter<Char>, Char>(w, s) {}
+
+  void visit_bool(bool value) {
+    FormatSpec &fmt_spec = this->spec();
+    if (fmt_spec.type_ != 's')
+      return this->visit_any_int(value);
+    fmt_spec.type_ = 0;
+    this->write(value);
+  }
+
+  void visit_char(int value) {
+    const FormatSpec &fmt_spec = this->spec();
+    BasicWriter<Char> &w = this->writer();
+    if (fmt_spec.type_ && fmt_spec.type_ != 'c')
+      w.write_int(value, fmt_spec);
+    typedef typename BasicWriter<Char>::CharPtr CharPtr;
+    CharPtr out = CharPtr();
+    if (fmt_spec.width_ > 1) {
+      Char fill = ' ';
+      out = w.grow_buffer(fmt_spec.width_);
+      if (fmt_spec.align_ != ALIGN_LEFT) {
+        std::fill_n(out, fmt_spec.width_ - 1, fill);
+        out += fmt_spec.width_ - 1;
+      } else {
+        std::fill_n(out + 1, fmt_spec.width_ - 1, fill);
+      }
+    } else {
+      out = w.grow_buffer(1);
+    }
+    *out = static_cast<Char>(value);
+  }
+
+  void visit_cstring(const char *value) {
+    if (value)
+      Base::visit_cstring(value);
+    else if (this->spec().type_ == 'p')
+      write_null_pointer();
+    else
+      this->write("(null)");
+  }
+
+  void visit_pointer(const void *value) {
+    if (value)
+      return Base::visit_pointer(value);
+    this->spec().type_ = 0;
+    write_null_pointer();
+  }
+
+  void visit_custom(Arg::CustomValue c) {
+    BasicFormatter<Char> formatter(ArgList(), this->writer());
+    const Char format_str[] = {'}', 0};
+    const Char *format = format_str;
+    c.format(&formatter, c.value, &format);
+  }
+};
+}  // namespace internal
+}  // namespace fmt
+
+FMT_FUNC void fmt::SystemError::init(
+    int err_code, CStringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  internal::format_system_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+template <typename T>
+int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SNPRINTF(buffer, size, format, value) :
+        FMT_SNPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SNPRINTF(buffer, size, format, width, value) :
+      FMT_SNPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SWPRINTF(buffer, size, format, value) :
+        FMT_SWPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SWPRINTF(buffer, size, format, width, value) :
+      FMT_SWPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+const char fmt::internal::BasicData<T>::DIGITS[] =
+    "0001020304050607080910111213141516171819"
+    "2021222324252627282930313233343536373839"
+    "4041424344454647484950515253545556575859"
+    "6061626364656667686970717273747576777879"
+    "8081828384858687888990919293949596979899";
+
+#define FMT_POWERS_OF_10(factor) \
+  factor * 10, \
+  factor * 100, \
+  factor * 1000, \
+  factor * 10000, \
+  factor * 100000, \
+  factor * 1000000, \
+  factor * 10000000, \
+  factor * 100000000, \
+  factor * 1000000000
+
+template <typename T>
+const uint32_t fmt::internal::BasicData<T>::POWERS_OF_10_32[] = {
+  0, FMT_POWERS_OF_10(1)
+};
+
+template <typename T>
+const uint64_t fmt::internal::BasicData<T>::POWERS_OF_10_64[] = {
+  0,
+  FMT_POWERS_OF_10(1),
+  FMT_POWERS_OF_10(fmt::ULongLong(1000000000)),
+  // Multiply several constants instead of using a single long long constant
+  // to avoid warnings about C++98 not supporting long long.
+  fmt::ULongLong(1000000000) * fmt::ULongLong(1000000000) * 10
+};
+
+FMT_FUNC void fmt::internal::report_unknown_type(char code, const char *type) {
+  (void)type;
+  if (std::isprint(static_cast<unsigned char>(code))) {
+    FMT_THROW(fmt::FormatError(
+        fmt::format("unknown format code '{}' for {}", code, type)));
+  }
+  FMT_THROW(fmt::FormatError(
+      fmt::format("unknown format code '\\x{:02x}' for {}",
+        static_cast<unsigned>(code), type)));
+}
+
+#if FMT_USE_WINDOWS_H
+
+FMT_FUNC fmt::internal::UTF8ToUTF16::UTF8ToUTF16(fmt::StringRef s) {
+  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
+  if (s.size() > INT_MAX)
+    FMT_THROW(WindowsError(ERROR_INVALID_PARAMETER, ERROR_MSG));
+  int s_size = static_cast<int>(s.size());
+  int length = MultiByteToWideChar(
+      CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, 0, 0);
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+  buffer_.resize(length + 1);
+  length = MultiByteToWideChar(
+    CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, &buffer_[0], length);
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+  buffer_[length] = 0;
+}
+
+FMT_FUNC fmt::internal::UTF16ToUTF8::UTF16ToUTF8(fmt::WStringRef s) {
+  if (int error_code = convert(s)) {
+    FMT_THROW(WindowsError(error_code,
+        "cannot convert string from UTF-16 to UTF-8"));
+  }
+}
+
+FMT_FUNC int fmt::internal::UTF16ToUTF8::convert(fmt::WStringRef s) {
+  if (s.size() > INT_MAX)
+    return ERROR_INVALID_PARAMETER;
+  int s_size = static_cast<int>(s.size());
+  int length = WideCharToMultiByte(CP_UTF8, 0, s.data(), s_size, 0, 0, 0, 0);
+  if (length == 0)
+    return GetLastError();
+  buffer_.resize(length + 1);
+  length = WideCharToMultiByte(
+    CP_UTF8, 0, s.data(), s_size, &buffer_[0], length, 0, 0);
+  if (length == 0)
+    return GetLastError();
+  buffer_[length] = 0;
+  return 0;
+}
+
+FMT_FUNC void fmt::WindowsError::init(
+    int err_code, CStringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  internal::format_windows_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+FMT_FUNC void fmt::internal::format_windows_error(
+    fmt::Writer &out, int error_code,
+    fmt::StringRef message) FMT_NOEXCEPT {
+  FMT_TRY {
+    MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer;
+    buffer.resize(INLINE_BUFFER_SIZE);
+    for (;;) {
+      wchar_t *system_message = &buffer[0];
+      int result = FormatMessageW(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                  0, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                                  system_message, static_cast<uint32_t>(buffer.size()), 0);
+      if (result != 0) {
+        UTF16ToUTF8 utf8_message;
+        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
+          out << message << ": " << utf8_message;
+          return;
+        }
+        break;
+      }
+      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        break;  // Can't get error message, report error code instead.
+      buffer.resize(buffer.size() * 2);
+    }
+  } FMT_CATCH(...) {}
+  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
+}
+
+#endif  // FMT_USE_WINDOWS_H
+
+FMT_FUNC void fmt::internal::format_system_error(
+    fmt::Writer &out, int error_code,
+    fmt::StringRef message) FMT_NOEXCEPT {
+  FMT_TRY {
+    MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer;
+    buffer.resize(INLINE_BUFFER_SIZE);
+    for (;;) {
+      char *system_message = &buffer[0];
+      int result = safe_strerror(error_code, system_message, buffer.size());
+      if (result == 0) {
+        out << message << ": " << system_message;
+        return;
+      }
+      if (result != ERANGE)
+        break;  // Can't get error message, report error code instead.
+      buffer.resize(buffer.size() * 2);
+    }
+  } FMT_CATCH(...) {}
+  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
+}
+
+template <typename Char>
+void fmt::internal::ArgMap<Char>::init(const ArgList &args) {
+  if (!map_.empty())
+    return;
+  typedef internal::NamedArg<Char> NamedArg;
+  const NamedArg *named_arg = 0;
+  bool use_values =
+      args.type(ArgList::MAX_PACKED_ARGS - 1) == internal::Arg::NONE;
+  if (use_values) {
+    for (unsigned i = 0;/*nothing*/; ++i) {
+      internal::Arg::Type arg_type = args.type(i);
+      switch (arg_type) {
+      case internal::Arg::NONE:
+        return;
+      case internal::Arg::NAMED_ARG:
+        named_arg = static_cast<const NamedArg*>(args.values_[i].pointer);
+        map_.push_back(Pair(named_arg->name, *named_arg));
+        break;
+      default:
+        /*nothing*/;
+      }
+    }
+    return;
+  }
+  for (unsigned i = 0; i != ArgList::MAX_PACKED_ARGS; ++i) {
+    internal::Arg::Type arg_type = args.type(i);
+    if (arg_type == internal::Arg::NAMED_ARG) {
+      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
+      map_.push_back(Pair(named_arg->name, *named_arg));
+    }
+  }
+  for (unsigned i = ArgList::MAX_PACKED_ARGS;/*nothing*/; ++i) {
+    switch (args.args_[i].type) {
+    case internal::Arg::NONE:
+      return;
+    case internal::Arg::NAMED_ARG:
+      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
+      map_.push_back(Pair(named_arg->name, *named_arg));
+      break;
+    default:
+      /*nothing*/;
+    }
+  }
+}
+
+template <typename Char>
+void fmt::internal::FixedBuffer<Char>::grow(std::size_t) {
+  FMT_THROW(std::runtime_error("buffer overflow"));
+}
+
+FMT_FUNC Arg fmt::internal::FormatterBase::do_get_arg(
+    unsigned arg_index, const char *&error) {
+  Arg arg = args_[arg_index];
+  switch (arg.type) {
+  case Arg::NONE:
+    error = "argument index out of range";
+    break;
+  case Arg::NAMED_ARG:
+    arg = *static_cast<const internal::Arg*>(arg.pointer);
+    break;
+  default:
+    /*nothing*/;
+  }
+  return arg;
+}
+
+template <typename Char>
+void fmt::internal::PrintfFormatter<Char>::parse_flags(
+    FormatSpec &spec, const Char *&s) {
+  for (;;) {
+    switch (*s++) {
+      case '-':
+        spec.align_ = ALIGN_LEFT;
+        break;
+      case '+':
+        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
+        break;
+      case '0':
+        spec.fill_ = '0';
+        break;
+      case ' ':
+        spec.flags_ |= SIGN_FLAG;
+        break;
+      case '#':
+        spec.flags_ |= HASH_FLAG;
+        break;
+      default:
+        --s;
+        return;
+    }
+  }
+}
+
+template <typename Char>
+Arg fmt::internal::PrintfFormatter<Char>::get_arg(
+    const Char *s, unsigned arg_index) {
+  (void)s;
+  const char *error = 0;
+  Arg arg = arg_index == UINT_MAX ?
+    next_arg(error) : FormatterBase::get_arg(arg_index - 1, error);
+  if (error)
+    FMT_THROW(FormatError(!*s ? "invalid format string" : error));
+  return arg;
+}
+
+template <typename Char>
+unsigned fmt::internal::PrintfFormatter<Char>::parse_header(
+  const Char *&s, FormatSpec &spec) {
+  unsigned arg_index = UINT_MAX;
+  Char c = *s;
+  if (c >= '0' && c <= '9') {
+    // Parse an argument index (if followed by '$') or a width possibly
+    // preceded with '0' flag(s).
+    unsigned value = parse_nonnegative_int(s);
+    if (*s == '$') {  // value is an argument index
+      ++s;
+      arg_index = value;
+    } else {
+      if (c == '0')
+        spec.fill_ = '0';
+      if (value != 0) {
+        // Nonzero value means that we parsed width and don't need to
+        // parse it or flags again, so return now.
+        spec.width_ = value;
+        return arg_index;
+      }
+    }
+  }
+  parse_flags(spec, s);
+  // Parse width.
+  if (*s >= '0' && *s <= '9') {
+    spec.width_ = parse_nonnegative_int(s);
+  } else if (*s == '*') {
+    ++s;
+    spec.width_ = WidthHandler(spec).visit(get_arg(s));
+  }
+  return arg_index;
+}
+
+template <typename Char>
+void fmt::internal::PrintfFormatter<Char>::format(
+    BasicWriter<Char> &writer, BasicCStringRef<Char> format_str) {
+  const Char *start = format_str.c_str();
+  const Char *s = start;
+  while (*s) {
+    Char c = *s++;
+    if (c != '%') continue;
+    if (*s == c) {
+      write(writer, start, s);
+      start = ++s;
+      continue;
+    }
+    write(writer, start, s - 1);
+
+    FormatSpec spec;
+    spec.align_ = ALIGN_RIGHT;
+
+    // Parse argument index, flags and width.
+    unsigned arg_index = parse_header(s, spec);
+
+    // Parse precision.
+    if (*s == '.') {
+      ++s;
+      if ('0' <= *s && *s <= '9') {
+        spec.precision_ = static_cast<int>(parse_nonnegative_int(s));
+      } else if (*s == '*') {
+        ++s;
+        spec.precision_ = PrecisionHandler().visit(get_arg(s));
+      }
+    }
+
+    Arg arg = get_arg(s, arg_index);
+    if (spec.flag(HASH_FLAG) && IsZeroInt().visit(arg))
+      spec.flags_ &= ~to_unsigned<int>(HASH_FLAG);
+    if (spec.fill_ == '0') {
+      if (arg.type <= Arg::LAST_NUMERIC_TYPE)
+        spec.align_ = ALIGN_NUMERIC;
+      else
+        spec.fill_ = ' ';  // Ignore '0' flag for non-numeric types.
+    }
+
+    // Parse length and convert the argument to the required type.
+    switch (*s++) {
+    case 'h':
+      if (*s == 'h')
+        ArgConverter<signed char>(arg, *++s).visit(arg);
+      else
+        ArgConverter<short>(arg, *s).visit(arg);
+      break;
+    case 'l':
+      if (*s == 'l')
+        ArgConverter<fmt::LongLong>(arg, *++s).visit(arg);
+      else
+        ArgConverter<long>(arg, *s).visit(arg);
+      break;
+    case 'j':
+      ArgConverter<intmax_t>(arg, *s).visit(arg);
+      break;
+    case 'z':
+      ArgConverter<std::size_t>(arg, *s).visit(arg);
+      break;
+    case 't':
+      ArgConverter<std::ptrdiff_t>(arg, *s).visit(arg);
+      break;
+    case 'L':
+      // printf produces garbage when 'L' is omitted for long double, no
+      // need to do the same.
+      break;
+    default:
+      --s;
+      ArgConverter<void>(arg, *s).visit(arg);
+    }
+
+    // Parse type.
+    if (!*s)
+      FMT_THROW(FormatError("invalid format string"));
+    spec.type_ = static_cast<char>(*s++);
+    if (arg.type <= Arg::LAST_INTEGER_TYPE) {
+      // Normalize type.
+      switch (spec.type_) {
+      case 'i': case 'u':
+        spec.type_ = 'd';
+        break;
+      case 'c':
+        // TODO: handle wchar_t
+        CharConverter(arg).visit(arg);
+        break;
+      }
+    }
+
+    start = s;
+
+    // Format argument.
+    internal::PrintfArgFormatter<Char>(writer, spec).visit(arg);
+  }
+  write(writer, start, s);
+}
+
+FMT_FUNC void fmt::report_system_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  // 'fmt::' is for bcc32.
+  fmt::report_error(internal::format_system_error, error_code, message);
+}
+
+#if FMT_USE_WINDOWS_H
+FMT_FUNC void fmt::report_windows_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  // 'fmt::' is for bcc32.
+  fmt::report_error(internal::format_windows_error, error_code, message);
+}
+#endif
+
+FMT_FUNC void fmt::print(std::FILE *f, CStringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  std::fwrite(w.data(), 1, w.size(), f);
+}
+
+FMT_FUNC void fmt::print(CStringRef format_str, ArgList args) {
+  print(stdout, format_str, args);
+}
+
+FMT_FUNC void fmt::print_colored(Color c, CStringRef format, ArgList args) {
+  char escape[] = "\x1b[30m";
+  escape[3] = static_cast<char>('0' + c);
+  std::fputs(escape, stdout);
+  print(format, args);
+  std::fputs(RESET_COLOR, stdout);
+}
+
+FMT_FUNC int fmt::fprintf(std::FILE *f, CStringRef format, ArgList args) {
+  MemoryWriter w;
+  printf(w, format, args);
+  std::size_t size = w.size();
+  return std::fwrite(w.data(), 1, size, f) < size ? -1 : static_cast<int>(size);
+}
+
+#ifndef FMT_HEADER_ONLY
+
+template struct fmt::internal::BasicData<void>;
+
+// Explicit instantiations for char.
+
+template void fmt::internal::FixedBuffer<char>::grow(std::size_t);
+
+template void fmt::internal::ArgMap<char>::init(const fmt::ArgList &args);
+
+template void fmt::internal::PrintfFormatter<char>::format(
+  BasicWriter<char> &writer, CStringRef format);
+
+template int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, double value);
+
+template int fmt::internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, long double value);
+
+// Explicit instantiations for wchar_t.
+
+template void fmt::internal::FixedBuffer<wchar_t>::grow(std::size_t);
+
+template void fmt::internal::ArgMap<wchar_t>::init(const fmt::ArgList &args);
+
+template void fmt::internal::PrintfFormatter<wchar_t>::format(
+    BasicWriter<wchar_t> &writer, WCStringRef format);
+
+template int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, double value);
+
+template int fmt::internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, long double value);
+
+#endif  // FMT_HEADER_ONLY
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
--- a/diy/include/diy/fmt/format.h
+++ b/diy/include/diy/fmt/format.h
--- a/diy/include/diy/fmt/ostream.cc
+++ b/diy/include/diy/fmt/ostream.cc
@ -0,0 +1,61 @@
+/*
+ Formatting library for C++ - std::ostream support
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ostream.h"
+
+namespace fmt {
+
+namespace {
+// Write the content of w to os.
+void write(std::ostream &os, Writer &w) {
+  const char *data = w.data();
+  typedef internal::MakeUnsigned<std::streamsize>::Type UnsignedStreamSize;
+  UnsignedStreamSize size = w.size();
+  UnsignedStreamSize max_size =
+      internal::to_unsigned((std::numeric_limits<std::streamsize>::max)());
+  do {
+    UnsignedStreamSize n = size <= max_size ? size : max_size;
+    os.write(data, static_cast<std::streamsize>(n));
+    data += n;
+    size -= n;
+  } while (size != 0);
+}
+}
+
+FMT_FUNC void print(std::ostream &os, CStringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  write(os, w);
+}
+
+FMT_FUNC int fprintf(std::ostream &os, CStringRef format, ArgList args) {
+  MemoryWriter w;
+  printf(w, format, args);
+  write(os, w);
+  return static_cast<int>(w.size());
+}
+}  // namespace fmt
--- a/diy/include/diy/fmt/ostream.h
+++ b/diy/include/diy/fmt/ostream.h
@ -0,0 +1,133 @@
+/*
+ Formatting library for C++ - std::ostream support
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FMT_OSTREAM_H_
+#define FMT_OSTREAM_H_
+
+#include "format.h"
+#include <ostream>
+
+namespace fmt {
+
+namespace internal {
+
+template <class Char>
+class FormatBuf : public std::basic_streambuf<Char> {
+ private:
+  typedef typename std::basic_streambuf<Char>::int_type int_type;
+  typedef typename std::basic_streambuf<Char>::traits_type traits_type;
+
+  Buffer<Char> &buffer_;
+  Char *start_;
+
+ public:
+  FormatBuf(Buffer<Char> &buffer) : buffer_(buffer), start_(&buffer[0]) {
+    this->setp(start_, start_ + buffer_.capacity());
+  }
+
+  int_type overflow(int_type ch = traits_type::eof()) {
+    if (!traits_type::eq_int_type(ch, traits_type::eof())) {
+      size_t buf_size = size();
+      buffer_.resize(buf_size);
+      buffer_.reserve(buf_size * 2);
+
+      start_ = &buffer_[0];
+      start_[buf_size] = traits_type::to_char_type(ch);
+      this->setp(start_+ buf_size + 1, start_ + buf_size * 2);
+    }
+    return ch;
+  }
+
+  size_t size() const {
+    return to_unsigned(this->pptr() - start_);
+  }
+};
+
+Yes &convert(std::ostream &);
+
+struct DummyStream : std::ostream {
+  DummyStream();  // Suppress a bogus warning in MSVC.
+  // Hide all operator<< overloads from std::ostream.
+  void operator<<(Null<>);
+};
+
+No &operator<<(std::ostream &, int);
+
+template<typename T>
+struct ConvertToIntImpl<T, true> {
+  // Convert to int only if T doesn't have an overloaded operator<<.
+  enum {
+    value = sizeof(convert(get<DummyStream>() << get<T>())) == sizeof(No)
+  };
+};
+}  // namespace internal
+
+// Formats a value.
+template <typename Char, typename ArgFormatter_, typename T>
+void format(BasicFormatter<Char, ArgFormatter_> &f,
+            const Char *&format_str, const T &value) {
+  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE> buffer;
+
+  internal::FormatBuf<Char> format_buf(buffer);
+  std::basic_ostream<Char> output(&format_buf);
+  output << value;
+
+  BasicStringRef<Char> str(&buffer[0], format_buf.size());
+  typedef internal::MakeArg< BasicFormatter<Char> > MakeArg;
+  format_str = f.format(format_str, MakeArg(str));
+}
+
+/**
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    print(cerr, "Don't {}!", "panic");
+  \endrst
+ */
+FMT_API void print(std::ostream &os, CStringRef format_str, ArgList args);
+FMT_VARIADIC(void, print, std::ostream &, CStringRef)
+
+/**
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    fprintf(cerr, "Don't %s!", "panic");
+  \endrst
+ */
+FMT_API int fprintf(std::ostream &os, CStringRef format_str, ArgList args);
+FMT_VARIADIC(int, fprintf, std::ostream &, CStringRef)
+}  // namespace fmt
+
+#ifdef FMT_HEADER_ONLY
+# include "ostream.cc"
+#endif
+
+#endif  // FMT_OSTREAM_H_
--- a/diy/include/diy/grid.hpp
+++ b/diy/include/diy/grid.hpp
@ -0,0 +1,153 @@
+#ifndef DIY_GRID_HPP
+#define DIY_GRID_HPP
+
+#include "point.hpp"
+
+namespace diy
+{
+
+template<class C, unsigned D>
+struct Grid;
+
+template<class C, unsigned D>
+struct GridRef
+{
+    public:
+        typedef     C                                           Value;
+
+        typedef     Point<int, D>                               Vertex;
+        typedef     size_t                                      Index;
+
+    public:
+        template<class Int>
+                GridRef(C* data, const Point<Int,D>& shape, bool c_order = true):
+                    data_(data), shape_(shape), c_order_(c_order)   { set_stride(); }
+
+                GridRef(Grid<C,D>& g):
+                    data_(g.data()), shape_(g.shape()),
+                    c_order_(g.c_order())                       { set_stride(); }
+
+        template<class Int>
+        C       operator()(const Point<Int, D>& v) const        { return (*this)(index(v)); }
+
+        template<class Int>
+        C&      operator()(const Point<Int, D>& v)              { return (*this)(index(v)); }
+
+        C       operator()(Index i) const                       { return data_[i]; }
+        C&      operator()(Index i)                             { return data_[i]; }
+
+        const Vertex&
+                shape() const                                   { return shape_; }
+
+        const C*
+                data() const                                    { return data_; }
+        C*      data()                                          { return data_; }
+
+        // Set every element to the given value
+        GridRef&    operator=(C value)                          { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] = value; return *this; }
+        GridRef&    operator/=(C value)                         { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] /= value; return *this; }
+
+        Vertex      vertex(Index idx) const                     { Vertex v; for (unsigned i = 0; i < D; ++i) { v[i] = idx / stride_[i]; idx %= stride_[i]; } return v; }
+        Index       index(const Vertex& v) const                { Index idx = 0; for (unsigned i = 0; i < D; ++i) { idx += ((Index) v[i]) * ((Index) stride_[i]); } return idx; }
+
+        Index       size() const                                { return size(shape()); }
+        void        swap(GridRef& other)                        { std::swap(data_, other.data_); std::swap(shape_, other.shape_); std::swap(stride_, other.stride_); std::swap(c_order_, other.c_order_); }
+
+        bool        c_order() const                             { return c_order_; }
+
+        static constexpr
+        unsigned    dimension()                                 { return D; }
+
+    protected:
+        static Index
+                size(const Vertex& v)                           { Index res = 1; for (unsigned i = 0; i < D; ++i) res *= v[i]; return res; }
+
+        void    set_stride()
+        {
+            Index cur = 1;
+            if (c_order_)
+                for (unsigned i = D; i > 0; --i) { stride_[i-1] = cur; cur *= shape_[i-1]; }
+            else
+                for (unsigned i = 0; i < D; ++i) { stride_[i] = cur; cur *= shape_[i]; }
+
+        }
+        void    set_shape(const Vertex& v)                      { shape_ = v; set_stride(); }
+        void    set_data(C* data)                               { data_ = data; }
+        void    set_c_order(bool order)                         { c_order_ = order; }
+
+    private:
+        C*      data_;
+        Vertex  shape_;
+        Vertex  stride_;
+        bool    c_order_;
+};
+
+
+template<class C, unsigned D>
+struct Grid: public GridRef<C,D>
+{
+    public:
+        typedef     GridRef<C,D>                                Parent;
+        typedef     typename Parent::Value                      Value;
+        typedef     typename Parent::Index                      Index;
+        typedef     typename Parent::Vertex                     Vertex;
+        typedef     Parent                                      Reference;
+
+        template<class U>
+        struct rebind { typedef Grid<U,D>                       type; };
+
+    public:
+                Grid():
+                    Parent(new C[0], Vertex::zero())            {}
+        template<class Int>
+                Grid(const Point<Int, D>& shape, bool c_order = true):
+                    Parent(new C[size(shape)], shape, c_order)
+                {}
+
+                Grid(Grid&& g): Grid()                          { Parent::swap(g); }
+
+                Grid(const Parent& g):
+                    Parent(new C[size(g.shape())], g.shape(),
+                           g.c_order())                         { copy_data(g.data()); }
+
+        template<class OtherGrid>
+                Grid(const OtherGrid& g):
+                    Parent(new C[size(g.shape())],
+                           g.shape(),
+                           g.c_order())                         { copy_data(g.data()); }
+
+                ~Grid()                                         { delete[] Parent::data(); }
+
+        template<class OC>
+        Grid&   operator=(const GridRef<OC, D>& other)
+        {
+            delete[] Parent::data();
+            Parent::set_c_order(other.c_order());       // NB: order needs to be set before the shape, to set the stride correctly
+            Parent::set_shape(other.shape());
+            Index s = size(shape());
+            Parent::set_data(new C[s]);
+            copy_data(other.data());
+            return *this;
+        }
+
+        Grid&   operator=(Grid&& g)                             { Parent::swap(g); return *this; }
+
+        using Parent::data;
+        using Parent::shape;
+        using Parent::operator();
+        using Parent::operator=;
+        using Parent::size;
+
+    private:
+        template<class OC>
+        void    copy_data(const OC* data)
+        {
+            Index s = size(shape());
+            for (Index i = 0; i < s; ++i)
+                Parent::data()[i] = data[i];
+        }
+};
+
+}
+
+#endif
--- a/diy/include/diy/io/block.hpp
+++ b/diy/include/diy/io/block.hpp
@ -0,0 +1,396 @@
+#ifndef DIY_IO_BLOCK_HPP
+#define DIY_IO_BLOCK_HPP
+
+#include <string>
+#include <algorithm>
+#include <stdexcept>
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include "../mpi.hpp"
+#include "../assigner.hpp"
+#include "../master.hpp"
+#include "../storage.hpp"
+#include "../log.hpp"
+
+// Read and write collections of blocks using MPI-IO
+namespace diy
+{
+namespace io
+{
+  namespace detail
+  {
+    typedef mpi::io::offset                 offset_t;
+
+    struct GidOffsetCount
+    {
+                    GidOffsetCount():                                   // need to initialize a vector of given size
+                        gid(-1), offset(0), count(0)                    {}
+
+                    GidOffsetCount(int gid_, offset_t offset_, offset_t count_):
+                        gid(gid_), offset(offset_), count(count_)       {}
+
+        bool        operator<(const GidOffsetCount& other) const        { return gid < other.gid; }
+
+        int         gid;
+        offset_t    offset;
+        offset_t    count;
+    };
+  }
+}
+
+// Serialize GidOffsetCount explicitly, to avoid alignment and unitialized data issues
+// (to get identical output files given the same block input)
+template<>
+struct Serialization<io::detail::GidOffsetCount>
+{
+    typedef             io::detail::GidOffsetCount                  GidOffsetCount;
+
+    static void         save(BinaryBuffer& bb, const GidOffsetCount& x)
+    {
+      diy::save(bb, x.gid);
+      diy::save(bb, x.offset);
+      diy::save(bb, x.count);
+    }
+
+    static void         load(BinaryBuffer& bb, GidOffsetCount& x)
+    {
+      diy::load(bb, x.gid);
+      diy::load(bb, x.offset);
+      diy::load(bb, x.count);
+    }
+};
+
+namespace io
+{
+/**
+ * \ingroup IO
+ * \brief Write blocks to storage collectively in one shared file
+ */
+  inline
+  void
+  write_blocks(const std::string&           outfilename,           //!< output file name
+               const mpi::communicator&     comm,                  //!< communicator
+               Master&                      master,                //!< master object
+               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
+               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in the master
+  {
+    if (!save) save = master.saver();       // save is likely to be different from master.save()
+
+    typedef detail::offset_t                offset_t;
+    typedef detail::GidOffsetCount          GidOffsetCount;
+
+    unsigned size = master.size(),
+             max_size, min_size;
+    mpi::all_reduce(comm, size, max_size, mpi::maximum<unsigned>());
+    mpi::all_reduce(comm, size, min_size, mpi::minimum<unsigned>());
+
+    // truncate the file
+    if (comm.rank() == 0)
+        truncate(outfilename.c_str(), 0);
+
+    mpi::io::file f(comm, outfilename, mpi::io::file::wronly | mpi::io::file::create);
+
+    offset_t  start = 0, shift;
+    std::vector<GidOffsetCount>     offset_counts;
+    unsigned i;
+    for (i = 0; i < max_size; ++i)
+    {
+      offset_t count = 0,
+               offset;
+      if (i < size)
+      {
+        // get the block from master and serialize it
+        const void* block = master.get(i);
+        MemoryBuffer bb;
+        LinkFactory::save(bb, master.link(i));
+        save(block, bb);
+        count = bb.buffer.size();
+        mpi::scan(comm, count, offset, std::plus<offset_t>());
+        offset += start - count;
+        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
+        start += shift;
+
+        if (i < min_size)       // up to min_size, we can do collective IO
+          f.write_at_all(offset, bb.buffer);
+        else
+          f.write_at(offset, bb.buffer);
+
+        offset_counts.push_back(GidOffsetCount(master.gid(i), offset, count));
+      } else
+      {
+        // matching global operations
+        mpi::scan(comm, count, offset, std::plus<offset_t>());
+        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
+
+        // -1 indicates that there is no block written here from this rank
+        offset_counts.push_back(GidOffsetCount(-1, offset, count));
+      }
+    }
+
+    if (comm.rank() == 0)
+    {
+      // round-about way of gather vector of vectors of GidOffsetCount to avoid registering a new mpi datatype
+      std::vector< std::vector<char> > gathered_offset_count_buffers;
+      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
+      mpi::gather(comm, oc_buffer.buffer, gathered_offset_count_buffers, 0);
+
+      std::vector<GidOffsetCount>  all_offset_counts;
+      for (unsigned i = 0; i < gathered_offset_count_buffers.size(); ++i)
+      {
+        MemoryBuffer oc_buffer; oc_buffer.buffer.swap(gathered_offset_count_buffers[i]);
+        std::vector<GidOffsetCount> offset_counts;
+        diy::load(oc_buffer, offset_counts);
+        for (unsigned j = 0; j < offset_counts.size(); ++j)
+          if (offset_counts[j].gid != -1)
+            all_offset_counts.push_back(offset_counts[j]);
+      }
+      std::sort(all_offset_counts.begin(), all_offset_counts.end());        // sorts by gid
+
+      MemoryBuffer bb;
+      diy::save(bb, all_offset_counts);
+      diy::save(bb, extra);
+      size_t footer_size = bb.size();
+      diy::save(bb, footer_size);
+
+      // find footer_offset as the max of (offset + count)
+      offset_t footer_offset = 0;
+      for (unsigned i = 0; i < all_offset_counts.size(); ++i)
+      {
+        offset_t end = all_offset_counts[i].offset + all_offset_counts[i].count;
+        if (end > footer_offset)
+            footer_offset = end;
+      }
+      f.write_at(footer_offset, bb.buffer);
+    } else
+    {
+      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
+      mpi::gather(comm, oc_buffer.buffer, 0);
+    }
+  }
+
+/**
+ * \ingroup IO
+ * \brief Read blocks from storage collectively from one shared file
+ */
+    inline
+    void
+    read_blocks(const std::string&           infilename,     //!< input file name
+                const mpi::communicator&     comm,           //!< communicator
+                Assigner&                    assigner,       //!< assigner object
+                Master&                      master,         //!< master object
+                MemoryBuffer&                extra,          //!< user-defined metadata in file header
+                Master::LoadBlock            load = 0)       //!< load block function in case different than or unefined in the master
+    {
+        if (!load) load = master.loader();      // load is likely to be different from master.load()
+
+        typedef detail::offset_t                offset_t;
+        typedef detail::GidOffsetCount          GidOffsetCount;
+
+        mpi::io::file f(comm, infilename, mpi::io::file::rdonly);
+
+        offset_t    footer_offset = f.size() - sizeof(size_t);
+        size_t footer_size;
+
+        // Read the size
+        f.read_at_all(footer_offset, (char*) &footer_size, sizeof(footer_size));
+
+        // Read all_offset_counts
+        footer_offset -= footer_size;
+        MemoryBuffer footer;
+        footer.buffer.resize(footer_size);
+        f.read_at_all(footer_offset, footer.buffer);
+
+        std::vector<GidOffsetCount>  all_offset_counts;
+        diy::load(footer, all_offset_counts);
+        diy::load(footer, extra);
+        extra.reset();
+
+        // Get local gids from assigner
+        size_t size = all_offset_counts.size();
+        assigner.set_nblocks(size);
+        std::vector<int> gids;
+        assigner.local_gids(comm.rank(), gids);
+
+        for (unsigned i = 0; i < gids.size(); ++i)
+        {
+            if (gids[i] != all_offset_counts[gids[i]].gid)
+                get_logger()->warn("gids don't match in diy::io::read_blocks(), {} vs {}",
+                                   gids[i], all_offset_counts[gids[i]].gid);
+
+            offset_t offset = all_offset_counts[gids[i]].offset,
+                     count  = all_offset_counts[gids[i]].count;
+            MemoryBuffer bb;
+            bb.buffer.resize(count);
+            f.read_at(offset, bb.buffer);
+            Link* l = LinkFactory::load(bb);
+            l->fix(assigner);
+            void* b = master.create();
+            load(b, bb);
+            master.add(gids[i], b, l);
+        }
+    }
+
+
+  // Functions without the extra buffer, for compatibility with the old code
+  inline
+  void
+  write_blocks(const std::string&           outfilename,
+               const mpi::communicator&     comm,
+               Master&                      master,
+               Master::SaveBlock            save)
+  {
+    MemoryBuffer extra;
+    write_blocks(outfilename, comm, master, extra, save);
+  }
+
+  inline
+  void
+  read_blocks(const std::string&           infilename,
+              const mpi::communicator&     comm,
+              Assigner&                    assigner,
+              Master&                      master,
+              Master::LoadBlock            load = 0)
+  {
+    MemoryBuffer extra;     // dummy
+    read_blocks(infilename, comm, assigner, master, extra, load);
+  }
+
+namespace split
+{
+/**
+ * \ingroup IO
+ * \brief Write blocks to storage independently in one file per process
+ */
+  inline
+  void
+  write_blocks(const std::string&           outfilename,           //!< output file name
+               const mpi::communicator&     comm,                  //!< communicator
+               Master&                      master,                //!< master object
+               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
+               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in master
+  {
+    if (!save) save = master.saver();       // save is likely to be different from master.save()
+
+    bool proceed = false;
+    size_t size = 0;
+    if (comm.rank() == 0)
+    {
+        struct stat s;
+        if (stat(outfilename.c_str(), &s) == 0)
+        {
+            if (S_ISDIR(s.st_mode))
+                proceed = true;
+        } else if (mkdir(outfilename.c_str(), 0755) == 0)
+            proceed = true;
+        mpi::broadcast(comm, proceed, 0);
+        mpi::reduce(comm, (size_t) master.size(), size, 0, std::plus<size_t>());
+    } else
+    {
+        mpi::broadcast(comm, proceed, 0);
+        mpi::reduce(comm, (size_t) master.size(), 0, std::plus<size_t>());
+    }
+
+    if (!proceed)
+        throw std::runtime_error("Cannot access or create directory: " + outfilename);
+
+    for (int i = 0; i < (int)master.size(); ++i)
+    {
+        const void* block = master.get(i);
+
+        std::string filename = fmt::format("{}/{}", outfilename, master.gid(i));
+
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
+
+        LinkFactory::save(bb, master.link(i));
+        save(block, bb);
+
+        fclose(bb.file);
+    }
+
+    if (comm.rank() == 0)
+    {
+        // save the extra buffer
+        std::string filename = outfilename + "/extra";
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
+        ::diy::save(bb, size);
+        ::diy::save(bb, extra);
+        fclose(bb.file);
+    }
+  }
+
+/**
+ * \ingroup IO
+ * \brief Read blocks from storage independently from one file per process
+ */
+  inline
+  void
+  read_blocks(const std::string&           infilename,  //!< input file name
+              const mpi::communicator&     comm,        //!< communicator
+              Assigner&                    assigner,    //!< assigner object
+              Master&                      master,      //!< master object
+              MemoryBuffer&                extra,       //!< user-defined metadata in file header
+              Master::LoadBlock            load = 0)    //!< block load function in case different than or undefined in master
+  {
+    if (!load) load = master.loader();      // load is likely to be different from master.load()
+
+    // load the extra buffer and size
+    size_t          size;
+    std::string filename = infilename + "/extra";
+    ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
+    ::diy::load(bb, size);
+    ::diy::load(bb, extra);
+    extra.reset();
+    fclose(bb.file);
+
+    // Get local gids from assigner
+    assigner.set_nblocks(size);
+    std::vector<int> gids;
+    assigner.local_gids(comm.rank(), gids);
+
+    // Read our blocks;
+    for (unsigned i = 0; i < gids.size(); ++i)
+    {
+        std::string filename = fmt::format("{}/{}", infilename, gids[i]);
+
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
+        Link* l = LinkFactory::load(bb);
+        l->fix(assigner);
+        void* b = master.create();
+        load(b, bb);
+        master.add(gids[i], b, l);
+
+        fclose(bb.file);
+    }
+  }
+
+  // Functions without the extra buffer, for compatibility with the old code
+  inline
+  void
+  write_blocks(const std::string&           outfilename,
+               const mpi::communicator&     comm,
+               Master&                      master,
+               Master::SaveBlock            save)
+  {
+    MemoryBuffer extra;
+    write_blocks(outfilename, comm, master, extra, save);
+  }
+
+  inline
+  void
+  read_blocks(const std::string&           infilename,
+              const mpi::communicator&     comm,
+              Assigner&                    assigner,
+              Master&                      master,
+              Master::LoadBlock            load = 0)
+  {
+    MemoryBuffer extra;     // dummy
+    read_blocks(infilename, comm, assigner, master, extra, load);
+  }
+} // split
+} // io
+} // diy
+
+#endif
--- a/diy/include/diy/io/bov.hpp
+++ b/diy/include/diy/io/bov.hpp
@ -0,0 +1,171 @@
+#ifndef DIY_IO_BOV_HPP
+#define DIY_IO_BOV_HPP
+
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+#include "../types.hpp"
+#include "../mpi.hpp"
+
+namespace diy
+{
+namespace io
+{
+  // Reads and writes subsets of a block of values into specified block bounds
+  class BOV
+  {
+    public:
+      typedef       std::vector<int>                                    Shape;
+    public:
+                    BOV(mpi::io::file&    f):
+                      f_(f), offset_(0)                                 {}
+
+      template<class S>
+                    BOV(mpi::io::file&    f,
+                        const S&          shape  = S(),
+                        mpi::io::offset   offset = 0):
+                      f_(f), offset_(offset)                            { set_shape(shape); }
+
+      void          set_offset(mpi::io::offset offset)                  { offset_ = offset; }
+
+      template<class S>
+      void          set_shape(const S& shape)
+      {
+        shape_.clear();
+        stride_.clear();
+        for (unsigned i = 0; i < shape.size(); ++i)
+        {
+            shape_.push_back(shape[i]);
+            stride_.push_back(1);
+        }
+        for (int i = shape_.size() - 2; i >=  0; --i)
+          stride_[i] = stride_[i+1] * shape_[i+1];
+      }
+
+      const Shape&  shape() const                                       { return shape_; }
+
+      template<class T>
+      void          read(const DiscreteBounds& bounds, T* buffer, bool collective = false, int chunk = 1) const;
+
+      template<class T>
+      void          write(const DiscreteBounds& bounds, const T* buffer, bool collective = false, int chunk = 1);
+
+      template<class T>
+      void          write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective = false, int chunk = 1);
+
+    protected:
+      mpi::io::file&        file()                                        { return f_; }
+
+    private:
+      mpi::io::file&        f_;
+      Shape                 shape_;
+      std::vector<size_t>   stride_;
+      size_t                offset_;
+  };
+}
+}
+
+template<class T>
+void
+diy::io::BOV::
+read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
+{
+  int dim   = shape_.size();
+  int total = 1;
+  std::vector<int> subsizes;
+  for (int i = 0; i < dim; ++i)
+  {
+    subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
+    total *= subsizes.back();
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+    T_type = mpi::detail::get_mpi_datatype<T>();
+  else
+  {
+    // create an MPI struct of size chunk to read the data in those chunks
+    // (this allows to work around MPI-IO weirdness where crucial quantities
+    // are ints, which are too narrow of a type)
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk;
+  MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_commit(&fileblk);
+
+  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+      MPI_File_read(f_.handle(), buffer, total, T_type, &s.s);
+  else
+      MPI_File_read_all(f_.handle(), buffer, total, T_type, &s.s);
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+}
+
+template<class T>
+void
+diy::io::BOV::
+write(const DiscreteBounds& bounds, const T* buffer, bool collective, int chunk)
+{
+    write(bounds, buffer, bounds, collective, chunk);
+}
+
+template<class T>
+void
+diy::io::BOV::
+write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective, int chunk)
+{
+  int dim   = shape_.size();
+  std::vector<int> subsizes;
+  std::vector<int> buffer_shape, buffer_start;
+  for (int i = 0; i < dim; ++i)
+  {
+    buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
+    buffer_start.push_back(core.min[i] - bounds.min[i]);
+    subsizes.push_back(core.max[i] - core.min[i] + 1);
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+    T_type = mpi::detail::get_mpi_datatype<T>();
+  else
+  {
+    // assume T is a binary block and create an MPI struct of appropriate size
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk, subbuffer;
+  MPI_Type_create_subarray(dim, (int*) &shape_[0],       &subsizes[0], (int*) &bounds.min[0],   MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
+  MPI_Type_commit(&fileblk);
+  MPI_Type_commit(&subbuffer);
+
+  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+    MPI_File_write(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
+  else
+    MPI_File_write_all(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+  MPI_Type_free(&subbuffer);
+}
+
+#endif
--- a/diy/include/diy/io/numpy.hpp
+++ b/diy/include/diy/io/numpy.hpp
@ -0,0 +1,213 @@
+#ifndef DIY_IO_NMPY_HPP
+#define DIY_IO_NMPY_HPP
+
+#include <sstream>
+#include <complex>
+#include <stdexcept>
+
+#include "../serialization.hpp"
+#include "bov.hpp"
+
+namespace diy
+{
+namespace io
+{
+  class NumPy: public BOV
+  {
+    public:
+                        NumPy(mpi::io::file& f):
+                          BOV(f)                                {}
+
+      unsigned          word_size() const                       { return word_size_; }
+
+      unsigned          read_header()
+      {
+        BOV::Shape  shape;
+        bool        fortran;
+        size_t      offset = parse_npy_header(shape, fortran);
+        if (fortran)
+            throw std::runtime_error("diy::io::NumPy cannot read data in fortran order");
+        BOV::set_offset(offset);
+        BOV::set_shape(shape);
+        return word_size_;
+      }
+
+      template<class T>
+      void              write_header(int dim, const DiscreteBounds& bounds);
+
+      template<class T, class S>
+      void              write_header(const S& shape);
+
+    private:
+      inline size_t     parse_npy_header(BOV::Shape& shape, bool& fortran_order);
+      void              save(diy::BinaryBuffer& bb, const std::string& s)               { bb.save_binary(s.c_str(), s.size()); }
+      template<class T>
+      inline void       convert_and_save(diy::BinaryBuffer& bb, const T& x)
+      {
+          std::ostringstream oss;
+          oss << x;
+          save(bb, oss.str());
+      }
+
+    private:
+      unsigned          word_size_;
+  };
+
+  namespace detail
+  {
+    inline char big_endian();
+    template<class T>
+    char map_numpy_type();
+  }
+}
+}
+
+// Modified from: https://github.com/rogersce/cnpy
+// Copyright (C) 2011  Carl Rogers
+// Released under MIT License
+// license available at http://www.opensource.org/licenses/mit-license.php
+size_t
+diy::io::NumPy::
+parse_npy_header(BOV::Shape& shape, bool& fortran_order)
+{
+    char buffer[256];
+    file().read_at_all(0, buffer, 256);
+    std::string header(buffer, buffer + 256);
+    size_t nl = header.find('\n');
+    if (nl == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to read the header");
+    header = header.substr(11, nl - 11 + 1);
+    size_t header_size = nl + 1;
+
+    int loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    unsigned ndims;
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    if(str_shape[str_shape.size()-1] == ',') ndims = 1;
+    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    shape.resize(ndims);
+    for(unsigned int i = 0;i < ndims;i++) {
+        loc1 = str_shape.find(",");
+        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
+        str_shape = str_shape.substr(loc1+1);
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable.
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    //bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    //assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size_ = atoi(str_ws.substr(0,loc2).c_str());
+
+    return header_size;
+}
+
+template<class T>
+void
+diy::io::NumPy::
+write_header(int dim, const DiscreteBounds& bounds)
+{
+    std::vector<int> shape;
+    for (int i = 0; i < dim; ++i)
+        shape.push_back(bounds.max[i] - bounds.min[i] + 1);
+
+    write_header< T, std::vector<int> >(shape);
+}
+
+
+template<class T, class S>
+void
+diy::io::NumPy::
+write_header(const S& shape)
+{
+    BOV::set_shape(shape);
+
+    diy::MemoryBuffer dict;
+    save(dict, "{'descr': '");
+    diy::save(dict, detail::big_endian());
+    diy::save(dict, detail::map_numpy_type<T>());
+    convert_and_save(dict, sizeof(T));
+    save(dict, "', 'fortran_order': False, 'shape': (");
+    convert_and_save(dict, shape[0]);
+    for (int i = 1; i < (int) shape.size(); i++)
+    {
+        save(dict, ", ");
+        convert_and_save(dict, shape[i]);
+    }
+    if(shape.size() == 1) save(dict, ",");
+    save(dict, "), }");
+    //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+    int remainder = 16 - (10 + dict.position) % 16;
+    for (int i = 0; i < remainder - 1; ++i)
+        diy::save(dict, ' ');
+    diy::save(dict, '\n');
+
+    diy::MemoryBuffer header;
+    diy::save(header, (char) 0x93);
+    save(header, "NUMPY");
+    diy::save(header, (char) 0x01);  // major version of numpy format
+    diy::save(header, (char) 0x00);  // minor version of numpy format
+    diy::save(header, (unsigned short) dict.position);
+    header.save_binary(&dict.buffer[0], dict.buffer.size());
+
+    BOV::set_offset(header.position);
+
+    if (file().comm().rank() == 0)
+        file().write_at(0, &header.buffer[0], header.buffer.size());
+}
+
+char
+diy::io::detail::big_endian()
+{
+  unsigned char x[] = {1,0};
+  void* x_void = x;
+  short y = *static_cast<short*>(x_void);
+  return y == 1 ? '<' : '>';
+}
+
+namespace diy
+{
+namespace io
+{
+namespace detail
+{
+template<> inline char map_numpy_type<float>()                         { return 'f'; }
+template<> inline char map_numpy_type<double>()                        { return 'f'; }
+template<> inline char map_numpy_type<long double>()                   { return 'f'; }
+
+template<> inline char map_numpy_type<int>()                           { return 'i'; }
+template<> inline char map_numpy_type<char>()                          { return 'i'; }
+template<> inline char map_numpy_type<short>()                         { return 'i'; }
+template<> inline char map_numpy_type<long>()                          { return 'i'; }
+template<> inline char map_numpy_type<long long>()                     { return 'i'; }
+
+template<> inline char map_numpy_type<unsigned int>()                  { return 'u'; }
+template<> inline char map_numpy_type<unsigned char>()                 { return 'u'; }
+template<> inline char map_numpy_type<unsigned short>()                { return 'u'; }
+template<> inline char map_numpy_type<unsigned long>()                 { return 'u'; }
+template<> inline char map_numpy_type<unsigned long long>()            { return 'u'; }
+
+template<> inline char map_numpy_type<bool>()                          { return 'b'; }
+
+template<> inline char map_numpy_type< std::complex<float> >()         { return 'c'; }
+template<> inline char map_numpy_type< std::complex<double> >()        { return 'c'; }
+template<> inline char map_numpy_type< std::complex<long double> >()   { return 'c'; }
+}
+}
+}
+
+#endif
--- a/diy/include/diy/link.hpp
+++ b/diy/include/diy/link.hpp
@ -0,0 +1,219 @@
+#ifndef DIY_COVER_HPP
+#define DIY_COVER_HPP
+
+#include <vector>
+#include <map>
+#include <algorithm>
+
+#include "types.hpp"
+#include "serialization.hpp"
+#include "assigner.hpp"
+
+namespace diy
+{
+  // Local view of a distributed representation of a cover, a completely unstructured link
+  class Link
+  {
+    public:
+      virtual   ~Link()                             {}  // need to be able to delete derived classes
+
+      int       size() const                        { return neighbors_.size(); }
+      inline
+      int       size_unique() const;
+      BlockID   target(int i) const                 { return neighbors_[i]; }
+      BlockID&  target(int i)                       { return neighbors_[i]; }
+      inline
+      int       find(int gid) const;
+
+      void      add_neighbor(const BlockID& block)  { neighbors_.push_back(block); }
+
+      void      fix(const Assigner& assigner)       { for (unsigned i = 0; i < neighbors_.size(); ++i) { neighbors_[i].proc = assigner.rank(neighbors_[i].gid); } }
+
+      void      swap(Link& other)                   { neighbors_.swap(other.neighbors_); }
+
+      virtual void  save(BinaryBuffer& bb) const    { diy::save(bb, neighbors_); }
+      virtual void  load(BinaryBuffer& bb)          { diy::load(bb, neighbors_); }
+
+      virtual size_t id() const                     { return 0; }
+
+    private:
+      std::vector<BlockID>  neighbors_;
+  };
+
+  template<class Bounds_>
+  class RegularLink;
+
+  typedef       RegularLink<DiscreteBounds>         RegularGridLink;
+  typedef       RegularLink<ContinuousBounds>       RegularContinuousLink;
+
+  // Selector between regular discrete and contious links given bounds type
+  template<class Bounds_>
+  struct RegularLinkSelector;
+
+  template<>
+  struct RegularLinkSelector<DiscreteBounds>
+  {
+    typedef     RegularGridLink         type;
+    static const size_t id = 1;
+  };
+
+  template<>
+  struct RegularLinkSelector<ContinuousBounds>
+  {
+    typedef     RegularContinuousLink   type;
+    static const size_t id = 2;
+  };
+
+
+  // for a regular decomposition, it makes sense to address the neighbors by direction
+  // and store local and neighbor bounds
+  template<class Bounds_>
+  class RegularLink: public Link
+  {
+    public:
+      typedef   Bounds_                             Bounds;
+
+      typedef   std::map<Direction, int>            DirMap;
+      typedef   std::vector<Direction>              DirVec;
+
+    public:
+                RegularLink(int dim, const Bounds& core, const Bounds& bounds):
+                  dim_(dim), core_(core), bounds_(bounds)            {}
+
+      // dimension
+      int       dimension() const                       { return dim_; }
+
+      // direction
+      int       direction(Direction dir) const;         // convert direction to a neighbor (-1 if no neighbor)
+      Direction direction(int i) const                  { return dir_vec_[i]; }
+      void      add_direction(Direction dir)            { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
+
+      // wrap
+      void       add_wrap(Direction dir)                { wrap_.push_back(dir); }
+      Direction  wrap(int i) const                      { return wrap_[i]; }
+      Direction& wrap(int i)                            { return wrap_[i]; }
+
+      // bounds
+      const Bounds& core() const                        { return core_; }
+      Bounds&       core()                              { return core_; }
+      const Bounds& bounds() const                      { return bounds_; }
+      Bounds&       bounds()                            { return bounds_; }
+      const Bounds& bounds(int i) const                 { return nbr_bounds_[i]; }
+      void          add_bounds(const Bounds& bounds)    { nbr_bounds_.push_back(bounds); }
+
+      void      swap(RegularLink& other)                { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); wrap_.swap(other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); }
+
+      void      save(BinaryBuffer& bb) const
+      {
+          Link::save(bb);
+          diy::save(bb, dim_);
+          diy::save(bb, dir_map_);
+          diy::save(bb, dir_vec_);
+          diy::save(bb, core_);
+          diy::save(bb, bounds_);
+          diy::save(bb, nbr_bounds_);
+          diy::save(bb, wrap_);
+      }
+
+      void      load(BinaryBuffer& bb)
+      {
+          Link::load(bb);
+          diy::load(bb, dim_);
+          diy::load(bb, dir_map_);
+          diy::load(bb, dir_vec_);
+          diy::load(bb, core_);
+          diy::load(bb, bounds_);
+          diy::load(bb, nbr_bounds_);
+          diy::load(bb, wrap_);
+      }
+
+      virtual size_t id() const                         { return RegularLinkSelector<Bounds>::id; }
+
+    private:
+      int       dim_;
+
+      DirMap    dir_map_;
+      DirVec    dir_vec_;
+
+      Bounds                    core_;
+      Bounds                    bounds_;
+      std::vector<Bounds>       nbr_bounds_;
+      std::vector<Direction>    wrap_;
+  };
+
+  // Other cover candidates: KDTreeLink, AMRGridLink
+
+  struct LinkFactory
+  {
+    public:
+      static Link*          create(size_t id)
+      {
+          // not pretty, but will do for now
+          if (id == 0)
+            return new Link;
+          else if (id == 1)
+            return new RegularGridLink(0, DiscreteBounds(), DiscreteBounds());
+          else if (id == 2)
+            return new RegularContinuousLink(0, ContinuousBounds(), ContinuousBounds());
+          else
+            return 0;
+      }
+
+      inline static void    save(BinaryBuffer& bb, const Link* l);
+      inline static Link*   load(BinaryBuffer& bb);
+  };
+}
+
+
+void
+diy::LinkFactory::
+save(BinaryBuffer& bb, const Link* l)
+{
+    diy::save(bb, l->id());
+    l->save(bb);
+}
+
+diy::Link*
+diy::LinkFactory::
+load(BinaryBuffer& bb)
+{
+    size_t id;
+    diy::load(bb, id);
+    Link* l = create(id);
+    l->load(bb);
+    return l;
+}
+
+int
+diy::Link::
+find(int gid) const
+{
+    for (unsigned i = 0; i < (unsigned)size(); ++i)
+  {
+    if (target(i).gid == gid)
+      return i;
+  }
+  return -1;
+}
+int
+diy::Link::
+size_unique() const
+{
+    std::vector<BlockID> tmp(neighbors_.begin(), neighbors_.end());
+    std::sort(tmp.begin(), tmp.end());
+    return std::unique(tmp.begin(), tmp.end()) - tmp.begin();
+}
+
+template<class Bounds>
+int
+diy::RegularLink<Bounds>::
+direction(Direction dir) const
+{
+  DirMap::const_iterator it = dir_map_.find(dir);
+  if (it == dir_map_.end())
+    return -1;
+  else
+    return it->second;
+}
+
+#endif
--- a/diy/include/diy/log.hpp
+++ b/diy/include/diy/log.hpp
@ -0,0 +1,103 @@
+#ifndef DIY_LOG_HPP
+#define DIY_LOG_HPP
+
+#ifndef DIY_USE_SPDLOG
+
+#include <memory>
+#include "fmt/format.h"
+#include "fmt/ostream.h"
+
+namespace diy
+{
+
+namespace spd
+{
+    struct logger
+    {
+        // logger.info(cppformat_string, arg1, arg2, arg3, ...) call style
+        template <typename... Args> void trace(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void debug(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void info(const char* fmt, const Args&... args)     {}
+        template <typename... Args> void warn(const char* fmt, const Args&... args)     {}
+        template <typename... Args> void error(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void critical(const char* fmt, const Args&... args) {}
+    };
+}
+
+inline
+std::shared_ptr<spd::logger>
+get_logger()
+{
+    return std::make_shared<spd::logger>();
+}
+
+inline
+std::shared_ptr<spd::logger>
+create_logger(std::string)
+{
+    return std::make_shared<spd::logger>();
+}
+
+template<class... Args>
+std::shared_ptr<spd::logger>
+set_logger(Args... args)
+{
+    return std::make_shared<spd::logger>();
+}
+
+}   // diy
+
+#else // DIY_USE_SPDLOG
+
+#include <string>
+
+#include <spdlog/spdlog.h>
+#include <spdlog/sinks/null_sink.h>
+
+#include <spdlog/fmt/bundled/format.h>
+#include <spdlog/fmt/bundled/ostream.h>
+
+namespace diy
+{
+
+namespace spd = ::spdlog;
+
+inline
+std::shared_ptr<spd::logger>
+get_logger()
+{
+    auto log = spd::get("diy");
+    if (!log)
+    {
+        auto null_sink = std::make_shared<spd::sinks::null_sink_mt> ();
+        log = std::make_shared<spd::logger>("null_logger", null_sink);
+    }
+    return log;
+}
+
+inline
+std::shared_ptr<spd::logger>
+create_logger(std::string log_level)
+{
+    auto log = spd::stderr_logger_mt("diy");
+    int lvl;
+    for (lvl = spd::level::trace; lvl < spd::level::off; ++lvl)
+        if (spd::level::level_names[lvl] == log_level)
+            break;
+    log->set_level(static_cast<spd::level::level_enum>(lvl));
+    return log;
+}
+
+template<class... Args>
+std::shared_ptr<spd::logger>
+set_logger(Args... args)
+{
+    auto log = std::make_shared<spdlog::logger>("diy", args...);
+    return log;
+}
+
+}   // diy
+#endif
+
+
+#endif // DIY_LOG_HPP
--- a/diy/include/diy/master.hpp
+++ b/diy/include/diy/master.hpp
--- a/diy/include/diy/mpi.hpp
+++ b/diy/include/diy/mpi.hpp
@ -0,0 +1,32 @@
+#ifndef DIY_MPI_HPP
+#define DIY_MPI_HPP
+
+#include <mpi.h>
+
+#include "mpi/constants.hpp"
+#include "mpi/datatypes.hpp"
+#include "mpi/optional.hpp"
+#include "mpi/status.hpp"
+#include "mpi/request.hpp"
+#include "mpi/point-to-point.hpp"
+#include "mpi/communicator.hpp"
+#include "mpi/collectives.hpp"
+#include "mpi/io.hpp"
+
+namespace diy
+{
+namespace mpi
+{
+
+//! \ingroup MPI
+struct environment
+{
+  environment()                           { int argc = 0; char** argv; MPI_Init(&argc, &argv); }
+  environment(int argc, char* argv[])     { MPI_Init(&argc, &argv); }
+  ~environment()                          { MPI_Finalize(); }
+};
+
+}
+}
+
+#endif
--- a/diy/include/diy/mpi/collectives.hpp
+++ b/diy/include/diy/mpi/collectives.hpp
@ -0,0 +1,328 @@
+#include <vector>
+
+#include "operations.hpp"
+
+namespace diy
+{
+namespace mpi
+{
+  //!\addtogroup MPI
+  //!@{
+
+  template<class T, class Op>
+  struct Collectives
+  {
+    typedef   detail::mpi_datatype<T>     Datatype;
+
+    static void broadcast(const communicator& comm, T& x, int root)
+    {
+      MPI_Bcast(Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(), root, comm);
+    }
+
+    static void broadcast(const communicator& comm, std::vector<T>& x, int root)
+    {
+      size_t sz = x.size();
+      Collectives<size_t, void*>::broadcast(comm, sz, root);
+
+      if (comm.rank() != root)
+          x.resize(sz);
+
+      MPI_Bcast(Datatype::address(x[0]),
+                x.size(),
+                Datatype::datatype(), root, comm);
+    }
+
+    static request ibroadcast(const communicator& comm, T& x, int root)
+    {
+      request r;
+      MPI_Ibcast(Datatype::address(x),
+                 Datatype::count(x),
+                 Datatype::datatype(), root, comm, &r.r);
+      return r;
+    }
+
+    static void gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
+    {
+      size_t s  = comm.size();
+             s *= Datatype::count(in);
+      out.resize(s);
+      MPI_Gather(Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 Datatype::address(out[0]),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 root, comm);
+    }
+
+    static void gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
+    {
+      std::vector<int>  counts(comm.size());
+      Collectives<int,void*>::gather(comm, (int) in.size(), counts, root);
+
+      std::vector<int>  offsets(comm.size(), 0);
+      for (unsigned i = 1; i < offsets.size(); ++i)
+        offsets[i] = offsets[i-1] + counts[i-1];
+
+      std::vector<T> buffer(offsets.back() + counts.back());
+      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
+                  in.size(),
+                  Datatype::datatype(),
+                  Datatype::address(buffer[0]),
+                  &counts[0],
+                  &offsets[0],
+                  Datatype::datatype(),
+                  root, comm);
+
+      out.resize(comm.size());
+      size_t cur = 0;
+      for (unsigned i = 0; i < (unsigned)comm.size(); ++i)
+      {
+          out[i].reserve(counts[i]);
+          for (unsigned j = 0; j < (unsigned)counts[i]; ++j)
+              out[i].push_back(buffer[cur++]);
+      }
+    }
+
+    static void gather(const communicator& comm, const T& in, int root)
+    {
+      MPI_Gather(Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 root, comm);
+    }
+
+    static void gather(const communicator& comm, const std::vector<T>& in, int root)
+    {
+      Collectives<int,void*>::gather(comm, (int) in.size(), root);
+
+      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
+                  in.size(),
+                  Datatype::datatype(),
+                  0, 0, 0,
+                  Datatype::datatype(),
+                  root, comm);
+    }
+
+    static void all_gather(const communicator& comm, const T& in, std::vector<T>& out)
+    {
+      size_t s  = comm.size();
+             s *= Datatype::count(in);
+      out.resize(s);
+      MPI_Allgather(Datatype::address(const_cast<T&>(in)),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    Datatype::address(out[0]),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    comm);
+    }
+
+    static void all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
+    {
+      std::vector<int>  counts(comm.size());
+      Collectives<int,void*>::all_gather(comm, (int) in.size(), counts);
+
+      std::vector<int>  offsets(comm.size(), 0);
+      for (unsigned i = 1; i < offsets.size(); ++i)
+        offsets[i] = offsets[i-1] + counts[i-1];
+
+      std::vector<T> buffer(offsets.back() + counts.back());
+      MPI_Allgatherv(Datatype::address(const_cast<T&>(in[0])),
+                     in.size(),
+                     Datatype::datatype(),
+                     Datatype::address(buffer[0]),
+                     &counts[0],
+                     &offsets[0],
+                     Datatype::datatype(),
+                     comm);
+
+      out.resize(comm.size());
+      size_t cur = 0;
+      for (int i = 0; i < comm.size(); ++i)
+      {
+          out[i].reserve(counts[i]);
+          for (int j = 0; j < counts[i]; ++j)
+              out[i].push_back(buffer[cur++]);
+      }
+    }
+
+    static void reduce(const communicator& comm, const T& in, T& out, int root, const Op& op)
+    {
+      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
+                 Datatype::address(out),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 detail::mpi_op<Op>::get(op),
+                 root, comm);
+    }
+
+    static void reduce(const communicator& comm, const T& in, int root, const Op& op)
+    {
+      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
+                 Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 detail::mpi_op<Op>::get(op),
+                 root, comm);
+    }
+
+    static void all_reduce(const communicator& comm, const T& in, T& out, const Op& op)
+    {
+      MPI_Allreduce(Datatype::address(const_cast<T&>(in)),
+                    Datatype::address(out),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    detail::mpi_op<Op>::get(op),
+                    comm);
+    }
+
+    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
+    {
+      out.resize(in.size());
+      MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])),
+                    Datatype::address(out[0]),
+                    in.size(),
+                    Datatype::datatype(),
+                    detail::mpi_op<Op>::get(op),
+                    comm);
+    }
+
+    static void scan(const communicator& comm, const T& in, T& out, const Op& op)
+    {
+      MPI_Scan(Datatype::address(const_cast<T&>(in)),
+               Datatype::address(out),
+               Datatype::count(in),
+               Datatype::datatype(),
+               detail::mpi_op<Op>::get(op),
+               comm);
+    }
+
+    static void all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
+    {
+      // NB: this will fail if T is a vector
+      MPI_Alltoall(Datatype::address(const_cast<T&>(in[0])), n,
+                   Datatype::datatype(),
+                   Datatype::address(out[0]), n,
+                   Datatype::datatype(),
+                   comm);
+    }
+  };
+
+  //! Broadcast to all processes in `comm`.
+  template<class T>
+  void      broadcast(const communicator& comm, T& x, int root)
+  {
+    Collectives<T,void*>::broadcast(comm, x, root);
+  }
+
+  //! Broadcast for vectors
+  template<class T>
+  void      broadcast(const communicator& comm, std::vector<T>& x, int root)
+  {
+    Collectives<T,void*>::broadcast(comm, x, root);
+  }
+
+  //! iBroadcast to all processes in `comm`.
+  template<class T>
+  request   ibroadcast(const communicator& comm, T& x, int root)
+  {
+    return Collectives<T,void*>::ibroadcast(comm, x, root);
+  }
+
+  //! Gather from all processes in `comm`.
+  //!  On `root` process, `out` is resized to `comm.size()` and filled with
+  //! elements from the respective ranks.
+  template<class T>
+  void      gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, out, root);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T>
+  void      gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, out, root);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T>
+  void      gather(const communicator& comm, const T& in, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, root);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T>
+  void      gather(const communicator& comm, const std::vector<T>& in, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, root);
+  }
+
+  //! all_gather from all processes in `comm`.
+  //! `out` is resized to `comm.size()` and filled with
+  //! elements from the respective ranks.
+  template<class T>
+  void      all_gather(const communicator& comm, const T& in, std::vector<T>& out)
+  {
+    Collectives<T,void*>::all_gather(comm, in, out);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T>
+  void      all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
+  {
+    Collectives<T,void*>::all_gather(comm, in, out);
+  }
+
+  //! reduce
+  template<class T, class Op>
+  void      reduce(const communicator& comm, const T& in, T& out, int root, const Op& op)
+  {
+    Collectives<T, Op>::reduce(comm, in, out, root, op);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T, class Op>
+  void      reduce(const communicator& comm, const T& in, int root, const Op& op)
+  {
+    Collectives<T, Op>::reduce(comm, in, root, op);
+  }
+
+  //! all_reduce
+  template<class T, class Op>
+  void      all_reduce(const communicator& comm, const T& in, T& out, const Op& op)
+  {
+    Collectives<T, Op>::all_reduce(comm, in, out, op);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T, class Op>
+  void      all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
+  {
+    Collectives<T, Op>::all_reduce(comm, in, out, op);
+  }
+
+  //! scan
+  template<class T, class Op>
+  void      scan(const communicator& comm, const T& in, T& out, const Op& op)
+  {
+    Collectives<T, Op>::scan(comm, in, out, op);
+  }
+
+  //! all_to_all
+  template<class T>
+  void      all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
+  {
+    Collectives<T, void*>::all_to_all(comm, in, out, n);
+  }
+
+  //!@}
+}
+}
--- a/diy/include/diy/mpi/communicator.hpp
+++ b/diy/include/diy/mpi/communicator.hpp
@ -0,0 +1,72 @@
+namespace diy
+{
+namespace mpi
+{
+
+  //! \ingroup MPI
+  //! Simple wrapper around `MPI_Comm`.
+  class communicator
+  {
+    public:
+                communicator(MPI_Comm comm = MPI_COMM_WORLD):
+                  comm_(comm), rank_(0), size_(1) { if (comm != MPI_COMM_NULL) { MPI_Comm_rank(comm_, &rank_); MPI_Comm_size(comm_, &size_); } }
+
+      int       rank() const                        { return rank_; }
+      int       size() const                        { return size_; }
+
+      //void      send(int dest,
+      //               int tag,
+      //               const void* buf,
+      //               MPI_Datatype datatype) const   { }
+
+      //! Send `x` to processor `dest` using `tag` (blocking).
+      template<class T>
+      void      send(int dest, int tag, const T& x) const   { detail::send<T>()(comm_, dest, tag, x); }
+
+      //! Receive `x` from `dest` using `tag` (blocking).
+      //! If `T` is an `std::vector<...>`, `recv` will resize it to fit exactly the sent number of values.
+      template<class T>
+      status    recv(int source, int tag, T& x) const       { return detail::recv<T>()(comm_, source, tag, x); }
+
+      //! Non-blocking version of `send()`.
+      template<class T>
+      request   isend(int dest, int tag, const T& x) const  { return detail::isend<T>()(comm_, dest, tag, x); }
+
+      //! Non-blocking version of `recv()`.
+      //! If `T` is an `std::vector<...>`, its size must be big enough to accomodate the sent values.
+      template<class T>
+      request   irecv(int source, int tag, T& x) const      { return detail::irecv<T>()(comm_, source, tag, x); }
+
+      //! probe
+      status    probe(int source, int tag) const            { status s; MPI_Probe(source, tag, comm_, &s.s); return s; }
+
+      //! iprobe
+      inline
+      optional<status>
+                iprobe(int source, int tag) const;
+
+      //! barrier
+      void      barrier() const                             { MPI_Barrier(comm_); }
+
+                operator MPI_Comm() const                   { return comm_; }
+
+    private:
+      MPI_Comm  comm_;
+      int       rank_;
+      int       size_;
+  };
+}
+}
+
+diy::mpi::optional<diy::mpi::status>
+diy::mpi::communicator::
+iprobe(int source, int tag) const
+{
+  status s;
+  int flag;
+  MPI_Iprobe(source, tag, comm_, &flag, &s.s);
+  if (flag)
+    return s;
+  return optional<status>();
+}
+
--- a/diy/include/diy/mpi/constants.hpp
+++ b/diy/include/diy/mpi/constants.hpp
@ -0,0 +1,13 @@
+#ifndef DIY_MPI_CONSTANTS_HPP
+#define DIY_MPI_CONSTANTS_HPP
+
+namespace diy
+{
+namespace mpi
+{
+  const int any_source  = MPI_ANY_SOURCE;
+  const int any_tag     = MPI_ANY_TAG;
+}
+}
+
+#endif
--- a/diy/include/diy/mpi/datatypes.hpp
+++ b/diy/include/diy/mpi/datatypes.hpp
@ -0,0 +1,63 @@
+#ifndef DIY_MPI_DATATYPES_HPP
+#define DIY_MPI_DATATYPES_HPP
+
+#include <vector>
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+  template<class T> MPI_Datatype  get_mpi_datatype();
+
+  struct true_type  {};
+  struct false_type {};
+
+  /* is_mpi_datatype */
+  template<class T>
+  struct is_mpi_datatype        { typedef false_type    type; };
+
+#define DIY_MPI_DATATYPE_MAP(cpp_type, mpi_type) \
+  template<>  inline MPI_Datatype  get_mpi_datatype<cpp_type>() { return mpi_type; }  \
+  template<>  struct is_mpi_datatype<cpp_type>                  { typedef true_type type; };    \
+  template<>  struct is_mpi_datatype< std::vector<cpp_type> >   { typedef true_type type; };
+
+  DIY_MPI_DATATYPE_MAP(char,                  MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(unsigned char,         MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(bool,                  MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(int,                   MPI_INT);
+  DIY_MPI_DATATYPE_MAP(unsigned,              MPI_UNSIGNED);
+  DIY_MPI_DATATYPE_MAP(long,                  MPI_LONG);
+  DIY_MPI_DATATYPE_MAP(unsigned long,         MPI_UNSIGNED_LONG);
+  DIY_MPI_DATATYPE_MAP(long long,             MPI_LONG_LONG_INT);
+  DIY_MPI_DATATYPE_MAP(unsigned long long,    MPI_UNSIGNED_LONG_LONG);
+  DIY_MPI_DATATYPE_MAP(float,                 MPI_FLOAT);
+  DIY_MPI_DATATYPE_MAP(double,                MPI_DOUBLE);
+
+  /* mpi_datatype: helper routines, specialized for std::vector<...> */
+  template<class T>
+  struct mpi_datatype
+  {
+    static MPI_Datatype         datatype()              { return get_mpi_datatype<T>(); }
+    static const void*          address(const T& x)     { return &x; }
+    static void*                address(T& x)           { return &x; }
+    static int                  count(const T& x)       { return 1; }
+  };
+
+  template<class U>
+  struct mpi_datatype< std::vector<U> >
+  {
+    typedef     std::vector<U>      VecU;
+
+    static MPI_Datatype         datatype()              { return get_mpi_datatype<U>(); }
+    static const void*          address(const VecU& x)  { return &x[0]; }
+    static void*                address(VecU& x)        { return &x[0]; }
+    static int                  count(const VecU& x)    { return x.size(); }
+  };
+
+}
+}
+}
+
+#endif
--- a/diy/include/diy/mpi/io.hpp
+++ b/diy/include/diy/mpi/io.hpp
@ -0,0 +1,137 @@
+#ifndef DIY_MPI_IO_HPP
+#define DIY_MPI_IO_HPP
+
+#include <vector>
+#include <string>
+
+namespace diy
+{
+namespace mpi
+{
+namespace io
+{
+  typedef               MPI_Offset              offset;
+
+  //! Wraps MPI file IO. \ingroup MPI
+  class file
+  {
+    public:
+      enum
+      {
+        rdonly          = MPI_MODE_RDONLY,
+        rdwr            = MPI_MODE_RDWR,
+        wronly          = MPI_MODE_WRONLY,
+        create          = MPI_MODE_CREATE,
+        exclusive       = MPI_MODE_EXCL,
+        delete_on_close = MPI_MODE_DELETE_ON_CLOSE,
+        unique_open     = MPI_MODE_UNIQUE_OPEN,
+        sequential      = MPI_MODE_SEQUENTIAL,
+        append          = MPI_MODE_APPEND
+      };
+
+    public:
+                    file(const communicator&    comm,
+                         const std::string&     filename,
+                         int                    mode):
+                        comm_(comm)                         { MPI_File_open(comm, const_cast<char*>(filename.c_str()), mode, MPI_INFO_NULL, &fh); }
+                    ~file()                                 { close(); }
+      void          close()                                 { if (fh != MPI_FILE_NULL) MPI_File_close(&fh); }
+
+      offset        size() const                            { offset sz; MPI_File_get_size(fh, &sz); return sz; }
+      void          resize(offset size)                     { MPI_File_set_size(fh, size); }
+
+      inline void   read_at(offset o, char* buffer, size_t size);
+      inline void   read_at_all(offset o, char* buffer, size_t size);
+      inline void   write_at(offset o, const char* buffer, size_t size);
+      inline void   write_at_all(offset o, const char* buffer, size_t size);
+
+      template<class T>
+      inline void   read_at(offset o, std::vector<T>& data);
+
+      template<class T>
+      inline void   read_at_all(offset o, std::vector<T>& data);
+
+      template<class T>
+      inline void   write_at(offset o, const std::vector<T>& data);
+
+      template<class T>
+      inline void   write_at_all(offset o, const std::vector<T>& data);
+
+      const communicator&
+                    comm() const                            { return comm_; }
+
+      MPI_File&     handle()                                { return fh; }
+
+    private:
+      const communicator&   comm_;
+      MPI_File              fh;
+  };
+}
+}
+}
+
+void
+diy::mpi::io::file::
+read_at(offset o, char* buffer, size_t size)
+{
+  status s;
+  MPI_File_read_at(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+read_at(offset o, std::vector<T>& data)
+{
+  read_at(o, &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+read_at_all(offset o, char* buffer, size_t size)
+{
+  status s;
+  MPI_File_read_at_all(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+read_at_all(offset o, std::vector<T>& data)
+{
+  read_at_all(o, (char*) &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+write_at(offset o, const char* buffer, size_t size)
+{
+  status s;
+  MPI_File_write_at(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+write_at(offset o, const std::vector<T>& data)
+{
+  write_at(o, (const char*) &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+write_at_all(offset o, const char* buffer, size_t size)
+{
+  status s;
+  MPI_File_write_at_all(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+write_at_all(offset o, const std::vector<T>& data)
+{
+  write_at_all(o, &data[0], data.size()*sizeof(T));
+}
+
+#endif
--- a/diy/include/diy/mpi/operations.hpp
+++ b/diy/include/diy/mpi/operations.hpp
@ -0,0 +1,26 @@
+#include <functional>
+
+namespace diy
+{
+namespace mpi
+{
+  //! \addtogroup MPI
+  //!@{
+  template<class U>
+  struct maximum { const U& operator()(const U& x, const U& y) const { return std::max(x,y); } };
+  template<class U>
+  struct minimum { const U& operator()(const U& x, const U& y) const { return std::min(x,y); } };
+  //!@}
+
+namespace detail
+{
+  template<class T> struct mpi_op                           { static MPI_Op  get(const T&); };
+  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get(const maximum<U>&) { return MPI_MAX; }  };
+  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get(const minimum<U>&) { return MPI_MIN; }  };
+  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get(const std::plus<U>&) { return MPI_SUM; }  };
+  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get(const std::multiplies<U>&) { return MPI_PROD; }  };
+  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get(const std::logical_and<U>&) { return MPI_LAND; }  };
+  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get(const std::logical_or<U>&) { return MPI_LOR; }  };
+}
+}
+}
--- a/diy/include/diy/mpi/optional.hpp
+++ b/diy/include/diy/mpi/optional.hpp
@ -0,0 +1,55 @@
+namespace diy
+{
+namespace mpi
+{
+  template<class T>
+  struct optional
+  {
+                optional():
+                  init_(false)                  {}
+
+                optional(const T& v):
+                  init_(true)                   { new(buf_) T(v); }
+
+                optional(const optional& o):
+                  init_(o.init_)                { if (init_) new(buf_) T(*o);  }
+
+                ~optional()                     { if (init_) clear(); }
+
+    inline
+    optional&   operator=(const optional& o);
+
+                operator bool() const           { return init_; }
+
+    T&          operator*()                     { return *static_cast<T*>(address()); }
+    const T&    operator*() const               { return *static_cast<const T*>(address()); }
+
+    T*          operator->()                    { return &(operator*()); }
+    const T*    operator->() const              { return &(operator*()); }
+
+    private:
+      void      clear()                         { static_cast<T*>(address())->~T(); }
+
+      void*         address()                   { return buf_; }
+      const void*   address() const             { return buf_; }
+
+    private:
+      bool init_;
+      char buf_[sizeof(T)];
+  };
+}
+}
+
+template<class T>
+diy::mpi::optional<T>&
+diy::mpi::optional<T>::
+operator=(const optional& o)
+{
+  if (init_)
+    clear();
+  init_ = o.init_;
+  if (init_)
+    new (buf_) T(*o);
+
+  return *this;
+}
--- a/diy/include/diy/mpi/point-to-point.hpp
+++ b/diy/include/diy/mpi/point-to-point.hpp
@ -0,0 +1,98 @@
+#include <vector>
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+  // send
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct send;
+
+  template<class T>
+  struct send<T, true_type>
+  {
+    void operator()(MPI_Comm comm, int dest, int tag, const T& x) const
+    {
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Send((void*) Datatype::address(x),
+               Datatype::count(x),
+               Datatype::datatype(),
+               dest, tag, comm);
+    }
+  };
+
+  // recv
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct recv;
+
+  template<class T>
+  struct recv<T, true_type>
+  {
+    status operator()(MPI_Comm comm, int source, int tag, T& x) const
+    {
+      typedef       mpi_datatype<T>     Datatype;
+      status s;
+      MPI_Recv((void*) Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                source, tag, comm, &s.s);
+      return s;
+    }
+  };
+
+  template<class U>
+  struct recv<std::vector<U>, true_type>
+  {
+    status operator()(MPI_Comm comm, int source, int tag, std::vector<U>& x) const
+    {
+      status s;
+
+      MPI_Probe(source, tag, comm, &s.s);
+      x.resize(s.count<U>());
+      MPI_Recv(&x[0], x.size(), get_mpi_datatype<U>(), source, tag, comm, &s.s);
+      return s;
+    }
+  };
+
+  // isend
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct isend;
+
+  template<class T>
+  struct isend<T, true_type>
+  {
+    request operator()(MPI_Comm comm, int dest, int tag, const T& x) const
+    {
+      request r;
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Isend((void*) Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                dest, tag, comm, &r.r);
+      return r;
+    }
+  };
+
+  // irecv
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct irecv;
+
+  template<class T>
+  struct irecv<T, true_type>
+  {
+    request operator()(MPI_Comm comm, int source, int tag, T& x) const
+    {
+      request r;
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Irecv(Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                source, tag, comm, &r.r);
+      return r;
+    }
+  };
+}
+}
+}
--- a/diy/include/diy/mpi/request.hpp
+++ b/diy/include/diy/mpi/request.hpp
@ -0,0 +1,26 @@
+namespace diy
+{
+namespace mpi
+{
+  struct request
+  {
+    status              wait()              { status s; MPI_Wait(&r, &s.s); return s; }
+    inline
+    optional<status>    test();
+    void                cancel()            { MPI_Cancel(&r); }
+
+    MPI_Request         r;
+  };
+}
+}
+
+diy::mpi::optional<diy::mpi::status>
+diy::mpi::request::test()
+{
+  status s;
+  int flag;
+  MPI_Test(&r, &flag, &s.s);
+  if (flag)
+    return s;
+  return optional<status>();
+}
--- a/diy/include/diy/mpi/status.hpp
+++ b/diy/include/diy/mpi/status.hpp
@ -0,0 +1,30 @@
+namespace diy
+{
+namespace mpi
+{
+  struct status
+  {
+    int             source() const          { return s.MPI_SOURCE; }
+    int             tag() const             { return s.MPI_TAG; }
+    int             error() const           { return s.MPI_ERROR; }
+    bool            cancelled() const       { int flag; MPI_Test_cancelled(const_cast<MPI_Status*>(&s), &flag); return flag; }
+
+    template<class T>
+    int             count() const;
+
+                    operator MPI_Status&()              { return s; }
+                    operator const MPI_Status&() const  { return s; }
+
+    MPI_Status      s;
+  };
+}
+}
+
+template<class T>
+int
+diy::mpi::status::count() const
+{
+  int c;
+  MPI_Get_count(const_cast<MPI_Status*>(&s), detail::get_mpi_datatype<T>(), &c);
+  return c;
+}
--- a/diy/include/diy/no-thread.hpp
+++ b/diy/include/diy/no-thread.hpp
@ -0,0 +1,38 @@
+#ifndef DIY_NO_THREAD_HPP
+#define DIY_NO_THREAD_HPP
+
+// replicates only the parts of the threading interface that we use
+// executes everything in a single thread
+
+namespace diy
+{
+  struct thread
+  {
+                        thread(void (*f)(void *), void* args):
+                            f_(f), args_(args)                    {}
+
+    void                join()                                    { f_(args_); }
+
+    static unsigned     hardware_concurrency()                    { return 1; }
+
+    void (*f_)(void*);
+    void*   args_;
+  };
+
+  struct mutex {};
+  struct fast_mutex {};
+  struct recursive_mutex {};
+
+  template<class T>
+  struct lock_guard
+  {
+      lock_guard(T&)        {}
+  };
+
+  namespace this_thread
+  {
+      inline unsigned long int  get_id()    { return 0; }
+  }
+}
+
+#endif
--- a/diy/include/diy/partners/all-reduce.hpp
+++ b/diy/include/diy/partners/all-reduce.hpp
@ -0,0 +1,72 @@
+#ifndef DIY_PARTNERS_ALL_REDUCE_HPP
+#define DIY_PARTNERS_ALL_REDUCE_HPP
+
+#include "merge.hpp"
+
+namespace diy
+{
+
+class Master;
+
+//! Allreduce (reduction with results broadcasted to all blocks) is
+//! implemented as two merge reductions, with incoming and outgoing items swapped in second one.
+//! Ie, follows merge reduction up and down the merge tree
+
+/**
+ * \ingroup Communication
+ * \brief Partners for all-reduce
+ *
+ */
+struct RegularAllReducePartners: public RegularMergePartners
+{
+  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
+
+                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                //! contiguous is useful when data needs to be united;
+                //! round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularAllReducePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                         int k,                         //!< target k value
+                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(decomposer, k, contiguous)         {}
+                RegularAllReducePartners(const DivisionVector&   divs,//!< explicit division vector
+                                         const KVSVector&        kvs, //!< explicit k vector
+                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(divs, kvs, contiguous)               {}
+
+  //! returns total number of rounds
+  size_t        rounds() const                                  { return 2*Parent::rounds(); }
+  //! returns size of a group of partners in a given round
+  int           size(int round) const                           { return Parent::size(parent_round(round)); }
+  //! returns dimension (direction of partners in a regular grid) in a given round
+  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
+  //! returns whether a given block in a given round has dropped out of the merge yet or not
+  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
+  //! returns what the current round would be in the first or second parent merge reduction
+  int           parent_round(int round) const                   { return round < (int) Parent::rounds() ? round : rounds() - round; }
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      if (round <= (int) Parent::rounds())
+          Parent::incoming(round, gid, partners, m);
+      else
+          Parent::outgoing(parent_round(round), gid, partners, m);
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      if (round < (int) Parent::rounds())
+          Parent::outgoing(round, gid, partners, m);
+      else
+          Parent::incoming(parent_round(round), gid, partners, m);
+  }
+};
+
+} // diy
+
+#endif
+
+
--- a/diy/include/diy/partners/broadcast.hpp
+++ b/diy/include/diy/partners/broadcast.hpp
@ -0,0 +1,62 @@
+#ifndef DIY_PARTNERS_BROADCAST_HPP
+#define DIY_PARTNERS_BROADCAST_HPP
+
+#include "merge.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for broadcast
+ *
+ */
+struct RegularBroadcastPartners: public RegularMergePartners
+{
+  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
+
+                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                //! contiguous is useful when data needs to be united;
+                //! round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularBroadcastPartners(const Decomposer& decomposer,  //!< domain decomposition
+                                         int k,                         //!< target k value
+                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(decomposer, k, contiguous)         {}
+                RegularBroadcastPartners(const DivisionVector&   divs,//!< explicit division vector
+                                         const KVSVector&        kvs, //!< explicit k vector
+                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(divs, kvs, contiguous)               {}
+
+  //! returns total number of rounds
+  size_t        rounds() const                                  { return Parent::rounds(); }
+  //! returns size of a group of partners in a given round
+  int           size(int round) const                           { return Parent::size(parent_round(round)); }
+  //! returns dimension (direction of partners in a regular grid) in a given round
+  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
+  //! returns whether a given block in a given round has dropped out of the merge yet or not
+  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
+  //! returns what the current round would be in the first or second parent merge reduction
+  int           parent_round(int round) const                   { return rounds() - round; }
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      Parent::outgoing(parent_round(round), gid, partners, m);
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      Parent::incoming(parent_round(round), gid, partners, m);
+  }
+};
+
+} // diy
+
+#endif
+
+
--- a/diy/include/diy/partners/common.hpp
+++ b/diy/include/diy/partners/common.hpp
@ -0,0 +1,204 @@
+#ifndef DIY_PARTNERS_COMMON_HPP
+#define DIY_PARTNERS_COMMON_HPP
+
+#include "../decomposition.hpp"
+#include "../types.hpp"
+
+namespace diy
+{
+
+struct RegularPartners
+{
+  // The record of group size per round in a dimension
+  struct DimK
+  {
+            DimK(int dim_, int k_):
+                dim(dim_), size(k_)               {}
+
+    int dim;
+    int size;           // group size
+  };
+
+  typedef       std::vector<int>                    CoordVector;
+  typedef       std::vector<int>                    DivisionVector;
+  typedef       std::vector<DimK>                   KVSVector;
+
+  // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily)
+  typedef       DiscreteBounds                      Bounds;
+  typedef       RegularDecomposer<Bounds>           Decomposer;
+
+  template<class Decomposer_>
+                RegularPartners(const Decomposer_& decomposer, int k, bool contiguous = true):
+                  divisions_(decomposer.divisions),
+                  contiguous_(contiguous)                       { factor(k, divisions_, kvs_); fill_steps(); }
+                RegularPartners(const DivisionVector&   divs,
+                                const KVSVector&        kvs,
+                                bool  contiguous = true):
+                  divisions_(divs), kvs_(kvs),
+                  contiguous_(contiguous)                       { fill_steps(); }
+
+  size_t        rounds() const                                  { return kvs_.size(); }
+  int           size(int round) const                           { return kvs_[round].size; }
+  int           dim(int round) const                            { return kvs_[round].dim; }
+
+  int           step(int round) const                           { return steps_[round]; }
+
+  const DivisionVector&     divisions() const                   { return divisions_; }
+  const KVSVector&          kvs() const                         { return kvs_; }
+  bool                      contiguous() const                  { return contiguous_; }
+
+  static
+  inline void   factor(int k, const DivisionVector& divisions, KVSVector& kvs);
+
+  inline void   fill(int round, int gid, std::vector<int>& partners) const;
+  inline int    group_position(int round, int c, int step) const;
+
+  private:
+    inline void fill_steps();
+    static
+    inline void factor(int k, int tot_b, std::vector<int>& kvs);
+
+    DivisionVector      divisions_;
+    KVSVector           kvs_;
+    bool                contiguous_;
+    std::vector<int>    steps_;
+};
+
+}
+
+void
+diy::RegularPartners::
+fill_steps()
+{
+  if (contiguous_)
+  {
+    std::vector<int>    cur_steps(divisions().size(), 1);
+
+    for (size_t r = 0; r < rounds(); ++r)
+    {
+      steps_.push_back(cur_steps[kvs_[r].dim]);
+      cur_steps[kvs_[r].dim] *= kvs_[r].size;
+    }
+  } else
+  {
+    std::vector<int>    cur_steps(divisions().begin(), divisions().end());
+    for (size_t r = 0; r < rounds(); ++r)
+    {
+      cur_steps[kvs_[r].dim] /= kvs_[r].size;
+      steps_.push_back(cur_steps[kvs_[r].dim]);
+    }
+  }
+}
+
+void
+diy::RegularPartners::
+fill(int round, int gid, std::vector<int>& partners) const
+{
+  const DimK&   kv  = kvs_[round];
+  partners.reserve(kv.size);
+
+  int step = this->step(round);       // gids jump by this much in the current round
+
+  CoordVector   coords;
+  Decomposer::gid_to_coords(gid, coords, divisions_);
+  int c   = coords[kv.dim];
+  int pos = group_position(round, c, step);
+
+  int partner = c - pos * step;
+  coords[kv.dim] = partner;
+  int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
+  partners.push_back(partner_gid);
+
+  for (int k = 1; k < kv.size; ++k)
+  {
+    partner += step;
+    coords[kv.dim] = partner;
+    int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
+    partners.push_back(partner_gid);
+  }
+}
+
+// Tom's GetGrpPos
+int
+diy::RegularPartners::
+group_position(int round, int c, int step) const
+{
+  // the second term in the following expression does not simplify to
+  // (gid - start_b) / kv[r]
+  // because the division gid / (step * kv[r]) is integer and truncates
+  // this is exactly what we want
+  int g = c % step + c / (step * kvs_[round].size) * step;
+  int p = c / step % kvs_[round].size;
+  static_cast<void>(g);        // shut up the compiler
+
+  // g: group number (output)
+  // p: position number within the group (output)
+  return p;
+}
+
+void
+diy::RegularPartners::
+factor(int k, const DivisionVector& divisions, KVSVector& kvs)
+{
+  // factor in each dimension
+  std::vector< std::vector<int> >       tmp_kvs(divisions.size());
+  for (unsigned i = 0; i < divisions.size(); ++i)
+    factor(k, divisions[i], tmp_kvs[i]);
+
+  // interleave the dimensions
+  std::vector<int>  round_per_dim(divisions.size(), 0);
+  while(true)
+  {
+    // TODO: not the most efficient way to do this
+    bool changed = false;
+    for (unsigned i = 0; i < divisions.size(); ++i)
+    {
+      if (round_per_dim[i] == (int) tmp_kvs[i].size())
+        continue;
+      kvs.push_back(DimK(i, tmp_kvs[i][round_per_dim[i]++]));
+      changed = true;
+    }
+    if (!changed)
+        break;
+  }
+}
+
+// Tom's FactorK
+void
+diy::RegularPartners::
+factor(int k, int tot_b, std::vector<int>& kv)
+{
+  int rem = tot_b; // unfactored remaining portion of tot_b
+  int j;
+
+  while (rem > 1)
+  {
+    // remainder is divisible by k
+    if (rem % k == 0)
+    {
+      kv.push_back(k);
+      rem /= k;
+    }
+    // if not, start at k and linearly look for smaller factors down to 2
+    else
+    {
+      for (j = k - 1; j > 1; j--)
+      {
+        if (rem % j == 0)
+        {
+          kv.push_back(j);
+          rem /= k;
+          break;
+        }
+      }
+      if (j == 1)
+      {
+        kv.push_back(rem);
+        rem = 1;
+      }
+    } // else
+  } // while
+}
+
+
+#endif
--- a/diy/include/diy/partners/merge.hpp
+++ b/diy/include/diy/partners/merge.hpp
@ -0,0 +1,60 @@
+#ifndef DIY_PARTNERS_MERGE_HPP
+#define DIY_PARTNERS_MERGE_HPP
+
+#include "common.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for merge-reduce
+ *
+ */
+struct RegularMergePartners: public RegularPartners
+{
+  typedef       RegularPartners                                 Parent;
+
+                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                // contiguous is useful when data needs to be united;
+                // round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularMergePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                     int k,                         //!< target k value
+                                     bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                    Parent(decomposer, k, contiguous)           {}
+                RegularMergePartners(const DivisionVector&   divs, //!< explicit division vector
+                                     const KVSVector&        kvs,  //!< explicit k vector
+                                     bool  contiguous = true       //!< distance doubling (true) or halving (false)
+                    ):
+                    Parent(divs, kvs, contiguous)               {}
+
+  inline bool   active(int round, int gid, const Master&) const;
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master&) const    { Parent::fill(round - 1, gid, partners); }
+  // this is a lazy implementation of outgoing, but it reuses the existing code
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master&) const    { std::vector<int> tmp; Parent::fill(round, gid, tmp); partners.push_back(tmp[0]); }
+};
+
+} // diy
+
+bool
+diy::RegularMergePartners::
+active(int round, int gid, const Master&) const
+{
+  CoordVector   coords;
+  Decomposer::gid_to_coords(gid, coords, divisions());
+
+  for (int r = 0; r < round; ++r)
+      if (Parent::group_position(r, coords[kvs()[r].dim], step(r)) != 0)
+          return false;
+
+  return true;
+}
+
+#endif
+
--- a/diy/include/diy/partners/swap.hpp
+++ b/diy/include/diy/partners/swap.hpp
@ -0,0 +1,43 @@
+#ifndef DIY_PARTNERS_SWAP_HPP
+#define DIY_PARTNERS_SWAP_HPP
+
+#include "common.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for swap-reduce
+ *
+ */
+struct RegularSwapPartners: public RegularPartners
+{
+  typedef       RegularPartners                                 Parent;
+
+                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                // contiguous is useful when data needs to be united;
+                // round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularSwapPartners(const Decomposer& decomposer,   //!< domain decomposition
+                                    int k,                          //!< target k value
+                                    bool contiguous = true          //!< distance halving (true) or doubling (false)
+                    ):
+                    Parent(decomposer, k, contiguous)         {}
+                RegularSwapPartners(const DivisionVector&   divs, //!< explicit division vector
+                                    const KVSVector&        kvs,  //!< explicit k vector
+                                    bool  contiguous = true       //!< distance halving (true) or doubling (false)
+                    ):
+                    Parent(divs, kvs, contiguous)               {}
+
+  bool          active(int round, int gid, const Master&) const                                 { return true; }    // in swap-reduce every block is always active
+
+  void          incoming(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round - 1, gid, partners); }
+  void          outgoing(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round, gid, partners); }
+};
+
+} // diy
+
+#endif
--- a/diy/include/diy/pick.hpp
+++ b/diy/include/diy/pick.hpp
@ -0,0 +1,137 @@
+#ifndef DIY_PICK_HPP
+#define DIY_PICK_HPP
+
+#include "link.hpp"
+
+namespace diy
+{
+    template<class Bounds, class Point, class OutIter>
+    void near(const RegularLink<Bounds>& link, const Point& p, float r, OutIter out,
+              const Bounds& domain);
+
+    template<class Bounds, class Point, class OutIter>
+    void in(const RegularLink<Bounds>& link, const Point& p, OutIter out, const Bounds& domain);
+
+    template<class Point, class Bounds>
+    float distance(int dim, const Bounds& bounds, const Point& p);
+
+    template<class Bounds>
+    inline
+    float distance(int dim, const Bounds& bounds1, const Bounds& bounds2);
+
+    template<class Bounds>
+    void wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim);
+}
+
+//! Finds the neighbors within radius r of a target point.
+template<class Bounds, class Point, class OutIter>
+void
+diy::
+near(const RegularLink<Bounds>& link,  //!< neighbors
+     const Point& p,                   //!< target point (must be in current block)
+     float r,                          //!< target radius (>= 0.0)
+     OutIter out,                      //!< insert iterator for output set of neighbors
+     const Bounds& domain)             //!< global domain bounds
+{
+  Bounds neigh_bounds; // neighbor block bounds
+
+  // for all neighbors of this block
+  for (int n = 0; n < link.size(); n++)
+  {
+    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
+    neigh_bounds = link.bounds(n);
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
+
+    if (distance(link.dimension(), neigh_bounds, p) <= r)
+        *out++ = n;
+  } // for all neighbors
+}
+
+//! Find the distance between point `p` and box `bounds`.
+template<class Point, class Bounds>
+float
+diy::
+distance(int dim, const Bounds& bounds, const Point& p)
+{
+    float res = 0;
+    for (int i = 0; i < dim; ++i)
+    {
+        // avoids all the annoying case logic by finding
+        // diff = max(bounds.min[i] - p[i], 0, p[i] - bounds.max[i])
+        float diff = 0, d;
+
+        d = bounds.min[i] - p[i];
+        if (d > diff) diff = d;
+        d = p[i] - bounds.max[i];
+        if (d > diff) diff = d;
+
+        res += diff*diff;
+    }
+    return sqrt(res);
+}
+
+template<class Bounds>
+float
+diy::
+distance(int dim, const Bounds& bounds1, const Bounds& bounds2)
+{
+    float res = 0;
+    for (int i = 0; i < dim; ++i)
+    {
+        float diff = 0, d;
+
+        float d1 = bounds1.max[i] - bounds2.min[i];
+        float d2 = bounds2.max[i] - bounds1.min[i];
+
+        if (d1 > 0 && d2 > 0)
+            diff = 0;
+        else if (d1 <= 0)
+            diff = -d1;
+        else if (d2 <= 0)
+            diff = -d2;
+
+        res += diff*diff;
+    }
+    return sqrt(res);
+}
+
+//! Finds the neighbor(s) containing the target point.
+template<class Bounds, class Point, class OutIter>
+void
+diy::
+in(const RegularLink<Bounds>& link,  //!< neighbors
+   const Point& p,                   //!< target point
+   OutIter out,                      //!< insert iterator for output set of neighbors
+   const Bounds& domain)             //!< global domain bounds
+{
+  Bounds neigh_bounds; // neighbor block bounds
+
+  // for all neighbors of this block
+  for (int n = 0; n < link.size(); n++)
+  {
+    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
+    neigh_bounds = link.bounds(n);
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
+
+    if (distance(link.dimension(), neigh_bounds, p) == 0)
+        *out++ = n;
+  } // for all neighbors
+}
+
+// wraps block bounds
+// wrap dir is the wrapping direction from original block to wrapped neighbor block
+// overall domain bounds and dimensionality are also needed
+template<class Bounds>
+void
+diy::
+wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim)
+{
+  for (int i = 0; i < dim; ++i)
+  {
+    bounds.min[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
+    bounds.max[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
+  }
+}
+
+
+#endif
--- a/diy/include/diy/point.hpp
+++ b/diy/include/diy/point.hpp
@ -0,0 +1,120 @@
+#ifndef DIY_POINT_HPP
+#define DIY_POINT_HPP
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <sstream>
+
+#include <array>
+
+namespace diy
+{
+
+template<class Coordinate_, unsigned D>
+class Point: public std::array<Coordinate_, D>
+{
+    public:
+        typedef             Coordinate_                             Coordinate;
+        typedef             std::array<Coordinate, D>               ArrayParent;
+
+        typedef             Point<Coordinate, D-1>                  LPoint;
+        typedef             Point<Coordinate, D+1>                  UPoint;
+
+        template<class U>
+        struct rebind       { typedef Point<U,D> type; };
+
+    public:
+                            Point()                                 { for (unsigned i = 0; i < D; ++i) (*this)[i] = 0; }
+                            Point(const ArrayParent& a):
+                                ArrayParent(a)                      {}
+        template<class T>   Point(const Point<T, D>& p)             { for (size_t i = 0; i < D; ++i) (*this)[i] = p[i]; }
+        template<class T>   Point(const T* a)                       { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
+        template<class T>   Point(const std::vector<T>& a)          { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
+                            Point(std::initializer_list<Coordinate> lst)   { unsigned i = 0; for (Coordinate x : lst) (*this)[i++] = x; }
+
+                            Point(Point&&)                          =default;
+                            Point(const Point&)                     =default;
+        Point&              operator=(const Point&)                 =default;
+
+        static constexpr
+        unsigned            dimension()                             { return D; }
+
+        static Point        zero()                                  { return Point(); }
+        static Point        one()                                   { Point p; for (unsigned i = 0; i < D; ++i) p[i] = 1; return p; }
+
+        LPoint              drop(int dim) const                     { LPoint p; unsigned c = 0; for (unsigned i = 0; i < D;   ++i) { if (i == dim) continue; p[c++] = (*this)[i]; } return p; }
+        UPoint              lift(int dim, Coordinate x) const       { UPoint p; for (unsigned i = 0; i < D+1; ++i) { if (i < dim) p[i] = (*this)[i]; else if (i == dim) p[i] = x; else if (i > dim) p[i] = (*this)[i-1]; } return p; }
+
+        using ArrayParent::operator[];
+
+        Point&              operator+=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] += y[i];  return *this; }
+        Point&              operator-=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] -= y[i];  return *this; }
+        Point&              operator*=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] *= a;     return *this; }
+        Point&              operator/=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] /= a;     return *this; }
+
+        Coordinate          norm() const                            { return (*this)*(*this); }
+
+        std::ostream&       operator<<(std::ostream& out) const     { out << (*this)[0]; for (unsigned i = 1; i < D; ++i) out << " " << (*this)[i]; return out; }
+        std::istream&       operator>>(std::istream& in);
+
+        friend
+        Point               operator+(Point x, const Point& y)       { x += y; return x; }
+
+        friend
+        Point               operator-(Point x, const Point& y)       { x -= y; return x; }
+
+        friend
+        Point               operator/(Point x, Coordinate y)         { x /= y; return x; }
+
+        friend
+        Point               operator*(Point x, Coordinate y)         { x *= y; return x; }
+
+        friend
+        Point               operator*(Coordinate y, Point x)         { x *= y; return x; }
+
+        friend
+        Coordinate          operator*(const Point& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
+
+        template<class T>
+        friend
+        Coordinate          operator*(const Point<T,D>& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
+};
+
+template<class C, unsigned D>
+std::istream&
+Point<C,D>::
+operator>>(std::istream& in)
+{
+    std::string point_str;
+    in >> point_str;        // read until ' '
+    std::stringstream ps(point_str);
+
+    char x;
+    for (unsigned i = 0; i < dimension(); ++i)
+    {
+        ps >> (*this)[i];
+        ps >> x;
+    }
+
+    return in;
+}
+
+
+template<class Coordinate, unsigned D>
+Coordinate norm2(const Point<Coordinate,D>& p)
+{ Coordinate res = 0; for (unsigned i = 0; i < D; ++i) res += p[i]*p[i]; return res; }
+
+template<class C, unsigned D>
+std::ostream&
+operator<<(std::ostream& out, const Point<C,D>& p)
+{ return p.operator<<(out); }
+
+template<class C, unsigned D>
+std::istream&
+operator>>(std::istream& in, Point<C,D>& p)
+{ return p.operator>>(in); }
+
+}
+
+#endif // DIY_POINT_HPP
--- a/diy/include/diy/proxy.hpp
+++ b/diy/include/diy/proxy.hpp
@ -0,0 +1,228 @@
+#ifndef DIY_PROXY_HPP
+#define DIY_PROXY_HPP
+
+
+namespace diy
+{
+  //! Communication proxy, used for enqueueing and dequeueing items for future exchange.
+  struct Master::Proxy
+  {
+    template <class T>
+    struct EnqueueIterator;
+
+                        Proxy(Master* master, int gid):
+                          gid_(gid),
+                          master_(master),
+                          incoming_(&master->incoming(gid)),
+                          outgoing_(&master->outgoing(gid)),
+                          collectives_(&master->collectives(gid))       {}
+
+    int                 gid() const                                     { return gid_; }
+
+    //! Enqueue data whose size can be determined automatically, e.g., an STL vector.
+    template<class T>
+    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
+                                const T&        x,                                      //!< data (eg. STL vector)
+                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
+                               ) const
+    { OutgoingQueues& out = *outgoing_; save(out[to], x); }
+
+    //! Enqueue data whose size is given explicitly by the user, e.g., an array.
+    template<class T>
+    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
+                                const T*        x,                                      //!< pointer to the data (eg. address of start of vector)
+                                size_t          n,                                      //!< size in data elements (eg. ints)
+                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
+                               ) const;
+
+    //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was
+    //! previously enqueued so that diy knows its size when it is received.
+    //! In this case, diy will allocate the receive buffer; the user does not need to do so.
+    template<class T>
+    void                dequeue(int             from,                                   //!< target block gid
+                                T&              x,                                      //!< data (eg. STL vector)
+                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
+                               ) const
+    { IncomingQueues& in  = *incoming_; load(in[from], x); }
+
+    //! Dequeue an array of data whose size is given explicitly by the user.
+    //! In this case, the user needs to allocate the receive buffer prior to calling dequeue.
+    template<class T>
+    void                dequeue(int             from,                                   //!< target block gid
+                                T*              x,                                      //!< pointer to the data (eg. address of start of vector)
+                                size_t          n,                                      //!< size in data elements (eg. ints)
+                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
+                               ) const;
+
+    template<class T>
+    EnqueueIterator<T>  enqueuer(const T& x,
+                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save<T>) const
+    { return EnqueueIterator<T>(this, x, save); }
+
+    IncomingQueues*     incoming() const                                { return incoming_; }
+    MemoryBuffer&       incoming(int from) const                        { return (*incoming_)[from]; }
+    inline void         incoming(std::vector<int>& v) const;            // fill v with every gid from which we have a message
+
+    OutgoingQueues*     outgoing() const                                { return outgoing_; }
+    MemoryBuffer&       outgoing(const BlockID& to) const               { return (*outgoing_)[to]; }
+
+/**
+ * \ingroup Communication
+ * \brief Post an all-reduce collective using an existing communication proxy.
+ * Available operators are:
+ * maximum<T>, minimum<T>, std::plus<T>, std::multiplies<T>, std::logical_and<T>, and
+ * std::logical_or<T>.
+ */
+    template<class T, class Op>
+    inline void         all_reduce(const T& in,                  //!< local value being reduced
+                                   Op op                         //!< operator
+                                   ) const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective without popping it off the collectives list (same result would be returned multiple times). The list can be cleared with collectives()->clear().
+ */
+    template<class T>
+    inline T            read() const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective; result is popped off the collectives list.
+ */
+    template<class T>
+    inline T            get() const;
+
+    template<class T>
+    inline void         scratch(const T& in) const;
+
+/**
+ * \ingroup Communication
+ * \brief Return the list of proxy collectives (values and operations)
+ */
+    CollectivesList*    collectives() const                             { return collectives_; }
+
+    Master*             master() const                                  { return master_; }
+
+    private:
+      int               gid_;
+      Master*           master_;
+      IncomingQueues*   incoming_;
+      OutgoingQueues*   outgoing_;
+      CollectivesList*  collectives_;
+  };
+
+  template<class T>
+  struct Master::Proxy::EnqueueIterator:
+    public std::iterator<std::output_iterator_tag, void, void, void, void>
+  {
+    typedef     void (*SaveT)(BinaryBuffer&, const T&);
+
+                        EnqueueIterator(const Proxy* proxy, const T& x,
+                                        SaveT save = &::diy::save<T>):
+                            proxy_(proxy), x_(x), save_(save)               {}
+
+    EnqueueIterator&    operator=(const BlockID& to)                        { proxy_->enqueue(to, x_, save_); return *this; }
+    EnqueueIterator&    operator*()                                         { return *this; }
+    EnqueueIterator&    operator++()                                        { return *this; }
+    EnqueueIterator&    operator++(int)                                     { return *this; }
+
+    private:
+      const Proxy*  proxy_;
+      const T&      x_;
+      SaveT         save_;
+
+  };
+
+  struct Master::ProxyWithLink: public Master::Proxy
+  {
+            ProxyWithLink(const Proxy&    proxy,
+                          void*           block,
+                          Link*           link):
+              Proxy(proxy),
+              block_(block),
+              link_(link)                                           {}
+
+      Link*   link() const                                          { return link_; }
+      void*   block() const                                         { return block_; }
+
+    private:
+      void*   block_;
+      Link*   link_;
+  };
+}
+
+
+void
+diy::Master::Proxy::
+incoming(std::vector<int>& v) const
+{
+  for (IncomingQueues::const_iterator it = incoming_->begin(); it != incoming_->end(); ++it)
+    v.push_back(it->first);
+}
+
+template<class T, class Op>
+void
+diy::Master::Proxy::
+all_reduce(const T& in, Op op) const
+{
+  collectives_->push_back(Collective(new detail::AllReduceOp<T,Op>(in, op)));
+}
+
+template<class T>
+T
+diy::Master::Proxy::
+read() const
+{
+  T res;
+  collectives_->front().result_out(&res);
+  return res;
+}
+
+template<class T>
+T
+diy::Master::Proxy::
+get() const
+{
+  T res = read<T>();
+  collectives_->pop_front();
+  return res;
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+scratch(const T& in) const
+{
+  collectives_->push_back(Collective(new detail::Scratch<T>(in)));
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+enqueue(const BlockID& to, const T* x, size_t n,
+        void (*save)(BinaryBuffer&, const T&)) const
+{
+    OutgoingQueues& out = *outgoing_;
+    BinaryBuffer&   bb  = out[to];
+    if (save == (void (*)(BinaryBuffer&, const T&)) &::diy::save<T>)
+        diy::save(bb, x, n);       // optimized for unspecialized types
+    else
+        for (size_t i = 0; i < n; ++i)
+            save(bb, x[i]);
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+dequeue(int from, T* x, size_t n,
+        void (*load)(BinaryBuffer&, T&)) const
+{
+    IncomingQueues& in = *incoming_;
+    BinaryBuffer&   bb = in[from];
+    if (load == (void (*)(BinaryBuffer&, T&)) &::diy::load<T>)
+        diy::load(bb, x, n);       // optimized for unspecialized types
+    else
+        for (size_t i = 0; i < n; ++i)
+            load(bb, x[i]);
+}
+
+
+#endif
--- a/diy/include/diy/reduce-operations.hpp
+++ b/diy/include/diy/reduce-operations.hpp
@ -0,0 +1,32 @@
+#ifndef DIY_REDUCE_OPERATIONS_HPP
+#define DIY_REDUCE_OPERATIONS_HPP
+
+#include "reduce.hpp"
+#include "partners/swap.hpp"
+#include "detail/reduce/all-to-all.hpp"
+
+namespace diy
+{
+
+/**
+ * \ingroup Communication
+ * \brief all to all reduction
+ *
+ */
+template<class Op>
+void
+all_to_all(Master&              master,     //!< block owner
+           const Assigner&      assigner,   //!< global block locator (maps gid to proc)
+           const Op&            op,         //!< user-defined operation called to enqueue and dequeue items
+           int                  k = 2       //!< reduction fanout
+          )
+{
+  auto scoped = master.prof.scoped("all_to_all");
+  RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()-1), assigner.nblocks());
+  RegularSwapPartners  partners(decomposer, k, false);
+  reduce(master, assigner, partners, detail::AllToAllReduce<Op>(op, assigner), detail::SkipIntermediate(partners.rounds()));
+}
+
+}
+
+#endif
--- a/diy/include/diy/reduce.hpp
+++ b/diy/include/diy/reduce.hpp
@ -0,0 +1,216 @@
+#ifndef DIY_REDUCE_HPP
+#define DIY_REDUCE_HPP
+
+#include <vector>
+#include "master.hpp"
+#include "assigner.hpp"
+#include "detail/block_traits.hpp"
+#include "log.hpp"
+
+namespace diy
+{
+//! Enables communication within a group during a reduction.
+//! DIY creates the ReduceProxy for you in diy::reduce()
+//! and provides a reference to ReduceProxy each time the user's reduction function is called
+struct ReduceProxy: public Master::Proxy
+{
+    typedef     std::vector<int>                            GIDVector;
+
+    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+                void*                   block, //!< diy block
+                unsigned                round, //!< current round
+                const Assigner&         assigner, //!< assigner
+                const GIDVector&        incoming_gids, //!< incoming gids in this group
+                const GIDVector&        outgoing_gids): //!< outgoing gids in this group
+      Master::Proxy(proxy),
+      block_(block),
+      round_(round),
+      assigner_(assigner)
+    {
+      // setup in_link
+      for (unsigned i = 0; i < incoming_gids.size(); ++i)
+      {
+        BlockID nbr;
+        nbr.gid  = incoming_gids[i];
+        nbr.proc = assigner.rank(nbr.gid);
+        in_link_.add_neighbor(nbr);
+      }
+
+      // setup out_link
+      for (unsigned i = 0; i < outgoing_gids.size(); ++i)
+      {
+        BlockID nbr;
+        nbr.gid  = outgoing_gids[i];
+        nbr.proc = assigner.rank(nbr.gid);
+        out_link_.add_neighbor(nbr);
+      }
+    }
+
+    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+                void*                   block, //!< diy block
+                unsigned                round, //!< current round
+                const Assigner&         assigner,
+                const Link&             in_link,
+                const Link&             out_link):
+      Master::Proxy(proxy),
+      block_(block),
+      round_(round),
+      assigner_(assigner),
+      in_link_(in_link),
+      out_link_(out_link)
+    {}
+
+    //! returns pointer to block
+    void*         block() const                           { return block_; }
+    //! returns current round number
+    unsigned      round() const                           { return round_; }
+    //! returns incoming link
+    const Link&   in_link() const                         { return in_link_; }
+    //! returns outgoing link
+    const Link&   out_link() const                        { return out_link_; }
+    //! returns total number of blocks
+    int           nblocks() const                         { return assigner_.nblocks(); }
+    //! returns the assigner
+    const Assigner& assigner() const                      { return assigner_; }
+
+    //! advanced: change current round number
+    void          set_round(unsigned r)                   { round_ = r; }
+
+  private:
+    void*         block_;
+    unsigned      round_;
+    const Assigner& assigner_;
+
+    Link          in_link_;
+    Link          out_link_;
+};
+
+namespace detail
+{
+  template<class Block, class Partners>
+  struct ReductionFunctor;
+
+  template<class Partners, class Skip>
+  struct SkipInactiveOr;
+
+  struct ReduceNeverSkip
+  {
+    bool operator()(int round, int lid, const Master& master) const  { return false; }
+  };
+}
+
+/**
+ * \ingroup Communication
+ * \brief Implementation of the reduce communication pattern (includes
+ *        swap-reduce, merge-reduce, and any other global communication).
+ *
+ */
+template<class Reduce, class Partners, class Skip>
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reduce,        //!< reduction callback function
+            const Skip&                skip)          //!< object determining whether a block should be skipped
+{
+  auto log = get_logger();
+
+  int original_expected = master.expected();
+
+  using Block = typename detail::block_traits<Reduce>::type;
+
+  unsigned round;
+  for (round = 0; round < partners.rounds(); ++round)
+  {
+    log->debug("Round {}", round);
+    master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
+                   detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
+    master.execute();
+
+    int expected = 0;
+    for (unsigned i = 0; i < master.size(); ++i)
+    {
+      if (partners.active(round + 1, master.gid(i), master))
+      {
+        std::vector<int> incoming_gids;
+        partners.incoming(round + 1, master.gid(i), incoming_gids, master);
+        expected += incoming_gids.size();
+        master.incoming(master.gid(i)).clear();
+      }
+    }
+    master.set_expected(expected);
+    master.flush();
+  }
+  // final round
+  log->debug("Round {}", round);
+  master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
+                 detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
+
+  master.set_expected(original_expected);
+}
+
+/**
+ * \ingroup Communication
+ * \brief Implementation of the reduce communication pattern (includes
+ *        swap-reduce, merge-reduce, and any other global communication).
+ *
+ */
+template<class Reduce, class Partners>
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reducer)       //!< reduction callback function
+{
+  reduce(master, assigner, partners, reducer, detail::ReduceNeverSkip());
+}
+
+namespace detail
+{
+  template<class Block, class Partners>
+  struct ReductionFunctor
+  {
+    using Callback = std::function<void(Block*, const ReduceProxy&, const Partners&)>;
+
+                ReductionFunctor(unsigned round_, const Callback& reduce_, const Partners& partners_, const Assigner& assigner_):
+                    round(round_), reduce(reduce_), partners(partners_), assigner(assigner_)        {}
+
+    void        operator()(Block* b, const Master::ProxyWithLink& cp) const
+    {
+      if (!partners.active(round, cp.gid(), *cp.master())) return;
+
+      std::vector<int> incoming_gids, outgoing_gids;
+      if (round > 0)
+          partners.incoming(round, cp.gid(), incoming_gids, *cp.master());        // receive from the previous round
+      if (round < partners.rounds())
+          partners.outgoing(round, cp.gid(), outgoing_gids, *cp.master());        // send to the next round
+
+      ReduceProxy   rp(cp, b, round, assigner, incoming_gids, outgoing_gids);
+      reduce(b, rp, partners);
+
+      // touch the outgoing queues to make sure they exist
+      Master::OutgoingQueues& outgoing = *cp.outgoing();
+      if (outgoing.size() < (size_t) rp.out_link().size())
+        for (int j = 0; j < rp.out_link().size(); ++j)
+          outgoing[rp.out_link().target(j)];       // touch the outgoing queue, creating it if necessary
+    }
+
+    unsigned        round;
+    Callback        reduce;
+    Partners        partners;
+    const Assigner& assigner;
+  };
+
+  template<class Partners, class Skip>
+  struct SkipInactiveOr
+  {
+                    SkipInactiveOr(int round_, const Partners& partners_, const Skip& skip_):
+                        round(round_), partners(partners_), skip(skip_)         {}
+    bool            operator()(int i, const Master& master) const               { return !partners.active(round, master.gid(i), master) || skip(round, i, master); }
+    int             round;
+    const Partners& partners;
+    Skip            skip;
+  };
+}
+
+} // diy
+
+#endif // DIY_REDUCE_HPP
--- a/diy/include/diy/serialization.hpp
+++ b/diy/include/diy/serialization.hpp
@ -0,0 +1,456 @@
+#ifndef DIY_SERIALIZATION_HPP
+#define DIY_SERIALIZATION_HPP
+
+#include <vector>
+#include <valarray>
+#include <map>
+#include <set>
+#include <string>
+#include <fstream>
+
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <type_traits>              // this is used for a safety check for default serialization
+
+namespace diy
+{
+  //! A serialization buffer. \ingroup Serialization
+  struct BinaryBuffer
+  {
+    virtual void        save_binary(const char* x, size_t count)    =0;   //!< copy `count` bytes from `x` into the buffer
+    virtual void        load_binary(char* x, size_t count)          =0;   //!< copy `count` bytes into `x` from the buffer
+    virtual void        load_binary_back(char* x, size_t count)     =0;   //!< copy `count` bytes into `x` from the back of the buffer
+  };
+
+  struct MemoryBuffer: public BinaryBuffer
+  {
+                        MemoryBuffer(size_t position_ = 0):
+                          position(position_)                       {}
+
+    virtual inline void save_binary(const char* x, size_t count) override;   //!< copy `count` bytes from `x` into the buffer
+    virtual inline void load_binary(char* x, size_t count) override;         //!< copy `count` bytes into `x` from the buffer
+    virtual inline void load_binary_back(char* x, size_t count) override;    //!< copy `count` bytes into `x` from the back of the buffer
+
+    void                clear()                                     { buffer.clear(); reset(); }
+    void                wipe()                                      { std::vector<char>().swap(buffer); reset(); }
+    void                reset()                                     { position = 0; }
+    void                skip(size_t s)                              { position += s; }
+    void                swap(MemoryBuffer& o)                       { std::swap(position, o.position); buffer.swap(o.buffer); }
+    bool                empty() const                               { return buffer.empty(); }
+    size_t              size() const                                { return buffer.size(); }
+    void                reserve(size_t s)                           { buffer.reserve(s); }
+                        operator bool() const                       { return position < buffer.size(); }
+
+    //! copy a memory buffer from one buffer to another, bypassing making a temporary copy first
+    inline static void  copy(MemoryBuffer& from, MemoryBuffer& to);
+
+    //! multiplier used for the geometric growth of the container
+    static float        growth_multiplier()                         { return 1.5; }
+
+    // simple file IO
+    void                write(const std::string& fn) const          { std::ofstream out(fn.c_str()); out.write(&buffer[0], size()); }
+    void                read(const std::string& fn)
+    {
+        std::ifstream in(fn.c_str(), std::ios::binary | std::ios::ate);
+        buffer.resize(in.tellg());
+        in.seekg(0);
+        in.read(&buffer[0], size());
+        position = 0;
+    }
+
+    size_t              position;
+    std::vector<char>   buffer;
+  };
+
+  namespace detail
+  {
+    struct Default {};
+  }
+
+  //!\addtogroup Serialization
+  //!@{
+
+  /**
+   * \brief Main interface to serialization, meant to be specialized for the
+   * types that require special handling.  `diy::save()` and `diy::load()` call
+   * the static member functions of this class.
+   *
+   * The default (unspecialized) version copies
+   * `sizeof(T)` bytes from `&x` to or from `bb` via
+   * its `diy::BinaryBuffer::save_binary()` and `diy::BinaryBuffer::load_binary()`
+   * functions.  This works out perfectly for plain old data (e.g., simple structs).
+   * To save a more complicated type, one has to specialize
+   * `diy::Serialization<T>` for that type. Specializations are already provided for
+   * `std::vector<T>`, `std::map<K,V>`, and `std::pair<T,U>`.
+   * As a result one can quickly add a specialization of one's own
+   *
+   */
+  template<class T>
+  struct Serialization: public detail::Default
+  {
+#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)
+    static_assert(std::is_trivially_copyable<T>::value, "Default serialization works only for trivially copyable types");
+#endif
+
+    static void         save(BinaryBuffer& bb, const T& x)          { bb.save_binary((const char*)  &x, sizeof(T)); }
+    static void         load(BinaryBuffer& bb, T& x)                { bb.load_binary((char*)        &x, sizeof(T)); }
+  };
+
+  //! Saves `x` to `bb` by calling `diy::Serialization<T>::save(bb,x)`.
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T& x)          { Serialization<T>::save(bb, x); }
+
+  //! Loads `x` from `bb` by calling `diy::Serialization<T>::load(bb,x)`.
+  template<class T>
+  void                  load(BinaryBuffer& bb, T& x)                { Serialization<T>::load(bb, x); }
+
+  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
+  //! the array will be copied all at once. Otherwise, it's copied element by element.
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T* x, size_t n);
+
+  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
+  //! the array will be filled all at once. Otherwise, it's filled element by element.
+  template<class T>
+  void                  load(BinaryBuffer& bb, T* x, size_t n);
+
+  //! Supports only binary data copying (meant for simple footers).
+  template<class T>
+  void                  load_back(BinaryBuffer& bb, T& x)           { bb.load_binary_back((char*) &x, sizeof(T)); }
+
+  //@}
+
+
+  namespace detail
+  {
+    template<typename T>
+    struct is_default
+    {
+        typedef char    yes;
+        typedef int     no;
+
+        static yes      test(Default*);
+        static no       test(...);
+
+        enum { value = (sizeof(test((T*) 0)) == sizeof(yes)) };
+    };
+  }
+
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T* x, size_t n)
+  {
+    if (!detail::is_default< Serialization<T> >::value)
+      for (size_t i = 0; i < n; ++i)
+        diy::save(bb, x[i]);
+    else        // if Serialization is not specialized for U, just save the binary data
+      bb.save_binary((const char*) &x[0], sizeof(T)*n);
+  }
+
+  template<class T>
+  void                  load(BinaryBuffer& bb, T* x, size_t n)
+  {
+    if (!detail::is_default< Serialization<T> >::value)
+      for (size_t i = 0; i < n; ++i)
+        diy::load(bb, x[i]);
+    else      // if Serialization is not specialized for U, just load the binary data
+      bb.load_binary((char*) &x[0], sizeof(T)*n);
+  }
+
+
+  // save/load for MemoryBuffer
+  template<>
+  struct Serialization< MemoryBuffer >
+  {
+    static void         save(BinaryBuffer& bb, const MemoryBuffer& x)
+    {
+      diy::save(bb, x.position);
+      diy::save(bb, &x.buffer[0], x.position);
+    }
+
+    static void         load(BinaryBuffer& bb, MemoryBuffer& x)
+    {
+      diy::load(bb, x.position);
+      x.buffer.resize(x.position);
+      diy::load(bb, &x.buffer[0], x.position);
+    }
+  };
+
+  // save/load for std::vector<U>
+  template<class U>
+  struct Serialization< std::vector<U> >
+  {
+    typedef             std::vector<U>          Vector;
+
+    static void         save(BinaryBuffer& bb, const Vector& v)
+    {
+      size_t s = v.size();
+      diy::save(bb, s);
+      diy::save(bb, &v[0], v.size());
+    }
+
+    static void         load(BinaryBuffer& bb, Vector& v)
+    {
+      size_t s;
+      diy::load(bb, s);
+      v.resize(s);
+      diy::load(bb, &v[0], s);
+    }
+  };
+
+  template<class U>
+  struct Serialization< std::valarray<U> >
+  {
+    typedef             std::valarray<U>        ValArray;
+
+    static void         save(BinaryBuffer& bb, const ValArray& v)
+    {
+      size_t s = v.size();
+      diy::save(bb, s);
+      diy::save(bb, &v[0], v.size());
+    }
+
+    static void         load(BinaryBuffer& bb, ValArray& v)
+    {
+      size_t s;
+      diy::load(bb, s);
+      v.resize(s);
+      diy::load(bb, &v[0], s);
+    }
+  };
+
+  // save/load for std::string
+  template<>
+  struct Serialization< std::string >
+  {
+    typedef             std::string             String;
+
+    static void         save(BinaryBuffer& bb, const String& s)
+    {
+      size_t sz = s.size();
+      diy::save(bb, sz);
+      diy::save(bb, s.c_str(), sz);
+    }
+
+    static void         load(BinaryBuffer& bb, String& s)
+    {
+      size_t sz;
+      diy::load(bb, sz);
+      s.resize(sz);
+      for (size_t i = 0; i < sz; ++i)
+      {
+          char c;
+          diy::load(bb, c);
+          s[i] = c;
+      }
+    }
+  };
+
+  // save/load for std::pair<X,Y>
+  template<class X, class Y>
+  struct Serialization< std::pair<X,Y> >
+  {
+    typedef             std::pair<X,Y>          Pair;
+
+    static void         save(BinaryBuffer& bb, const Pair& p)
+    {
+      diy::save(bb, p.first);
+      diy::save(bb, p.second);
+    }
+
+    static void         load(BinaryBuffer& bb, Pair& p)
+    {
+      diy::load(bb, p.first);
+      diy::load(bb, p.second);
+    }
+  };
+
+  // save/load for std::map<K,V>
+  template<class K, class V>
+  struct Serialization< std::map<K,V> >
+  {
+    typedef             std::map<K,V>           Map;
+
+    static void         save(BinaryBuffer& bb, const Map& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it)
+        diy::save(bb, *it);
+    }
+
+    static void         load(BinaryBuffer& bb, Map& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        K k;
+        diy::load(bb, k);
+        diy::load(bb, m[k]);
+      }
+    }
+  };
+
+  // save/load for std::set<T>
+  template<class T>
+  struct Serialization< std::set<T> >
+  {
+    typedef             std::set<T>             Set;
+
+    static void         save(BinaryBuffer& bb, const Set& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (typename std::set<T>::const_iterator it = m.begin(); it != m.end(); ++it)
+        diy::save(bb, *it);
+    }
+
+    static void         load(BinaryBuffer& bb, Set& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        T p;
+        diy::load(bb, p);
+        m.insert(p);
+      }
+    }
+  };
+
+  // save/load for std::unordered_map<K,V,H,E,A>
+  template<class K, class V, class H, class E, class A>
+  struct Serialization< std::unordered_map<K,V,H,E,A> >
+  {
+    typedef             std::unordered_map<K,V,H,E,A>   Map;
+
+    static void         save(BinaryBuffer& bb, const Map& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (auto& x : m)
+        diy::save(bb, x);
+    }
+
+    static void         load(BinaryBuffer& bb, Map& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        std::pair<K,V> p;
+        diy::load(bb, p);
+        m.emplace(std::move(p));
+      }
+    }
+  };
+
+  // save/load for std::unordered_set<T,H,E,A>
+  template<class T, class H, class E, class A>
+  struct Serialization< std::unordered_set<T,H,E,A> >
+  {
+    typedef             std::unordered_set<T,H,E,A>     Set;
+
+    static void         save(BinaryBuffer& bb, const Set& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (auto& x : m)
+        diy::save(bb, x);
+    }
+
+    static void         load(BinaryBuffer& bb, Set& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        T p;
+        diy::load(bb, p);
+        m.emplace(std::move(p));
+      }
+    }
+  };
+
+  // save/load for std::tuple<...>
+  // TODO: this ought to be default (copying) serialization
+  //       if all arguments are default
+  template<class... Args>
+  struct Serialization< std::tuple<Args...> >
+  {
+    typedef             std::tuple<Args...>     Tuple;
+
+    static void         save(BinaryBuffer& bb, const Tuple& t)          { save<0>(bb, t); }
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I == sizeof...(Args), void>::type
+                        save(BinaryBuffer&, const Tuple&)               {}
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I < sizeof...(Args), void>::type
+                        save(BinaryBuffer& bb, const Tuple& t)          { diy::save(bb, std::get<I>(t)); save<I+1>(bb, t); }
+
+    static void         load(BinaryBuffer& bb, Tuple& t)                { load<0>(bb, t); }
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I == sizeof...(Args), void>::type
+                        load(BinaryBuffer&, Tuple&)                     {}
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I < sizeof...(Args), void>::type
+                        load(BinaryBuffer& bb, Tuple& t)                { diy::load(bb, std::get<I>(t)); load<I+1>(bb, t); }
+
+  };
+}
+
+void
+diy::MemoryBuffer::
+save_binary(const char* x, size_t count)
+{
+  if (position + count > buffer.capacity())
+    buffer.reserve((position + count) * growth_multiplier());           // if we have to grow, grow geometrically
+
+  if (position + count > buffer.size())
+    buffer.resize(position + count);
+
+  std::copy(x, x + count, &buffer[position]);
+  position += count;
+}
+
+void
+diy::MemoryBuffer::
+load_binary(char* x, size_t count)
+{
+  std::copy(&buffer[position], &buffer[position + count], x);
+  position += count;
+}
+
+void
+diy::MemoryBuffer::
+load_binary_back(char* x, size_t count)
+{
+  std::copy(&buffer[buffer.size() - count], &buffer[buffer.size()], x);
+  buffer.resize(buffer.size() - count);
+}
+
+void
+diy::MemoryBuffer::
+copy(MemoryBuffer& from, MemoryBuffer& to)
+{
+  size_t sz;
+  diy::load(from, sz);
+  from.position -= sizeof(size_t);
+
+  size_t total = sizeof(size_t) + sz;
+  to.buffer.resize(to.position + total);
+  std::copy(&from.buffer[from.position], &from.buffer[from.position + total], &to.buffer[to.position]);
+  to.position += total;
+  from.position += total;
+}
+
+#endif
--- a/diy/include/diy/stats.hpp
+++ b/diy/include/diy/stats.hpp
@ -0,0 +1,120 @@
+#ifndef DIY_STATS_HPP
+#define DIY_STATS_HPP
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+#include "log.hpp"      // need this for format
+#define DIY_PROFILE 1
+namespace diy
+{
+namespace stats
+{
+
+#if defined(DIY_PROFILE)
+struct Profiler
+{
+    using   Clock = std::chrono::high_resolution_clock;
+    using   Time  = Clock::time_point;
+
+    struct Event
+    {
+            Event(const std::string& name_, bool begin_):
+                name(name_),
+                begin(begin_),
+                stamp(Clock::now())
+                                                        {}
+
+        std::string     name;
+        bool            begin;
+        Time            stamp;
+    };
+
+    using   EventsVector = std::vector<Event>;
+
+    struct  Scoped
+    {
+            Scoped(Profiler& prof_, std::string name_):
+                prof(prof_), name(name_), active(true)  { prof << name; }
+            ~Scoped()                                   { if (active) prof >> name; }
+
+            Scoped(Scoped&& other):
+                prof(other.prof),
+                name(other.name),
+                active(other.active)                    { other.active = false; }
+
+        Scoped&
+            operator=(Scoped&& other) = delete;
+            Scoped(const Scoped&) = delete;
+        Scoped&
+            operator=(const Scoped&) = delete;
+
+        Profiler&   prof;
+        std::string name;
+        bool        active;
+    };
+
+            Profiler()                                  { reset_time(); }
+
+    void    reset_time()                                { start = Clock::now(); }
+
+    void    operator<<(std::string name)                { enter(name); }
+    void    operator>>(std::string name)                { exit(name); }
+
+    void    enter(std::string name)                     { events.push_back(Event(name, true)); }
+    void    exit(std::string name)                      { events.push_back(Event(name, false)); }
+
+    void    output(std::ostream& out)
+    {
+        for (size_t i = 0; i < events.size(); ++i)
+        {
+            const Event& e = events[i];
+            auto time = std::chrono::duration_cast<std::chrono::microseconds>(e.stamp - start).count();
+            fmt::print(out, "{} {} {}\n",
+                            time / 1000000.,
+                            (e.begin ? '<' : '>'),
+                            e.name);
+            /*
+            fmt::print(out, "{:02d}:{:02d}:{:02d}.{:06d} {}{}\n",
+                            time/1000000/60/60,
+                            time/1000000/60 % 60,
+                            time/1000000 % 60,
+                            time % 1000000,
+                            (e.begin ? '<' : '>'),
+                            e.name);
+                            */
+        }
+    }
+
+    Scoped  scoped(std::string name)                    { return Scoped(*this, name); }
+
+    void    clear()                                     { events.clear(); }
+
+    private:
+        Time            start;
+        EventsVector    events;
+};
+#else
+struct Profiler
+{
+    struct Scoped {};
+
+    void    reset_time()                                {}
+
+    void    operator<<(std::string)                     {}
+    void    operator>>(std::string)                     {}
+
+    void    enter(const std::string&)                   {}
+    void    exit(const std::string&)                    {}
+
+    void    output(std::ostream&)                       {}
+    void    clear()                                     {}
+
+    Scoped  scoped(std::string)                         { return Scoped(); }
+};
+#endif
+}
+}
+
+#endif
--- a/diy/include/diy/storage.hpp
+++ b/diy/include/diy/storage.hpp
@ -0,0 +1,228 @@
+#ifndef DIY_STORAGE_HPP
+#define DIY_STORAGE_HPP
+
+#include <string>
+#include <map>
+#include <fstream>
+
+#include <unistd.h>     // mkstemp() on Mac
+#include <cstdlib>      // mkstemp() on Linux
+#include <cstdio>       // remove()
+#include <fcntl.h>
+
+#include "serialization.hpp"
+#include "thread.hpp"
+#include "log.hpp"
+
+namespace diy
+{
+  namespace detail
+  {
+    typedef       void  (*Save)(const void*, BinaryBuffer& buf);
+    typedef       void  (*Load)(void*,       BinaryBuffer& buf);
+
+    struct FileBuffer: public BinaryBuffer
+    {
+                          FileBuffer(FILE* file_): file(file_), head(0), tail(0)    {}
+
+      // TODO: add error checking
+      virtual inline void save_binary(const char* x, size_t count) override   { fwrite(x, 1, count, file); head += count; }
+      virtual inline void load_binary(char* x, size_t count) override         { fread(x, 1, count, file); }
+      virtual inline void load_binary_back(char* x, size_t count) override    { fseek(file, tail, SEEK_END); fread(x, 1, count, file); tail += count; fseek(file, head, SEEK_SET); }
+
+      size_t              size() const                                { return head; }
+
+      FILE*  file;
+      size_t head, tail;  // tail is used to support reading from the back;
+                          // the mechanism is a little awkward and unused, but should work if needed
+    };
+  }
+
+  class ExternalStorage
+  {
+    public:
+      virtual int   put(MemoryBuffer& bb)                               =0;
+      virtual int   put(const void* x, detail::Save save)               =0;
+      virtual void  get(int i, MemoryBuffer& bb, size_t extra = 0)      =0;
+      virtual void  get(int i, void* x, detail::Load load)              =0;
+      virtual void  destroy(int i)                                      =0;
+  };
+
+  class FileStorage: public ExternalStorage
+  {
+    private:
+      struct FileRecord
+      {
+        size_t          size;
+        std::string     name;
+      };
+
+    public:
+                    FileStorage(const std::string& filename_template = "/tmp/DIY.XXXXXX"):
+                      filename_templates_(1, filename_template),
+                      count_(0), current_size_(0), max_size_(0)         {}
+
+                    FileStorage(const std::vector<std::string>& filename_templates):
+                      filename_templates_(filename_templates),
+                      count_(0), current_size_(0), max_size_(0)         {}
+
+      virtual int   put(MemoryBuffer& bb) override
+      {
+        auto log = get_logger();
+        std::string     filename;
+        int fh = open_random(filename);
+
+        log->debug("FileStorage::put(): {}; buffer size: {}", filename, bb.size());
+
+        size_t sz = bb.buffer.size();
+        size_t written = write(fh, &bb.buffer[0], sz);
+        if (written < sz || written == (size_t)-1)
+          log->warn("Could not write the full buffer to {}: written = {}; size = {}", filename, written, sz);
+        fsync(fh);
+        close(fh);
+        bb.wipe();
+
+#if 0       // double-check the written file size: only for extreme debugging
+        FILE* fp = fopen(filename.c_str(), "r");
+        fseek(fp, 0L, SEEK_END);
+        int fsz = ftell(fp);
+        if (fsz != sz)
+            log->warn("file size doesn't match the buffer size, {} vs {}", fsz, sz);
+        fclose(fp);
+#endif
+
+        return make_file_record(filename, sz);
+      }
+
+      virtual int    put(const void* x, detail::Save save) override
+      {
+        std::string     filename;
+        int fh = open_random(filename);
+
+        detail::FileBuffer fb(fdopen(fh, "w"));
+        save(x, fb);
+        size_t sz = fb.size();
+        fclose(fb.file);
+        fsync(fh);
+
+        return make_file_record(filename, sz);
+      }
+
+      virtual void   get(int i, MemoryBuffer& bb, size_t extra) override
+      {
+        FileRecord fr = extract_file_record(i);
+
+        get_logger()->debug("FileStorage::get(): {}", fr.name);
+
+        bb.buffer.reserve(fr.size + extra);
+        bb.buffer.resize(fr.size);
+        int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
+        read(fh, &bb.buffer[0], fr.size);
+        close(fh);
+
+        remove_file(fr);
+      }
+
+      virtual void   get(int i, void* x, detail::Load load) override
+      {
+        FileRecord fr = extract_file_record(i);
+
+        //int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
+        int fh = open(fr.name.c_str(), O_RDONLY, 0600);
+        detail::FileBuffer fb(fdopen(fh, "r"));
+        load(x, fb);
+        fclose(fb.file);
+
+        remove_file(fr);
+      }
+
+      virtual void  destroy(int i) override
+      {
+        FileRecord      fr;
+        {
+          CriticalMapAccessor accessor = filenames_.access();
+          fr = (*accessor)[i];
+          accessor->erase(i);
+        }
+        remove(fr.name.c_str());
+        (*current_size_.access()) -= fr.size;
+      }
+
+      int           count() const               { return (*count_.const_access()); }
+      size_t        current_size() const        { return (*current_size_.const_access()); }
+      size_t        max_size() const            { return (*max_size_.const_access()); }
+
+                    ~FileStorage()
+      {
+        for (FileRecordMap::const_iterator it =  filenames_.const_access()->begin();
+                                           it != filenames_.const_access()->end();
+                                         ++it)
+        {
+          remove(it->second.name.c_str());
+        }
+      }
+
+    private:
+      int           open_random(std::string& filename) const
+      {
+        if (filename_templates_.size() == 1)
+            filename = filename_templates_[0].c_str();
+        else
+        {
+            // pick a template at random (very basic load balancing mechanism)
+            filename  = filename_templates_[std::rand() % filename_templates_.size()].c_str();
+        }
+#ifdef __MACH__
+        // TODO: figure out how to open with O_SYNC
+        int fh = mkstemp(const_cast<char*>(filename.c_str()));
+#else
+        int fh = mkostemp(const_cast<char*>(filename.c_str()), O_WRONLY | O_SYNC);
+#endif
+
+        return fh;
+      }
+
+      int           make_file_record(const std::string& filename, size_t sz)
+      {
+        int res = (*count_.access())++;
+        FileRecord  fr = { sz, filename };
+        (*filenames_.access())[res] = fr;
+
+        // keep track of sizes
+        critical_resource<size_t>::accessor     cur = current_size_.access();
+        *cur += sz;
+        critical_resource<size_t>::accessor     max = max_size_.access();
+        if (*cur > *max)
+            *max = *cur;
+
+        return res;
+      }
+
+      FileRecord    extract_file_record(int i)
+      {
+        CriticalMapAccessor accessor = filenames_.access();
+        FileRecord fr = (*accessor)[i];
+        accessor->erase(i);
+        return fr;
+      }
+
+      void          remove_file(const FileRecord& fr)
+      {
+        remove(fr.name.c_str());
+        (*current_size_.access()) -= fr.size;
+      }
+
+    private:
+      typedef           std::map<int, FileRecord>                   FileRecordMap;
+      typedef           critical_resource<FileRecordMap>            CriticalMap;
+      typedef           CriticalMap::accessor                       CriticalMapAccessor;
+
+    private:
+      std::vector<std::string>      filename_templates_;
+      CriticalMap                   filenames_;
+      critical_resource<int>        count_;
+      critical_resource<size_t>     current_size_, max_size_;
+  };
+}
+
+#endif
--- a/diy/include/diy/thread.hpp
+++ b/diy/include/diy/thread.hpp
@ -0,0 +1,31 @@
+#ifndef DIY_THREAD_H
+#define DIY_THREAD_H
+
+#ifdef DIY_NO_THREADS
+#include "no-thread.hpp"
+#else
+
+#include "thread/fast_mutex.h"
+
+#include <thread>
+#include <mutex>
+
+namespace diy
+{
+    using std::thread;
+    using std::mutex;
+    using std::recursive_mutex;
+    namespace this_thread = std::this_thread;
+
+    // TODO: replace with our own implementation using std::atomic_flag
+    using fast_mutex = tthread::fast_mutex;
+
+    template<class Mutex>
+    using lock_guard = std::unique_lock<Mutex>;
+}
+
+#endif
+
+#include "critical-resource.hpp"
+
+#endif
--- a/diy/include/diy/thread/fast_mutex.h
+++ b/diy/include/diy/thread/fast_mutex.h
@ -0,0 +1,248 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef _FAST_MUTEX_H_
+#define _FAST_MUTEX_H_
+
+/// @file
+
+// Which platform are we on?
+#if !defined(_TTHREAD_PLATFORM_DEFINED_)
+  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
+    #define _TTHREAD_WIN32_
+  #else
+    #define _TTHREAD_POSIX_
+  #endif
+  #define _TTHREAD_PLATFORM_DEFINED_
+#endif
+
+// Check if we can support the assembly language level implementation (otherwise
+// revert to the system API)
+#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
+    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
+    (defined(__GNUC__) && (defined(__ppc__)))
+  #define _FAST_MUTEX_ASM_
+#else
+  #define _FAST_MUTEX_SYS_
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+  #ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+    #define __UNDEF_LEAN_AND_MEAN
+  #endif
+  #include <windows.h>
+  #ifdef __UNDEF_LEAN_AND_MEAN
+    #undef WIN32_LEAN_AND_MEAN
+    #undef __UNDEF_LEAN_AND_MEAN
+  #endif
+#else
+  #ifdef _FAST_MUTEX_ASM_
+    #include <sched.h>
+  #else
+    #include <pthread.h>
+  #endif
+#endif
+
+namespace tthread {
+
+/// Fast mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. It is similar to the tthread::mutex class,
+/// but instead of using system level functions, it is implemented as an atomic
+/// spin lock with very low CPU overhead.
+///
+/// The \c fast_mutex class is NOT compatible with the \c condition_variable
+/// class (however, it IS compatible with the \c lock_guard class). It should
+/// also be noted that the \c fast_mutex class typically does not provide
+/// as accurate thread scheduling as a the standard \c mutex class does.
+///
+/// Because of the limitations of the class, it should only be used in
+/// situations where the mutex needs to be locked/unlocked very frequently.
+///
+/// @note The "fast" version of this class relies on inline assembler language,
+/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
+/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
+/// Visual C++).
+/// For other architectures/compilers, system functions are used instead.
+class fast_mutex {
+  public:
+    /// Constructor.
+#if defined(_FAST_MUTEX_ASM_)
+    fast_mutex() : mLock(0) {}
+#else
+    fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      InitializeCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_init(&mHandle, NULL);
+  #endif
+    }
+#endif
+
+#if !defined(_FAST_MUTEX_ASM_)
+    /// Destructor.
+    ~fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      DeleteCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_destroy(&mHandle);
+  #endif
+    }
+#endif
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until \c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      bool gotLock;
+      do {
+        gotLock = try_lock();
+        if(!gotLock)
+        {
+  #if defined(_TTHREAD_WIN32_)
+          Sleep(0);
+  #elif defined(_TTHREAD_POSIX_)
+          sched_yield();
+  #endif
+        }
+      } while(!gotLock);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_lock(&mHandle);
+  #endif
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return \c true if the lock was acquired, or \c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      int oldLock;
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $1,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        "movl %%eax,%1\n\t"
+        : "=m" (mLock), "=m" (oldLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,1
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+        mov oldLock,eax
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      int newLock = 1;
+      asm volatile (
+        "\n1:\n\t"
+        "lwarx  %0,0,%1\n\t"
+        "cmpwi  0,%0,0\n\t"
+        "bne-   2f\n\t"
+        "stwcx. %2,0,%1\n\t"
+        "bne-   1b\n\t"
+        "isync\n"
+        "2:\n\t"
+        : "=&r" (oldLock)
+        : "r" (&mLock), "r" (newLock)
+        : "cr0", "memory"
+      );
+  #endif
+      return (oldLock == 0);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      return TryEnterCriticalSection(&mHandle) ? true : false;
+  #elif defined(_TTHREAD_POSIX_)
+      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
+  #endif
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $0,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        : "=m" (mLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,0
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      asm volatile (
+        "sync\n\t"  // Replace with lwsync where possible?
+        : : : "memory"
+      );
+      mLock = 0;
+  #endif
+#else
+  #if defined(_TTHREAD_WIN32_)
+      LeaveCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_unlock(&mHandle);
+  #endif
+#endif
+    }
+
+  private:
+#if defined(_FAST_MUTEX_ASM_)
+    int mLock;
+#else
+  #if defined(_TTHREAD_WIN32_)
+    CRITICAL_SECTION mHandle;
+  #elif defined(_TTHREAD_POSIX_)
+    pthread_mutex_t mHandle;
+  #endif
+#endif
+};
+
+}
+
+#endif // _FAST_MUTEX_H_
+
--- a/diy/include/diy/time.hpp
+++ b/diy/include/diy/time.hpp
@ -0,0 +1,33 @@
+#ifndef DIY_TIME_HPP
+#define DIY_TIME_HPP
+
+#include <sys/time.h>
+
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+namespace diy
+{
+
+typedef     unsigned long       time_type;
+
+inline time_type get_time()
+{
+#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+#else
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+    return ts.tv_sec*1000 + ts.tv_nsec/1000000;
+}
+
+}
+
+#endif
--- a/diy/include/diy/types.hpp
+++ b/diy/include/diy/types.hpp
@ -0,0 +1,85 @@
+#ifndef DIY_TYPES_HPP
+#define DIY_TYPES_HPP
+
+#include <iostream>
+#include "constants.h"
+#include "point.hpp"
+
+namespace diy
+{
+    struct BlockID
+    {
+        int gid, proc;
+    };
+
+    template<class Coordinate_>
+    struct Bounds
+    {
+        using Coordinate = Coordinate_;
+
+        Point<Coordinate, DIY_MAX_DIM>    min, max;
+    };
+    using DiscreteBounds   = Bounds<int>;
+    using ContinuousBounds = Bounds<float>;
+
+    //! Helper to create a 1-dimensional discrete domain with the specified extents
+    inline
+    diy::DiscreteBounds
+    interval(int from, int to)            { DiscreteBounds domain; domain.min[0] = from; domain.max[0] = to; return domain; }
+
+    struct Direction: public Point<int,DIY_MAX_DIM>
+    {
+              Direction()                 { for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0; }
+              Direction(int dir)
+      {
+          for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0;
+          if (dir & DIY_X0) (*this)[0] -= 1;
+          if (dir & DIY_X1) (*this)[0] += 1;
+          if (dir & DIY_Y0) (*this)[1] -= 1;
+          if (dir & DIY_Y1) (*this)[1] += 1;
+          if (dir & DIY_Z0) (*this)[2] -= 1;
+          if (dir & DIY_Z1) (*this)[2] += 1;
+          if (dir & DIY_T0) (*this)[3] -= 1;
+          if (dir & DIY_T1) (*this)[3] += 1;
+      }
+
+      bool
+      operator==(const diy::Direction& y) const
+      {
+        for (int i = 0; i < DIY_MAX_DIM; ++i)
+            if ((*this)[i] != y[i]) return false;
+        return true;
+      }
+
+      // lexicographic comparison
+      bool
+      operator<(const diy::Direction& y) const
+      {
+        for (int i = 0; i < DIY_MAX_DIM; ++i)
+        {
+            if ((*this)[i] < y[i]) return true;
+            if ((*this)[i] > y[i]) return false;
+        }
+        return false;
+      }
+    };
+
+    // Selector of bounds value type
+    template<class Bounds_>
+    struct BoundsValue
+    {
+        using type = typename Bounds_::Coordinate;
+    };
+
+    inline
+    bool
+    operator<(const diy::BlockID& x, const diy::BlockID& y)
+    { return x.gid < y.gid; }
+
+    inline
+    bool
+    operator==(const diy::BlockID& x, const diy::BlockID& y)
+    { return x.gid == y.gid; }
+}
+
+#endif
--- a/diy/include/diy/vertices.hpp
+++ b/diy/include/diy/vertices.hpp
@ -0,0 +1,54 @@
+#ifndef DIY_VERTICES_HPP
+#define DIY_VERTICES_HPP
+
+#include <iterator>
+
+namespace diy
+{
+
+namespace detail
+{
+    template<class Vertex, size_t I>
+    struct IsLast
+    {
+        static constexpr bool value = (Vertex::dimension() - 1 == I);
+    };
+
+    template<class Vertex, class Callback, size_t I, bool P>
+    struct ForEach
+    {
+        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
+        {
+            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
+                ForEach<Vertex, Callback, I+1, IsLast<Vertex,I+1>::value>()(pos, from, to, callback);
+        }
+    };
+
+    template<class Vertex, class Callback, size_t I>
+    struct ForEach<Vertex,Callback,I,true>
+    {
+        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
+        {
+            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
+                callback(pos);
+        }
+    };
+}
+
+template<class Vertex, class Callback>
+void for_each(const Vertex& from, const Vertex& to, const Callback& callback)
+{
+    Vertex pos;
+    grid::detail::ForEach<Vertex, Callback, 0, detail::IsLast<Vertex,0>::value>()(pos, from, to, callback);
+}
+
+template<class Vertex, class Callback>
+void for_each(const Vertex& shape, const Callback& callback)
+{
+    // specify grid namespace to disambiguate with std::for_each(...)
+    grid::for_each(Vertex::zero(), shape - Vertex::one(), callback);
+}
+
+}
+
+#endif
--- a/examples/game_of_life/GameOfLife.cxx
+++ b/examples/game_of_life/GameOfLife.cxx
@ -357,7 +357,7 @@ int main(int argc, char** argv)
  vtkm::cont::DataSetBuilderUniform builder;
  vtkm::cont::DataSet data = builder.Create(vtkm::Id2(x, y));

-  vtkm::cont::Field stateField("state", vtkm::cont::Field::ASSOC_POINTS, input_state);
+  auto stateField = vtkm::cont::make_Field("state", vtkm::cont::Field::ASSOC_POINTS, input_state);
  data.AddField(stateField);

  GameOfLife filter;
--- a/vtkm/CMakeLists.txt
+++ b/vtkm/CMakeLists.txt
@ -34,6 +34,7 @@ set(headers
  Bounds.h
  CellShape.h
  CellTraits.h
+  Flags.h
  Hash.h
  ImplicitFunction.h
  ListTag.h
--- a/vtkm/Flags.h
+++ b/vtkm/Flags.h
@ -0,0 +1,33 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_Flags_h
+#define vtk_m_Flags_h
+
+namespace vtkm
+{
+
+enum class CopyFlag
+{
+  Off = 0,
+  On = 1
+};
+}
+
+#endif // vtk_m_Flags_h
--- a/vtkm/ListTag.h
+++ b/vtkm/ListTag.h
@ -94,7 +94,7 @@ VTKM_CONT void ListForEach(Functor&& f, ListTag, Args&&... args)
 }

 /// Generate a tag that is the cross product of two other tags. The resulting
-// a tag has the form of Tag< std::pair<A1,B1>, std::pair<A1,B2> .... >
+// a tag has the form of Tag< brigand::list<A1,B1>, brigand::list<A1,B2> .... >
 ///
 template <typename ListTag1, typename ListTag2>
 struct ListCrossProduct : detail::ListRoot
--- a/vtkm/benchmarking/BenchmarkRayTracing.cxx
+++ b/vtkm/benchmarking/BenchmarkRayTracing.cxx
@ -0,0 +1,113 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/benchmarking/Benchmarker.h>
+
+#include <vtkm/TypeTraits.h>
+
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/Timer.h>
+#include <vtkm/cont/testing/MakeTestDataSet.h>
+
+#include <vtkm/rendering/Camera.h>
+#include <vtkm/rendering/internal/RunTriangulator.h>
+#include <vtkm/rendering/raytracing/Ray.h>
+#include <vtkm/rendering/raytracing/RayTracer.h>
+
+#include <vtkm/exec/FunctorBase.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace vtkm::benchmarking;
+namespace vtkm
+{
+namespace benchmarking
+{
+
+template <typename Precision>
+struct BenchRayTracing
+{
+  vtkm::rendering::raytracing::RayTracer Tracer;
+  vtkm::rendering::raytracing::Camera RayCamera;
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Id, 4>> Indices;
+  vtkm::rendering::raytracing::Ray<Precision> Rays;
+  vtkm::Id NumberOfTriangles;
+  vtkm::cont::CoordinateSystem Coords;
+  vtkm::cont::DataSet Data;
+
+  VTKM_CONT BenchRayTracing()
+  {
+    vtkm::cont::testing::MakeTestDataSet maker;
+    Data = maker.Make3DUniformDataSet2();
+    Coords = Data.GetCoordinateSystem();
+
+    vtkm::rendering::Camera camera;
+    vtkm::Bounds bounds = Data.GetCoordinateSystem().GetBounds();
+    camera.ResetToBounds(bounds);
+
+    vtkm::cont::DynamicCellSet cellset = Data.GetCellSet();
+    vtkm::rendering::internal::RunTriangulator(cellset, Indices, NumberOfTriangles);
+
+    vtkm::rendering::CanvasRayTracer canvas(1920, 1080);
+    RayCamera.SetParameters(camera, canvas);
+    RayCamera.CreateRays(Rays, Coords);
+
+    Rays.Buffers.at(0).InitConst(0.f);
+
+    vtkm::cont::Field field = Data.GetField("pointvar");
+    vtkm::Range range = field.GetRange().GetPortalConstControl().Get(0);
+
+    Tracer.SetData(Coords.GetData(), Indices, field, NumberOfTriangles, range, bounds);
+
+    vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 4>> colors;
+    vtkm::rendering::ColorTable("cool2warm").Sample(100, colors);
+
+    Tracer.SetColorMap(colors);
+    Tracer.Render(Rays);
+  }
+
+  VTKM_CONT
+  vtkm::Float64 operator()()
+  {
+    vtkm::cont::Timer<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> timer;
+
+    RayCamera.CreateRays(Rays, Coords);
+    Tracer.Render(Rays);
+
+    return timer.GetElapsedTime();
+  }
+
+  VTKM_CONT
+  std::string Description() const { return "A ray tracing benchmark"; }
+};
+
+VTKM_MAKE_BENCHMARK(RayTracing, BenchRayTracing);
+}
+} // end namespace vtkm::benchmarking
+
+int main(int, char* [])
+{
+  using TestTypes = vtkm::ListTagBase<vtkm::Float32>;
+  VTKM_RUN_BENCHMARK(RayTracing, vtkm::ListTagBase<vtkm::Float32>());
+  return 0;
+}
--- a/vtkm/benchmarking/CMakeLists.txt
+++ b/vtkm/benchmarking/CMakeLists.txt
@ -25,16 +25,10 @@ set(benchmarks
  BenchmarkFieldAlgorithms
  BenchmarkTopologyAlgorithms
  )
-#set(benchmark_files
-#        BenchmarkArrayTransfer.cxx
-#        BenchmarkCopySpeeds.cxx
-#        BenchmarkDeviceAdapter.cxx
-#        BenchmarkFieldAlgorithms.cxx
-#        BenchmarkTopologyAlgorithms.cxx
-#        )
-#set(benchmark_headers
-#  Benchmarker.h
-#  )
+
+if(TARGET vtkm_rendering)
+  list(APPEND benchmarks BenchmarkRayTracing)
+endif()

 function(add_benchmark name files)
  add_executable(${name}_SERIAL ${files})
--- a/vtkm/cont/Algorithm.h
+++ b/vtkm/cont/Algorithm.h
@ -0,0 +1,537 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_Algorithm_h
+#define vtk_m_cont_Algorithm_h
+
+#include <vtkm/Types.h>
+
+#include <vtkm/cont/TryExecute.h>
+#include <vtkm/cont/internal/ArrayManagerExecution.h>
+#include <vtkm/cont/internal/DeviceAdapterTag.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+namespace
+{
+struct CopyFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Copy(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct CopyIfFunctor
+{
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::CopyIf(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct CopySubRangeFunctor
+{
+  bool valid;
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    valid = vtkm::cont::DeviceAdapterAlgorithm<Device>::CopySubRange(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct LowerBoundsFunctor
+{
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::LowerBounds(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+template <typename U>
+struct ReduceFunctor
+{
+  U result;
+
+  ReduceFunctor()
+    : result(U(0))
+  {
+  }
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    result = vtkm::cont::DeviceAdapterAlgorithm<Device>::Reduce(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct ReduceByKeyFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::ReduceByKey(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+template <typename T>
+struct ScanInclusiveResultFunctor
+{
+  T result;
+  ScanInclusiveResultFunctor()
+    : result(T(0))
+  {
+  }
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    result = vtkm::cont::DeviceAdapterAlgorithm<Device>::ScanInclusive(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+template <typename T>
+struct StreamingScanExclusiveFunctor
+{
+  T result;
+  StreamingScanExclusiveFunctor()
+    : result(T(0))
+  {
+  }
+
+  template <typename Device, class CIn, class COut>
+  VTKM_CONT bool operator()(Device,
+                            const vtkm::Id numBlocks,
+                            const vtkm::cont::ArrayHandle<T, CIn>& input,
+                            vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    result =
+      vtkm::cont::DeviceAdapterAlgorithm<Device>::StreamingScanExclusive(numBlocks, input, output);
+    return true;
+  }
+};
+
+struct ScanInclusiveByKeyFunctor
+{
+  ScanInclusiveByKeyFunctor() {}
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::ScanInclusiveByKey(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+template <typename T>
+struct ScanExclusiveFunctor
+{
+  T result;
+  ScanExclusiveFunctor()
+    : result(T(0))
+  {
+  }
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    result = vtkm::cont::DeviceAdapterAlgorithm<Device>::ScanExclusive(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct ScanExclusiveByKeyFunctor
+{
+  ScanExclusiveByKeyFunctor() {}
+
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::ScanExclusiveByKey(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct ScheduleFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Schedule(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct SortFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Sort(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct SortByKeyFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::SortByKey(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct SynchronizeFunctor
+{
+  template <typename Device>
+  VTKM_CONT bool operator()(Device)
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Synchronize();
+    return true;
+  }
+};
+
+struct UniqueFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::Unique(std::forward<Args>(args)...);
+    return true;
+  }
+};
+
+struct UpperBoundsFunctor
+{
+  template <typename Device, typename... Args>
+  VTKM_CONT bool operator()(Device, Args&&... args) const
+  {
+    VTKM_IS_DEVICE_ADAPTER_TAG(Device);
+    vtkm::cont::DeviceAdapterAlgorithm<Device>::UpperBounds(std::forward<Args>(args)...);
+    return true;
+  }
+};
+} // annonymous namespace
+
+struct Algorithm
+{
+
+  template <typename T, typename U, class CIn, class COut>
+  VTKM_CONT static void Copy(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                             vtkm::cont::ArrayHandle<U, COut>& output)
+  {
+    vtkm::cont::TryExecute(CopyFunctor(), input, output);
+  }
+
+  template <typename T, typename U, class CIn, class CStencil, class COut>
+  VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                               const vtkm::cont::ArrayHandle<U, CStencil>& stencil,
+                               vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    vtkm::cont::TryExecute(CopyIfFunctor(), input, stencil, output);
+  }
+
+  template <typename T, typename U, class CIn, class CStencil, class COut, class UnaryPredicate>
+  VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                               const vtkm::cont::ArrayHandle<U, CStencil>& stencil,
+                               vtkm::cont::ArrayHandle<T, COut>& output,
+                               UnaryPredicate unary_predicate)
+  {
+    vtkm::cont::TryExecute(CopyIfFunctor(), input, stencil, output, unary_predicate);
+  }
+
+  template <typename T, typename U, class CIn, class COut>
+  VTKM_CONT static bool CopySubRange(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                     vtkm::Id inputStartIndex,
+                                     vtkm::Id numberOfElementsToCopy,
+                                     vtkm::cont::ArrayHandle<U, COut>& output,
+                                     vtkm::Id outputIndex = 0)
+  {
+    CopySubRangeFunctor functor;
+    vtkm::cont::TryExecute(
+      functor, input, inputStartIndex, numberOfElementsToCopy, output, outputIndex);
+    return functor.valid;
+  }
+
+  template <typename T, class CIn, class CVal, class COut>
+  VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                    const vtkm::cont::ArrayHandle<T, CVal>& values,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& output)
+  {
+    vtkm::cont::TryExecute(LowerBoundsFunctor(), input, values, output);
+  }
+
+  template <typename T, class CIn, class CVal, class COut, class BinaryCompare>
+  VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                    const vtkm::cont::ArrayHandle<T, CVal>& values,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& output,
+                                    BinaryCompare binary_compare)
+  {
+    vtkm::cont::TryExecute(LowerBoundsFunctor(), input, values, output, binary_compare);
+  }
+
+  template <class CIn, class COut>
+  VTKM_CONT static void LowerBounds(const vtkm::cont::ArrayHandle<vtkm::Id, CIn>& input,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& values_output)
+  {
+    vtkm::cont::TryExecute(LowerBoundsFunctor(), input, values_output);
+  }
+
+  template <typename T, typename U, class CIn>
+  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input, U initialValue)
+  {
+    ReduceFunctor<U> functor;
+    vtkm::cont::TryExecute(functor, input, initialValue);
+    return functor.result;
+  }
+
+  template <typename T, typename U, class CIn, class BinaryFunctor>
+  VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                            U initialValue,
+                            BinaryFunctor binary_functor)
+  {
+    ReduceFunctor<U> functor;
+    vtkm::cont::TryExecute(functor, input, initialValue, binary_functor);
+    return functor.result;
+  }
+
+  template <typename T,
+            typename U,
+            class CKeyIn,
+            class CValIn,
+            class CKeyOut,
+            class CValOut,
+            class BinaryFunctor>
+  VTKM_CONT static void ReduceByKey(const vtkm::cont::ArrayHandle<T, CKeyIn>& keys,
+                                    const vtkm::cont::ArrayHandle<U, CValIn>& values,
+                                    vtkm::cont::ArrayHandle<T, CKeyOut>& keys_output,
+                                    vtkm::cont::ArrayHandle<U, CValOut>& values_output,
+                                    BinaryFunctor binary_functor)
+  {
+    vtkm::cont::TryExecute(
+      ReduceByKeyFunctor(), keys, values, keys_output, values_output, binary_functor);
+  }
+
+  template <typename T, class CIn, class COut>
+  VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    ScanInclusiveResultFunctor<T> functor;
+    vtkm::cont::TryExecute(functor, input, output);
+    return functor.result;
+  }
+
+  template <typename T, class CIn, class COut>
+  VTKM_CONT static T StreamingScanExclusive(const vtkm::Id numBlocks,
+                                            const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                            vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    StreamingScanExclusiveFunctor<T> functor;
+    vtkm::cont::TryExecute(functor, numBlocks, input, output);
+    return functor.result;
+  }
+
+  template <typename T, class CIn, class COut, class BinaryFunctor>
+  VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output,
+                                   BinaryFunctor binary_functor)
+  {
+    ScanInclusiveResultFunctor<T> functor;
+    vtkm::cont::TryExecute(functor, input, output, binary_functor);
+    return functor.result;
+  }
+
+  template <typename T,
+            typename U,
+            typename KIn,
+            typename VIn,
+            typename VOut,
+            typename BinaryFunctor>
+  VTKM_CONT static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle<T, KIn>& keys,
+                                           const vtkm::cont::ArrayHandle<U, VIn>& values,
+                                           vtkm::cont::ArrayHandle<U, VOut>& values_output,
+                                           BinaryFunctor binary_functor)
+  {
+    vtkm::cont::TryExecute(
+      ScanInclusiveByKeyFunctor(), keys, values, values_output, binary_functor);
+  }
+
+  template <typename T, typename U, typename KIn, typename VIn, typename VOut>
+  VTKM_CONT static void ScanInclusiveByKey(const vtkm::cont::ArrayHandle<T, KIn>& keys,
+                                           const vtkm::cont::ArrayHandle<U, VIn>& values,
+                                           vtkm::cont::ArrayHandle<U, VOut>& values_output)
+  {
+    vtkm::cont::TryExecute(ScanInclusiveByKeyFunctor(), keys, values, values_output);
+  }
+
+  template <typename T, class CIn, class COut>
+  VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output)
+  {
+    ScanExclusiveFunctor<T> functor;
+    vtkm::cont::TryExecute(functor, input, output);
+    return functor.result;
+  }
+
+  template <typename T, class CIn, class COut, class BinaryFunctor>
+  VTKM_CONT static T ScanExclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                   vtkm::cont::ArrayHandle<T, COut>& output,
+                                   BinaryFunctor binaryFunctor,
+                                   const T& initialValue)
+  {
+    ScanExclusiveFunctor<T> functor;
+    vtkm::cont::TryExecute(functor, input, output, binaryFunctor, initialValue);
+    return functor.result;
+  }
+
+  template <typename T, typename U, typename KIn, typename VIn, typename VOut, class BinaryFunctor>
+  VTKM_CONT static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle<T, KIn>& keys,
+                                           const vtkm::cont::ArrayHandle<U, VIn>& values,
+                                           vtkm::cont::ArrayHandle<U, VOut>& output,
+                                           const U& initialValue,
+                                           BinaryFunctor binaryFunctor)
+  {
+    vtkm::cont::TryExecute(
+      ScanExclusiveByKeyFunctor(), keys, values, output, initialValue, binaryFunctor);
+  }
+
+  template <typename T, typename U, class KIn, typename VIn, typename VOut>
+  VTKM_CONT static void ScanExclusiveByKey(const vtkm::cont::ArrayHandle<T, KIn>& keys,
+                                           const vtkm::cont::ArrayHandle<U, VIn>& values,
+                                           vtkm::cont::ArrayHandle<U, VOut>& output)
+  {
+    vtkm::cont::TryExecute(ScanExclusiveByKeyFunctor(), keys, values, output);
+  }
+
+  template <class Functor>
+  VTKM_CONT static void Schedule(Functor functor, vtkm::Id numInstances)
+  {
+    vtkm::cont::TryExecute(ScheduleFunctor(), functor, numInstances);
+  }
+
+  template <class Functor>
+  VTKM_CONT static void Schedule(Functor functor, vtkm::Id3 rangeMax)
+  {
+    vtkm::cont::TryExecute(ScheduleFunctor(), functor, rangeMax);
+  }
+
+  template <typename T, class Storage>
+  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values)
+  {
+    vtkm::cont::TryExecute(SortFunctor(), values);
+  }
+
+  template <typename T, class Storage, class BinaryCompare>
+  VTKM_CONT static void Sort(vtkm::cont::ArrayHandle<T, Storage>& values,
+                             BinaryCompare binary_compare)
+  {
+    vtkm::cont::TryExecute(SortFunctor(), values, binary_compare);
+  }
+
+  template <typename T, typename U, class StorageT, class StorageU>
+  VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                                  vtkm::cont::ArrayHandle<U, StorageU>& values)
+  {
+    vtkm::cont::TryExecute(SortByKeyFunctor(), keys, values);
+  }
+
+  template <typename T, typename U, class StorageT, class StorageU, class BinaryCompare>
+  VTKM_CONT static void SortByKey(vtkm::cont::ArrayHandle<T, StorageT>& keys,
+                                  vtkm::cont::ArrayHandle<U, StorageU>& values,
+                                  BinaryCompare binary_compare)
+  {
+    vtkm::cont::TryExecute(SortByKeyFunctor(), keys, values, binary_compare);
+  }
+
+  VTKM_CONT static void Synchronize() { vtkm::cont::TryExecute(SynchronizeFunctor()); }
+
+  template <typename T, class Storage>
+  VTKM_CONT static void Unique(vtkm::cont::ArrayHandle<T, Storage>& values)
+  {
+    vtkm::cont::TryExecute(UniqueFunctor(), values);
+  }
+
+  template <typename T, class Storage, class BinaryCompare>
+  VTKM_CONT static void Unique(vtkm::cont::ArrayHandle<T, Storage>& values,
+                               BinaryCompare binary_compare)
+  {
+    vtkm::cont::TryExecute(UniqueFunctor(), values, binary_compare);
+  }
+
+  template <typename T, class CIn, class CVal, class COut>
+  VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                    const vtkm::cont::ArrayHandle<T, CVal>& values,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& output)
+  {
+    vtkm::cont::TryExecute(UpperBoundsFunctor(), input, values, output);
+  }
+
+  template <typename T, class CIn, class CVal, class COut, class BinaryCompare>
+  VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle<T, CIn>& input,
+                                    const vtkm::cont::ArrayHandle<T, CVal>& values,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& output,
+                                    BinaryCompare binary_compare)
+  {
+    vtkm::cont::TryExecute(UpperBoundsFunctor(), input, values, output, binary_compare);
+  }
+
+  template <class CIn, class COut>
+  VTKM_CONT static void UpperBounds(const vtkm::cont::ArrayHandle<vtkm::Id, CIn>& input,
+                                    vtkm::cont::ArrayHandle<vtkm::Id, COut>& values_output)
+  {
+    vtkm::cont::TryExecute(UpperBoundsFunctor(), input, values_output);
+  }
+};
+}
+} // namespace vtkm::cont
+
+#endif //vtk_m_cont_Algorithm_h
--- a/vtkm/cont/ArrayHandle.h
+++ b/vtkm/cont/ArrayHandle.h
@ -23,6 +23,7 @@
 #include <vtkm/cont/vtkm_cont_export.h>

 #include <vtkm/Assert.h>
+#include <vtkm/Flags.h>
 #include <vtkm/Types.h>

 #include <vtkm/cont/ArrayPortalToIterators.h>
@ -31,6 +32,7 @@
 #include <vtkm/cont/Storage.h>
 #include <vtkm/cont/StorageBasic.h>

+#include <algorithm>
 #include <iterator>
 #include <memory>
 #include <vector>
@ -503,23 +505,35 @@ public:
 /// A convenience function for creating an ArrayHandle from a standard C array.
 ///
 template <typename T>
-VTKM_CONT vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> make_ArrayHandle(const T* array,
-                                                                                   vtkm::Id length)
+VTKM_CONT vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>
+make_ArrayHandle(const T* array, vtkm::Id length, vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
 {
  using ArrayHandleType = vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>;
-  using StorageType = vtkm::cont::internal::Storage<T, vtkm::cont::StorageTagBasic>;
-  return ArrayHandleType(StorageType(array, length));
+  if (copy == vtkm::CopyFlag::On)
+  {
+    ArrayHandleType handle;
+    handle.Allocate(length);
+    std::copy(
+      array, array + length, vtkm::cont::ArrayPortalToIteratorBegin(handle.GetPortalControl()));
+    return handle;
+  }
+  else
+  {
+    using StorageType = vtkm::cont::internal::Storage<T, vtkm::cont::StorageTagBasic>;
+    return ArrayHandleType(StorageType(array, length));
+  }
 }

 /// A convenience function for creating an ArrayHandle from an std::vector.
 ///
 template <typename T, typename Allocator>
 VTKM_CONT vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> make_ArrayHandle(
-  const std::vector<T, Allocator>& array)
+  const std::vector<T, Allocator>& array,
+  vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
 {
  if (!array.empty())
  {
-    return make_ArrayHandle(&array.front(), static_cast<vtkm::Id>(array.size()));
+    return make_ArrayHandle(&array.front(), static_cast<vtkm::Id>(array.size()), copy);
  }
  else
  {
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@ -19,6 +19,7 @@
 ##============================================================================

 set(headers
+  Algorithm.h
  ArrayCopy.h
  ArrayHandle.h
  ArrayHandleCast.h
@ -62,6 +63,7 @@ set(headers
  DeviceAdapterListTag.h
  DynamicArrayHandle.h
  DynamicCellSet.h
+  EnvironmentTracker.h
  Error.h
  ErrorBadAllocation.h
  ErrorBadType.h
@ -96,7 +98,9 @@ set(sources
  CellSetExplicit.cxx
  CellSetStructured.cxx
  CoordinateSystem.cxx
+  DataSet.cxx
  DynamicArrayHandle.cxx
+  EnvironmentTracker.cxx
  Field.cxx
  internal/SimplePolymorphicContainer.cxx
  MultiBlock.cxx
@ -140,5 +144,10 @@ vtkm_library( NAME vtkm_cont
              WRAP_FOR_CUDA ${device_sources}
            )
 target_link_libraries(vtkm_cont PUBLIC vtkm_compiler_flags ${backends})
+if(VTKm_ENABLE_MPI)
+  # This will become a required dependency eventually.
+  target_link_libraries(vtkm_cont PUBLIC diy)
+endif()
+
 #-----------------------------------------------------------------------------
 add_subdirectory(testing)
--- a/vtkm/cont/CoordinateSystem.h
+++ b/vtkm/cont/CoordinateSystem.h
@ -110,18 +110,6 @@ public:
  {
  }

-  template <typename T>
-  VTKM_CONT CoordinateSystem(std::string name, const std::vector<T>& data)
-    : Superclass(name, ASSOC_POINTS, data)
-  {
-  }
-
-  template <typename T>
-  VTKM_CONT CoordinateSystem(std::string name, const T* data, vtkm::Id numberOfValues)
-    : Superclass(name, ASSOC_POINTS, data, numberOfValues)
-  {
-  }
-
  /// This constructor of coordinate system sets up a regular grid of points.
  ///
  VTKM_CONT
@ -225,9 +213,27 @@ public:
 };

 template <typename Functor, typename... Args>
-void CastAndCall(const vtkm::cont::CoordinateSystem& coords, const Functor& f, Args&&... args)
+void CastAndCall(const vtkm::cont::CoordinateSystem& coords, Functor&& f, Args&&... args)
 {
-  coords.GetData().CastAndCall(f, std::forward<Args>(args)...);
+  coords.GetData().CastAndCall(std::forward<Functor>(f), std::forward<Args>(args)...);
+}
+
+template <typename T>
+vtkm::cont::CoordinateSystem make_CoordinateSystem(std::string name,
+                                                   const std::vector<T>& data,
+                                                   vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::CoordinateSystem(name, vtkm::cont::make_ArrayHandle(data, copy));
+}
+
+template <typename T>
+vtkm::cont::CoordinateSystem make_CoordinateSystem(std::string name,
+                                                   const T* data,
+                                                   vtkm::Id numberOfValues,
+                                                   vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::CoordinateSystem(name,
+                                      vtkm::cont::make_ArrayHandle(data, numberOfValues, copy));
 }

 namespace internal
--- a/vtkm/cont/DataSet.cxx
+++ b/vtkm/cont/DataSet.cxx
@ -0,0 +1,164 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/DataSet.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+DataSet::DataSet()
+{
+}
+
+void DataSet::Clear()
+{
+  this->CoordSystems.clear();
+  this->Fields.clear();
+  this->CellSets.clear();
+}
+
+const vtkm::cont::Field& DataSet::GetField(vtkm::Id index) const
+{
+  VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfFields()));
+  return this->Fields[static_cast<std::size_t>(index)];
+}
+
+vtkm::Id DataSet::GetFieldIndex(const std::string& name,
+                                vtkm::cont::Field::AssociationEnum assoc) const
+{
+  bool found;
+  vtkm::Id index = this->FindFieldIndex(name, assoc, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No field with requested name: " + name);
+  }
+}
+
+const vtkm::cont::CoordinateSystem& DataSet::GetCoordinateSystem(vtkm::Id index) const
+{
+  VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfCoordinateSystems()));
+  return this->CoordSystems[static_cast<std::size_t>(index)];
+}
+
+vtkm::Id DataSet::GetCoordinateSystemIndex(const std::string& name) const
+{
+  bool found;
+  vtkm::Id index = this->FindCoordinateSystemIndex(name, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No coordinate system with requested name");
+  }
+}
+
+vtkm::Id DataSet::GetCellSetIndex(const std::string& name) const
+{
+  bool found;
+  vtkm::Id index = this->FindCellSetIndex(name, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No cell set with requested name");
+  }
+}
+
+void DataSet::PrintSummary(std::ostream& out) const
+{
+  out << "DataSet:\n";
+  out << "  CoordSystems[" << this->CoordSystems.size() << "]\n";
+  for (std::size_t index = 0; index < this->CoordSystems.size(); index++)
+  {
+    this->CoordSystems[index].PrintSummary(out);
+  }
+
+  out << "  CellSets[" << this->GetNumberOfCellSets() << "]\n";
+  for (vtkm::Id index = 0; index < this->GetNumberOfCellSets(); index++)
+  {
+    this->GetCellSet(index).PrintSummary(out);
+  }
+
+  out << "  Fields[" << this->GetNumberOfFields() << "]\n";
+  for (vtkm::Id index = 0; index < this->GetNumberOfFields(); index++)
+  {
+    this->GetField(index).PrintSummary(out);
+  }
+}
+
+vtkm::Id DataSet::FindFieldIndex(const std::string& name,
+                                 vtkm::cont::Field::AssociationEnum association,
+                                 bool& found) const
+{
+  for (std::size_t index = 0; index < this->Fields.size(); ++index)
+  {
+    if ((association == vtkm::cont::Field::ASSOC_ANY ||
+         association == this->Fields[index].GetAssociation()) &&
+        this->Fields[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+
+vtkm::Id DataSet::FindCoordinateSystemIndex(const std::string& name, bool& found) const
+{
+  for (std::size_t index = 0; index < this->CoordSystems.size(); ++index)
+  {
+    if (this->CoordSystems[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+vtkm::Id DataSet::FindCellSetIndex(const std::string& name, bool& found) const
+{
+  for (std::size_t index = 0; index < static_cast<size_t>(this->GetNumberOfCellSets()); ++index)
+  {
+    if (this->CellSets[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+} // namespace cont
+} // namespace vtkm
--- a/vtkm/cont/DataSet.h
+++ b/vtkm/cont/DataSet.h
@ -20,6 +20,8 @@
 #ifndef vtk_m_cont_DataSet_h
 #define vtk_m_cont_DataSet_h

+#include <vtkm/cont/vtkm_cont_export.h>
+
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/CoordinateSystem.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
@ -33,29 +35,17 @@ namespace vtkm
 namespace cont
 {

-class DataSet
+class VTKM_CONT_EXPORT DataSet
 {
 public:
-  VTKM_CONT
-  DataSet() {}
+  VTKM_CONT DataSet();
+
+  VTKM_CONT void Clear();
+
+  VTKM_CONT void AddField(const Field& field) { this->Fields.push_back(field); }

  VTKM_CONT
-  void Clear()
-  {
-    this->CoordSystems.clear();
-    this->Fields.clear();
-    this->CellSets.clear();
-  }
-
-  VTKM_CONT
-  void AddField(Field field) { this->Fields.push_back(field); }
-
-  VTKM_CONT
-  const vtkm::cont::Field& GetField(vtkm::Id index) const
-  {
-    VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfFields()));
-    return this->Fields[static_cast<std::size_t>(index)];
-  }
+  const vtkm::cont::Field& GetField(vtkm::Id index) const;

  VTKM_CONT
  bool HasField(const std::string& name,
@ -69,19 +59,7 @@ public:
  VTKM_CONT
  vtkm::Id GetFieldIndex(
    const std::string& name,
-    vtkm::cont::Field::AssociationEnum assoc = vtkm::cont::Field::ASSOC_ANY) const
-  {
-    bool found;
-    vtkm::Id index = this->FindFieldIndex(name, assoc, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No field with requested name: " + name);
-    }
-  }
+    vtkm::cont::Field::AssociationEnum assoc = vtkm::cont::Field::ASSOC_ANY) const;

  VTKM_CONT
  const vtkm::cont::Field& GetField(
@ -104,14 +82,13 @@ public:
  }

  VTKM_CONT
-  void AddCoordinateSystem(vtkm::cont::CoordinateSystem cs) { this->CoordSystems.push_back(cs); }
+  void AddCoordinateSystem(const vtkm::cont::CoordinateSystem& cs)
+  {
+    this->CoordSystems.push_back(cs);
+  }

  VTKM_CONT
-  const vtkm::cont::CoordinateSystem& GetCoordinateSystem(vtkm::Id index = 0) const
-  {
-    VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfCoordinateSystems()));
-    return this->CoordSystems[static_cast<std::size_t>(index)];
-  }
+  const vtkm::cont::CoordinateSystem& GetCoordinateSystem(vtkm::Id index = 0) const;

  VTKM_CONT
  bool HasCoordinateSystem(const std::string& name) const
@ -122,19 +99,7 @@ public:
  }

  VTKM_CONT
-  vtkm::Id GetCoordinateSystemIndex(const std::string& name) const
-  {
-    bool found;
-    vtkm::Id index = this->FindCoordinateSystemIndex(name, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No coordinate system with requested name");
-    }
-  }
+  vtkm::Id GetCoordinateSystemIndex(const std::string& name) const;

  VTKM_CONT
  const vtkm::cont::CoordinateSystem& GetCoordinateSystem(const std::string& name) const
@ -143,7 +108,7 @@ public:
  }

  VTKM_CONT
-  void AddCellSet(vtkm::cont::DynamicCellSet cellSet) { this->CellSets.push_back(cellSet); }
+  void AddCellSet(const vtkm::cont::DynamicCellSet& cellSet) { this->CellSets.push_back(cellSet); }

  template <typename CellSetType>
  VTKM_CONT void AddCellSet(const CellSetType& cellSet)
@ -168,19 +133,7 @@ public:
  }

  VTKM_CONT
-  vtkm::Id GetCellSetIndex(const std::string& name) const
-  {
-    bool found;
-    vtkm::Id index = this->FindCellSetIndex(name, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No cell set with requested name");
-    }
-  }
+  vtkm::Id GetCellSetIndex(const std::string& name) const;

  VTKM_CONT
  vtkm::cont::DynamicCellSet GetCellSet(const std::string& name) const
@ -207,27 +160,7 @@ public:
  }

  VTKM_CONT
-  void PrintSummary(std::ostream& out) const
-  {
-    out << "DataSet:\n";
-    out << "  CoordSystems[" << this->CoordSystems.size() << "]\n";
-    for (std::size_t index = 0; index < this->CoordSystems.size(); index++)
-    {
-      this->CoordSystems[index].PrintSummary(out);
-    }
-
-    out << "  CellSets[" << this->GetNumberOfCellSets() << "]\n";
-    for (vtkm::Id index = 0; index < this->GetNumberOfCellSets(); index++)
-    {
-      this->GetCellSet(index).PrintSummary(out);
-    }
-
-    out << "  Fields[" << this->GetNumberOfFields() << "]\n";
-    for (vtkm::Id index = 0; index < this->GetNumberOfFields(); index++)
-    {
-      this->GetField(index).PrintSummary(out);
-    }
-  }
+  void PrintSummary(std::ostream& out) const;

 private:
  std::vector<vtkm::cont::CoordinateSystem> CoordSystems;
@ -237,51 +170,13 @@ private:
  VTKM_CONT
  vtkm::Id FindFieldIndex(const std::string& name,
                          vtkm::cont::Field::AssociationEnum association,
-                          bool& found) const
-  {
-    for (std::size_t index = 0; index < this->Fields.size(); ++index)
-    {
-      if ((association == vtkm::cont::Field::ASSOC_ANY ||
-           association == this->Fields[index].GetAssociation()) &&
-          this->Fields[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+                          bool& found) const;

  VTKM_CONT
-  vtkm::Id FindCoordinateSystemIndex(const std::string& name, bool& found) const
-  {
-    for (std::size_t index = 0; index < this->CoordSystems.size(); ++index)
-    {
-      if (this->CoordSystems[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+  vtkm::Id FindCoordinateSystemIndex(const std::string& name, bool& found) const;

  VTKM_CONT
-  vtkm::Id FindCellSetIndex(const std::string& name, bool& found) const
-  {
-    for (std::size_t index = 0; index < static_cast<size_t>(this->GetNumberOfCellSets()); ++index)
-    {
-      if (this->CellSets[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+  vtkm::Id FindCellSetIndex(const std::string& name, bool& found) const;
 };

 } // namespace cont
--- a/vtkm/cont/DataSetFieldAdd.h
+++ b/vtkm/cont/DataSetFieldAdd.h
@ -57,7 +57,8 @@ public:
                                      const std::string& fieldName,
                                      const std::vector<T>& field)
  {
-    dataSet.AddField(Field(fieldName, vtkm::cont::Field::ASSOC_POINTS, field));
+    dataSet.AddField(
+      make_Field(fieldName, vtkm::cont::Field::ASSOC_POINTS, field, vtkm::CopyFlag::On));
  }

  template <typename T>
@ -66,7 +67,8 @@ public:
                                      const T* field,
                                      const vtkm::Id& n)
  {
-    dataSet.AddField(Field(fieldName, vtkm::cont::Field::ASSOC_POINTS, field, n));
+    dataSet.AddField(
+      make_Field(fieldName, vtkm::cont::Field::ASSOC_POINTS, field, n, vtkm::CopyFlag::On));
  }

  //Cell centered field
@ -94,7 +96,8 @@ public:
                                     const std::vector<T>& field,
                                     const std::string& cellSetName)
  {
-    dataSet.AddField(Field(fieldName, vtkm::cont::Field::ASSOC_CELL_SET, cellSetName, field));
+    dataSet.AddField(make_Field(
+      fieldName, vtkm::cont::Field::ASSOC_CELL_SET, cellSetName, field, vtkm::CopyFlag::On));
  }

  template <typename T>
@ -104,7 +107,8 @@ public:
                                     const vtkm::Id& n,
                                     const std::string& cellSetName)
  {
-    dataSet.AddField(Field(fieldName, vtkm::cont::Field::ASSOC_CELL_SET, cellSetName, field, n));
+    dataSet.AddField(make_Field(
+      fieldName, vtkm::cont::Field::ASSOC_CELL_SET, cellSetName, field, n, vtkm::CopyFlag::On));
  }

  VTKM_CONT
--- a/vtkm/cont/DeviceAdapterAlgorithm.h
+++ b/vtkm/cont/DeviceAdapterAlgorithm.h
@ -223,14 +223,14 @@ struct DeviceAdapterAlgorithm
  VTKM_CONT static T ScanInclusive(const vtkm::cont::ArrayHandle<T, CIn>& input,
                                   vtkm::cont::ArrayHandle<T, COut>& output);

-  /// \brief Streaming version of scan inclusive
+  /// \brief Streaming version of scan exclusive
  ///
  /// Computes a scan one block at a time.
  ///
  /// \return The total sum.
  ///
  template <typename T, class CIn, class COut>
-  VTKM_CONT static T StreamingScanInclusive(const vtkm::Id numBlocks,
+  VTKM_CONT static T StreamingScanExclusive(const vtkm::Id numBlocks,
                                            const vtkm::cont::ArrayHandle<T, CIn>& input,
                                            vtkm::cont::ArrayHandle<T, COut>& output);

@ -282,18 +282,6 @@ struct DeviceAdapterAlgorithm
                                           const vtkm::cont::ArrayHandle<U, VIn>& values,
                                           vtkm::cont::ArrayHandle<U, VOut>& values_output);

-  /// \brief Streaming version of scan inclusive
-  ///
-  /// Computes a scan one block at a time.
-  ///
-  /// \return The total sum.
-  ///
-  template <typename T, class CIn, class COut, class BinaryFunctor>
-  VTKM_CONT static T StreamingScanInclusive(const vtkm::Id numBlocks,
-                                            const vtkm::cont::ArrayHandle<T, CIn>& input,
-                                            vtkm::cont::ArrayHandle<T, COut>& output,
-                                            BinaryFunctor binary_functor);
-
  /// \brief Compute an exclusive prefix sum operation on the input ArrayHandle.
  ///
  /// Computes an exclusive prefix sum operation on the \c input ArrayHandle,
--- a/vtkm/cont/DynamicArrayHandle.h
+++ b/vtkm/cont/DynamicArrayHandle.h
@ -358,7 +358,7 @@ public:
  /// respectively.
  ///
  template <typename Functor, typename... Args>
-  VTKM_CONT void CastAndCall(const Functor& f, Args&&...) const;
+  VTKM_CONT void CastAndCall(Functor&& f, Args&&...) const;

  /// \brief Create a new array of the same type as this array.
  ///
@ -414,15 +414,15 @@ struct DynamicArrayHandleTry
  }

  template <typename T, typename U, typename... Args>
-  void operator()(std::pair<T, U>&& p, Args&&... args) const
+  void operator()(brigand::list<T, U>, Args&&... args) const
  {
    using storage = vtkm::cont::internal::Storage<T, U>;
    using invalid = typename std::is_base_of<vtkm::cont::internal::UndefinedStorage, storage>::type;
-    this->run(std::forward<decltype(p)>(p), invalid{}, args...);
+    this->run<T, U>(invalid{}, args...);
  }

  template <typename T, typename U, typename Functor, typename... Args>
-  void run(std::pair<T, U>&&, std::false_type, Functor&& f, bool& called, Args&&... args) const
+  void run(std::false_type, Functor&& f, bool& called, Args&&... args) const
  {
    if (!called)
    {
@ -437,7 +437,7 @@ struct DynamicArrayHandleTry
  }

  template <typename T, typename U, typename... Args>
-  void run(std::pair<T, U>&&, std::true_type, Args&&...) const
+  void run(std::true_type, Args&&...) const
  {
  }

@ -451,7 +451,7 @@ VTKM_CONT_EXPORT void ThrowCastAndCallException(PolymorphicArrayHandleContainerB

 template <typename TypeList, typename StorageList>
 template <typename Functor, typename... Args>
-VTKM_CONT void DynamicArrayHandleBase<TypeList, StorageList>::CastAndCall(const Functor& f,
+VTKM_CONT void DynamicArrayHandleBase<TypeList, StorageList>::CastAndCall(Functor&& f,
                                                                          Args&&... args) const
 {
  //For optimizations we should compile once the cross product for the default types
@ -460,8 +460,11 @@ VTKM_CONT void DynamicArrayHandleBase<TypeList, StorageList>::CastAndCall(const

  bool called = false;
  auto* ptr = this->ArrayContainer.get();
-  vtkm::ListForEach(
-    detail::DynamicArrayHandleTry(ptr), crossProduct{}, f, called, std::forward<Args>(args)...);
+  vtkm::ListForEach(detail::DynamicArrayHandleTry(ptr),
+                    crossProduct{},
+                    std::forward<Functor>(f),
+                    called,
+                    std::forward<Args>(args)...);
  if (!called)
  {
    // throw an exception
--- a/vtkm/cont/DynamicCellSet.h
+++ b/vtkm/cont/DynamicCellSet.h
@ -228,7 +228,7 @@ public:
  /// behavior from \c CastAndCall.
  ///
  template <typename Functor, typename... Args>
-  VTKM_CONT void CastAndCall(const Functor& f, Args&&...) const;
+  VTKM_CONT void CastAndCall(Functor&& f, Args&&...) const;

  /// \brief Create a new cell set of the same type as this cell set.
  ///
@ -302,11 +302,12 @@ struct DynamicCellSetTry

 template <typename CellSetList>
 template <typename Functor, typename... Args>
-VTKM_CONT void DynamicCellSetBase<CellSetList>::CastAndCall(const Functor& f, Args&&... args) const
+VTKM_CONT void DynamicCellSetBase<CellSetList>::CastAndCall(Functor&& f, Args&&... args) const
 {
  bool called = false;
  detail::DynamicCellSetTry tryCellSet(this->CellSetContainer.get());
-  vtkm::ListForEach(tryCellSet, CellSetList{}, f, called, std::forward<Args>(args)...);
+  vtkm::ListForEach(
+    tryCellSet, CellSetList{}, std::forward<Functor>(f), called, std::forward<Args>(args)...);
  if (!called)
  {
    throw vtkm::cont::ErrorBadValue("Could not find appropriate cast for cell set.");
--- a/vtkm/cont/EnvironmentTracker.cxx
+++ b/vtkm/cont/EnvironmentTracker.cxx
@ -0,0 +1,67 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include <vtkm/cont/EnvironmentTracker.h>
+
+#if defined(VTKM_ENABLE_MPI)
+#include <diy/mpi.hpp>
+#else
+namespace diy
+{
+namespace mpi
+{
+class communicator
+{
+};
+}
+}
+#endif
+
+namespace vtkm
+{
+namespace cont
+{
+#if defined(VTKM_ENABLE_MPI)
+namespace internal
+{
+static diy::mpi::communicator GlobalCommuncator(MPI_COMM_NULL);
+}
+
+void EnvironmentTracker::SetCommunicator(const diy::mpi::communicator& comm)
+{
+  vtkm::cont::internal::GlobalCommuncator = comm;
+}
+
+const diy::mpi::communicator& EnvironmentTracker::GetCommunicator()
+{
+  return vtkm::cont::internal::GlobalCommuncator;
+}
+#else
+void EnvironmentTracker::SetCommunicator(const diy::mpi::communicator&)
+{
+}
+
+const diy::mpi::communicator& EnvironmentTracker::GetCommunicator()
+{
+  static diy::mpi::communicator tmp;
+  return tmp;
+}
+#endif
+} // namespace vtkm::cont
+} // namespace vtkm
--- a/vtkm/cont/EnvironmentTracker.h
+++ b/vtkm/cont/EnvironmentTracker.h
@ -0,0 +1,53 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_EnvironmentTracker_h
+#define vtk_m_cont_EnvironmentTracker_h
+
+#include <vtkm/Types.h>
+#include <vtkm/cont/vtkm_cont_export.h>
+#include <vtkm/internal/Configure.h>
+#include <vtkm/internal/ExportMacros.h>
+
+namespace diy
+{
+namespace mpi
+{
+class communicator;
+}
+}
+
+namespace vtkm
+{
+namespace cont
+{
+class VTKM_CONT_EXPORT EnvironmentTracker
+{
+public:
+  VTKM_CONT
+  static void SetCommunicator(const diy::mpi::communicator& comm);
+
+  VTKM_CONT
+  static const diy::mpi::communicator& GetCommunicator();
+};
+}
+}
+
+
+#endif // vtk_m_cont_EnvironmentTracker_h
--- a/vtkm/cont/Field.h
+++ b/vtkm/cont/Field.h
@ -102,32 +102,6 @@ public:
    VTKM_ASSERT((this->Association == ASSOC_WHOLE_MESH) || (this->Association == ASSOC_POINTS));
  }

-  template <typename T>
-  VTKM_CONT Field(std::string name, AssociationEnum association, const std::vector<T>& data)
-    : Name(name)
-    , Association(association)
-    , AssocCellSetName()
-    , AssocLogicalDim(-1)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT((this->Association == ASSOC_WHOLE_MESH) || (this->Association == ASSOC_POINTS));
-    this->CopyData(&data[0], static_cast<vtkm::Id>(data.size()));
-  }
-
-  template <typename T>
-  VTKM_CONT Field(std::string name, AssociationEnum association, const T* data, vtkm::Id nvals)
-    : Name(name)
-    , Association(association)
-    , AssocCellSetName()
-    , AssocLogicalDim(-1)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT((this->Association == ASSOC_WHOLE_MESH) || (this->Association == ASSOC_POINTS));
-    this->CopyData(data, nvals);
-  }
-
  /// constructors for cell set associations
  VTKM_CONT
  Field(std::string name,
@ -161,39 +135,6 @@ public:
    VTKM_ASSERT(this->Association == ASSOC_CELL_SET);
  }

-  template <typename T>
-  VTKM_CONT Field(std::string name,
-                  AssociationEnum association,
-                  const std::string& cellSetName,
-                  const std::vector<T>& data)
-    : Name(name)
-    , Association(association)
-    , AssocCellSetName(cellSetName)
-    , AssocLogicalDim(-1)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT(this->Association == ASSOC_CELL_SET);
-    this->CopyData(&data[0], static_cast<vtkm::Id>(data.size()));
-  }
-
-  template <typename T>
-  VTKM_CONT Field(std::string name,
-                  AssociationEnum association,
-                  const std::string& cellSetName,
-                  const T* data,
-                  vtkm::Id nvals)
-    : Name(name)
-    , Association(association)
-    , AssocCellSetName(cellSetName)
-    , AssocLogicalDim(-1)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT(this->Association == ASSOC_CELL_SET);
-    this->CopyData(data, nvals);
-  }
-
  /// constructors for logical dimension associations
  VTKM_CONT
  Field(std::string name,
@ -226,37 +167,6 @@ public:
    VTKM_ASSERT(this->Association == ASSOC_LOGICAL_DIM);
  }

-  template <typename T>
-  VTKM_CONT Field(std::string name,
-                  AssociationEnum association,
-                  vtkm::IdComponent logicalDim,
-                  const std::vector<T>& data)
-    : Name(name)
-    , Association(association)
-    , AssocLogicalDim(logicalDim)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT(this->Association == ASSOC_LOGICAL_DIM);
-    this->CopyData(&data[0], static_cast<vtkm::Id>(data.size()));
-  }
-
-  template <typename T>
-  VTKM_CONT Field(std::string name,
-                  AssociationEnum association,
-                  vtkm::IdComponent logicalDim,
-                  const T* data,
-                  vtkm::Id nvals)
-    : Name(name)
-    , Association(association)
-    , AssocLogicalDim(logicalDim)
-    , Range()
-    , ModifiedFlag(true)
-  {
-    VTKM_ASSERT(this->Association == ASSOC_LOGICAL_DIM);
-    CopyData(data, nvals);
-  }
-
  VTKM_CONT
  Field()
    : Name()
@ -356,17 +266,7 @@ public:
  template <typename T>
  VTKM_CONT void CopyData(const T* ptr, vtkm::Id nvals)
  {
-    //allocate main memory using an array handle
-    vtkm::cont::ArrayHandle<T> tmp;
-    tmp.Allocate(nvals);
-
-    //copy into the memory owned by the array handle
-    std::copy(ptr,
-              ptr + static_cast<std::size_t>(nvals),
-              vtkm::cont::ArrayPortalToIteratorBegin(tmp.GetPortalControl()));
-
-    //assign to the dynamic array handle
-    this->Data = tmp;
+    this->Data = vtkm::cont::make_ArrayHandle(ptr, nvals, true);
    this->ModifiedFlag = true;
  }

@ -402,11 +302,78 @@ private:
 };

 template <typename Functor, typename... Args>
-void CastAndCall(const vtkm::cont::Field& field, const Functor& f, Args&&... args)
+void CastAndCall(const vtkm::cont::Field& field, Functor&& f, Args&&... args)
 {
-  field.GetData().CastAndCall(f, std::forward<Args>(args)...);
+  field.GetData().CastAndCall(std::forward<Functor>(f), std::forward<Args>(args)...);
 }

+//@{
+/// Convinience functions to build fields from C style arrays and std::vector
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             const T* data,
+                             vtkm::Id size,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(name, association, vtkm::cont::make_ArrayHandle(data, size, copy));
+}
+
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             const std::vector<T>& data,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(name, association, vtkm::cont::make_ArrayHandle(data, copy));
+}
+
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             const std::string& cellSetName,
+                             const T* data,
+                             vtkm::Id size,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(
+    name, association, cellSetName, vtkm::cont::make_ArrayHandle(data, size, copy));
+}
+
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             const std::string& cellSetName,
+                             const std::vector<T>& data,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(
+    name, association, cellSetName, vtkm::cont::make_ArrayHandle(data, copy));
+}
+
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             vtkm::IdComponent logicalDim,
+                             const T* data,
+                             vtkm::Id size,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(
+    name, association, logicalDim, vtkm::cont::make_ArrayHandle(data, size, copy));
+}
+
+template <typename T>
+vtkm::cont::Field make_Field(std::string name,
+                             Field::AssociationEnum association,
+                             vtkm::IdComponent logicalDim,
+                             const std::vector<T>& data,
+                             vtkm::CopyFlag copy = vtkm::CopyFlag::Off)
+{
+  return vtkm::cont::Field(name, association, logicalDim, vtkm::cont::make_ArrayHandle(data, copy));
+}
+//@}
+
 namespace internal
 {

--- a/vtkm/cont/MultiBlock.cxx
+++ b/vtkm/cont/MultiBlock.cxx
@ -19,13 +19,142 @@
 //============================================================================

 #include <vtkm/StaticAssert.h>
+#include <vtkm/cont/ArrayCopy.h>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/DataSet.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
 #include <vtkm/cont/DynamicArrayHandle.h>
+#include <vtkm/cont/EnvironmentTracker.h>
 #include <vtkm/cont/ErrorExecution.h>
 #include <vtkm/cont/Field.h>
 #include <vtkm/cont/MultiBlock.h>
+
+#if defined(VTKM_ENABLE_MPI)
+#include <diy/master.hpp>
+
+namespace vtkm
+{
+namespace cont
+{
+namespace detail
+{
+template <typename PortalType>
+VTKM_CONT std::vector<typename PortalType::ValueType> CopyArrayPortalToVector(
+  const PortalType& portal)
+{
+  using ValueType = typename PortalType::ValueType;
+  std::vector<ValueType> result(portal.GetNumberOfValues());
+  vtkm::cont::ArrayPortalToIterators<PortalType> iterators(portal);
+  std::copy(iterators.GetBegin(), iterators.GetEnd(), result.begin());
+  return result;
+}
+}
+}
+}
+
+namespace std
+{
+
+namespace detail
+{
+
+template <typename T, size_t ElementSize = sizeof(T)>
+struct MPIPlus
+{
+  MPIPlus()
+  {
+    this->OpPtr = std::shared_ptr<MPI_Op>(new MPI_Op(MPI_NO_OP), [](MPI_Op* ptr) {
+      MPI_Op_free(ptr);
+      delete ptr;
+    });
+
+    MPI_Op_create(
+      [](void* a, void* b, int* len, MPI_Datatype*) {
+        T* ba = reinterpret_cast<T*>(a);
+        T* bb = reinterpret_cast<T*>(b);
+        for (int cc = 0; cc < (*len) / ElementSize; ++cc)
+        {
+          bb[cc] = ba[cc] + bb[cc];
+        }
+      },
+      1,
+      this->OpPtr.get());
+  }
+  ~MPIPlus() {}
+  operator MPI_Op() const { return *this->OpPtr.get(); }
+private:
+  std::shared_ptr<MPI_Op> OpPtr;
+};
+
+} // std::detail
+
+template <>
+struct plus<vtkm::Bounds>
+{
+  MPI_Op get_mpi_op() const { return this->Op; }
+  vtkm::Bounds operator()(const vtkm::Bounds& lhs, const vtkm::Bounds& rhs) const
+  {
+    return lhs + rhs;
+  }
+
+private:
+  std::detail::MPIPlus<vtkm::Bounds> Op;
+};
+
+template <>
+struct plus<vtkm::Range>
+{
+  MPI_Op get_mpi_op() const { return this->Op; }
+  vtkm::Range operator()(const vtkm::Range& lhs, const vtkm::Range& rhs) const { return lhs + rhs; }
+
+private:
+  std::detail::MPIPlus<vtkm::Range> Op;
+};
+}
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+template <>
+struct mpi_datatype<vtkm::Bounds>
+{
+  static MPI_Datatype datatype() { return get_mpi_datatype<vtkm::Float64>(); }
+  static const void* address(const vtkm::Bounds& x) { return &x; }
+  static void* address(vtkm::Bounds& x) { return &x; }
+  static int count(const vtkm::Bounds&) { return 6; }
+};
+
+template <>
+struct mpi_op<std::plus<vtkm::Bounds>>
+{
+  static MPI_Op get(const std::plus<vtkm::Bounds>& op) { return op.get_mpi_op(); }
+};
+
+template <>
+struct mpi_datatype<vtkm::Range>
+{
+  static MPI_Datatype datatype() { return get_mpi_datatype<vtkm::Float64>(); }
+  static const void* address(const vtkm::Range& x) { return &x; }
+  static void* address(vtkm::Range& x) { return &x; }
+  static int count(const vtkm::Range&) { return 2; }
+};
+
+template <>
+struct mpi_op<std::plus<vtkm::Range>>
+{
+  static MPI_Op get(const std::plus<vtkm::Range>& op) { return op.get_mpi_op(); }
+};
+
+} // diy::mpi::detail
+} // diy::mpi
+} // diy
+
+
+#endif
+
 namespace vtkm
 {
 namespace cont
@ -34,25 +163,25 @@ namespace cont
 VTKM_CONT
 MultiBlock::MultiBlock(const vtkm::cont::DataSet& ds)
 {
-  this->blocks.insert(blocks.end(), ds);
+  this->Blocks.insert(this->Blocks.end(), ds);
 }

 VTKM_CONT
 MultiBlock::MultiBlock(const vtkm::cont::MultiBlock& src)
 {
-  this->blocks = src.GetBlocks();
+  this->Blocks = src.GetBlocks();
 }

 VTKM_CONT
 MultiBlock::MultiBlock(const std::vector<vtkm::cont::DataSet>& mblocks)
 {
-  this->blocks = mblocks;
+  this->Blocks = mblocks;
 }

 VTKM_CONT
 MultiBlock::MultiBlock(vtkm::Id size)
 {
-  this->blocks.reserve(static_cast<std::size_t>(size));
+  this->Blocks.reserve(static_cast<std::size_t>(size));
 }

 VTKM_CONT
@ -68,7 +197,7 @@ MultiBlock::~MultiBlock()
 VTKM_CONT
 MultiBlock& MultiBlock::operator=(const vtkm::cont::MultiBlock& src)
 {
-  this->blocks = src.GetBlocks();
+  this->Blocks = src.GetBlocks();
  return *this;
 }

@ -76,46 +205,68 @@ VTKM_CONT
 vtkm::cont::Field MultiBlock::GetField(const std::string& field_name, const int& block_index)
 {
  assert(block_index >= 0);
-  assert(static_cast<std::size_t>(block_index) < blocks.size());
-  return blocks[static_cast<std::size_t>(block_index)].GetField(field_name);
+  assert(static_cast<std::size_t>(block_index) < this->Blocks.size());
+  return this->Blocks[static_cast<std::size_t>(block_index)].GetField(field_name);
 }

 VTKM_CONT
 vtkm::Id MultiBlock::GetNumberOfBlocks() const
 {
-  return static_cast<vtkm::Id>(this->blocks.size());
+  return static_cast<vtkm::Id>(this->Blocks.size());
+}
+
+VTKM_CONT
+vtkm::Id MultiBlock::GetGlobalNumberOfBlocks() const
+{
+#if defined(VTKM_ENABLE_MPI)
+  auto world = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  const auto local_count = this->GetNumberOfBlocks();
+
+  diy::Master master(world, 1, -1);
+  int block_not_used = 1;
+  master.add(world.rank(), &block_not_used, new diy::Link());
+  // empty link since we're only using collectives.
+  master.foreach ([=](void*, const diy::Master::ProxyWithLink& cp) {
+    cp.all_reduce(local_count, std::plus<vtkm::Id>());
+  });
+  master.process_collectives();
+  vtkm::Id global_count = master.proxy(0).get<vtkm::Id>();
+  return global_count;
+#else
+  return this->GetNumberOfBlocks();
+#endif
 }

 VTKM_CONT
 const vtkm::cont::DataSet& MultiBlock::GetBlock(vtkm::Id blockId) const
 {
-  return this->blocks[static_cast<std::size_t>(blockId)];
+  return this->Blocks[static_cast<std::size_t>(blockId)];
 }

 VTKM_CONT
 const std::vector<vtkm::cont::DataSet>& MultiBlock::GetBlocks() const
 {
-  return this->blocks;
+  return this->Blocks;
 }

 VTKM_CONT
 void MultiBlock::AddBlock(vtkm::cont::DataSet& ds)
 {
-  this->blocks.insert(blocks.end(), ds);
+  this->Blocks.insert(this->Blocks.end(), ds);
  return;
 }

 void MultiBlock::AddBlocks(std::vector<vtkm::cont::DataSet>& mblocks)
 {
-  this->blocks.insert(blocks.end(), mblocks.begin(), mblocks.end());
+  this->Blocks.insert(this->Blocks.end(), mblocks.begin(), mblocks.end());
  return;
 }

 VTKM_CONT
 void MultiBlock::InsertBlock(vtkm::Id index, vtkm::cont::DataSet& ds)
 {
-  if (index <= static_cast<vtkm::Id>(blocks.size()))
-    this->blocks.insert(blocks.begin() + index, ds);
+  if (index <= static_cast<vtkm::Id>(this->Blocks.size()))
+    this->Blocks.insert(this->Blocks.begin() + index, ds);
  else
  {
    std::string msg = "invalid insert position\n ";
@ -126,8 +277,8 @@ void MultiBlock::InsertBlock(vtkm::Id index, vtkm::cont::DataSet& ds)
 VTKM_CONT
 void MultiBlock::ReplaceBlock(vtkm::Id index, vtkm::cont::DataSet& ds)
 {
-  if (index < static_cast<vtkm::Id>(blocks.size()))
-    this->blocks.at(static_cast<std::size_t>(index)) = ds;
+  if (index < static_cast<vtkm::Id>(this->Blocks.size()))
+    this->Blocks.at(static_cast<std::size_t>(index)) = ds;
  else
  {
    std::string msg = "invalid replace position\n ";
@ -158,8 +309,32 @@ VTKM_CONT vtkm::Bounds MultiBlock::GetBounds(vtkm::Id coordinate_system_index,
  VTKM_IS_LIST_TAG(TypeList);
  VTKM_IS_LIST_TAG(StorageList);

+#if defined(VTKM_ENABLE_MPI)
+  auto world = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  //const auto global_num_blocks = this->GetGlobalNumberOfBlocks();
+
+  const auto num_blocks = this->GetNumberOfBlocks();
+
+  diy::Master master(world, 1, -1);
+  for (vtkm::Id cc = 0; cc < num_blocks; ++cc)
+  {
+    int gid = cc * world.size() + world.rank();
+    master.add(gid, const_cast<vtkm::cont::DataSet*>(&this->Blocks[cc]), new diy::Link());
+  }
+
+  master.foreach ([&](const vtkm::cont::DataSet* block, const diy::Master::ProxyWithLink& cp) {
+    auto coords = block->GetCoordinateSystem(coordinate_system_index);
+    const vtkm::Bounds bounds = coords.GetBounds(TypeList(), StorageList());
+    cp.all_reduce(bounds, std::plus<vtkm::Bounds>());
+  });
+
+  master.process_collectives();
+  auto bounds = master.proxy(0).get<vtkm::Bounds>();
+  return bounds;
+
+#else
  const vtkm::Id index = coordinate_system_index;
-  const size_t num_blocks = blocks.size();
+  const size_t num_blocks = this->Blocks.size();

  vtkm::Bounds bounds;
  for (size_t i = 0; i < num_blocks; ++i)
@ -167,8 +342,8 @@ VTKM_CONT vtkm::Bounds MultiBlock::GetBounds(vtkm::Id coordinate_system_index,
    vtkm::Bounds block_bounds = this->GetBlockBounds(i, index, TypeList(), StorageList());
    bounds.Include(block_bounds);
  }
-
  return bounds;
+#endif
 }

 VTKM_CONT
@ -206,7 +381,7 @@ VTKM_CONT vtkm::Bounds MultiBlock::GetBlockBounds(const std::size_t& block_index
  vtkm::cont::CoordinateSystem coords;
  try
  {
-    coords = blocks[block_index].GetCoordinateSystem(index);
+    coords = this->Blocks[block_index].GetCoordinateSystem(index);
  }
  catch (const vtkm::cont::Error& error)
  {
@ -241,8 +416,8 @@ VTKM_CONT vtkm::cont::ArrayHandle<vtkm::Range> MultiBlock::GetGlobalRange(const
  VTKM_IS_LIST_TAG(TypeList);
  VTKM_IS_LIST_TAG(StorageList);

-  assert(blocks.size() > 0);
-  vtkm::cont::Field field = blocks.at(0).GetField(index);
+  assert(this->Blocks.size() > 0);
+  vtkm::cont::Field field = this->Blocks.at(0).GetField(index);
  std::string field_name = field.GetName();
  return this->GetGlobalRange(field_name, TypeList(), StorageList());
 }
@ -267,21 +442,86 @@ template <typename TypeList, typename StorageList>
 VTKM_CONT vtkm::cont::ArrayHandle<vtkm::Range>
 MultiBlock::GetGlobalRange(const std::string& field_name, TypeList, StorageList) const
 {
+#if defined(VTKM_ENABLE_MPI)
+  auto world = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  const auto num_blocks = this->GetNumberOfBlocks();
+
+  diy::Master master(world);
+  for (vtkm::Id cc = 0; cc < num_blocks; ++cc)
+  {
+    int gid = cc * world.size() + world.rank();
+    master.add(gid, const_cast<vtkm::cont::DataSet*>(&this->Blocks[cc]), new diy::Link());
+  }
+
+  // collect info about number of components in the field.
+  master.foreach ([&](const vtkm::cont::DataSet* dataset, const diy::Master::ProxyWithLink& cp) {
+    if (dataset->HasField(field_name))
+    {
+      auto field = dataset->GetField(field_name);
+      const vtkm::cont::ArrayHandle<vtkm::Range> range = field.GetRange(TypeList(), StorageList());
+      vtkm::Id components = range.GetPortalConstControl().GetNumberOfValues();
+      cp.all_reduce(components, diy::mpi::maximum<vtkm::Id>());
+    }
+  });
+  master.process_collectives();
+
+  const vtkm::Id components = master.size() ? master.proxy(0).read<vtkm::Id>() : 0;
+
+  // clear all collectives.
+  master.foreach ([&](const vtkm::cont::DataSet*, const diy::Master::ProxyWithLink& cp) {
+    cp.collectives()->clear();
+  });
+
+  master.foreach ([&](const vtkm::cont::DataSet* dataset, const diy::Master::ProxyWithLink& cp) {
+    if (dataset->HasField(field_name))
+    {
+      auto field = dataset->GetField(field_name);
+      const vtkm::cont::ArrayHandle<vtkm::Range> range = field.GetRange(TypeList(), StorageList());
+      const auto v_range =
+        vtkm::cont::detail::CopyArrayPortalToVector(range.GetPortalConstControl());
+      for (const vtkm::Range& r : v_range)
+      {
+        cp.all_reduce(r, std::plus<vtkm::Range>());
+      }
+      // if current block has less that the max number of components, just add invalid ranges for the rest.
+      for (vtkm::Id cc = static_cast<vtkm::Id>(v_range.size()); cc < components; ++cc)
+      {
+        cp.all_reduce(vtkm::Range(), std::plus<vtkm::Range>());
+      }
+    }
+  });
+  master.process_collectives();
+  std::vector<vtkm::Range> ranges(components);
+  // FIXME: is master.size() == 0 i.e. there are no blocks on the current rank,
+  // this method won't return valid range.
+  if (master.size() > 0)
+  {
+    for (vtkm::Id cc = 0; cc < components; ++cc)
+    {
+      ranges[cc] = master.proxy(0).get<vtkm::Range>();
+    }
+  }
+
+  vtkm::cont::ArrayHandle<vtkm::Range> tmprange = vtkm::cont::make_ArrayHandle(ranges);
+  vtkm::cont::ArrayHandle<vtkm::Range> range;
+  vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandle(ranges), range);
+  return range;
+#else
  bool valid_field = true;
-  const size_t num_blocks = blocks.size();
+  const size_t num_blocks = this->Blocks.size();

  vtkm::cont::ArrayHandle<vtkm::Range> range;
  vtkm::Id num_components = 0;

  for (size_t i = 0; i < num_blocks; ++i)
  {
-    if (!blocks[i].HasField(field_name))
+    if (!this->Blocks[i].HasField(field_name))
    {
      valid_field = false;
      break;
    }

-    const vtkm::cont::Field& field = blocks[i].GetField(field_name);
+    const vtkm::cont::Field& field = this->Blocks[i].GetField(field_name);
    vtkm::cont::ArrayHandle<vtkm::Range> sub_range = field.GetRange(TypeList(), StorageList());

    vtkm::cont::ArrayHandle<vtkm::Range>::PortalConstControl sub_range_control =
@ -324,6 +564,7 @@ MultiBlock::GetGlobalRange(const std::string& field_name, TypeList, StorageList)
  }

  return range;
+#endif
 }

 VTKM_CONT
@ -332,10 +573,10 @@ void MultiBlock::PrintSummary(std::ostream& stream) const
  stream << "block "
         << "\n";

-  for (size_t block_index = 0; block_index < blocks.size(); ++block_index)
+  for (size_t block_index = 0; block_index < this->Blocks.size(); ++block_index)
  {
    stream << "block " << block_index << "\n";
-    blocks[block_index].PrintSummary(stream);
+    this->Blocks[block_index].PrintSummary(stream);
  }
 }
 }
--- a/vtkm/cont/MultiBlock.h
+++ b/vtkm/cont/MultiBlock.h
@ -64,6 +64,13 @@ public:
  VTKM_CONT
  vtkm::Id GetNumberOfBlocks() const;

+  /// Returns the number of blocks across all ranks. For non-MPI builds, this
+  /// will be same as `GetNumberOfBlocks()`.
+  /// This method is not thread-safe and may involve global communication across
+  /// all ranks in distributed environments with MPI.
+  VTKM_CONT
+  vtkm::Id GetGlobalNumberOfBlocks() const;
+
  VTKM_CONT
  const vtkm::cont::DataSet& GetBlock(vtkm::Id blockId) const;

@ -105,7 +112,11 @@ public:
                                        vtkm::Id coordinate_system_index,
                                        TypeList,
                                        StorageList) const;
-  /// get the unified range of the same feild within all contained DataSet
+
+  //@{
+  /// Get the unified range of the same field within all contained DataSet.
+  /// These methods are not thread-safe and may involve global communication
+  /// across all ranks in distributed environments with MPI.
  VTKM_CONT
  vtkm::cont::ArrayHandle<vtkm::Range> GetGlobalRange(const std::string& field_name) const;

@ -128,12 +139,13 @@ public:
  VTKM_CONT vtkm::cont::ArrayHandle<vtkm::Range> GetGlobalRange(const int& index,
                                                                TypeList,
                                                                StorageList) const;
+  //@}

  VTKM_CONT
  void PrintSummary(std::ostream& stream) const;

 private:
-  std::vector<vtkm::cont::DataSet> blocks;
+  std::vector<vtkm::cont::DataSet> Blocks;
 };
 }
 } // namespace vtkm::cont
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@ -33,6 +33,10 @@ static bool IsInitialized = false;

 // True if all devices support concurrent pagable managed memory.
 static bool ManagedMemorySupported = false;
+
+// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
+// This value should be > 0 or else these functions will error out.
+static std::size_t Threshold = 1 << 20;
 }

 namespace vtkm
@ -94,6 +98,12 @@ bool CudaAllocator::IsManagedPointer(const void* ptr)
 void* CudaAllocator::Allocate(std::size_t numBytes)
 {
  CudaAllocator::Initialize();
+  // When numBytes is zero cudaMallocManaged returns an error and the behavior
+  // of cudaMalloc is not documented. Just return nullptr.
+  if (numBytes == 0)
+  {
+    return nullptr;
+  }

  void* ptr = nullptr;
  if (ManagedMemorySupported)
@ -115,7 +125,7 @@ void CudaAllocator::Free(void* ptr)

 void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
 {
-  if (IsManagedPointer(ptr))
+  if (IsManagedPointer(ptr) && numBytes >= Threshold)
  {
 #if CUDART_VERSION >= 8000
    // TODO these hints need to be benchmarked and adjusted once we start
@ -128,7 +138,7 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)

 void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
 {
-  if (IsManagedPointer(ptr))
+  if (IsManagedPointer(ptr) && numBytes >= Threshold)
  {
 #if CUDART_VERSION >= 8000
    int dev;
@ -143,7 +153,7 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)

 void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
 {
-  if (IsManagedPointer(ptr))
+  if (IsManagedPointer(ptr) && numBytes >= Threshold)
  {
 #if CUDART_VERSION >= 8000
    int dev;
@ -158,7 +168,7 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)

 void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
 {
-  if (IsManagedPointer(ptr))
+  if (IsManagedPointer(ptr) && numBytes >= Threshold)
  {
 #if CUDART_VERSION >= 8000
    int dev;
--- a/vtkm/cont/internal/DynamicTransform.h
+++ b/vtkm/cont/internal/DynamicTransform.h
@ -48,28 +48,28 @@ class CellSetPermutation;
 /// DynamicObject's CastAndCall, but specializations of this function exist for
 /// other classes (e.g. Field, CoordinateSystem, ArrayHandle).
 template <typename DynamicObject, typename Functor, typename... Args>
-void CastAndCall(const DynamicObject& dynamicObject, const Functor& f, Args&&... args)
+void CastAndCall(const DynamicObject& dynamicObject, Functor&& f, Args&&... args)
 {
-  dynamicObject.CastAndCall(f, std::forward<Args>(args)...);
+  dynamicObject.CastAndCall(std::forward<Functor>(f), std::forward<Args>(args)...);
 }

 /// A specialization of CastAndCall for basic CoordinateSystem to make
 /// it be treated just like any other dynamic object
 // actually implemented in vtkm/cont/CoordinateSystem
 template <typename Functor, typename... Args>
-void CastAndCall(const CoordinateSystem& coords, const Functor& f, Args&&... args);
+void CastAndCall(const CoordinateSystem& coords, Functor&& f, Args&&... args);

 /// A specialization of CastAndCall for basic Field to make
 /// it be treated just like any other dynamic object
 // actually implemented in vtkm/cont/Field
 template <typename Functor, typename... Args>
-void CastAndCall(const vtkm::cont::Field& field, const Functor& f, Args&&... args);
+void CastAndCall(const vtkm::cont::Field& field, Functor&& f, Args&&... args);

 /// A specialization of CastAndCall for basic ArrayHandle types,
 /// Since the type is already known no deduction is needed.
 /// This specialization is used to simplify numerous worklet algorithms
 template <typename T, typename U, typename Functor, typename... Args>
-void CastAndCall(const vtkm::cont::ArrayHandle<T, U>& handle, const Functor& f, Args&&... args)
+void CastAndCall(const vtkm::cont::ArrayHandle<T, U>& handle, Functor&& f, Args&&... args)
 {
  f(handle, std::forward<Args>(args)...);
 }
@ -78,9 +78,7 @@ void CastAndCall(const vtkm::cont::ArrayHandle<T, U>& handle, const Functor& f,
 /// Since the type is already known no deduction is needed.
 /// This specialization is used to simplify numerous worklet algorithms
 template <vtkm::IdComponent Dim, typename Functor, typename... Args>
-void CastAndCall(const vtkm::cont::CellSetStructured<Dim>& cellset,
-                 const Functor& f,
-                 Args&&... args)
+void CastAndCall(const vtkm::cont::CellSetStructured<Dim>& cellset, Functor&& f, Args&&... args)
 {
  f(cellset, std::forward<Args>(args)...);
 }
@ -90,7 +88,7 @@ void CastAndCall(const vtkm::cont::CellSetStructured<Dim>& cellset,
 /// This specialization is used to simplify numerous worklet algorithms
 template <typename ConnectivityStorageTag, typename Functor, typename... Args>
 void CastAndCall(const vtkm::cont::CellSetSingleType<ConnectivityStorageTag>& cellset,
-                 const Functor& f,
+                 Functor&& f,
                 Args&&... args)
 {
  f(cellset, std::forward<Args>(args)...);
@ -101,7 +99,7 @@ void CastAndCall(const vtkm::cont::CellSetSingleType<ConnectivityStorageTag>& ce
 /// This specialization is used to simplify numerous worklet algorithms
 template <typename T, typename S, typename U, typename V, typename Functor, typename... Args>
 void CastAndCall(const vtkm::cont::CellSetExplicit<T, S, U, V>& cellset,
-                 const Functor& f,
+                 Functor&& f,
                 Args&&... args)
 {
  f(cellset, std::forward<Args>(args)...);
@ -112,7 +110,7 @@ void CastAndCall(const vtkm::cont::CellSetExplicit<T, S, U, V>& cellset,
 /// This specialization is used to simplify numerous worklet algorithms
 template <typename PermutationType, typename CellSetType, typename Functor, typename... Args>
 void CastAndCall(const vtkm::cont::CellSetPermutation<PermutationType, CellSetType>& cellset,
-                 const Functor& f,
+                 Functor&& f,
                 Args&&... args)
 {
  f(cellset, std::forward<Args>(args)...);
--- a/vtkm/cont/testing/CMakeLists.txt
+++ b/vtkm/cont/testing/CMakeLists.txt
@ -37,6 +37,7 @@ set(headers
 vtkm_declare_headers(${headers})

 set(unit_tests
+  UnitTestAlgorithm.cxx
  UnitTestArrayCopy.cxx
  UnitTestArrayHandleCartesianProduct.cxx
  UnitTestArrayHandleCompositeVector.cxx
@ -65,7 +66,7 @@ set(unit_tests
  UnitTestDeviceAdapterAlgorithmGeneral.cxx
  UnitTestDynamicArrayHandle.cxx
  UnitTestDynamicCellSet.cxx
-  UnitTestMultiBlock.cxx
+  UnitTestMultiBlock.cxx,MPI
  UnitTestRuntimeDeviceInformation.cxx
  UnitTestStorageBasic.cxx
  UnitTestStorageImplicit.cxx
--- a/vtkm/cont/testing/MakeTestDataSet.h
+++ b/vtkm/cont/testing/MakeTestDataSet.h
@ -53,6 +53,7 @@ public:
  // 3D uniform datasets.
  vtkm::cont::DataSet Make3DUniformDataSet0();
  vtkm::cont::DataSet Make3DUniformDataSet1();
+  vtkm::cont::DataSet Make3DUniformDataSet2();
  vtkm::cont::DataSet Make3DRegularDataSet0();
  vtkm::cont::DataSet Make3DRegularDataSet1();

@ -245,6 +246,32 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DUniformDataSet1()
  return dataSet;
 }

+inline vtkm::cont::DataSet MakeTestDataSet::Make3DUniformDataSet2()
+{
+  const vtkm::Id base_size = 256;
+  vtkm::cont::DataSetBuilderUniform dsb;
+  vtkm::Id3 dimensions(base_size, base_size, base_size);
+  vtkm::cont::DataSet dataSet = dsb.Create(dimensions);
+
+  vtkm::cont::DataSetFieldAdd dsf;
+  const vtkm::Id nVerts = base_size * base_size * base_size;
+  vtkm::Float32* pointvar = new vtkm::Float32[nVerts];
+
+  for (vtkm::Int32 z = 0; z < base_size; ++z)
+    for (vtkm::Int32 y = 0; y < base_size; ++y)
+      for (vtkm::Int32 x = 0; x < base_size; ++x)
+      {
+        vtkm::Int32 index = z * base_size * base_size + y * base_size + x;
+        pointvar[index] = vtkm::Sqrt(vtkm::Float32(x * x + y * y + z * z));
+      }
+
+  dsf.AddPointField(dataSet, "pointvar", pointvar, nVerts);
+
+  delete[] pointvar;
+
+  return dataSet;
+}
+
 inline vtkm::cont::DataSet MakeTestDataSet::Make2DRectilinearDataSet0()
 {
  vtkm::cont::DataSetBuilderRectilinear dsb;
@ -287,11 +314,13 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DRegularDataSet0()
  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[4] = { 100.1f, 100.2f, 100.3f, 100.4f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 4));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 4, vtkm::CopyFlag::On));

  static const vtkm::IdComponent dim = 3;
  vtkm::cont::CellSetStructured<dim> cellSet("cells");
@ -312,11 +341,13 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DRegularDataSet1()
  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[1] = { 100.1f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1, vtkm::CopyFlag::On));

  static const vtkm::IdComponent dim = 3;
  vtkm::cont::CellSetStructured<dim> cellSet("cells");
@ -556,7 +587,8 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet1()
                                    CoordType(2, 2, 0) };
  vtkm::Float32 vars[nVerts] = { 10.1f, 20.1f, 30.2f, 40.2f, 50.3f };

-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));
  vtkm::cont::CellSetExplicit<> cellSet("cells");
  cellSet.PrepareToAddCells(2, 7);
  cellSet.AddCell(vtkm::CELL_SHAPE_TRIANGLE, 3, make_Vec<vtkm::Id>(0, 1, 2));
@ -565,11 +597,13 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet1()
  dataSet.AddCellSet(cellSet);

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[2] = { 100.1f, 100.2f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 2));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 2, vtkm::CopyFlag::On));

  return dataSet;
 }
@ -592,14 +626,17 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet2()
  };
  vtkm::Float32 vars[nVerts] = { 10.1f, 20.1f, 30.2f, 40.2f, 50.3f, 60.2f, 70.2f, 80.3f };

-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[2] = { 100.1f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1, vtkm::CopyFlag::On));

  vtkm::cont::CellSetExplicit<> cellSet("cells");
  vtkm::Vec<vtkm::Id, 8> ids;
@ -645,14 +682,17 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet4()
  vtkm::Float32 vars[nVerts] = { 10.1f, 20.1f, 30.2f, 40.2f, 50.3f, 60.2f,
                                 70.2f, 80.3f, 90.f,  10.f,  11.f,  12.f };

-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[2] = { 100.1f, 110.f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 2));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 2, vtkm::CopyFlag::On));

  vtkm::cont::CellSetExplicit<> cellSet("cells");
  vtkm::Vec<vtkm::Id, 8> ids;
@ -695,14 +735,17 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet3()
  };
  vtkm::Float32 vars[nVerts] = { 10.1f, 10.1f, 10.2f, 30.2f };

-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  vtkm::Float32 cellvar[2] = { 100.1f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, 1, vtkm::CopyFlag::On));

  vtkm::cont::CellSetExplicit<> cellSet("cells");
  vtkm::Vec<vtkm::Id, 4> ids;
@ -743,15 +786,18 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSet5()
  vtkm::Float32 vars[nVerts] = { 10.1f, 20.1f, 30.2f, 40.2f, 50.3f, 60.2f,
                                 70.2f, 80.3f, 90.f,  10.f,  11.f };

-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));

  //Set point scalar
-  dataSet.AddField(Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts));
+  dataSet.AddField(
+    make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, nVerts, vtkm::CopyFlag::On));

  //Set cell scalar
  const int nCells = 4;
  vtkm::Float32 cellvar[nCells] = { 100.1f, 110.f, 120.2f, 130.5f };
-  dataSet.AddField(Field("cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, nCells));
+  dataSet.AddField(make_Field(
+    "cellvar", vtkm::cont::Field::ASSOC_CELL_SET, "cells", cellvar, nCells, vtkm::CopyFlag::On));

  vtkm::cont::CellSetExplicit<> cellSet("cells");
  vtkm::Vec<vtkm::Id, 8> ids;
@ -982,7 +1028,8 @@ inline vtkm::cont::DataSet MakeTestDataSet::Make3DExplicitDataSetCowNose()

  // create DataSet
  vtkm::cont::DataSet dataSet;
-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates, nVerts));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, nVerts, vtkm::CopyFlag::On));

  vtkm::cont::ArrayHandle<vtkm::Id> connectivity;
  connectivity.Allocate(connectivitySize);
--- a/vtkm/cont/testing/TestingComputeRange.h
+++ b/vtkm/cont/testing/TestingComputeRange.h
@ -60,7 +60,7 @@ private:
    const vtkm::Id nvals = 11;
    T data[nvals] = { 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0 };
    std::random_shuffle(data, data + nvals);
-    vtkm::cont::Field field("TestField", vtkm::cont::Field::ASSOC_POINTS, data, nvals);
+    auto field = vtkm::cont::make_Field("TestField", vtkm::cont::Field::ASSOC_POINTS, data, nvals);

    vtkm::Range result;
    field.GetRange(&result);
@ -84,7 +84,8 @@ private:
        fieldData[j][i] = data[j];
      }
    }
-    vtkm::cont::Field field("TestField", vtkm::cont::Field::ASSOC_POINTS, fieldData, nvals);
+    auto field =
+      vtkm::cont::make_Field("TestField", vtkm::cont::Field::ASSOC_POINTS, fieldData, nvals);

    vtkm::Range result[NumberOfComponents];
    field.GetRange(result, CustomTypeList(), VTKM_DEFAULT_STORAGE_LIST_TAG());
--- a/vtkm/cont/testing/UnitTestAlgorithm.cxx
+++ b/vtkm/cont/testing/UnitTestAlgorithm.cxx
@ -0,0 +1,185 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2017 UT-Battelle, LLC.
+//  Copyright 2017 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/Algorithm.h>
+
+#include <vtkm/TypeTraits.h>
+
+#include <vtkm/cont/testing/Testing.h>
+
+namespace
+{
+// The goal of this unit test is not to verify the correctness
+// of the various algorithms. Since Algorithm is a header, we
+// need to ensure we instatiate each algorithm in a source
+// file to verify compilation.
+//
+static const vtkm::Id ARRAY_SIZE = 10;
+
+void CopyTest()
+{
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+  vtkm::cont::ArrayHandle<vtkm::Id> output;
+  vtkm::cont::ArrayHandle<vtkm::Id> stencil;
+
+  input.Allocate(ARRAY_SIZE);
+  output.Allocate(ARRAY_SIZE);
+  stencil.Allocate(ARRAY_SIZE);
+
+  vtkm::cont::Algorithm::Copy(input, output);
+  vtkm::cont::Algorithm::CopyIf(input, stencil, output);
+  vtkm::cont::Algorithm::CopyIf(input, stencil, output, vtkm::LogicalNot());
+  vtkm::cont::Algorithm::CopySubRange(input, 2, 1, output);
+}
+
+void BoundsTest()
+{
+
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+  vtkm::cont::ArrayHandle<vtkm::Id> output;
+  vtkm::cont::ArrayHandle<vtkm::Id> values;
+
+  input.Allocate(ARRAY_SIZE);
+  output.Allocate(ARRAY_SIZE);
+  values.Allocate(ARRAY_SIZE);
+
+  vtkm::cont::Algorithm::LowerBounds(input, values, output);
+  vtkm::cont::Algorithm::LowerBounds(input, values, output, vtkm::Sum());
+  vtkm::cont::Algorithm::LowerBounds(input, values);
+
+  vtkm::cont::Algorithm::UpperBounds(input, values, output);
+  vtkm::cont::Algorithm::UpperBounds(input, values, output, vtkm::Sum());
+  vtkm::cont::Algorithm::UpperBounds(input, values);
+}
+
+void ReduceTest()
+{
+
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+  vtkm::cont::ArrayHandle<vtkm::Id> keys;
+  vtkm::cont::ArrayHandle<vtkm::Id> keysOut;
+  vtkm::cont::ArrayHandle<vtkm::Id> valsOut;
+
+  input.Allocate(ARRAY_SIZE);
+  keys.Allocate(ARRAY_SIZE);
+  keysOut.Allocate(ARRAY_SIZE);
+  valsOut.Allocate(ARRAY_SIZE);
+
+  vtkm::Id result;
+  result = vtkm::cont::Algorithm::Reduce(input, vtkm::Id(0));
+  result = vtkm::cont::Algorithm::Reduce(input, vtkm::Id(0), vtkm::Maximum());
+  vtkm::cont::Algorithm::ReduceByKey(keys, input, keysOut, valsOut, vtkm::Maximum());
+  (void)result;
+}
+
+void ScanTest()
+{
+
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+  vtkm::cont::ArrayHandle<vtkm::Id> output;
+  vtkm::cont::ArrayHandle<vtkm::Id> keys;
+
+  input.Allocate(ARRAY_SIZE);
+  output.Allocate(ARRAY_SIZE);
+  keys.Allocate(ARRAY_SIZE);
+
+  vtkm::Id out;
+  out = vtkm::cont::Algorithm::ScanInclusive(input, output);
+  out = vtkm::cont::Algorithm::ScanInclusive(input, output, vtkm::Maximum());
+  out = vtkm::cont::Algorithm::StreamingScanExclusive(1, input, output);
+  vtkm::cont::Algorithm::ScanInclusiveByKey(keys, input, output, vtkm::Maximum());
+  vtkm::cont::Algorithm::ScanInclusiveByKey(keys, input, output);
+  out = vtkm::cont::Algorithm::ScanExclusive(input, output, vtkm::Maximum(), vtkm::Id(0));
+  vtkm::cont::Algorithm::ScanExclusiveByKey(keys, input, output, vtkm::Id(0), vtkm::Maximum());
+  vtkm::cont::Algorithm::ScanExclusiveByKey(keys, input, output);
+  (void)out;
+}
+
+struct DummyFunctor : public vtkm::exec::FunctorBase
+{
+  template <typename IdType>
+  VTKM_EXEC void operator()(IdType) const
+  {
+  }
+};
+
+void ScheduleTest()
+{
+  vtkm::cont::Algorithm::Schedule(DummyFunctor(), vtkm::Id(1));
+  vtkm::Id3 id3(1, 1, 1);
+  vtkm::cont::Algorithm::Schedule(DummyFunctor(), id3);
+}
+
+struct CompFunctor
+{
+  template <typename T>
+  VTKM_EXEC_CONT bool operator()(const T& x, const T& y) const
+  {
+    return x < y;
+  }
+};
+
+void SortTest()
+{
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+  vtkm::cont::ArrayHandle<vtkm::Id> keys;
+
+  input.Allocate(ARRAY_SIZE);
+  keys.Allocate(ARRAY_SIZE);
+
+  vtkm::cont::Algorithm::Sort(input);
+  vtkm::cont::Algorithm::Sort(input, CompFunctor());
+  vtkm::cont::Algorithm::SortByKey(keys, input);
+  vtkm::cont::Algorithm::SortByKey(keys, input, CompFunctor());
+}
+
+void SynchronizeTest()
+{
+  vtkm::cont::Algorithm::Synchronize();
+}
+
+void UniqueTest()
+{
+  vtkm::cont::ArrayHandle<vtkm::Id> input;
+
+  input.Allocate(ARRAY_SIZE);
+
+  vtkm::cont::Algorithm::Unique(input);
+  vtkm::cont::Algorithm::Unique(input, CompFunctor());
+}
+
+void TestAll()
+{
+  CopyTest();
+  BoundsTest();
+  ReduceTest();
+  ScanTest();
+  ScheduleTest();
+  SortTest();
+  SynchronizeTest();
+  UniqueTest();
+}
+
+} // anonymous namespace
+
+int UnitTestAlgorithm(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestAll);
+}
--- a/vtkm/cont/testing/UnitTestMultiBlock.cxx
+++ b/vtkm/cont/testing/UnitTestMultiBlock.cxx
@ -27,6 +27,7 @@
 #include <vtkm/cont/DataSet.h>
 #include <vtkm/cont/DataSetFieldAdd.h>
 #include <vtkm/cont/DynamicArrayHandle.h>
+#include <vtkm/cont/EnvironmentTracker.h>
 #include <vtkm/cont/Field.h>
 #include <vtkm/cont/MultiBlock.h>
 #include <vtkm/cont/serial/DeviceAdapterSerial.h>
@ -34,6 +35,10 @@
 #include <vtkm/cont/testing/Testing.h>
 #include <vtkm/exec/ConnectivityStructured.h>

+#if defined(VTKM_ENABLE_MPI)
+#include <diy/master.hpp>
+#endif
+
 void DataSet_Compare(vtkm::cont::DataSet& LeftDateSet, vtkm::cont::DataSet& RightDateSet);
 static void MultiBlockTest()
 {
@ -46,7 +51,14 @@ static void MultiBlockTest()
  multiblock.AddBlock(TDset1);
  multiblock.AddBlock(TDset2);

+  int procsize = 1;
+#if defined(VTKM_ENABLE_MPI)
+  procsize = vtkm::cont::EnvironmentTracker::GetCommunicator().size();
+#endif
+
  VTKM_TEST_ASSERT(multiblock.GetNumberOfBlocks() == 2, "Incorrect number of blocks");
+  VTKM_TEST_ASSERT(multiblock.GetGlobalNumberOfBlocks() == 2 * procsize,
+                   "Incorrect number of blocks");

  vtkm::cont::DataSet TestDSet = multiblock.GetBlock(0);
  VTKM_TEST_ASSERT(TDset1.GetNumberOfFields() == TestDSet.GetNumberOfFields(),
@ -155,7 +167,13 @@ void DataSet_Compare(vtkm::cont::DataSet& LeftDateSet, vtkm::cont::DataSet& Righ
  return;
 }

-int UnitTestMultiBlock(int, char* [])
+int UnitTestMultiBlock(int argc, char* argv[])
 {
+  (void)argc;
+  (void)argv;
+#if defined(VTKM_ENABLE_MPI)
+  diy::mpi::environment env(argc, argv);
+  vtkm::cont::EnvironmentTracker::SetCommunicator(diy::mpi::communicator(MPI_COMM_WORLD));
+#endif
  return vtkm::cont::testing::Testing::Run(MultiBlockTest);
 }
--- a/vtkm/filter/NDEntropy.hxx
+++ b/vtkm/filter/NDEntropy.hxx
@ -18,7 +18,6 @@
 //  this software.
 //============================================================================

-#include <vector>
 #include <vtkm/cont/DataSet.h>
 #include <vtkm/worklet/NDimsEntropy.h>

@ -57,11 +56,9 @@ inline VTKM_CONT vtkm::filter::Result NDEntropy::DoExecute(

  // Run worklet to calculate multi-variate entropy
  vtkm::Float64 entropy = ndEntropy.Run(device);
-
  vtkm::cont::DataSet outputData;
-  std::vector<vtkm::Float64> entropyHandle;
-  entropyHandle.push_back(entropy);
-  outputData.AddField(vtkm::cont::Field("Entropy", vtkm::cont::Field::ASSOC_POINTS, entropyHandle));
+  outputData.AddField(vtkm::cont::make_Field(
+    "Entropy", vtkm::cont::Field::ASSOC_POINTS, &entropy, 1, vtkm::CopyFlag::On));

  //return outputData;
  return vtkm::filter::Result(outputData);
--- a/vtkm/filter/testing/UnitTestFieldMetadata.cxx
+++ b/vtkm/filter/testing/UnitTestFieldMetadata.cxx
@ -59,7 +59,7 @@ void TestFieldTypesPoint()

  //verify the field helper works properly
  vtkm::Float32 vars[6] = { 10.1f, 20.1f, 30.1f, 40.1f, 50.1f, 60.1f };
-  vtkm::cont::Field field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, 6);
+  auto field = vtkm::cont::make_Field("pointvar", vtkm::cont::Field::ASSOC_POINTS, vars, 6);
  vtkm::filter::FieldMetadata makeMDFromField(field);
  VTKM_TEST_ASSERT(makeMDFromField.IsPointField() == true, "point should be a point field");
  VTKM_TEST_ASSERT(makeMDFromField.IsCellField() == false, "point can't be a cell field");
@ -74,7 +74,8 @@ void TestFieldTypesCell()

  //verify the field helper works properly
  vtkm::Float32 vars[6] = { 10.1f, 20.1f, 30.1f, 40.1f, 50.1f, 60.1f };
-  vtkm::cont::Field field("pointvar", vtkm::cont::Field::ASSOC_CELL_SET, std::string(), vars, 6);
+  auto field =
+    vtkm::cont::make_Field("pointvar", vtkm::cont::Field::ASSOC_CELL_SET, std::string(), vars, 6);
  vtkm::filter::FieldMetadata makeMDFromField(field);
  VTKM_TEST_ASSERT(makeMDFromField.IsPointField() == false, "cell can't be a point field");
  VTKM_TEST_ASSERT(makeMDFromField.IsCellField() == true, "cell should be a cell field");
--- a/vtkm/filter/testing/UnitTestHistogramFilter.cxx
+++ b/vtkm/filter/testing/UnitTestHistogramFilter.cxx
@ -227,23 +227,28 @@ vtkm::cont::DataSet MakeTestDataSet()
  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));

  // Set point scalars
-  dataSet.AddField(
-    vtkm::cont::Field("p_poisson", vtkm::cont::Field::ASSOC_POINTS, poisson, nVerts));
-  dataSet.AddField(vtkm::cont::Field("p_normal", vtkm::cont::Field::ASSOC_POINTS, normal, nVerts));
-  dataSet.AddField(
-    vtkm::cont::Field("p_chiSquare", vtkm::cont::Field::ASSOC_POINTS, chiSquare, nVerts));
-  dataSet.AddField(
-    vtkm::cont::Field("p_uniform", vtkm::cont::Field::ASSOC_POINTS, uniform, nVerts));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "p_poisson", vtkm::cont::Field::ASSOC_POINTS, poisson, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "p_normal", vtkm::cont::Field::ASSOC_POINTS, normal, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "p_chiSquare", vtkm::cont::Field::ASSOC_POINTS, chiSquare, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "p_uniform", vtkm::cont::Field::ASSOC_POINTS, uniform, nVerts, vtkm::CopyFlag::On));

  // Set cell scalars
-  dataSet.AddField(
-    vtkm::cont::Field("c_poisson", vtkm::cont::Field::ASSOC_CELL_SET, "cells", poisson, nCells));
-  dataSet.AddField(
-    vtkm::cont::Field("c_normal", vtkm::cont::Field::ASSOC_CELL_SET, "cells", normal, nCells));
-  dataSet.AddField(vtkm::cont::Field(
-    "c_chiSquare", vtkm::cont::Field::ASSOC_CELL_SET, "cells", chiSquare, nCells));
-  dataSet.AddField(
-    vtkm::cont::Field("c_uniform", vtkm::cont::Field::ASSOC_CELL_SET, "cells", poisson, nCells));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "c_poisson", vtkm::cont::Field::ASSOC_CELL_SET, "cells", poisson, nCells, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "c_normal", vtkm::cont::Field::ASSOC_CELL_SET, "cells", normal, nCells, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field("c_chiSquare",
+                                          vtkm::cont::Field::ASSOC_CELL_SET,
+                                          "cells",
+                                          chiSquare,
+                                          nCells,
+                                          vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "c_uniform", vtkm::cont::Field::ASSOC_CELL_SET, "cells", poisson, nCells, vtkm::CopyFlag::On));

  vtkm::cont::CellSetStructured<dimension> cellSet("cells");

--- a/vtkm/filter/testing/UnitTestNDEntropyFilter.cxx
+++ b/vtkm/filter/testing/UnitTestNDEntropyFilter.cxx
@ -173,9 +173,12 @@ vtkm::cont::DataSet MakeTestDataSet()
  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));

  // Set point scalars
-  dataSet.AddField(vtkm::cont::Field("fieldA", vtkm::cont::Field::ASSOC_POINTS, fieldA, nVerts));
-  dataSet.AddField(vtkm::cont::Field("fieldB", vtkm::cont::Field::ASSOC_POINTS, fieldB, nVerts));
-  dataSet.AddField(vtkm::cont::Field("fieldC", vtkm::cont::Field::ASSOC_POINTS, fieldC, nVerts));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldA", vtkm::cont::Field::ASSOC_POINTS, fieldA, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldB", vtkm::cont::Field::ASSOC_POINTS, fieldB, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldC", vtkm::cont::Field::ASSOC_POINTS, fieldC, nVerts, vtkm::CopyFlag::On));

  return dataSet;
 }
--- a/vtkm/filter/testing/UnitTestNDHistogramFilter.cxx
+++ b/vtkm/filter/testing/UnitTestNDHistogramFilter.cxx
@ -56,9 +56,12 @@ vtkm::cont::DataSet MakeTestDataSet()
  };

  // Set point scalars
-  dataSet.AddField(vtkm::cont::Field("fieldA", vtkm::cont::Field::ASSOC_POINTS, fieldA, nVerts));
-  dataSet.AddField(vtkm::cont::Field("fieldB", vtkm::cont::Field::ASSOC_POINTS, fieldB, nVerts));
-  dataSet.AddField(vtkm::cont::Field("fieldC", vtkm::cont::Field::ASSOC_POINTS, fieldC, nVerts));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldA", vtkm::cont::Field::ASSOC_POINTS, fieldA, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldB", vtkm::cont::Field::ASSOC_POINTS, fieldB, nVerts, vtkm::CopyFlag::On));
+  dataSet.AddField(vtkm::cont::make_Field(
+    "fieldC", vtkm::cont::Field::ASSOC_POINTS, fieldC, nVerts, vtkm::CopyFlag::On));

  return dataSet;
 }
--- a/vtkm/filter/testing/UnitTestPointElevationFilter.cxx
+++ b/vtkm/filter/testing/UnitTestPointElevationFilter.cxx
@ -44,7 +44,8 @@ vtkm::cont::DataSet MakePointElevationTestDataSet()
  }

  vtkm::Id numCells = (dim - 1) * (dim - 1);
-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));
+  dataSet.AddCoordinateSystem(
+    vtkm::cont::make_CoordinateSystem("coordinates", coordinates, vtkm::CopyFlag::On));

  vtkm::cont::CellSetExplicit<> cellSet("cells");
  cellSet.PrepareToAddCells(numCells, numCells * 4);
--- a/vtkm/internal/CMakeLists.txt
+++ b/vtkm/internal/CMakeLists.txt
@ -28,6 +28,7 @@ set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS})

 set(VTKM_ENABLE_CUDA ${VTKm_ENABLE_CUDA})
 set(VTKM_ENABLE_TBB ${VTKm_ENABLE_TBB})
+set(VTKM_ENABLE_MPI ${VTKm_ENABLE_MPI})

 vtkm_get_kit_name(kit_name kit_dir)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Configure.h.in
--- a/vtkm/internal/Configure.h.in
+++ b/vtkm/internal/Configure.h.in
@ -263,6 +263,9 @@
 #cmakedefine VTKM_ENABLE_TBB
 #endif

+//Mark if we are building with MPI enabled.
+#cmakedefine VTKM_ENABLE_MPI
+
 #if __cplusplus >= 201103L || \
    ( defined(VTKM_MSVC) && _MSC_VER >= 1800  ) || \
    ( defined(VTKM_ICC)  && defined(__INTEL_CXX11_MODE__) )
--- a/Show More
+++ b/Show More