From 3eb8294b24344704a37aacaa5c613abe2a12b488 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Tue, 19 Dec 2017 15:14:30 -0500
Subject: [PATCH 01/24] Build vtkm::cont::DataSet into the vtkm_cont library.

---
 vtkm/cont/CMakeLists.txt |   1 +
 vtkm/cont/DataSet.cxx    | 164 +++++++++++++++++++++++++++++++++++++++
 vtkm/cont/DataSet.h      | 149 ++++++-----------------------------
 3 files changed, 187 insertions(+), 127 deletions(-)
 create mode 100644 vtkm/cont/DataSet.cxx

diff --git a/vtkm/cont/CMakeLists.txt b/vtkm/cont/CMakeLists.txt
index 397c3ef3a..f9f43103a 100644
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@@ -98,6 +98,7 @@ set(sources
   CellSetExplicit.cxx
   CellSetStructured.cxx
   CoordinateSystem.cxx
+  DataSet.cxx
   DynamicArrayHandle.cxx
   EnvironmentTracker.cxx
   Field.cxx
diff --git a/vtkm/cont/DataSet.cxx b/vtkm/cont/DataSet.cxx
new file mode 100644
index 000000000..3cfc36982
--- /dev/null
+++ b/vtkm/cont/DataSet.cxx
@@ -0,0 +1,164 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/cont/DataSet.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+DataSet::DataSet()
+{
+}
+
+void DataSet::Clear()
+{
+  this->CoordSystems.clear();
+  this->Fields.clear();
+  this->CellSets.clear();
+}
+
+const vtkm::cont::Field& DataSet::GetField(vtkm::Id index) const
+{
+  VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfFields()));
+  return this->Fields[static_cast<std::size_t>(index)];
+}
+
+vtkm::Id DataSet::GetFieldIndex(const std::string& name,
+                                vtkm::cont::Field::AssociationEnum assoc) const
+{
+  bool found;
+  vtkm::Id index = this->FindFieldIndex(name, assoc, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No field with requested name: " + name);
+  }
+}
+
+const vtkm::cont::CoordinateSystem& DataSet::GetCoordinateSystem(vtkm::Id index) const
+{
+  VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfCoordinateSystems()));
+  return this->CoordSystems[static_cast<std::size_t>(index)];
+}
+
+vtkm::Id DataSet::GetCoordinateSystemIndex(const std::string& name) const
+{
+  bool found;
+  vtkm::Id index = this->FindCoordinateSystemIndex(name, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No coordinate system with requested name");
+  }
+}
+
+vtkm::Id DataSet::GetCellSetIndex(const std::string& name) const
+{
+  bool found;
+  vtkm::Id index = this->FindCellSetIndex(name, found);
+  if (found)
+  {
+    return index;
+  }
+  else
+  {
+    throw vtkm::cont::ErrorBadValue("No cell set with requested name");
+  }
+}
+
+void DataSet::PrintSummary(std::ostream& out) const
+{
+  out << "DataSet:\n";
+  out << "  CoordSystems[" << this->CoordSystems.size() << "]\n";
+  for (std::size_t index = 0; index < this->CoordSystems.size(); index++)
+  {
+    this->CoordSystems[index].PrintSummary(out);
+  }
+
+  out << "  CellSets[" << this->GetNumberOfCellSets() << "]\n";
+  for (vtkm::Id index = 0; index < this->GetNumberOfCellSets(); index++)
+  {
+    this->GetCellSet(index).PrintSummary(out);
+  }
+
+  out << "  Fields[" << this->GetNumberOfFields() << "]\n";
+  for (vtkm::Id index = 0; index < this->GetNumberOfFields(); index++)
+  {
+    this->GetField(index).PrintSummary(out);
+  }
+}
+
+vtkm::Id DataSet::FindFieldIndex(const std::string& name,
+                                 vtkm::cont::Field::AssociationEnum association,
+                                 bool& found) const
+{
+  for (std::size_t index = 0; index < this->Fields.size(); ++index)
+  {
+    if ((association == vtkm::cont::Field::ASSOC_ANY ||
+         association == this->Fields[index].GetAssociation()) &&
+        this->Fields[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+
+vtkm::Id DataSet::FindCoordinateSystemIndex(const std::string& name, bool& found) const
+{
+  for (std::size_t index = 0; index < this->CoordSystems.size(); ++index)
+  {
+    if (this->CoordSystems[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+vtkm::Id DataSet::FindCellSetIndex(const std::string& name, bool& found) const
+{
+  for (std::size_t index = 0; index < static_cast<size_t>(this->GetNumberOfCellSets()); ++index)
+  {
+    if (this->CellSets[index].GetName() == name)
+    {
+      found = true;
+      return static_cast<vtkm::Id>(index);
+    }
+  }
+  found = false;
+  return -1;
+}
+
+} // namespace cont
+} // namespace vtkm
diff --git a/vtkm/cont/DataSet.h b/vtkm/cont/DataSet.h
index 71e3a6da6..f4854b07d 100644
--- a/vtkm/cont/DataSet.h
+++ b/vtkm/cont/DataSet.h
@@ -20,6 +20,8 @@
 #ifndef vtk_m_cont_DataSet_h
 #define vtk_m_cont_DataSet_h
 
+#include <vtkm/cont/vtkm_cont_export.h>
+
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/CoordinateSystem.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
@@ -33,29 +35,17 @@ namespace vtkm
 namespace cont
 {
 
-class DataSet
+class VTKM_CONT_EXPORT DataSet
 {
 public:
-  VTKM_CONT
-  DataSet() {}
+  VTKM_CONT DataSet();
+
+  VTKM_CONT void Clear();
+
+  VTKM_CONT void AddField(const Field& field) { this->Fields.push_back(field); }
 
   VTKM_CONT
-  void Clear()
-  {
-    this->CoordSystems.clear();
-    this->Fields.clear();
-    this->CellSets.clear();
-  }
-
-  VTKM_CONT
-  void AddField(Field field) { this->Fields.push_back(field); }
-
-  VTKM_CONT
-  const vtkm::cont::Field& GetField(vtkm::Id index) const
-  {
-    VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfFields()));
-    return this->Fields[static_cast<std::size_t>(index)];
-  }
+  const vtkm::cont::Field& GetField(vtkm::Id index) const;
 
   VTKM_CONT
   bool HasField(const std::string& name,
@@ -69,19 +59,7 @@ public:
   VTKM_CONT
   vtkm::Id GetFieldIndex(
     const std::string& name,
-    vtkm::cont::Field::AssociationEnum assoc = vtkm::cont::Field::ASSOC_ANY) const
-  {
-    bool found;
-    vtkm::Id index = this->FindFieldIndex(name, assoc, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No field with requested name: " + name);
-    }
-  }
+    vtkm::cont::Field::AssociationEnum assoc = vtkm::cont::Field::ASSOC_ANY) const;
 
   VTKM_CONT
   const vtkm::cont::Field& GetField(
@@ -104,14 +82,13 @@ public:
   }
 
   VTKM_CONT
-  void AddCoordinateSystem(vtkm::cont::CoordinateSystem cs) { this->CoordSystems.push_back(cs); }
+  void AddCoordinateSystem(const vtkm::cont::CoordinateSystem& cs)
+  {
+    this->CoordSystems.push_back(cs);
+  }
 
   VTKM_CONT
-  const vtkm::cont::CoordinateSystem& GetCoordinateSystem(vtkm::Id index = 0) const
-  {
-    VTKM_ASSERT((index >= 0) && (index < this->GetNumberOfCoordinateSystems()));
-    return this->CoordSystems[static_cast<std::size_t>(index)];
-  }
+  const vtkm::cont::CoordinateSystem& GetCoordinateSystem(vtkm::Id index = 0) const;
 
   VTKM_CONT
   bool HasCoordinateSystem(const std::string& name) const
@@ -122,19 +99,7 @@ public:
   }
 
   VTKM_CONT
-  vtkm::Id GetCoordinateSystemIndex(const std::string& name) const
-  {
-    bool found;
-    vtkm::Id index = this->FindCoordinateSystemIndex(name, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No coordinate system with requested name");
-    }
-  }
+  vtkm::Id GetCoordinateSystemIndex(const std::string& name) const;
 
   VTKM_CONT
   const vtkm::cont::CoordinateSystem& GetCoordinateSystem(const std::string& name) const
@@ -143,7 +108,7 @@ public:
   }
 
   VTKM_CONT
-  void AddCellSet(vtkm::cont::DynamicCellSet cellSet) { this->CellSets.push_back(cellSet); }
+  void AddCellSet(const vtkm::cont::DynamicCellSet& cellSet) { this->CellSets.push_back(cellSet); }
 
   template <typename CellSetType>
   VTKM_CONT void AddCellSet(const CellSetType& cellSet)
@@ -168,19 +133,7 @@ public:
   }
 
   VTKM_CONT
-  vtkm::Id GetCellSetIndex(const std::string& name) const
-  {
-    bool found;
-    vtkm::Id index = this->FindCellSetIndex(name, found);
-    if (found)
-    {
-      return index;
-    }
-    else
-    {
-      throw vtkm::cont::ErrorBadValue("No cell set with requested name");
-    }
-  }
+  vtkm::Id GetCellSetIndex(const std::string& name) const;
 
   VTKM_CONT
   vtkm::cont::DynamicCellSet GetCellSet(const std::string& name) const
@@ -207,27 +160,7 @@ public:
   }
 
   VTKM_CONT
-  void PrintSummary(std::ostream& out) const
-  {
-    out << "DataSet:\n";
-    out << "  CoordSystems[" << this->CoordSystems.size() << "]\n";
-    for (std::size_t index = 0; index < this->CoordSystems.size(); index++)
-    {
-      this->CoordSystems[index].PrintSummary(out);
-    }
-
-    out << "  CellSets[" << this->GetNumberOfCellSets() << "]\n";
-    for (vtkm::Id index = 0; index < this->GetNumberOfCellSets(); index++)
-    {
-      this->GetCellSet(index).PrintSummary(out);
-    }
-
-    out << "  Fields[" << this->GetNumberOfFields() << "]\n";
-    for (vtkm::Id index = 0; index < this->GetNumberOfFields(); index++)
-    {
-      this->GetField(index).PrintSummary(out);
-    }
-  }
+  void PrintSummary(std::ostream& out) const;
 
 private:
   std::vector<vtkm::cont::CoordinateSystem> CoordSystems;
@@ -237,51 +170,13 @@ private:
   VTKM_CONT
   vtkm::Id FindFieldIndex(const std::string& name,
                           vtkm::cont::Field::AssociationEnum association,
-                          bool& found) const
-  {
-    for (std::size_t index = 0; index < this->Fields.size(); ++index)
-    {
-      if ((association == vtkm::cont::Field::ASSOC_ANY ||
-           association == this->Fields[index].GetAssociation()) &&
-          this->Fields[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+                          bool& found) const;
 
   VTKM_CONT
-  vtkm::Id FindCoordinateSystemIndex(const std::string& name, bool& found) const
-  {
-    for (std::size_t index = 0; index < this->CoordSystems.size(); ++index)
-    {
-      if (this->CoordSystems[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+  vtkm::Id FindCoordinateSystemIndex(const std::string& name, bool& found) const;
 
   VTKM_CONT
-  vtkm::Id FindCellSetIndex(const std::string& name, bool& found) const
-  {
-    for (std::size_t index = 0; index < static_cast<size_t>(this->GetNumberOfCellSets()); ++index)
-    {
-      if (this->CellSets[index].GetName() == name)
-      {
-        found = true;
-        return static_cast<vtkm::Id>(index);
-      }
-    }
-    found = false;
-    return -1;
-  }
+  vtkm::Id FindCellSetIndex(const std::string& name, bool& found) const;
 };
 
 } // namespace cont

From 2537a1cf56ad53c1d5a6a7a02353696ff3a86c59 Mon Sep 17 00:00:00 2001
From: Dave Pugmire <dpugmire@gmail.com>
Date: Fri, 22 Dec 2017 15:20:20 -0500
Subject: [PATCH 02/24] Worklets for cross and dot product.

---
 vtkm/worklet/CMakeLists.txt                   |   2 +
 vtkm/worklet/CrossProduct.h                   |  47 +++++++
 vtkm/worklet/DotProduct.h                     |  47 +++++++
 vtkm/worklet/testing/CMakeLists.txt           |   2 +
 vtkm/worklet/testing/UnitTestCrossProduct.cxx | 130 ++++++++++++++++++
 vtkm/worklet/testing/UnitTestDotProduct.cxx   | 115 ++++++++++++++++
 6 files changed, 343 insertions(+)
 create mode 100644 vtkm/worklet/CrossProduct.h
 create mode 100644 vtkm/worklet/DotProduct.h
 create mode 100644 vtkm/worklet/testing/UnitTestCrossProduct.cxx
 create mode 100644 vtkm/worklet/testing/UnitTestDotProduct.cxx

diff --git a/vtkm/worklet/CMakeLists.txt b/vtkm/worklet/CMakeLists.txt
index 4a70203bf..e442d55b3 100644
--- a/vtkm/worklet/CMakeLists.txt
+++ b/vtkm/worklet/CMakeLists.txt
@@ -25,11 +25,13 @@ set(headers
   Clip.h
   ContourTreeUniform.h
   CosmoTools.h
+  CrossProduct.h
   DispatcherMapField.h
   DispatcherMapTopology.h
   DispatcherPointNeighborhood.h
   DispatcherReduceByKey.h
   DispatcherStreamingMapField.h
+  DotProduct.h
   ExternalFaces.h
   ExtractGeometry.h
   ExtractPoints.h
diff --git a/vtkm/worklet/CrossProduct.h b/vtkm/worklet/CrossProduct.h
new file mode 100644
index 000000000..4f1c80d7d
--- /dev/null
+++ b/vtkm/worklet/CrossProduct.h
@@ -0,0 +1,47 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_worklet_CrossProduct_h
+#define vtk_m_worklet_CrossProduct_h
+
+#include <vtkm/worklet/WorkletMapField.h>
+
+#include <vtkm/VectorAnalysis.h>
+
+namespace vtkm
+{
+namespace worklet
+{
+
+class CrossProduct : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<VecAll>, FieldIn<VecAll>, FieldOut<VecAll>);
+  typedef void ExecutionSignature(_1, _2, _3);
+
+  template <typename T, typename T2>
+  VTKM_EXEC void operator()(const T& vec1, const T& vec2, T2& outVec) const
+  {
+    outVec = vtkm::Cross(vec1, vec2);
+  }
+};
+}
+} // namespace vtkm::worklet
+
+#endif // vtk_m_worklet_CrossProduct_h
diff --git a/vtkm/worklet/DotProduct.h b/vtkm/worklet/DotProduct.h
new file mode 100644
index 000000000..54c2c074f
--- /dev/null
+++ b/vtkm/worklet/DotProduct.h
@@ -0,0 +1,47 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_worklet_DotProduct_h
+#define vtk_m_worklet_DotProduct_h
+
+#include <vtkm/worklet/WorkletMapField.h>
+
+#include <vtkm/VectorAnalysis.h>
+
+namespace vtkm
+{
+namespace worklet
+{
+
+class DotProduct : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<VecAll>, FieldIn<VecAll>, FieldOut<Scalar>);
+  typedef void ExecutionSignature(_1, _2, _3);
+
+  template <typename T, typename T2>
+  VTKM_EXEC void operator()(const T& v1, const T& v2, T2& outValue) const
+  {
+    outValue = vtkm::dot(v1, v2);
+  }
+};
+}
+} // namespace vtkm::worklet
+
+#endif // vtk_m_worklet_Normalize_h
diff --git a/vtkm/worklet/testing/CMakeLists.txt b/vtkm/worklet/testing/CMakeLists.txt
index e92865442..70ce7dc59 100644
--- a/vtkm/worklet/testing/CMakeLists.txt
+++ b/vtkm/worklet/testing/CMakeLists.txt
@@ -26,6 +26,8 @@ set(unit_tests
   UnitTestClipping.cxx
   UnitTestContourTreeUniform.cxx
   UnitTestCosmoTools.cxx
+  UnitTestCrossProduct.cxx
+  UnitTestDotProduct.cxx
   UnitTestExternalFaces.cxx
   UnitTestExtractGeometry.cxx
   UnitTestExtractPoints.cxx
diff --git a/vtkm/worklet/testing/UnitTestCrossProduct.cxx b/vtkm/worklet/testing/UnitTestCrossProduct.cxx
new file mode 100644
index 000000000..694d1b7e0
--- /dev/null
+++ b/vtkm/worklet/testing/UnitTestCrossProduct.cxx
@@ -0,0 +1,130 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/worklet/CrossProduct.h>
+#include <vtkm/worklet/DispatcherMapField.h>
+
+#include <random>
+#include <vtkm/cont/testing/Testing.h>
+
+namespace
+{
+std::mt19937 randGenerator;
+
+template <typename T>
+void createVectors(std::vector<vtkm::Vec<T, 3>>& vecs1, std::vector<vtkm::Vec<T, 3>>& vecs2)
+{
+  // First, test the standard directions.
+  // X x Y
+  vecs1.push_back(vtkm::make_Vec(1, 0, 0));
+  vecs2.push_back(vtkm::make_Vec(0, 1, 0));
+
+  // Y x Z
+  vecs1.push_back(vtkm::make_Vec(0, 1, 0));
+  vecs2.push_back(vtkm::make_Vec(0, 0, 1));
+
+  // Z x X
+  vecs1.push_back(vtkm::make_Vec(0, 0, 1));
+  vecs2.push_back(vtkm::make_Vec(1, 0, 0));
+
+  // Y x X
+  vecs1.push_back(vtkm::make_Vec(0, 1, 0));
+  vecs2.push_back(vtkm::make_Vec(1, 0, 0));
+
+  // Z x Y
+  vecs1.push_back(vtkm::make_Vec(0, 0, 1));
+  vecs2.push_back(vtkm::make_Vec(0, 1, 0));
+
+  // X x Z
+  vecs1.push_back(vtkm::make_Vec(1, 0, 0));
+  vecs2.push_back(vtkm::make_Vec(0, 0, 1));
+
+  //Test some other vector combinations
+  std::uniform_real_distribution<vtkm::Float64> randomDist(-10.0, 10.0);
+  randomDist(randGenerator);
+
+  for (int i = 0; i < 100; i++)
+  {
+    vecs1.push_back(vtkm::make_Vec(
+      randomDist(randGenerator), randomDist(randGenerator), randomDist(randGenerator)));
+    vecs2.push_back(vtkm::make_Vec(
+      randomDist(randGenerator), randomDist(randGenerator), randomDist(randGenerator)));
+  }
+}
+
+template <typename T>
+void TestCrossProduct()
+{
+  std::vector<vtkm::Vec<T, 3>> inputVecs1, inputVecs2;
+  createVectors(inputVecs1, inputVecs2);
+
+  vtkm::cont::ArrayHandle<vtkm::Vec<T, 3>> inputArray1, inputArray2;
+  vtkm::cont::ArrayHandle<vtkm::Vec<T, 3>> outputArray;
+  inputArray1 = vtkm::cont::make_ArrayHandle(inputVecs1);
+  inputArray2 = vtkm::cont::make_ArrayHandle(inputVecs2);
+
+  vtkm::worklet::CrossProduct crossProductWorklet;
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::CrossProduct> dispatcherCrossProduct(
+    crossProductWorklet);
+  dispatcherCrossProduct.Invoke(inputArray1, inputArray2, outputArray);
+
+  VTKM_TEST_ASSERT(outputArray.GetNumberOfValues() == inputArray1.GetNumberOfValues(),
+                   "Wrong number of results for CrossProduct worklet");
+
+  //Test the cannonical cases.
+  VTKM_TEST_ASSERT(
+    test_equal(outputArray.GetPortalConstControl().Get(0), vtkm::make_Vec(0, 0, 1)) &&
+      test_equal(outputArray.GetPortalConstControl().Get(1), vtkm::make_Vec(1, 0, 0)) &&
+      test_equal(outputArray.GetPortalConstControl().Get(2), vtkm::make_Vec(0, 1, 0)) &&
+      test_equal(outputArray.GetPortalConstControl().Get(3), vtkm::make_Vec(0, 0, -1)) &&
+      test_equal(outputArray.GetPortalConstControl().Get(4), vtkm::make_Vec(-1, 0, 0)) &&
+      test_equal(outputArray.GetPortalConstControl().Get(5), vtkm::make_Vec(0, -1, 0)),
+    "Wrong result for CrossProduct worklet");
+
+  for (vtkm::Id i = 0; i < inputArray1.GetNumberOfValues(); i++)
+  {
+    vtkm::Vec<T, 3> v1 = inputArray1.GetPortalConstControl().Get(i);
+    vtkm::Vec<T, 3> v2 = inputArray2.GetPortalConstControl().Get(i);
+    vtkm::Vec<T, 3> res = outputArray.GetPortalConstControl().Get(i);
+
+    //Make sure result is orthogonal each input vector. Need to normalize to compare with zero.
+    vtkm::Vec<T, 3> v1N(vtkm::Normal(v1)), v2N(vtkm::Normal(v1)), resN(vtkm::Normal(res));
+    VTKM_TEST_ASSERT(test_equal(vtkm::dot(resN, v1N), T(0.0)), "Wrong result for cross product");
+    VTKM_TEST_ASSERT(test_equal(vtkm::dot(resN, v2N), T(0.0)), "Wrong result for cross product");
+
+    T sinAngle = vtkm::Magnitude(res) * vtkm::RMagnitude(v1) * vtkm::RMagnitude(v2);
+    T cosAngle = vtkm::dot(v1, v2) * vtkm::RMagnitude(v1) * vtkm::RMagnitude(v2);
+    VTKM_TEST_ASSERT(test_equal(sinAngle * sinAngle + cosAngle * cosAngle, T(1.0)),
+                     "Bad cross product length.");
+  }
+}
+
+void TestCrossProductWorklets()
+{
+  std::cout << "Testing CrossProduct Worklet" << std::endl;
+  TestCrossProduct<vtkm::Float32>();
+  TestCrossProduct<vtkm::Float64>();
+}
+}
+
+int UnitTestCrossProduct(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestCrossProductWorklets);
+}
diff --git a/vtkm/worklet/testing/UnitTestDotProduct.cxx b/vtkm/worklet/testing/UnitTestDotProduct.cxx
new file mode 100644
index 000000000..04f739d70
--- /dev/null
+++ b/vtkm/worklet/testing/UnitTestDotProduct.cxx
@@ -0,0 +1,115 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#include <vtkm/worklet/DispatcherMapField.h>
+#include <vtkm/worklet/DotProduct.h>
+
+#include <vtkm/cont/testing/Testing.h>
+
+namespace
+{
+
+template <typename T>
+T normalizedVector(T v)
+{
+  T vN = vtkm::Normal(v);
+  return vN;
+}
+
+template <typename T>
+void createVectors(std::vector<vtkm::Vec<T, 3>>& vecs1,
+                   std::vector<vtkm::Vec<T, 3>>& vecs2,
+                   std::vector<T>& result)
+{
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  result.push_back(1);
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(-1), T(0), T(0))));
+  result.push_back(-1);
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(0), T(1), T(0))));
+  result.push_back(0);
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(0), T(-1), T(0))));
+  result.push_back(0);
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(1), T(1), T(0))));
+  result.push_back(T(1.0 / vtkm::Sqrt(2.0)));
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(1), T(1), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(1), T(0), T(0))));
+  result.push_back(T(1.0 / vtkm::Sqrt(2.0)));
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(-1), T(0), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(1), T(1), T(0))));
+  result.push_back(-T(1.0 / vtkm::Sqrt(2.0)));
+
+  vecs1.push_back(normalizedVector(vtkm::make_Vec(T(0), T(1), T(0))));
+  vecs2.push_back(normalizedVector(vtkm::make_Vec(T(1), T(1), T(0))));
+  result.push_back(T(1.0 / vtkm::Sqrt(2.0)));
+}
+
+template <typename T>
+void TestDotProduct()
+{
+  std::vector<vtkm::Vec<T, 3>> inputVecs1, inputVecs2;
+  std::vector<T> answer;
+  createVectors(inputVecs1, inputVecs2, answer);
+
+  vtkm::cont::ArrayHandle<vtkm::Vec<T, 3>> inputArray1, inputArray2;
+  vtkm::cont::ArrayHandle<T> outputArray;
+  inputArray1 = vtkm::cont::make_ArrayHandle(inputVecs1);
+  inputArray2 = vtkm::cont::make_ArrayHandle(inputVecs2);
+
+  vtkm::worklet::DotProduct dotProductWorklet;
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::DotProduct> dispatcherDotProduct(
+    dotProductWorklet);
+  dispatcherDotProduct.Invoke(inputArray1, inputArray2, outputArray);
+
+  VTKM_TEST_ASSERT(outputArray.GetNumberOfValues() == inputArray1.GetNumberOfValues(),
+                   "Wrong number of results for DotProduct worklet");
+
+  for (vtkm::Id i = 0; i < inputArray1.GetNumberOfValues(); i++)
+  {
+    vtkm::Vec<T, 3> v1 = inputArray1.GetPortalConstControl().Get(i);
+    vtkm::Vec<T, 3> v2 = inputArray2.GetPortalConstControl().Get(i);
+    T ans = answer[static_cast<std::size_t>(i)];
+
+    VTKM_TEST_ASSERT(test_equal(ans, vtkm::dot(v1, v2)), "Wrong result for dot product");
+  }
+}
+
+void TestDotProductWorklets()
+{
+  std::cout << "Testing DotProduct Worklet" << std::endl;
+  TestDotProduct<vtkm::Float32>();
+  //  TestDotProduct<vtkm::Float64>();
+}
+}
+
+int UnitTestDotProduct(int, char* [])
+{
+  return vtkm::cont::testing::Testing::Run(TestDotProductWorklets);
+}

From 053dccac245ec40b939dc934e27adaa7b4e6e49d Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Thu, 28 Dec 2017 16:44:48 -0500
Subject: [PATCH 03/24] Properly propagate down the VTKM_NO_ASSERT flag to
 configure file.

---
 vtkm/internal/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vtkm/internal/CMakeLists.txt b/vtkm/internal/CMakeLists.txt
index 99ed7c6b7..2b47aed81 100755
--- a/vtkm/internal/CMakeLists.txt
+++ b/vtkm/internal/CMakeLists.txt
@@ -21,8 +21,9 @@
 #-----------------------------------------------------------------------------
 # Build the configure file.
 # need to set numerous VTKm cmake properties to the naming convention
-# that we exepect for our C++ defines.
+# that we expect for our C++ defines.
 
+set(VTKM_NO_ASSERT ${VTKm_NO_ASSERT})
 set(VTKM_USE_DOUBLE_PRECISION ${VTKm_USE_DOUBLE_PRECISION})
 set(VTKM_USE_64BIT_IDS ${VTKm_USE_64BIT_IDS})
 

From 18ece6dce9c8e5c2522f7c3b3e05832ef1a18881 Mon Sep 17 00:00:00 2001
From: Dave Pugmire <dpugmire@gmail.com>
Date: Fri, 29 Dec 2017 06:48:39 -0500
Subject: [PATCH 04/24] Better type signatures.

---
 vtkm/worklet/CrossProduct.h | 6 ++++--
 vtkm/worklet/DotProduct.h   | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vtkm/worklet/CrossProduct.h b/vtkm/worklet/CrossProduct.h
index 4f1c80d7d..e0406b3e8 100644
--- a/vtkm/worklet/CrossProduct.h
+++ b/vtkm/worklet/CrossProduct.h
@@ -35,8 +35,10 @@ public:
   typedef void ControlSignature(FieldIn<VecAll>, FieldIn<VecAll>, FieldOut<VecAll>);
   typedef void ExecutionSignature(_1, _2, _3);
 
-  template <typename T, typename T2>
-  VTKM_EXEC void operator()(const T& vec1, const T& vec2, T2& outVec) const
+  template <typename T>
+  VTKM_EXEC void operator()(const vtkm::Vec<T, 3>& vec1,
+                            const vtkm::Vec<T, 3>& vec2,
+                            vtkm::Vec<T, 3>& outVec) const
   {
     outVec = vtkm::Cross(vec1, vec2);
   }
diff --git a/vtkm/worklet/DotProduct.h b/vtkm/worklet/DotProduct.h
index 54c2c074f..829ea80b0 100644
--- a/vtkm/worklet/DotProduct.h
+++ b/vtkm/worklet/DotProduct.h
@@ -35,8 +35,10 @@ public:
   typedef void ControlSignature(FieldIn<VecAll>, FieldIn<VecAll>, FieldOut<Scalar>);
   typedef void ExecutionSignature(_1, _2, _3);
 
-  template <typename T, typename T2>
-  VTKM_EXEC void operator()(const T& v1, const T& v2, T2& outValue) const
+  template <typename T, vtkm::IdComponent Size>
+  VTKM_EXEC void operator()(const vtkm::Vec<T, Size>& v1,
+                            const vtkm::Vec<T, Size>& v2,
+                            T& outValue) const
   {
     outValue = vtkm::dot(v1, v2);
   }

From 3e10b504e62ac13ade8c178e9159a34652ec660d Mon Sep 17 00:00:00 2001
From: Sujin Philip <sujin.philip@kitware.com>
Date: Fri, 22 Dec 2017 13:25:35 -0500
Subject: [PATCH 05/24] Replace ExecutionWholeArray with WholeArray

---
 vtkm/exec/ExecutionWholeArray.h               | 14 ++---
 vtkm/rendering/CanvasRayTracer.cxx            | 42 ++++++-------
 vtkm/rendering/Triangulator.h                 | 19 +++---
 vtkm/rendering/Wireframer.h                   | 20 +++---
 .../raytracing/BoundingVolumeHierarchy.cxx    | 37 +++++------
 .../raytracing/MeshConnectivityBuilder.h      | 41 ++++++-------
 vtkm/worklet/KernelSplatter.h                 | 16 ++---
 vtkm/worklet/StreamLineUniformGrid.h          | 28 ++++-----
 .../UnitTestWorkletMapFieldExecArg.cxx        | 61 +++++++++----------
 9 files changed, 127 insertions(+), 151 deletions(-)

diff --git a/vtkm/exec/ExecutionWholeArray.h b/vtkm/exec/ExecutionWholeArray.h
index 099bb0470..7b22c802c 100644
--- a/vtkm/exec/ExecutionWholeArray.h
+++ b/vtkm/exec/ExecutionWholeArray.h
@@ -29,14 +29,16 @@ namespace vtkm
 namespace exec
 {
 
+/// The following classes have been deprecated and are meant to be used
+/// internally only. Please use the \c WholeArrayIn, \c WholeArrayOut, and
+/// \c WholeArrayInOut \c ControlSignature tags instead.
+
 /// \c ExecutionWholeArray is an execution object that allows an array handle
 /// content to be a parameter in an execution environment
 /// function. This can be used to allow worklets to have a shared search
-/// structure
+/// structure.
 ///
-template <typename T,
-          typename StorageTag = VTKM_DEFAULT_STORAGE_TAG,
-          typename DeviceAdapterTag = VTKM_DEFAULT_DEVICE_ADAPTER_TAG>
+template <typename T, typename StorageTag, typename DeviceAdapterTag>
 class ExecutionWholeArray : public vtkm::exec::ExecutionObjectBase
 {
 public:
@@ -86,9 +88,7 @@ private:
 /// function. This can be used to allow worklets to have a shared search
 /// structure
 ///
-template <typename T,
-          typename StorageTag = VTKM_DEFAULT_STORAGE_TAG,
-          typename DeviceAdapterTag = VTKM_DEFAULT_DEVICE_ADAPTER_TAG>
+template <typename T, typename StorageTag, typename DeviceAdapterTag>
 class ExecutionWholeArrayConst : public vtkm::exec::ExecutionObjectBase
 {
 public:
diff --git a/vtkm/rendering/CanvasRayTracer.cxx b/vtkm/rendering/CanvasRayTracer.cxx
index e8763b237..dc1640cc3 100644
--- a/vtkm/rendering/CanvasRayTracer.cxx
+++ b/vtkm/rendering/CanvasRayTracer.cxx
@@ -21,7 +21,6 @@
 #include <vtkm/rendering/CanvasRayTracer.h>
 
 #include <vtkm/cont/TryExecute.h>
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/rendering/Canvas.h>
 #include <vtkm/rendering/Color.h>
 #include <vtkm/rendering/raytracing/Ray.h>
@@ -51,19 +50,21 @@ public:
                                 FieldIn<>,
                                 FieldIn<>,
                                 FieldIn<>,
-                                ExecObject,
-                                ExecObject);
+                                WholeArrayOut<vtkm::ListTagBase<vtkm::Float32>>,
+                                WholeArrayOut<vtkm::ListTagBase<vtkm::Vec<vtkm::Float32, 4>>>);
   typedef void ExecutionSignature(_1, _2, _3, _4, _5, _6, _7, WorkIndex);
-  template <typename Precision, typename ColorPortalType>
-  VTKM_EXEC void operator()(
-    const vtkm::Id& pixelIndex,
-    ColorPortalType& colorBufferIn,
-    const Precision& inDepth,
-    const vtkm::Vec<Precision, 3>& origin,
-    const vtkm::Vec<Precision, 3>& dir,
-    vtkm::exec::ExecutionWholeArray<vtkm::Float32>& depthBuffer,
-    vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Float32, 4>>& colorBuffer,
-    const vtkm::Id& index) const
+  template <typename Precision,
+            typename ColorPortalType,
+            typename DepthBufferPortalType,
+            typename ColorBufferPortalType>
+  VTKM_EXEC void operator()(const vtkm::Id& pixelIndex,
+                            ColorPortalType& colorBufferIn,
+                            const Precision& inDepth,
+                            const vtkm::Vec<Precision, 3>& origin,
+                            const vtkm::Vec<Precision, 3>& dir,
+                            DepthBufferPortalType& depthBuffer,
+                            ColorBufferPortalType& colorBuffer,
+                            const vtkm::Id& index) const
   {
     vtkm::Vec<Precision, 3> intersection = origin + inDepth * dir;
     vtkm::Vec<vtkm::Float32, 4> point;
@@ -140,14 +141,13 @@ public:
   {
     VTKM_IS_DEVICE_ADAPTER_TAG(Device);
     vtkm::worklet::DispatcherMapField<SurfaceConverter, Device>(SurfaceConverter(ViewProjMat))
-      .Invoke(
-        Rays.PixelIdx,
-        Colors,
-        Rays.Distance,
-        Rays.Origin,
-        Rays.Dir,
-        vtkm::exec::ExecutionWholeArray<vtkm::Float32>(Canvas->GetDepthBuffer()),
-        vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Float32, 4>>(Canvas->GetColorBuffer()));
+      .Invoke(Rays.PixelIdx,
+              Colors,
+              Rays.Distance,
+              Rays.Origin,
+              Rays.Dir,
+              Canvas->GetDepthBuffer(),
+              Canvas->GetColorBuffer());
     return true;
   }
 };
diff --git a/vtkm/rendering/Triangulator.h b/vtkm/rendering/Triangulator.h
index e181c8f44..cb3fa7376 100644
--- a/vtkm/rendering/Triangulator.h
+++ b/vtkm/rendering/Triangulator.h
@@ -24,7 +24,6 @@
 #include <vtkm/cont/ArrayHandleCounting.h>
 #include <vtkm/cont/CellSetPermutation.h>
 #include <vtkm/cont/DataSet.h>
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/rendering/raytracing/MeshConnectivityBuilder.h>
 #include <vtkm/worklet/DispatcherMapField.h>
 #include <vtkm/worklet/DispatcherMapTopology.h>
@@ -276,17 +275,21 @@ public:
   public:
     VTKM_CONT
     UniqueTriangles() {}
-    typedef void ControlSignature(ExecObject, ExecObject);
+
+    typedef void ControlSignature(WholeArrayIn<vtkm::ListTagBase<vtkm::Vec<vtkm::Id, 4>>>,
+                                  WholeArrayOut<vtkm::ListTagBase<vtkm::UInt8>>);
     typedef void ExecutionSignature(_1, _2, WorkIndex);
+
     VTKM_EXEC
     bool IsTwin(const vtkm::Vec<vtkm::Id, 4>& a, const vtkm::Vec<vtkm::Id, 4>& b) const
     {
       return (a[1] == b[1] && a[2] == b[2] && a[3] == b[3]);
     }
-    VTKM_EXEC
-    void operator()(vtkm::exec::ExecutionWholeArrayConst<vtkm::Vec<vtkm::Id, 4>>& indices,
-                    vtkm::exec::ExecutionWholeArray<vtkm::UInt8>& outputFlags,
-                    const vtkm::Id& index) const
+
+    template <typename IndicesPortalType, typename OutputFlagsPortalType>
+    VTKM_EXEC void operator()(const IndicesPortalType& indices,
+                              OutputFlagsPortalType& outputFlags,
+                              const vtkm::Id& index) const
     {
       if (index == 0)
         return;
@@ -612,9 +615,7 @@ public:
     flags.Allocate(outputTriangles);
     vtkm::worklet::DispatcherMapField<MemSet<vtkm::UInt8>>(MemSet<vtkm::UInt8>(1)).Invoke(flags);
     //Unique triangles will have a flag = 1
-    vtkm::worklet::DispatcherMapField<UniqueTriangles>().Invoke(
-      vtkm::exec::ExecutionWholeArrayConst<vtkm::Vec<vtkm::Id, 4>>(outputIndices),
-      vtkm::exec::ExecutionWholeArray<vtkm::UInt8>(flags));
+    vtkm::worklet::DispatcherMapField<UniqueTriangles>().Invoke(outputIndices, flags);
 
     vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Id, 4>> subset;
     vtkm::cont::DeviceAdapterAlgorithm<Device>::CopyIf(outputIndices, flags, subset);
diff --git a/vtkm/rendering/Wireframer.h b/vtkm/rendering/Wireframer.h
index 6eae48dd8..4dff3be70 100644
--- a/vtkm/rendering/Wireframer.h
+++ b/vtkm/rendering/Wireframer.h
@@ -157,7 +157,7 @@ class EdgePlotter : public vtkm::worklet::WorkletMapField
 public:
   using AtomicPackedFrameBufferHandle = vtkm::exec::AtomicArray<vtkm::Int64, DeviceTag>;
 
-  typedef void ControlSignature(FieldIn<>, WholeArrayIn<>, WholeArrayIn<Scalar>);
+  typedef void ControlSignature(FieldIn<Id2Type>, WholeArrayIn<Vec3>, WholeArrayIn<Scalar>);
   typedef void ExecutionSignature(_1, _2, _3);
   using InputDomain = _1;
 
@@ -393,14 +393,16 @@ public:
   VTKM_CONT
   BufferConverter() {}
 
-  typedef void ControlSignature(FieldIn<>, ExecObject, ExecObject);
+  typedef void ControlSignature(FieldIn<>,
+                                WholeArrayOut<vtkm::ListTagBase<vtkm::Float32>>,
+                                WholeArrayOut<vtkm::ListTagBase<vtkm::Vec<vtkm::Float32, 4>>>);
   typedef void ExecutionSignature(_1, _2, _3, WorkIndex);
 
-  VTKM_EXEC
-  void operator()(const vtkm::Int64& packedValue,
-                  vtkm::exec::ExecutionWholeArray<vtkm::Float32>& depthBuffer,
-                  vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Float32, 4>>& colorBuffer,
-                  const vtkm::Id& index) const
+  template <typename DepthBufferPortalType, typename ColorBufferPortalType>
+  VTKM_EXEC void operator()(const vtkm::Int64& packedValue,
+                            DepthBufferPortalType& depthBuffer,
+                            ColorBufferPortalType& colorBuffer,
+                            const vtkm::Id& index) const
   {
     PackedValue packed;
     packed.Raw = packedValue;
@@ -551,9 +553,7 @@ private:
 
     BufferConverter converter;
     vtkm::worklet::DispatcherMapField<BufferConverter, DeviceTag>(converter).Invoke(
-      FrameBuffer,
-      vtkm::exec::ExecutionWholeArray<vtkm::Float32>(Canvas->GetDepthBuffer()),
-      vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Float32, 4>>(Canvas->GetColorBuffer()));
+      FrameBuffer, Canvas->GetDepthBuffer(), Canvas->GetColorBuffer());
   }
 
   VTKM_CONT
diff --git a/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx b/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx
index f710d4cfe..1e86ffc36 100644
--- a/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx
+++ b/vtkm/rendering/raytracing/BoundingVolumeHierarchy.cxx
@@ -316,22 +316,22 @@ public:
   {
     this->FlatBVH = flatBVH.PrepareForOutput((LeafCount - 1) * 4, Device());
   }
-  typedef void ControlSignature(ExecObject,
-                                ExecObject,
-                                ExecObject,
-                                ExecObject,
-                                ExecObject,
-                                ExecObject);
+  typedef void ControlSignature(WholeArrayIn<Scalar>,
+                                WholeArrayIn<Scalar>,
+                                WholeArrayIn<Scalar>,
+                                WholeArrayIn<Scalar>,
+                                WholeArrayIn<Scalar>,
+                                WholeArrayIn<Scalar>);
   typedef void ExecutionSignature(WorkIndex, _1, _2, _3, _4, _5, _6);
-  template <typename StrorageType>
-  VTKM_EXEC_CONT void operator()(
-    const vtkm::Id workIndex,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& xmin,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& ymin,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& zmin,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& xmax,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& ymax,
-    const vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32, StrorageType>& zmax) const
+
+  template <typename InputPortalType>
+  VTKM_EXEC_CONT void operator()(const vtkm::Id workIndex,
+                                 const InputPortalType& xmin,
+                                 const InputPortalType& ymin,
+                                 const InputPortalType& zmin,
+                                 const InputPortalType& xmax,
+                                 const InputPortalType& ymax,
+                                 const InputPortalType& zmax) const
   {
     //move up into the inner nodes
     vtkm::Id currentNode = LeafCount - 1 + workIndex;
@@ -780,12 +780,7 @@ VTKM_CONT void LinearBVHBuilder::RunOnDevice(LinearBVH& linearBVH, Device device
   vtkm::worklet::DispatcherMapField<PropagateAABBs<Device>, Device>(
     PropagateAABBs<Device>(
       bvh.parent, bvh.leftChild, bvh.rightChild, primitiveCount, linearBVH.FlatBVH, atomicCounters))
-    .Invoke(vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.xmins),
-            vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.ymins),
-            vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.zmins),
-            vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.xmaxs),
-            vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.ymaxs),
-            vtkm::exec::ExecutionWholeArrayConst<vtkm::Float32>(*bvh.zmaxs));
+    .Invoke(*bvh.xmins, *bvh.ymins, *bvh.zmins, *bvh.xmaxs, *bvh.ymaxs, *bvh.zmaxs);
 
   time = timer.GetElapsedTime();
   logger->AddLogData("propagate_aabbs", time);
diff --git a/vtkm/rendering/raytracing/MeshConnectivityBuilder.h b/vtkm/rendering/raytracing/MeshConnectivityBuilder.h
index 9560be688..b63885002 100644
--- a/vtkm/rendering/raytracing/MeshConnectivityBuilder.h
+++ b/vtkm/rendering/raytracing/MeshConnectivityBuilder.h
@@ -95,7 +95,7 @@ public:
   VTKM_CONT
   MortonNeighbor() {}
   typedef void ControlSignature(WholeArrayIn<>,
-                                ExecObject,
+                                WholeArrayInOut<Id3Type>,
                                 WholeArrayIn<>,
                                 WholeArrayIn<>,
                                 WholeArrayIn<>,
@@ -146,18 +146,18 @@ public:
 
 
   template <typename MortonPortalType,
+            typename FaceIdPairsPortalType,
             typename ConnPortalType,
             typename ShapePortalType,
             typename OffsetPortalType,
             typename ExternalFaceFlagType>
-  VTKM_EXEC inline void operator()(
-    const MortonPortalType& mortonCodes,
-    vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Id, 3>>& faceIdPairs,
-    const vtkm::Id& index,
-    const ConnPortalType& connectivity,
-    const ShapePortalType& shapes,
-    const OffsetPortalType& offsets,
-    ExternalFaceFlagType& flags) const
+  VTKM_EXEC inline void operator()(const MortonPortalType& mortonCodes,
+                                   FaceIdPairsPortalType& faceIdPairs,
+                                   const vtkm::Id& index,
+                                   const ConnPortalType& connectivity,
+                                   const ShapePortalType& shapes,
+                                   const OffsetPortalType& offsets,
+                                   ExternalFaceFlagType& flags) const
   {
     if (index == 0)
     {
@@ -330,14 +330,16 @@ public:
 class WriteFaceConn : public vtkm::worklet::WorkletMapField
 {
 public:
+  typedef void ControlSignature(FieldIn<>, WholeArrayIn<>, WholeArrayOut<IdType>);
+  typedef void ExecutionSignature(_1, _2, _3);
+
   VTKM_CONT
   WriteFaceConn() {}
-  typedef void ControlSignature(FieldIn<>, WholeArrayIn<>, ExecObject);
-  typedef void ExecutionSignature(_1, _2, _3);
-  template <typename FaceOffsetsPortalType>
+
+  template <typename FaceOffsetsPortalType, typename FaceConnectivityPortalType>
   VTKM_EXEC inline void operator()(const vtkm::Vec<vtkm::Id, 3>& faceIdPair,
                                    const FaceOffsetsPortalType& faceOffsets,
-                                   vtkm::exec::ExecutionWholeArray<vtkm::Id>& faceConn) const
+                                   FaceConnectivityPortalType& faceConn) const
   {
     vtkm::Id cellId = faceIdPair[0];
     BOUNDS_CHECK(faceOffsets, cellId);
@@ -570,8 +572,7 @@ public:
 
     // scatter the coonectivity into the original order
     vtkm::worklet::DispatcherMapField<WriteFaceConn>(WriteFaceConn())
-      .Invoke(
-        cellFaceId, this->FaceOffsets, vtkm::exec::ExecutionWholeArray<vtkm::Id>(faceConnectivity));
+      .Invoke(cellFaceId, this->FaceOffsets, faceConnectivity);
 
 
     FaceConnectivity = faceConnectivity;
@@ -628,8 +629,7 @@ public:
 
     // scatter the coonectivity into the original order
     vtkm::worklet::DispatcherMapField<WriteFaceConn>(WriteFaceConn())
-      .Invoke(
-        cellFaceId, this->FaceOffsets, vtkm::exec::ExecutionWholeArray<vtkm::Id>(faceConnectivity));
+      .Invoke(cellFaceId, this->FaceOffsets, faceConnectivity);
 
     FaceConnectivity = faceConnectivity;
     OutsideTriangles = externalTriangles;
@@ -754,12 +754,7 @@ protected:
       .Invoke(faceConnectivity);
 
     vtkm::worklet::DispatcherMapField<MortonNeighbor, DeviceAdapter>(MortonNeighbor())
-      .Invoke(faceMortonCodes,
-              vtkm::exec::ExecutionWholeArray<vtkm::Vec<vtkm::Id, 3>>(cellFaceId),
-              conn,
-              shapes,
-              shapeOffsets,
-              faceConnectivity);
+      .Invoke(faceMortonCodes, cellFaceId, conn, shapes, shapeOffsets, faceConnectivity);
 
     vtkm::Float64 time = timer.GetElapsedTime();
     Logger::GetInstance()->AddLogData("gen_face_conn", time);
diff --git a/vtkm/worklet/KernelSplatter.h b/vtkm/worklet/KernelSplatter.h
index 2bfb2d98f..f8ab96e00 100644
--- a/vtkm/worklet/KernelSplatter.h
+++ b/vtkm/worklet/KernelSplatter.h
@@ -22,8 +22,6 @@
 
 #include <vtkm/Math.h>
 
-#include <vtkm/exec/ExecutionWholeArray.h>
-
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleCounting.h>
 #include <vtkm/cont/ArrayHandlePermutation.h>
@@ -363,16 +361,16 @@ struct KernelSplatterFilterUniformGrid
   class UpdateVoxelSplats : public vtkm::worklet::WorkletMapField
   {
   public:
-    typedef void ControlSignature(FieldIn<>, FieldIn<>, ExecObject);
+    typedef void ControlSignature(FieldIn<>, FieldIn<>, WholeArrayOut<Scalar>);
     typedef void ExecutionSignature(_1, _2, _3);
 
     VTKM_CONT
     UpdateVoxelSplats() {}
 
-    VTKM_EXEC_CONT
-    void operator()(const vtkm::Id& voxelIndex,
-                    const vtkm::Float64& splatValue,
-                    vtkm::exec::ExecutionWholeArray<vtkm::Float32>& execArg) const
+    template <typename ExecArgPortalType>
+    VTKM_EXEC_CONT void operator()(const vtkm::Id& voxelIndex,
+                                   const vtkm::Float64& splatValue,
+                                   ExecArgPortalType& execArg) const
     {
       execArg.Set(voxelIndex, static_cast<vtkm::Float32>(splatValue));
     }
@@ -594,9 +592,7 @@ struct KernelSplatterFilterUniformGrid
     vtkm::worklet::DispatcherMapField<UpdateVoxelSplats> scatterDispatcher;
 
     START_TIMER_BLOCK(UpdateVoxelSplats)
-    scatterDispatcher.Invoke(uniqueVoxelIds,
-                             voxelSplatSums,
-                             vtkm::exec::ExecutionWholeArray<vtkm::Float32>(scalarSplatOutput));
+    scatterDispatcher.Invoke(uniqueVoxelIds, voxelSplatSums, scalarSplatOutput);
     END_TIMER_BLOCK(UpdateVoxelSplats)
     debug::OutputArrayDebug(scalarSplatOutput, "scalarSplatOutput");
     //
diff --git a/vtkm/worklet/StreamLineUniformGrid.h b/vtkm/worklet/StreamLineUniformGrid.h
index ad8a7f4fc..17f75eff4 100644
--- a/vtkm/worklet/StreamLineUniformGrid.h
+++ b/vtkm/worklet/StreamLineUniformGrid.h
@@ -33,8 +33,6 @@
 #include <vtkm/worklet/ScatterUniform.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
-#include <vtkm/exec/ExecutionWholeArray.h>
-
 namespace vtkm
 {
 
@@ -166,9 +164,9 @@ public:
   public:
     typedef void ControlSignature(FieldIn<IdType> seedId,
                                   FieldIn<> position,
-                                  ExecObject numIndices,
-                                  ExecObject validPoint,
-                                  ExecObject streamLines);
+                                  WholeArrayOut<IdComponentType> numIndices,
+                                  WholeArrayOut<IdComponentType> validPoint,
+                                  WholeArrayOut<Vec3> streamLines);
     typedef void ExecutionSignature(_1, _2, _3, _4, _5, VisitIndex);
     typedef _1 InputDomain;
 
@@ -200,13 +198,13 @@ public:
     {
     }
 
-    VTKM_EXEC
-    void operator()(vtkm::Id& seedId,
-                    vtkm::Vec<FieldType, 3>& seedPos,
-                    vtkm::exec::ExecutionWholeArray<vtkm::IdComponent>& numIndices,
-                    vtkm::exec::ExecutionWholeArray<vtkm::IdComponent>& validPoint,
-                    vtkm::exec::ExecutionWholeArray<vtkm::Vec<FieldType, 3>>& slLists,
-                    vtkm::IdComponent visitIndex) const
+    template <typename IdComponentPortalType, typename FieldVec3PortalType>
+    VTKM_EXEC void operator()(vtkm::Id& seedId,
+                              vtkm::Vec<FieldType, 3>& seedPos,
+                              IdComponentPortalType& numIndices,
+                              IdComponentPortalType& validPoint,
+                              FieldVec3PortalType& slLists,
+                              vtkm::IdComponent visitIndex) const
     {
       // Set initial offset into the output streams array
       vtkm::Vec<FieldType, 3> pos = seedPos;
@@ -403,11 +401,7 @@ public:
     typedef typename vtkm::worklet::DispatcherMapField<MakeStreamLines> MakeStreamLinesDispatcher;
     MakeStreamLinesDispatcher makeStreamLinesDispatcher(makeStreamLines);
     makeStreamLinesDispatcher.Invoke(
-      seedIdArray,
-      seedPosArray,
-      vtkm::exec::ExecutionWholeArray<vtkm::IdComponent>(numIndices, numCells),
-      vtkm::exec::ExecutionWholeArray<vtkm::IdComponent>(validPoint, maxConnectivityLen),
-      vtkm::exec::ExecutionWholeArray<vtkm::Vec<FieldType, 3>>(streamArray, maxConnectivityLen));
+      seedIdArray, seedPosArray, numIndices, validPoint, streamArray);
 
     // Size of connectivity based on size of returned streamlines
     vtkm::cont::ArrayHandle<vtkm::IdComponent> numIndicesOut;
diff --git a/vtkm/worklet/testing/UnitTestWorkletMapFieldExecArg.cxx b/vtkm/worklet/testing/UnitTestWorkletMapFieldExecArg.cxx
index 61c2a8c74..3b35923c6 100644
--- a/vtkm/worklet/testing/UnitTestWorkletMapFieldExecArg.cxx
+++ b/vtkm/worklet/testing/UnitTestWorkletMapFieldExecArg.cxx
@@ -24,35 +24,34 @@
 #include <vtkm/worklet/DispatcherMapField.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
-#include <vtkm/exec/ExecutionWholeArray.h>
-
 #include <vtkm/cont/testing/Testing.h>
 
-class TestExecObjectWorklet : public vtkm::worklet::WorkletMapField
+struct TestExecObjectWorklet
 {
-public:
-  typedef void ControlSignature(FieldIn<>, ExecObject, ExecObject, FieldOut<>);
-  typedef void ExecutionSignature(_1, _2, _3, _4);
-
-  template <typename T, typename StorageTag>
-  VTKM_EXEC void operator()(const vtkm::Id& index,
-                            const vtkm::exec::ExecutionWholeArrayConst<T, StorageTag>& execIn,
-                            vtkm::exec::ExecutionWholeArray<T, StorageTag>& execOut,
-                            T& out) const
+  template <typename T>
+  class Worklet : public vtkm::worklet::WorkletMapField
   {
-    if (!test_equal(execIn.Get(index), TestValue(index, T()) + T(100)))
+  public:
+    typedef void ControlSignature(FieldIn<IdType>,
+                                  WholeArrayIn<vtkm::ListTagBase<T>>,
+                                  WholeArrayOut<vtkm::ListTagBase<T>>,
+                                  FieldOut<vtkm::ListTagBase<T>>);
+    typedef void ExecutionSignature(_1, _2, _3, _4);
+
+    template <typename InPortalType, typename OutPortalType>
+    VTKM_EXEC void operator()(const vtkm::Id& index,
+                              const InPortalType& execIn,
+                              OutPortalType& execOut,
+                              T& out) const
     {
-      this->RaiseError("Got wrong input value.");
+      if (!test_equal(execIn.Get(index), TestValue(index, T()) + T(100)))
+      {
+        this->RaiseError("Got wrong input value.");
+      }
+      out = execIn.Get(index) - T(100);
+      execOut.Set(index, out);
     }
-    out = execIn.Get(index) - T(100);
-    execOut.Set(index, out);
-  }
-
-  template <typename T1, typename T2, typename T3>
-  VTKM_EXEC void operator()(const vtkm::Id&, const T1&, const T2&, const T3&) const
-  {
-    this->RaiseError("Cannot call this worklet with different types.");
-  }
+  };
 };
 
 namespace map_exec_field
@@ -78,13 +77,11 @@ struct DoTestWorklet
     vtkm::cont::ArrayHandle<T> inputHandle = vtkm::cont::make_ArrayHandle(inputArray, ARRAY_SIZE);
     vtkm::cont::ArrayHandle<T> outputHandle;
     vtkm::cont::ArrayHandle<T> outputFieldArray;
+    outputHandle.Allocate(ARRAY_SIZE);
 
     std::cout << "Create and run dispatcher." << std::endl;
-    vtkm::worklet::DispatcherMapField<WorkletType> dispatcher;
-    dispatcher.Invoke(counting,
-                      vtkm::exec::ExecutionWholeArrayConst<T>(inputHandle),
-                      vtkm::exec::ExecutionWholeArray<T>(outputHandle, ARRAY_SIZE),
-                      outputFieldArray);
+    vtkm::worklet::DispatcherMapField<typename WorkletType::template Worklet<T>> dispatcher;
+    dispatcher.Invoke(counting, inputHandle, outputHandle, outputFieldArray);
 
     std::cout << "Check result." << std::endl;
     CheckPortal(outputHandle.GetPortalConstControl());
@@ -94,12 +91,10 @@ struct DoTestWorklet
     // Clear out output arrays.
     outputFieldArray = vtkm::cont::ArrayHandle<T>();
     outputHandle = vtkm::cont::ArrayHandle<T>();
+    outputHandle.Allocate(ARRAY_SIZE);
 
     vtkm::cont::DynamicArrayHandle outputFieldDynamic(outputFieldArray);
-    dispatcher.Invoke(counting,
-                      vtkm::exec::ExecutionWholeArrayConst<T>(inputHandle),
-                      vtkm::exec::ExecutionWholeArray<T>(outputHandle, ARRAY_SIZE),
-                      outputFieldDynamic);
+    dispatcher.Invoke(counting, inputHandle, outputHandle, outputFieldDynamic);
 
     std::cout << "Check dynamic array result." << std::endl;
     CheckPortal(outputHandle.GetPortalConstControl());
@@ -110,7 +105,7 @@ struct DoTestWorklet
 void TestWorkletMapFieldExecArg()
 {
   typedef vtkm::cont::DeviceAdapterTraits<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAdapterTraits;
-  std::cout << "Testing Worklet with ExecutionWholeArray on device adapter: "
+  std::cout << "Testing Worklet with WholeArray on device adapter: "
             << DeviceAdapterTraits::GetName() << std::endl;
 
   std::cout << "--- Worklet accepting all types." << std::endl;

From 6b190312007fd9eebaefb8f96e58283fddb65cec Mon Sep 17 00:00:00 2001
From: Sujin Philip <sujin.philip@kitware.com>
Date: Fri, 22 Dec 2017 14:22:56 -0500
Subject: [PATCH 06/24] Cleanup includes of ExecutionWholeArray.h

---
 vtkm/rendering/raytracing/TriangleIntersector.h              | 1 -
 vtkm/worklet/Clip.h                                          | 1 -
 vtkm/worklet/contourtree/ActiveEdgeTransferrer.h             | 1 -
 vtkm/worklet/contourtree/ChainDoubler.h                      | 1 -
 vtkm/worklet/contourtree/CopyJoinSplit.h                     | 1 -
 vtkm/worklet/contourtree/CopyNeighbors.h                     | 1 -
 vtkm/worklet/contourtree/CopySupernodes.h                    | 1 -
 vtkm/worklet/contourtree/DegreeDelta.h                       | 1 -
 vtkm/worklet/contourtree/DegreeSubrangeOffset.h              | 1 -
 vtkm/worklet/contourtree/FillSupernodes.h                    | 1 -
 vtkm/worklet/contourtree/FindLeaves.h                        | 1 -
 vtkm/worklet/contourtree/GoverningSaddleFinder.h             | 1 -
 vtkm/worklet/contourtree/JoinArcConnector.h                  | 1 -
 vtkm/worklet/contourtree/JoinSuperArcFinder.h                | 1 -
 vtkm/worklet/contourtree/JoinTreeTransferrer.h               | 1 -
 vtkm/worklet/contourtree/Mesh2D_DEM_SaddleStarter.h          | 1 -
 vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h | 1 -
 vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h          | 1 -
 vtkm/worklet/contourtree/Mesh3D_DEM_SaddleStarter.h          | 1 -
 vtkm/worklet/contourtree/Mesh3D_DEM_VertexOutdegreeStarter.h | 1 -
 vtkm/worklet/contourtree/Mesh3D_DEM_VertexStarter.h          | 1 -
 vtkm/worklet/contourtree/RegularPointTransferrer.h           | 1 -
 vtkm/worklet/contourtree/RegularToCandidate.h                | 1 -
 vtkm/worklet/contourtree/RegularToCriticalDown.h             | 1 -
 vtkm/worklet/contourtree/RegularToCriticalUp.h               | 1 -
 vtkm/worklet/contourtree/ResetDegrees.h                      | 1 -
 vtkm/worklet/contourtree/SaddleAscentFunctor.h               | 1 -
 vtkm/worklet/contourtree/SaddleAscentTransferrer.h           | 1 -
 vtkm/worklet/contourtree/SetJoinAndSplitArcs.h               | 1 -
 vtkm/worklet/contourtree/SetSupernodeInward.h                | 1 -
 vtkm/worklet/contourtree/SkipVertex.h                        | 1 -
 vtkm/worklet/contourtree/SubrangeOffset.h                    | 1 -
 vtkm/worklet/contourtree/TrunkBuilder.h                      | 1 -
 vtkm/worklet/contourtree/UpdateOutbound.h                    | 1 -
 vtkm/worklet/contourtree/VertexDegreeUpdater.h               | 1 -
 vtkm/worklet/cosmotools/ComputeBinIndices.h                  | 1 -
 vtkm/worklet/cosmotools/ComputeBinRange.h                    | 1 -
 vtkm/worklet/cosmotools/ComputeNeighborBins.h                | 1 -
 vtkm/worklet/cosmotools/ComputePotential.h                   | 1 -
 vtkm/worklet/cosmotools/ComputePotentialBin.h                | 1 -
 vtkm/worklet/cosmotools/ComputePotentialMxN.h                | 1 -
 vtkm/worklet/cosmotools/ComputePotentialNeighbors.h          | 1 -
 vtkm/worklet/cosmotools/ComputePotentialNxN.h                | 1 -
 vtkm/worklet/cosmotools/ComputePotentialOnCandidates.h       | 1 -
 vtkm/worklet/cosmotools/EqualsMinimumPotential.h             | 1 -
 vtkm/worklet/cosmotools/GraftParticles.h                     | 1 -
 vtkm/worklet/cosmotools/IsStar.h                             | 1 -
 vtkm/worklet/cosmotools/MarkActiveNeighbors.h                | 1 -
 vtkm/worklet/cosmotools/PointerJump.h                        | 1 -
 vtkm/worklet/cosmotools/SetCandidateParticles.h              | 1 -
 vtkm/worklet/cosmotools/ValidHalo.h                          | 1 -
 vtkm/worklet/testing/UnitTestParticleAdvection.cxx           | 1 -
 52 files changed, 52 deletions(-)

diff --git a/vtkm/rendering/raytracing/TriangleIntersector.h b/vtkm/rendering/raytracing/TriangleIntersector.h
index b612a4cf2..d5b2e7ca4 100644
--- a/vtkm/rendering/raytracing/TriangleIntersector.h
+++ b/vtkm/rendering/raytracing/TriangleIntersector.h
@@ -22,7 +22,6 @@
 #include <cstring>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleCompositeVector.h>
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/rendering/raytracing/BoundingVolumeHierarchy.h>
 #include <vtkm/rendering/raytracing/Ray.h>
 #include <vtkm/rendering/raytracing/RayOperations.h>
diff --git a/vtkm/worklet/Clip.h b/vtkm/worklet/Clip.h
index d43a7531f..025abcf7b 100644
--- a/vtkm/worklet/Clip.h
+++ b/vtkm/worklet/Clip.h
@@ -33,7 +33,6 @@
 #include <vtkm/cont/ImplicitFunctionHandle.h>
 #include <vtkm/cont/Timer.h>
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/exec/FunctorBase.h>
 
 #if defined(THRUST_MAJOR_VERSION) && THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION == 8 &&     \
diff --git a/vtkm/worklet/contourtree/ActiveEdgeTransferrer.h b/vtkm/worklet/contourtree/ActiveEdgeTransferrer.h
index ff5d3e026..0a211a092 100644
--- a/vtkm/worklet/contourtree/ActiveEdgeTransferrer.h
+++ b/vtkm/worklet/contourtree/ActiveEdgeTransferrer.h
@@ -84,7 +84,6 @@
 #ifndef vtk_m_worklet_contourtree_active_edge_transferrer_h
 #define vtk_m_worklet_contourtree_active_edge_transferrer_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/ChainDoubler.h b/vtkm/worklet/contourtree/ChainDoubler.h
index 3d51c1a4e..741cb64d1 100644
--- a/vtkm/worklet/contourtree/ChainDoubler.h
+++ b/vtkm/worklet/contourtree/ChainDoubler.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_chain_doubler_h
 #define vtkm_worklet_contourtree_chain_doubler_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/CopyJoinSplit.h b/vtkm/worklet/contourtree/CopyJoinSplit.h
index 89bafe209..b088a61fa 100644
--- a/vtkm/worklet/contourtree/CopyJoinSplit.h
+++ b/vtkm/worklet/contourtree/CopyJoinSplit.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_copy_join_split_h
 #define vtkm_worklet_contourtree_copy_join_split_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/CopyNeighbors.h b/vtkm/worklet/contourtree/CopyNeighbors.h
index 14d94c5a0..44ac688cf 100644
--- a/vtkm/worklet/contourtree/CopyNeighbors.h
+++ b/vtkm/worklet/contourtree/CopyNeighbors.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_copy_neighbors_h
 #define vtkm_worklet_contourtree_copy_neighbors_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/CopySupernodes.h b/vtkm/worklet/contourtree/CopySupernodes.h
index 27bfd0077..c0e6376a1 100644
--- a/vtkm/worklet/contourtree/CopySupernodes.h
+++ b/vtkm/worklet/contourtree/CopySupernodes.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_copy_supernodes_h
 #define vtkm_worklet_contourtree_copy_supernodes_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/DegreeDelta.h b/vtkm/worklet/contourtree/DegreeDelta.h
index dec5739b9..d6382224a 100644
--- a/vtkm/worklet/contourtree/DegreeDelta.h
+++ b/vtkm/worklet/contourtree/DegreeDelta.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_degree_delta_h
 #define vtkm_worklet_contourtree_degree_delta_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/DegreeSubrangeOffset.h b/vtkm/worklet/contourtree/DegreeSubrangeOffset.h
index 57cc8c076..46244ea57 100644
--- a/vtkm/worklet/contourtree/DegreeSubrangeOffset.h
+++ b/vtkm/worklet/contourtree/DegreeSubrangeOffset.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_degree_subrange_offset_h
 #define vtkm_worklet_contourtree_degree_subrange_offset_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/FillSupernodes.h b/vtkm/worklet/contourtree/FillSupernodes.h
index cad5479d8..dc6cc6947 100644
--- a/vtkm/worklet/contourtree/FillSupernodes.h
+++ b/vtkm/worklet/contourtree/FillSupernodes.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_fill_supernodes_h
 #define vtkm_worklet_contourtree_fill_supernodes_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/FindLeaves.h b/vtkm/worklet/contourtree/FindLeaves.h
index c5472959a..b86d12c8c 100644
--- a/vtkm/worklet/contourtree/FindLeaves.h
+++ b/vtkm/worklet/contourtree/FindLeaves.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_find_leaves_h
 #define vtkm_worklet_contourtree_find_leaves_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/GoverningSaddleFinder.h b/vtkm/worklet/contourtree/GoverningSaddleFinder.h
index 760e54397..6226b76ea 100644
--- a/vtkm/worklet/contourtree/GoverningSaddleFinder.h
+++ b/vtkm/worklet/contourtree/GoverningSaddleFinder.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_governing_saddle_finder_h
 #define vtkm_worklet_contourtree_governing_saddle_finder_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/JoinArcConnector.h b/vtkm/worklet/contourtree/JoinArcConnector.h
index 73a3d2751..e8582fe1c 100644
--- a/vtkm/worklet/contourtree/JoinArcConnector.h
+++ b/vtkm/worklet/contourtree/JoinArcConnector.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_join_arc_connector_h
 #define vtkm_worklet_contourtree_join_arc_connector_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/JoinSuperArcFinder.h b/vtkm/worklet/contourtree/JoinSuperArcFinder.h
index e9be9d990..5f14f7ef9 100644
--- a/vtkm/worklet/contourtree/JoinSuperArcFinder.h
+++ b/vtkm/worklet/contourtree/JoinSuperArcFinder.h
@@ -89,7 +89,6 @@
 
 #include "vtkm/worklet/contourtree/Types.h"
 #include "vtkm/worklet/contourtree/VertexValueComparator.h"
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/JoinTreeTransferrer.h b/vtkm/worklet/contourtree/JoinTreeTransferrer.h
index 4c4878c3e..4e51856f8 100644
--- a/vtkm/worklet/contourtree/JoinTreeTransferrer.h
+++ b/vtkm/worklet/contourtree/JoinTreeTransferrer.h
@@ -84,7 +84,6 @@
 #ifndef vtkm_worklet_contourtree_join_tree_transferrer_h
 #define vtkm_worklet_contourtree_join_tree_transferrer_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/Mesh2D_DEM_SaddleStarter.h b/vtkm/worklet/contourtree/Mesh2D_DEM_SaddleStarter.h
index d1b8457d2..f2931b94b 100644
--- a/vtkm/worklet/contourtree/Mesh2D_DEM_SaddleStarter.h
+++ b/vtkm/worklet/contourtree/Mesh2D_DEM_SaddleStarter.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_mesh2d_dem_saddle_starter_h
 #define vtkm_worklet_contourtree_mesh2d_dem_saddle_starter_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Mesh2D_DEM_Triangulation_Macros.h>
 
diff --git a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h
index 1853946bd..87bb6c874 100644
--- a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h
+++ b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexOutdegreeStarter.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_mesh2d_dem_vertex_outdegree_starter_h
 #define vtkm_worklet_contourtree_mesh2d_dem_vertex_outdegree_starter_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Mesh2D_DEM_Triangulation_Macros.h>
 
diff --git a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h
index 88e4aa8cf..678a630be 100644
--- a/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h
+++ b/vtkm/worklet/contourtree/Mesh2D_DEM_VertexStarter.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_mesh2d_dem_vertex_starter_h
 #define vtkm_worklet_contourtree_mesh2d_dem_vertex_starter_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Mesh2D_DEM_Triangulation_Macros.h>
 #include <vtkm/worklet/contourtree/VertexValueComparator.h>
diff --git a/vtkm/worklet/contourtree/Mesh3D_DEM_SaddleStarter.h b/vtkm/worklet/contourtree/Mesh3D_DEM_SaddleStarter.h
index a0ee4dbd1..46365560f 100644
--- a/vtkm/worklet/contourtree/Mesh3D_DEM_SaddleStarter.h
+++ b/vtkm/worklet/contourtree/Mesh3D_DEM_SaddleStarter.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_mesh3d_dem_saddle_starter_h
 #define vtkm_worklet_contourtree_mesh3d_dem_saddle_starter_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/LinkComponentCaseTable3D.h>
 #include <vtkm/worklet/contourtree/Mesh3D_DEM_Triangulation_Macros.h>
diff --git a/vtkm/worklet/contourtree/Mesh3D_DEM_VertexOutdegreeStarter.h b/vtkm/worklet/contourtree/Mesh3D_DEM_VertexOutdegreeStarter.h
index 65f139339..c994279f1 100644
--- a/vtkm/worklet/contourtree/Mesh3D_DEM_VertexOutdegreeStarter.h
+++ b/vtkm/worklet/contourtree/Mesh3D_DEM_VertexOutdegreeStarter.h
@@ -83,7 +83,6 @@
 #ifndef vtkm_worklet_contourtree_mesh3d_dem_vertex_outdegree_starter_h
 #define vtkm_worklet_contourtree_mesh3d_dem_vertex_outdegree_starter_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/LinkComponentCaseTable3D.h>
 #include <vtkm/worklet/contourtree/Mesh3D_DEM_Triangulation_Macros.h>
diff --git a/vtkm/worklet/contourtree/Mesh3D_DEM_VertexStarter.h b/vtkm/worklet/contourtree/Mesh3D_DEM_VertexStarter.h
index d254c07ac..93f22ef4a 100644
--- a/vtkm/worklet/contourtree/Mesh3D_DEM_VertexStarter.h
+++ b/vtkm/worklet/contourtree/Mesh3D_DEM_VertexStarter.h
@@ -84,7 +84,6 @@
 #define vtkm_worklet_contourtree_mesh3d_dem_vertex_starter_h
 
 #include <iostream>
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Mesh3D_DEM_Triangulation_Macros.h>
 #include <vtkm/worklet/contourtree/VertexValueComparator.h>
diff --git a/vtkm/worklet/contourtree/RegularPointTransferrer.h b/vtkm/worklet/contourtree/RegularPointTransferrer.h
index a357cbe8c..8038f1dbd 100644
--- a/vtkm/worklet/contourtree/RegularPointTransferrer.h
+++ b/vtkm/worklet/contourtree/RegularPointTransferrer.h
@@ -86,7 +86,6 @@
 #include "vtkm/worklet/contourtree/Mesh2D_DEM_Triangulation_Macros.h"
 #include "vtkm/worklet/contourtree/Types.h"
 #include "vtkm/worklet/contourtree/VertexValueComparator.h"
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/RegularToCandidate.h b/vtkm/worklet/contourtree/RegularToCandidate.h
index dfbbb84d1..a389a79ff 100644
--- a/vtkm/worklet/contourtree/RegularToCandidate.h
+++ b/vtkm/worklet/contourtree/RegularToCandidate.h
@@ -68,7 +68,6 @@
 #define vtkm_worklet_contourtree_regular_to_candidate_h
 
 #include "vtkm/worklet/contourtree/Types.h"
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/RegularToCriticalDown.h b/vtkm/worklet/contourtree/RegularToCriticalDown.h
index cfff438ac..1dd0c38e9 100644
--- a/vtkm/worklet/contourtree/RegularToCriticalDown.h
+++ b/vtkm/worklet/contourtree/RegularToCriticalDown.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_regular_to_critical_down_h
 #define vtkm_worklet_contourtree_regular_to_critical_down_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/RegularToCriticalUp.h b/vtkm/worklet/contourtree/RegularToCriticalUp.h
index 724d3d8d7..dce16af7c 100644
--- a/vtkm/worklet/contourtree/RegularToCriticalUp.h
+++ b/vtkm/worklet/contourtree/RegularToCriticalUp.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_regular_to_critical_up_h
 #define vtkm_worklet_contourtree_regular_to_critical_up_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/ResetDegrees.h b/vtkm/worklet/contourtree/ResetDegrees.h
index 0fcdb841a..11f61b075 100644
--- a/vtkm/worklet/contourtree/ResetDegrees.h
+++ b/vtkm/worklet/contourtree/ResetDegrees.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_reset_degrees_h
 #define vtkm_worklet_contourtree_reset_degrees_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/SaddleAscentFunctor.h b/vtkm/worklet/contourtree/SaddleAscentFunctor.h
index 130cc585a..60c46a8a0 100644
--- a/vtkm/worklet/contourtree/SaddleAscentFunctor.h
+++ b/vtkm/worklet/contourtree/SaddleAscentFunctor.h
@@ -80,7 +80,6 @@
 #ifndef vtkm_worklet_contourtree_saddle_ascent_functor_h
 #define vtkm_worklet_contourtree_saddle_ascent_functor_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/SaddleAscentTransferrer.h b/vtkm/worklet/contourtree/SaddleAscentTransferrer.h
index 2c56badde..8a5d9d1a2 100644
--- a/vtkm/worklet/contourtree/SaddleAscentTransferrer.h
+++ b/vtkm/worklet/contourtree/SaddleAscentTransferrer.h
@@ -80,7 +80,6 @@
 #ifndef vtkm_worklet_contourtree_saddle_ascent_transferrer_h
 #define vtkm_worklet_contourtree_saddle_ascent_transferrer_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/SetJoinAndSplitArcs.h b/vtkm/worklet/contourtree/SetJoinAndSplitArcs.h
index 46f456e3d..fe61ef500 100644
--- a/vtkm/worklet/contourtree/SetJoinAndSplitArcs.h
+++ b/vtkm/worklet/contourtree/SetJoinAndSplitArcs.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_set_join_and_split_arcs_h
 #define vtkm_worklet_contourtree_set_join_and_split_arcs_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/SetSupernodeInward.h b/vtkm/worklet/contourtree/SetSupernodeInward.h
index 4964e24e0..8602372c2 100644
--- a/vtkm/worklet/contourtree/SetSupernodeInward.h
+++ b/vtkm/worklet/contourtree/SetSupernodeInward.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_set_supernode_inward_h
 #define vtkm_worklet_contourtree_set_supernode_inward_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/SkipVertex.h b/vtkm/worklet/contourtree/SkipVertex.h
index bf0d27a4f..d65b33d72 100644
--- a/vtkm/worklet/contourtree/SkipVertex.h
+++ b/vtkm/worklet/contourtree/SkipVertex.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_skip_vertex_h
 #define vtkm_worklet_contourtree_skip_vertex_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/SubrangeOffset.h b/vtkm/worklet/contourtree/SubrangeOffset.h
index 4702d2878..4b568bb94 100644
--- a/vtkm/worklet/contourtree/SubrangeOffset.h
+++ b/vtkm/worklet/contourtree/SubrangeOffset.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_subrange_offset_h
 #define vtkm_worklet_contourtree_subrange_offset_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/contourtree/TrunkBuilder.h b/vtkm/worklet/contourtree/TrunkBuilder.h
index 206dcb841..c6842c69b 100644
--- a/vtkm/worklet/contourtree/TrunkBuilder.h
+++ b/vtkm/worklet/contourtree/TrunkBuilder.h
@@ -84,7 +84,6 @@
 #ifndef vtkm_worklet_contourtree_trunk_builder_h
 #define vtkm_worklet_contourtree_trunk_builder_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/UpdateOutbound.h b/vtkm/worklet/contourtree/UpdateOutbound.h
index c5867c990..565e42959 100644
--- a/vtkm/worklet/contourtree/UpdateOutbound.h
+++ b/vtkm/worklet/contourtree/UpdateOutbound.h
@@ -67,7 +67,6 @@
 #ifndef vtkm_worklet_contourtree_update_outbound_h
 #define vtkm_worklet_contourtree_update_outbound_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/contourtree/Types.h>
 
diff --git a/vtkm/worklet/contourtree/VertexDegreeUpdater.h b/vtkm/worklet/contourtree/VertexDegreeUpdater.h
index 60bb0457e..5bcd17de2 100644
--- a/vtkm/worklet/contourtree/VertexDegreeUpdater.h
+++ b/vtkm/worklet/contourtree/VertexDegreeUpdater.h
@@ -84,7 +84,6 @@
 #ifndef vtkm_worklet_contourtree_vertex_degree_updater_h
 #define vtkm_worklet_contourtree_vertex_degree_updater_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputeBinIndices.h b/vtkm/worklet/cosmotools/ComputeBinIndices.h
index ecdc9c451..ef67de382 100644
--- a/vtkm/worklet/cosmotools/ComputeBinIndices.h
+++ b/vtkm/worklet/cosmotools/ComputeBinIndices.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_bin_indices_h
 #define vtkm_worklet_cosmotools_compute_bin_indices_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputeBinRange.h b/vtkm/worklet/cosmotools/ComputeBinRange.h
index d46689667..e04367f2b 100644
--- a/vtkm/worklet/cosmotools/ComputeBinRange.h
+++ b/vtkm/worklet/cosmotools/ComputeBinRange.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_bin_range_h
 #define vtkm_worklet_cosmotools_compute_bin_range_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputeNeighborBins.h b/vtkm/worklet/cosmotools/ComputeNeighborBins.h
index df7eefa27..f6679d4df 100644
--- a/vtkm/worklet/cosmotools/ComputeNeighborBins.h
+++ b/vtkm/worklet/cosmotools/ComputeNeighborBins.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_neighbor_bins_h
 #define vtkm_worklet_cosmotools_compute_neighbor_bins_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotential.h b/vtkm/worklet/cosmotools/ComputePotential.h
index cfe697284..16dbf16d5 100644
--- a/vtkm/worklet/cosmotools/ComputePotential.h
+++ b/vtkm/worklet/cosmotools/ComputePotential.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_h
 #define vtkm_worklet_cosmotools_compute_potential_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotentialBin.h b/vtkm/worklet/cosmotools/ComputePotentialBin.h
index 3989d8ea4..428705bd0 100644
--- a/vtkm/worklet/cosmotools/ComputePotentialBin.h
+++ b/vtkm/worklet/cosmotools/ComputePotentialBin.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_bin_h
 #define vtkm_worklet_cosmotools_compute_potential_bin_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotentialMxN.h b/vtkm/worklet/cosmotools/ComputePotentialMxN.h
index d0dd06b86..185a25abf 100644
--- a/vtkm/worklet/cosmotools/ComputePotentialMxN.h
+++ b/vtkm/worklet/cosmotools/ComputePotentialMxN.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_mxn_h
 #define vtkm_worklet_cosmotools_compute_potential_mxn_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotentialNeighbors.h b/vtkm/worklet/cosmotools/ComputePotentialNeighbors.h
index e8ef02c82..ffeb75e9b 100644
--- a/vtkm/worklet/cosmotools/ComputePotentialNeighbors.h
+++ b/vtkm/worklet/cosmotools/ComputePotentialNeighbors.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_neighbors_h
 #define vtkm_worklet_cosmotools_compute_potential_neighbors_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotentialNxN.h b/vtkm/worklet/cosmotools/ComputePotentialNxN.h
index f07c7b36b..81b0350be 100644
--- a/vtkm/worklet/cosmotools/ComputePotentialNxN.h
+++ b/vtkm/worklet/cosmotools/ComputePotentialNxN.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_nxn_h
 #define vtkm_worklet_cosmotools_compute_potential_nxn_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ComputePotentialOnCandidates.h b/vtkm/worklet/cosmotools/ComputePotentialOnCandidates.h
index 537556dc0..c14930ae6 100644
--- a/vtkm/worklet/cosmotools/ComputePotentialOnCandidates.h
+++ b/vtkm/worklet/cosmotools/ComputePotentialOnCandidates.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_compute_potential_on_candidates_h
 #define vtkm_worklet_cosmotools_compute_potential_on_candidates_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/EqualsMinimumPotential.h b/vtkm/worklet/cosmotools/EqualsMinimumPotential.h
index 934e36490..054ebf117 100644
--- a/vtkm/worklet/cosmotools/EqualsMinimumPotential.h
+++ b/vtkm/worklet/cosmotools/EqualsMinimumPotential.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_equals_minimum_potential_h
 #define vtkm_worklet_cosmotools_equals_minimum_potential_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/GraftParticles.h b/vtkm/worklet/cosmotools/GraftParticles.h
index ee5d7d8fa..036ce1730 100644
--- a/vtkm/worklet/cosmotools/GraftParticles.h
+++ b/vtkm/worklet/cosmotools/GraftParticles.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_graft_particle_h
 #define vtkm_worklet_cosmotools_graft_particle_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/cosmotools/TagTypes.h>
 
diff --git a/vtkm/worklet/cosmotools/IsStar.h b/vtkm/worklet/cosmotools/IsStar.h
index e20f444a4..79673cb89 100644
--- a/vtkm/worklet/cosmotools/IsStar.h
+++ b/vtkm/worklet/cosmotools/IsStar.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_is_star_h
 #define vtkm_worklet_cosmotools_is_star_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/MarkActiveNeighbors.h b/vtkm/worklet/cosmotools/MarkActiveNeighbors.h
index 27ca56901..83a15df43 100644
--- a/vtkm/worklet/cosmotools/MarkActiveNeighbors.h
+++ b/vtkm/worklet/cosmotools/MarkActiveNeighbors.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_mark_active_neighbors_h
 #define vtkm_worklet_cosmotools_mark_active_neighbors_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/cosmotools/TagTypes.h>
 
diff --git a/vtkm/worklet/cosmotools/PointerJump.h b/vtkm/worklet/cosmotools/PointerJump.h
index 83506f052..66b10920e 100644
--- a/vtkm/worklet/cosmotools/PointerJump.h
+++ b/vtkm/worklet/cosmotools/PointerJump.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_pointer_jump_h
 #define vtkm_worklet_cosmotools_pointer_jump_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/SetCandidateParticles.h b/vtkm/worklet/cosmotools/SetCandidateParticles.h
index 17e007a4a..da575d165 100644
--- a/vtkm/worklet/cosmotools/SetCandidateParticles.h
+++ b/vtkm/worklet/cosmotools/SetCandidateParticles.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_set_candidate_particles_h
 #define vtkm_worklet_cosmotools_set_candidate_particles_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/cosmotools/ValidHalo.h b/vtkm/worklet/cosmotools/ValidHalo.h
index 2190064aa..28f4686f0 100644
--- a/vtkm/worklet/cosmotools/ValidHalo.h
+++ b/vtkm/worklet/cosmotools/ValidHalo.h
@@ -61,7 +61,6 @@
 #ifndef vtkm_worklet_cosmotools_valid_halo_h
 #define vtkm_worklet_cosmotools_valid_halo_h
 
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/WorkletMapField.h>
 
 namespace vtkm
diff --git a/vtkm/worklet/testing/UnitTestParticleAdvection.cxx b/vtkm/worklet/testing/UnitTestParticleAdvection.cxx
index b239f1622..58bb17fec 100644
--- a/vtkm/worklet/testing/UnitTestParticleAdvection.cxx
+++ b/vtkm/worklet/testing/UnitTestParticleAdvection.cxx
@@ -25,7 +25,6 @@
 #include <vtkm/cont/DataSetBuilderUniform.h>
 #include <vtkm/cont/DeviceAdapter.h>
 #include <vtkm/cont/testing/Testing.h>
-#include <vtkm/exec/ExecutionWholeArray.h>
 #include <vtkm/worklet/ParticleAdvection.h>
 #include <vtkm/worklet/particleadvection/GridEvaluators.h>
 #include <vtkm/worklet/particleadvection/Integrators.h>

From 93bc0198fea50b05e18380e7d18b60191c9407dd Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Thu, 28 Dec 2017 16:41:13 -0500
Subject: [PATCH 07/24] Suppress false positive warnings about calling host
 device functions.

---
 examples/cosmotools/CMakeLists.txt            |  8 ++++
 examples/demo/CMakeLists.txt                  |  6 +++
 examples/game_of_life/CMakeLists.txt          |  9 ++++
 vtkm/Math.h                                   | 14 +++++-
 vtkm/benchmarking/BenchmarkCopySpeeds.cxx     |  4 +-
 vtkm/benchmarking/BenchmarkDeviceAdapter.cxx  | 46 +++++++++----------
 vtkm/benchmarking/BenchmarkRayTracing.cxx     |  1 -
 vtkm/cont/CellLocatorTwoLevelUniformGrid.h    |  1 +
 .../internal/DeviceAdapterAlgorithmSerial.h   |  1 +
 vtkm/cont/tbb/internal/FunctorsTBB.h          |  3 +-
 .../testing/UnitTestTaskSingularCuda.cu       |  7 +--
 vtkm/internal/FunctionInterfaceDetailPre.h    | 22 +++++++++
 vtkm/internal/FunctionInterfaceDetailPre.h.in |  2 +
 vtkm/rendering/Wireframer.h                   |  2 +
 vtkm/worklet/Keys.h                           |  2 +-
 vtkm/worklet/internal/DispatcherBase.h        | 14 ++++++
 .../spatialstructure/KdTree3DNNSearch.h       | 43 ++++++++++-------
 vtkm/worklet/splatkernels/Gaussian.h          |  1 +
 vtkm/worklet/splatkernels/Spline3rdOrder.h    |  6 +--
 .../worklet/testing/UnitTestMarchingCubes.cxx |  5 +-
 20 files changed, 140 insertions(+), 57 deletions(-)

diff --git a/examples/cosmotools/CMakeLists.txt b/examples/cosmotools/CMakeLists.txt
index 087bf21a8..d6a8c710b 100644
--- a/examples/cosmotools/CMakeLists.txt
+++ b/examples/cosmotools/CMakeLists.txt
@@ -35,6 +35,11 @@ target_link_libraries(CosmoHaloFinder_SERIAL PRIVATE ${VTKm_LIBRARIES})
 target_compile_options(CosmoHaloFinder_SERIAL PRIVATE ${VTKm_COMPILE_OPTIONS})
 
 if(VTKm_CUDA_FOUND)
+  set(old_nvcc_flags ${CUDA_NVCC_FLAGS})
+  set(old_cxx_flags ${CMAKE_CXX_FLAGS})
+  vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags)
+  vtkm_disable_troublesome_thrust_warnings()
+
   # Cuda compiles do not respect target_include_directories
   cuda_include_directories(${VTKm_INCLUDE_DIRS})
 
@@ -47,6 +52,9 @@ if(VTKm_CUDA_FOUND)
   target_include_directories(CosmoHaloFinder_CUDA PRIVATE ${VTKm_INCLUDE_DIRS})
   target_link_libraries(CosmoHaloFinder_CUDA PRIVATE ${VTKm_LIBRARIES})
   target_compile_options(CosmoHaloFinder_CUDA PRIVATE ${VTKm_COMPILE_OPTIONS})
+
+  set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
+  set(CMAKE_CXX_FLAGS ${old_cxx_flags})
 endif()
 
 if(VTKm_TBB_FOUND)
diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt
index c2d745c1b..34f555e42 100644
--- a/examples/demo/CMakeLists.txt
+++ b/examples/demo/CMakeLists.txt
@@ -28,6 +28,12 @@ find_package(VTKm QUIET REQUIRED
 
 if(VTKm_OSMesa_FOUND AND VTKm_Rendering_FOUND)
   if(VTKm_CUDA_FOUND)
+
+    set(old_nvcc_flags ${CUDA_NVCC_FLAGS})
+    set(old_cxx_flags ${CMAKE_CXX_FLAGS})
+    vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags)
+    vtkm_disable_troublesome_thrust_warnings()
+
     # Cuda compiles do not respect target_include_directories
     cuda_include_directories(${VTKm_INCLUDE_DIRS})
     cuda_add_executable(Demo Demo.cu)
diff --git a/examples/game_of_life/CMakeLists.txt b/examples/game_of_life/CMakeLists.txt
index 7b67e0e5c..9756994f0 100644
--- a/examples/game_of_life/CMakeLists.txt
+++ b/examples/game_of_life/CMakeLists.txt
@@ -29,7 +29,16 @@ find_package(VTKm REQUIRED
   )
 
 if(VTKm_CUDA_FOUND)
+
+  set(old_nvcc_flags ${CUDA_NVCC_FLAGS})
+  set(old_cxx_flags ${CMAKE_CXX_FLAGS})
+  vtkm_setup_nvcc_flags( old_nvcc_flags old_cxx_flags)
+  vtkm_disable_troublesome_thrust_warnings()
+
   cuda_add_executable(GameOfLife GameOfLife.cu LoadShaders.h)
+
+  set(CUDA_NVCC_FLAGS ${old_nvcc_flags})
+  set(CMAKE_CXX_FLAGS ${old_cxx_flags})
 else()
   add_executable(GameOfLife GameOfLife.cxx LoadShaders.h)
 endif()
diff --git a/vtkm/Math.h b/vtkm/Math.h
index f35c9628b..881e99a3c 100644
--- a/vtkm/Math.h
+++ b/vtkm/Math.h
@@ -2347,7 +2347,12 @@ static inline VTKM_EXEC_CONT vtkm::Float32 RemainderQuotient(vtkm::Float32 numer
                                                              QType& quotient)
 {
   int iQuotient;
-  vtkm::Float32 result = std::remquo(numerator, denominator, &iQuotient);
+#ifdef VTKM_CUDA
+  const vtkm::Float64 result =
+    VTKM_CUDA_MATH_FUNCTION_32(remquo)(numerator, denominator, &iQuotient);
+#else
+  const vtkm::Float32 result = std::remquo(numerator, denominator, &iQuotient);
+#endif
   quotient = iQuotient;
   return result;
 }
@@ -2357,7 +2362,12 @@ static inline VTKM_EXEC_CONT vtkm::Float64 RemainderQuotient(vtkm::Float64 numer
                                                              QType& quotient)
 {
   int iQuotient;
-  vtkm::Float64 result = std::remquo(numerator, denominator, &iQuotient);
+#ifdef VTKM_CUDA
+  const vtkm::Float64 result =
+    VTKM_CUDA_MATH_FUNCTION_64(remquo)(numerator, denominator, &iQuotient);
+#else
+  const vtkm::Float64 result = std::remquo(numerator, denominator, &iQuotient);
+#endif
   quotient = iQuotient;
   return result;
 }
diff --git a/vtkm/benchmarking/BenchmarkCopySpeeds.cxx b/vtkm/benchmarking/BenchmarkCopySpeeds.cxx
index c219b5845..a4fe1e33a 100644
--- a/vtkm/benchmarking/BenchmarkCopySpeeds.cxx
+++ b/vtkm/benchmarking/BenchmarkCopySpeeds.cxx
@@ -80,8 +80,8 @@ struct MeasureCopySpeed
 
   VTKM_CONT std::string Description() const
   {
-    vtkm::UInt64 actualSize =
-      static_cast<vtkm::UInt64>(this->Source.GetNumberOfValues() * sizeof(ValueType));
+    vtkm::UInt64 actualSize = sizeof(ValueType);
+    actualSize *= static_cast<vtkm::UInt64>(this->Source.GetNumberOfValues());
     std::ostringstream out;
     out << "Copying " << HumanSize(this->NumBytes) << " (actual=" << HumanSize(actualSize)
         << ") of " << vtkm::testing::TypeName<ValueType>::Name() << "\n";
diff --git a/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
index 00cd77c83..5a5715aad 100644
--- a/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
+++ b/vtkm/benchmarking/BenchmarkDeviceAdapter.cxx
@@ -108,7 +108,7 @@ struct BenchDevAlgoConfig
   /// @note FixBytes and FixSizes are not mutually exclusive. If both are
   /// specified, both will run.
   bool TestArraySizeValues{ false };
-  vtkm::Id ArraySizeValues{ 1 << 21 };
+  vtkm::UInt64 ArraySizeValues{ 1 << 21 };
 
   /// If true, operations like "Unique" will test with a wider range of unique
   /// values (5%, 10%, 15%, 20%, 25%, 30%, 35%, 40%, 45%, 50%, 75%, 100%
@@ -126,7 +126,7 @@ struct BenchDevAlgoConfig
   {
     return this->DoByteSizes
       ? static_cast<vtkm::Id>(this->ArraySizeBytes / static_cast<vtkm::UInt64>(sizeof(T)))
-      : this->ArraySizeValues;
+      : static_cast<vtkm::Id>(this->ArraySizeValues);
   }
 };
 
@@ -291,8 +291,8 @@ private:
     {
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
-      description << "Copy " << arraySize << " values (" << HumanSize(arraySize * sizeof(Value))
-                  << ")";
+      description << "Copy " << arraySize << " values ("
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -337,8 +337,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "CopyIf on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ") with " << PERCENT_VALID
-                  << "% valid values";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") with "
+                  << PERCENT_VALID << "% valid values";
       return description.str();
     }
   };
@@ -393,8 +393,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "LowerBounds on " << arraySize << " input values ("
-                  << "(" << HumanSize(arraySize * sizeof(Value)) << ") (" << PERCENT_VALUES
-                  << "% configuration)";
+                  << "(" << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") ("
+                  << PERCENT_VALUES << "% configuration)";
       return description.str();
     }
   };
@@ -451,7 +451,7 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "Reduce on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -496,8 +496,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "ReduceByKey on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ") with " << N_KEYS << " ("
-                  << PERCENT_KEYS << "%) distinct vtkm::Id keys";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") with "
+                  << N_KEYS << " (" << PERCENT_KEYS << "%) distinct vtkm::Id keys";
       return description.str();
     }
   };
@@ -543,7 +543,7 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "ScanInclusive on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -579,7 +579,7 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "ScanExclusive on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -621,7 +621,7 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "Sort on " << arraySize << " random values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -674,8 +674,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "SortByKey on " << arraySize << " random values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ") with " << N_KEYS << " ("
-                  << PERCENT_KEYS << "%) different vtkm::Id keys";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") with "
+                  << N_KEYS << " (" << PERCENT_KEYS << "%) different vtkm::Id keys";
       return description.str();
     }
   };
@@ -731,7 +731,7 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "StableSortIndices::Sort on " << arraySize << " random values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
@@ -775,8 +775,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "StableSortIndices::Unique on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ") with " << this->N_VALID << " ("
-                  << PERCENT_VALID << "%) valid values";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") with "
+                  << this->N_VALID << " (" << PERCENT_VALID << "%) valid values";
       return description.str();
     }
   };
@@ -831,8 +831,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "Unique on " << arraySize << " values ("
-                  << HumanSize(arraySize * sizeof(Value)) << ") with " << N_VALID << " ("
-                  << PERCENT_VALID << "%) valid values";
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ") with "
+                  << N_VALID << " (" << PERCENT_VALID << "%) valid values";
       return description.str();
     }
   };
@@ -887,8 +887,8 @@ private:
       vtkm::Id arraySize = Config.ComputeSize<Value>();
       std::stringstream description;
       description << "UpperBounds on " << arraySize << " input and " << N_VALS << " ("
-                  << PERCENT_VALS
-                  << "%) values (input array size: " << HumanSize(arraySize * sizeof(Value)) << ")";
+                  << PERCENT_VALS << "%) values (input array size: "
+                  << HumanSize(static_cast<vtkm::UInt64>(arraySize) * sizeof(Value)) << ")";
       return description.str();
     }
   };
diff --git a/vtkm/benchmarking/BenchmarkRayTracing.cxx b/vtkm/benchmarking/BenchmarkRayTracing.cxx
index c32eb137f..7d404e441 100644
--- a/vtkm/benchmarking/BenchmarkRayTracing.cxx
+++ b/vtkm/benchmarking/BenchmarkRayTracing.cxx
@@ -107,7 +107,6 @@ VTKM_MAKE_BENCHMARK(RayTracing, BenchRayTracing);
 
 int main(int, char* [])
 {
-  using TestTypes = vtkm::ListTagBase<vtkm::Float32>;
   VTKM_RUN_BENCHMARK(RayTracing, vtkm::ListTagBase<vtkm::Float32>());
   return 0;
 }
diff --git a/vtkm/cont/CellLocatorTwoLevelUniformGrid.h b/vtkm/cont/CellLocatorTwoLevelUniformGrid.h
index 286d538e4..01a764a52 100644
--- a/vtkm/cont/CellLocatorTwoLevelUniformGrid.h
+++ b/vtkm/cont/CellLocatorTwoLevelUniformGrid.h
@@ -107,6 +107,7 @@ private:
     DimVec3 Min;
     DimVec3 Max;
 
+    VTKM_EXEC
     bool Empty() const
     {
       return (this->Max[0] < this->Min[0]) || (this->Max[1] < this->Min[1]) ||
diff --git a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
index cd110a105..1354d25e0 100644
--- a/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
+++ b/vtkm/cont/serial/internal/DeviceAdapterAlgorithmSerial.h
@@ -68,6 +68,7 @@ private:
     }
   }
 
+  VTKM_SUPPRESS_EXEC_WARNINGS
   template <typename InIter, typename OutIter>
   VTKM_EXEC static void DoCopy(InIter src, InIter srcEnd, OutIter dst, std::true_type)
   {
diff --git a/vtkm/cont/tbb/internal/FunctorsTBB.h b/vtkm/cont/tbb/internal/FunctorsTBB.h
index 10bb2f2b1..03dade15f 100644
--- a/vtkm/cont/tbb/internal/FunctorsTBB.h
+++ b/vtkm/cont/tbb/internal/FunctorsTBB.h
@@ -98,7 +98,6 @@ struct CopyBody
   vtkm::Id InputOffset;
   vtkm::Id OutputOffset;
 
-  VTKM_EXEC_CONT
   CopyBody(const InputPortalType& inPortal,
            const OutputPortalType& outPortal,
            vtkm::Id inOffset,
@@ -127,12 +126,14 @@ struct CopyBody
     }
   }
 
+  VTKM_SUPPRESS_EXEC_WARNINGS
   template <typename InIter, typename OutIter>
   VTKM_EXEC void DoCopy(InIter src, InIter srcEnd, OutIter dst, std::true_type) const
   {
     std::copy(src, srcEnd, dst);
   }
 
+  VTKM_SUPPRESS_EXEC_WARNINGS
   VTKM_EXEC
   void operator()(const ::tbb::blocked_range<vtkm::Id>& range) const
   {
diff --git a/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu b/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu
index 0ce3d8561..f0fd9de82 100644
--- a/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu
+++ b/vtkm/exec/cuda/internal/testing/UnitTestTaskSingularCuda.cu
@@ -37,12 +37,6 @@ namespace
 
 struct TestExecObject
 {
-  VTKM_EXEC_CONT
-  TestExecObject()
-    : Portal()
-  {
-  }
-
   VTKM_EXEC_CONT
   TestExecObject(vtkm::exec::cuda::internal::ArrayPortalFromThrust<vtkm::Id> portal)
     : Portal(portal)
@@ -62,6 +56,7 @@ struct MyOutputToInputMapPortal
 struct MyVisitArrayPortal
 {
   using ValueType = vtkm::IdComponent;
+  VTKM_EXEC_CONT
   vtkm::IdComponent Get(vtkm::Id) const { return 1; }
 };
 
diff --git a/vtkm/internal/FunctionInterfaceDetailPre.h b/vtkm/internal/FunctionInterfaceDetailPre.h
index 0698d60b3..a241da8ef 100644
--- a/vtkm/internal/FunctionInterfaceDetailPre.h
+++ b/vtkm/internal/FunctionInterfaceDetailPre.h
@@ -76,12 +76,16 @@ struct ParameterContainer;
 template <typename R>
 struct ParameterContainer<R()>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
 };
 
 template <typename R,
           typename P1>
 struct ParameterContainer<R(P1)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
 };
 
@@ -90,6 +94,8 @@ template <typename R,
           typename P2>
 struct ParameterContainer<R(P1, P2)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
 };
@@ -100,6 +106,8 @@ template <typename R,
           typename P3>
 struct ParameterContainer<R(P1, P2, P3)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -112,6 +120,8 @@ template <typename R,
           typename P4>
 struct ParameterContainer<R(P1, P2, P3, P4)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -126,6 +136,8 @@ template <typename R,
           typename P5>
 struct ParameterContainer<R(P1, P2, P3, P4, P5)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -142,6 +154,8 @@ template <typename R,
           typename P6>
 struct ParameterContainer<R(P1, P2, P3, P4, P5, P6)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -160,6 +174,8 @@ template <typename R,
           typename P7>
 struct ParameterContainer<R(P1, P2, P3, P4, P5, P6, P7)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -180,6 +196,8 @@ template <typename R,
           typename P8>
 struct ParameterContainer<R(P1, P2, P3, P4, P5, P6, P7, P8)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -202,6 +220,8 @@ template <typename R,
           typename P9>
 struct ParameterContainer<R(P1, P2, P3, P4, P5, P6, P7, P8, P9)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
@@ -226,6 +246,8 @@ template <typename R,
           typename P10>
 struct ParameterContainer<R(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
   P1 Parameter1;
   P2 Parameter2;
   P3 Parameter3;
diff --git a/vtkm/internal/FunctionInterfaceDetailPre.h.in b/vtkm/internal/FunctionInterfaceDetailPre.h.in
index 911cc5b78..7aa8c7977 100644
--- a/vtkm/internal/FunctionInterfaceDetailPre.h.in
+++ b/vtkm/internal/FunctionInterfaceDetailPre.h.in
@@ -123,6 +123,8 @@ $for(num_params in range(0, max_parameters+1))\
 template <$template_params(num_params)>
 struct ParameterContainer<$signature(num_params)>
 {
+  VTKM_SUPPRESS_EXEC_WARNINGS
+  ~ParameterContainer() = default;
 $for(param_index in range(1, num_params+1))\
   $ptype(param_index) Parameter$(param_index);
 $endfor\
diff --git a/vtkm/rendering/Wireframer.h b/vtkm/rendering/Wireframer.h
index 6eae48dd8..1382c8159 100644
--- a/vtkm/rendering/Wireframer.h
+++ b/vtkm/rendering/Wireframer.h
@@ -74,6 +74,7 @@ vtkm::UInt32 ScaleColorComponent(vtkm::Float32 c)
   return vtkm::UInt32(t < 0 ? 0 : (t > 255 ? 255 : t));
 }
 
+VTKM_EXEC_CONT
 vtkm::UInt32 PackColor(vtkm::Float32 r, vtkm::Float32 g, vtkm::Float32 b, vtkm::Float32 a);
 
 VTKM_EXEC_CONT
@@ -92,6 +93,7 @@ vtkm::UInt32 PackColor(vtkm::Float32 r, vtkm::Float32 g, vtkm::Float32 b, vtkm::
   return packed;
 }
 
+VTKM_EXEC_CONT
 void UnpackColor(vtkm::UInt32 color,
                  vtkm::Float32& r,
                  vtkm::Float32& g,
diff --git a/vtkm/worklet/Keys.h b/vtkm/worklet/Keys.h
index c96a9f82d..7fcfe5343 100644
--- a/vtkm/worklet/Keys.h
+++ b/vtkm/worklet/Keys.h
@@ -79,7 +79,7 @@ public:
   };
 
   VTKM_CONT
-  Keys() {}
+  Keys() = default;
 
   /// \b Construct a Keys class from an array of keys.
   ///
diff --git a/vtkm/worklet/internal/DispatcherBase.h b/vtkm/worklet/internal/DispatcherBase.h
index 363909b16..3caba1ec0 100644
--- a/vtkm/worklet/internal/DispatcherBase.h
+++ b/vtkm/worklet/internal/DispatcherBase.h
@@ -401,8 +401,22 @@ private:
     static_assert(isAllValid::value == expectedLen::value,
                   "All arguments failed the TypeCheck pass");
 
+#if defined __NVCC__
+// Disable warning "calling a __host__ function from a __host__ __device__"
+// In some cases nv_exec_check_disable doesn't work and therefore you need
+// to use the following suppressions
+// This have been found by eigen:
+// https://github.com/RLovelett/eigen/blame/master/Eigen/src/Core/util/DisableStupidWarnings.h
+#pragma push
+#pragma diag_suppress 2737
+#pragma diag_suppress 2739
+#endif
     auto fi =
       vtkm::internal::make_FunctionInterface<void, typename std::decay<Args>::type...>(args...);
+#if defined __NVCC__
+#pragma pop
+#endif
+
     auto ivc = vtkm::internal::Invocation<ParameterInterface,
                                           ControlInterface,
                                           ExecutionInterface,
diff --git a/vtkm/worklet/spatialstructure/KdTree3DNNSearch.h b/vtkm/worklet/spatialstructure/KdTree3DNNSearch.h
index 9b5085d1a..e44be3d12 100644
--- a/vtkm/worklet/spatialstructure/KdTree3DNNSearch.h
+++ b/vtkm/worklet/spatialstructure/KdTree3DNNSearch.h
@@ -21,15 +21,13 @@
 #ifndef vtk_m_worklet_KdTree3DNNSearch_h
 #define vtk_m_worklet_KdTree3DNNSearch_h
 
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+
 #include <vtkm/Math.h>
 #include <vtkm/cont/ArrayHandle.h>
 #include <vtkm/cont/ArrayHandleCounting.h>
 #include <vtkm/cont/ArrayHandleReverse.h>
-#include <vtkm/cont/DeviceAdapter.h>
-#include <vtkm/cont/DeviceAdapterAlgorithm.h>
-#include <vtkm/cont/arg/ControlSignatureTagBase.h>
-#include <vtkm/cont/serial/DeviceAdapterSerial.h>
-#include <vtkm/cont/testing/Testing.h>
+
 #include <vtkm/worklet/DispatcherMapField.h>
 #include <vtkm/worklet/WorkletMapField.h>
 #include <vtkm/worklet/internal/DispatcherBase.h>
@@ -53,7 +51,7 @@ public:
                                   WholeArrayIn<> treeSplitIdIn,
                                   WholeArrayIn<> treeCoordiIn,
                                   FieldOut<> nnIdOut,
-                                  FieldOut<> nnDisOut);
+                                  FieldInOut<> nnDisOut);
     typedef void ExecutionSignature(_1, _2, _3, _4, _5, _6);
 
     VTKM_CONT
@@ -175,8 +173,6 @@ public:
                               IdType& nnId,
                               CoordiType& nnDis) const
     {
-      nnDis = std::numeric_limits<CoordiType>::max();
-
       NearestNeighborSearch3D(qc,
                               nnDis,
                               nnId,
@@ -207,13 +203,24 @@ public:
            const vtkm::cont::ArrayHandle<vtkm::Vec<CoordType, 3>, CoordStorageTag2>& qc_Handle,
            vtkm::cont::ArrayHandle<vtkm::Id>& nnId_Handle,
            vtkm::cont::ArrayHandle<CoordType>& nnDis_Handle,
-           DeviceAdapter vtkmNotUsed(device))
+           DeviceAdapter)
   {
-#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_CUDA
-    //set up stack size for cuda envinroment
-    size_t stackSizeBackup;
-    cudaDeviceGetLimit(&stackSizeBackup, cudaLimitStackSize);
-    cudaDeviceSetLimit(cudaLimitStackSize, 1024 * 16);
+    //fill the nnDis_Handle handle array with max values before running
+    auto intialValue = std::numeric_limits<CoordType>::max();
+    vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>::Copy(
+      vtkm::cont::make_ArrayHandleConstant(intialValue, qc_Handle.GetNumberOfValues()),
+      nnDis_Handle);
+
+//set up stack size for cuda environment
+#ifdef VTKM_CUDA
+    using DeviceAdapterTraits = vtkm::cont::DeviceAdapterTraits<DeviceAdapter>;
+    std::size_t stackSizeBackup;
+    (void)stackSizeBackup;
+    if (DeviceAdapterTraits::GetId() == VTKM_DEVICE_ADAPTER_CUDA)
+    {
+      cudaDeviceGetLimit(&stackSizeBackup, cudaLimitStackSize);
+      cudaDeviceSetLimit(cudaLimitStackSize, 1024 * 16);
+    }
 #endif
 
     NearestNeighborSearch3DWorklet nns3dWorklet;
@@ -221,8 +228,12 @@ public:
       nns3DDispatcher(nns3dWorklet);
     nns3DDispatcher.Invoke(
       qc_Handle, pointId_Handle, splitId_Handle, coordi_Handle, nnId_Handle, nnDis_Handle);
-#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_CUDA
-    cudaDeviceSetLimit(cudaLimitStackSize, stackSizeBackup);
+
+#ifdef VTKM_CUDA
+    if (DeviceAdapterTraits::GetId() == VTKM_DEVICE_ADAPTER_CUDA)
+    {
+      cudaDeviceSetLimit(cudaLimitStackSize, stackSizeBackup);
+    }
 #endif
   }
 };
diff --git a/vtkm/worklet/splatkernels/Gaussian.h b/vtkm/worklet/splatkernels/Gaussian.h
index e5cdcfca5..e9639175c 100644
--- a/vtkm/worklet/splatkernels/Gaussian.h
+++ b/vtkm/worklet/splatkernels/Gaussian.h
@@ -58,6 +58,7 @@ struct Gaussian : public KernelBase<Gaussian<Dimensions>>
 
   //---------------------------------------------------------------------
   // return the multiplier between smoothing length and max cutoff distance
+  VTKM_EXEC_CONT
   VTKM_CONSTEXPR double getDilationFactor() const { return 5.0; }
 
   //---------------------------------------------------------------------
diff --git a/vtkm/worklet/splatkernels/Spline3rdOrder.h b/vtkm/worklet/splatkernels/Spline3rdOrder.h
index e0990fd98..fdf5f9a6b 100644
--- a/vtkm/worklet/splatkernels/Spline3rdOrder.h
+++ b/vtkm/worklet/splatkernels/Spline3rdOrder.h
@@ -39,13 +39,13 @@ struct default_norm_value;
 template <>
 struct default_norm_value<2>
 {
-  double value() const { return 10.0 / (7.0 * M_PI); }
+  const double value = 10.0 / (7.0 * M_PI);
 };
 
 template <>
 struct default_norm_value<3>
 {
-  double value() const { return 1.0 / M_PI; }
+  const double value = 1.0 / M_PI;
 };
 
 
@@ -65,7 +65,7 @@ struct Spline3rdOrder : public KernelBase<Spline3rdOrder<Dimensions>>
     maxRadius_ = 2.0 * smoothingLength;
     maxRadius2_ = maxRadius_ * maxRadius_;
     //
-    norm_ = default_norm_value<Dimensions>().value();
+    norm_ = default_norm_value<Dimensions>().value;
 
     scale_W_ = norm_ * PowerExpansion<Dimensions>(Hinverse_);
     scale_GradW_ = norm_ * PowerExpansion<Dimensions + 1>(Hinverse_);
diff --git a/vtkm/worklet/testing/UnitTestMarchingCubes.cxx b/vtkm/worklet/testing/UnitTestMarchingCubes.cxx
index 1fc86dce4..02457d4d7 100644
--- a/vtkm/worklet/testing/UnitTestMarchingCubes.cxx
+++ b/vtkm/worklet/testing/UnitTestMarchingCubes.cxx
@@ -411,6 +411,7 @@ void TestMarchingCubesExplicit()
 
 int UnitTestMarchingCubes(int, char* [])
 {
-  return vtkm::cont::testing::Testing::Run(TestMarchingCubesUniformGrid);
-  return vtkm::cont::testing::Testing::Run(TestMarchingCubesExplicit);
+  int result1 = vtkm::cont::testing::Testing::Run(TestMarchingCubesUniformGrid);
+  int result2 = vtkm::cont::testing::Testing::Run(TestMarchingCubesExplicit);
+  return result1 == 0 && result2 == 0;
 }

From e349dd0d1c2e054918196d53398db21f55e82280 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Thu, 21 Dec 2017 13:29:18 -0500
Subject: [PATCH 08/24] Use default copy constructor.

vtkm::Bounds and vtkm::Range now uses default copy-constructor and
assignment operator. That way `std::is_trivially_copyable` succeeds for
these basic types.
---
 vtkm/Bounds.h | 11 ++++-------
 vtkm/Range.h  | 10 ++++------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/vtkm/Bounds.h b/vtkm/Bounds.h
index dc57816cb..9e8546e68 100644
--- a/vtkm/Bounds.h
+++ b/vtkm/Bounds.h
@@ -45,6 +45,9 @@ struct Bounds
   VTKM_EXEC_CONT
   Bounds() {}
 
+  VTKM_EXEC_CONT
+  Bounds(const Bounds&) = default;
+
   VTKM_EXEC_CONT
   Bounds(const vtkm::Range& xRange, const vtkm::Range& yRange, const vtkm::Range& zRange)
     : X(xRange)
@@ -89,13 +92,7 @@ struct Bounds
   }
 
   VTKM_EXEC_CONT
-  const vtkm::Bounds& operator=(const vtkm::Bounds& src)
-  {
-    this->X = src.X;
-    this->Y = src.Y;
-    this->Z = src.Z;
-    return *this;
-  }
+  vtkm::Bounds& operator=(const vtkm::Bounds& src) = default;
 
   /// \b Determine if the bounds are valid (i.e. has at least one valid point).
   ///
diff --git a/vtkm/Range.h b/vtkm/Range.h
index c2d1f60a1..07c22dd0a 100644
--- a/vtkm/Range.h
+++ b/vtkm/Range.h
@@ -49,6 +49,9 @@ struct Range
   {
   }
 
+  VTKM_EXEC_CONT
+  Range(const Range&) = default;
+
   template <typename T1, typename T2>
   VTKM_EXEC_CONT Range(const T1& min, const T2& max)
     : Min(static_cast<vtkm::Float64>(min))
@@ -57,12 +60,7 @@ struct Range
   }
 
   VTKM_EXEC_CONT
-  const vtkm::Range& operator=(const vtkm::Range& src)
-  {
-    this->Min = src.Min;
-    this->Max = src.Max;
-    return *this;
-  }
+  vtkm::Range& operator=(const vtkm::Range& src) = default;
 
   /// \b Determine if the range is valid (i.e. has at least one valid point).
   ///

From cac71555e24592b6a556ac1bb50add57f4628ebd Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 20 Dec 2017 20:18:22 -0500
Subject: [PATCH 09/24] Use `diy::reduce` in MultiBlock reductions.

MultiBlock now uses `diy::reduce` for reductions rather than using proxy
collectives. To support using `diy::reduce` operations on a
vtkm::cont::MultiBlock, added AssignerMultiBlock and
DecomposerMultiBlock classes. This are helper classes that provide DIY
concepts on top of a existing MultiBlock.
---
 vtkm/cont/AssignerMultiBlock.cxx |  78 +++++++++
 vtkm/cont/AssignerMultiBlock.h   |  70 ++++++++
 vtkm/cont/CMakeLists.txt         |   3 +
 vtkm/cont/DecomposerMultiBlock.h |  57 +++++++
 vtkm/cont/MultiBlock.cxx         | 263 ++++++++++++-------------------
 5 files changed, 311 insertions(+), 160 deletions(-)
 create mode 100644 vtkm/cont/AssignerMultiBlock.cxx
 create mode 100644 vtkm/cont/AssignerMultiBlock.h
 create mode 100644 vtkm/cont/DecomposerMultiBlock.h

diff --git a/vtkm/cont/AssignerMultiBlock.cxx b/vtkm/cont/AssignerMultiBlock.cxx
new file mode 100644
index 000000000..629fcabf0
--- /dev/null
+++ b/vtkm/cont/AssignerMultiBlock.cxx
@@ -0,0 +1,78 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#include <vtkm/cont/AssignerMultiBlock.h>
+
+#if defined(VTKM_ENABLE_MPI)
+
+#include <diy/mpi.hpp>
+#include <vtkm/cont/EnvironmentTracker.h>
+
+#include <algorithm> // std::lower_bound
+#include <numeric>   // std::iota
+
+namespace vtkm
+{
+namespace cont
+{
+
+VTKM_CONT
+AssignerMultiBlock::AssignerMultiBlock(const vtkm::cont::MultiBlock& mb)
+  : diy::Assigner(vtkm::cont::EnvironmentTracker::GetCommunicator().size(), 1)
+  , IScanBlockCounts()
+{
+  auto comm = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  const auto nblocks = mb.GetNumberOfBlocks();
+
+  vtkm::Id iscan;
+  diy::mpi::scan(comm, nblocks, iscan, std::plus<vtkm::Id>());
+  diy::mpi::all_gather(comm, iscan, this->IScanBlockCounts);
+
+  this->set_nblocks(static_cast<int>(this->IScanBlockCounts.back()));
+}
+
+VTKM_CONT
+void AssignerMultiBlock::local_gids(int rank, std::vector<int>& gids) const
+{
+  if (rank == 0)
+  {
+    assert(this->IScanBlockCounts.size() > 0);
+    gids.resize(this->IScanBlockCounts[rank]);
+    std::iota(gids.begin(), gids.end(), 0);
+  }
+  else if (rank > 0 && rank < static_cast<int>(this->IScanBlockCounts.size()))
+  {
+    gids.resize(this->IScanBlockCounts[rank] - this->IScanBlockCounts[rank - 1]);
+    std::iota(gids.begin(), gids.end(), this->IScanBlockCounts[rank - 1]);
+  }
+}
+
+VTKM_CONT
+int AssignerMultiBlock::rank(int gid) const
+{
+  return static_cast<int>(
+    std::lower_bound(this->IScanBlockCounts.begin(), this->IScanBlockCounts.end(), gid + 1) -
+    this->IScanBlockCounts.begin());
+}
+
+
+} // vtkm::cont
+} // vtkm
+
+#endif // defined(VTKM_ENABLE_MPI)
diff --git a/vtkm/cont/AssignerMultiBlock.h b/vtkm/cont/AssignerMultiBlock.h
new file mode 100644
index 000000000..fb7440609
--- /dev/null
+++ b/vtkm/cont/AssignerMultiBlock.h
@@ -0,0 +1,70 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_AssignerMultiBlock_h
+#define vtk_m_cont_AssignerMultiBlock_h
+
+#include <vtkm/internal/Configure.h>
+#if defined(VTKM_ENABLE_MPI)
+
+#include <diy/assigner.hpp>
+#include <vtkm/cont/MultiBlock.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+/// \brief Assigner for `MultiBlock` blocks.
+///
+/// `AssignerMultiBlock` is a `diy::Assigner` implementation that uses
+/// `MultiBlock`'s block distribution to build global-id/rank associations
+/// needed for several `diy` operations.
+/// It uses a contiguous assignment strategy to map blocks to global ids i.e.
+/// blocks on rank 0 come first, then rank 1, etc. Any rank may have 0 blocks.
+///
+/// AssignerMultiBlock uses collectives in the constructor hence it is
+/// essential it gets created on all ranks irrespective of whether the rank has
+/// any blocks.
+///
+class VTKM_CONT_EXPORT AssignerMultiBlock : public diy::Assigner
+{
+public:
+  /// Initialize the assigner using a multiblock dataset.
+  /// This may initialize collective operations to populate the assigner with
+  /// information about blocks on all ranks.
+  VTKM_CONT
+  AssignerMultiBlock(const vtkm::cont::MultiBlock& mb);
+
+  ///@{
+  /// diy::Assigner API implementation.
+  VTKM_CONT
+  void local_gids(int rank, std::vector<int>& gids) const override;
+
+  VTKM_CONT
+  int rank(int gid) const override;
+  //@}
+private:
+  std::vector<vtkm::Id> IScanBlockCounts;
+};
+}
+}
+
+#endif // defined(VTKM_ENABLE_MPI)
+#endif
diff --git a/vtkm/cont/CMakeLists.txt b/vtkm/cont/CMakeLists.txt
index f9f43103a..11384c114 100644
--- a/vtkm/cont/CMakeLists.txt
+++ b/vtkm/cont/CMakeLists.txt
@@ -45,6 +45,7 @@ set(headers
   ArrayHandleConcatenate.h
   ArrayRangeCompute.h
   ArrayRangeCompute.hxx
+  AssignerMultiBlock.h
   CellLocatorTwoLevelUniformGrid.h
   CellSet.h
   CellSetExplicit.h
@@ -58,6 +59,7 @@ set(headers
   DataSetBuilderRectilinear.h
   DataSetBuilderUniform.h
   DataSetFieldAdd.h
+  DecomposerMultiBlock.h
   DeviceAdapter.h
   DeviceAdapterAlgorithm.h
   DeviceAdapterListTag.h
@@ -94,6 +96,7 @@ set(header_impls
 
 set(sources
   ArrayHandle.cxx
+  AssignerMultiBlock.cxx
   CellSet.cxx
   CellSetExplicit.cxx
   CellSetStructured.cxx
diff --git a/vtkm/cont/DecomposerMultiBlock.h b/vtkm/cont/DecomposerMultiBlock.h
new file mode 100644
index 000000000..342968d83
--- /dev/null
+++ b/vtkm/cont/DecomposerMultiBlock.h
@@ -0,0 +1,57 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+#ifndef vtk_m_cont_DecomposerMultiBlock_h
+#define vtk_m_cont_DecomposerMultiBlock_h
+
+#include <vtkm/internal/Configure.h>
+#if defined(VTKM_ENABLE_MPI)
+#include <vtkm/cont/AssignerMultiBlock.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+/// \brief DIY Decomposer that uses `MultiBlock` existing decomposition.
+///
+/// To create partners for various reduce operations, DIY requires a decomposer.
+/// This class provides an implementation that can use the multiblock's
+/// decomposition.
+///
+class VTKM_CONT_EXPORT DecomposerMultiBlock
+{
+public:
+  VTKM_CONT DecomposerMultiBlock(const diy::Assigner& assigner)
+    : divisions{ assigner.nblocks() }
+  {
+  }
+
+  using DivisionVector = std::vector<int>;
+
+  /// this public member is needed to satisfy decomposer concept for
+  /// partners in DIY.
+  DivisionVector divisions;
+};
+}
+}
+
+#endif // defined(VTKM_ENABLE_MPI)
+
+#endif
diff --git a/vtkm/cont/MultiBlock.cxx b/vtkm/cont/MultiBlock.cxx
index 9d71bfb50..f2bdd66ad 100644
--- a/vtkm/cont/MultiBlock.cxx
+++ b/vtkm/cont/MultiBlock.cxx
@@ -21,7 +21,9 @@
 #include <vtkm/StaticAssert.h>
 #include <vtkm/cont/ArrayCopy.h>
 #include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/AssignerMultiBlock.h>
 #include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/DecomposerMultiBlock.h>
 #include <vtkm/cont/DeviceAdapterAlgorithm.h>
 #include <vtkm/cont/DynamicArrayHandle.h>
 #include <vtkm/cont/EnvironmentTracker.h>
@@ -30,7 +32,11 @@
 #include <vtkm/cont/MultiBlock.h>
 
 #if defined(VTKM_ENABLE_MPI)
+#include <diy/decomposition.hpp>
 #include <diy/master.hpp>
+#include <diy/partners/all-reduce.hpp>
+#include <diy/partners/swap.hpp>
+#include <diy/reduce.hpp>
 
 namespace vtkm
 {
@@ -48,111 +54,21 @@ VTKM_CONT std::vector<typename PortalType::ValueType> CopyArrayPortalToVector(
   std::copy(iterators.GetBegin(), iterators.GetEnd(), result.begin());
   return result;
 }
+
+template <typename T>
+const vtkm::cont::DataSet& GetBlock(const vtkm::cont::MultiBlock& mb, const T&);
+
+template <>
+const vtkm::cont::DataSet& GetBlock(const vtkm::cont::MultiBlock& mb,
+                                    const diy::Master::ProxyWithLink& cp)
+{
+  const int lid = cp.master()->lid(cp.gid());
+  return mb.GetBlock(lid);
 }
 }
 }
-
-namespace std
-{
-
-namespace detail
-{
-
-template <typename T, size_t ElementSize = sizeof(T)>
-struct MPIPlus
-{
-  MPIPlus()
-  {
-    this->OpPtr = std::shared_ptr<MPI_Op>(new MPI_Op(MPI_NO_OP), [](MPI_Op* ptr) {
-      MPI_Op_free(ptr);
-      delete ptr;
-    });
-
-    MPI_Op_create(
-      [](void* a, void* b, int* len, MPI_Datatype*) {
-        T* ba = reinterpret_cast<T*>(a);
-        T* bb = reinterpret_cast<T*>(b);
-        for (int cc = 0; cc < (*len) / ElementSize; ++cc)
-        {
-          bb[cc] = ba[cc] + bb[cc];
-        }
-      },
-      1,
-      this->OpPtr.get());
-  }
-  ~MPIPlus() {}
-  operator MPI_Op() const { return *this->OpPtr.get(); }
-private:
-  std::shared_ptr<MPI_Op> OpPtr;
-};
-
-} // std::detail
-
-template <>
-struct plus<vtkm::Bounds>
-{
-  MPI_Op get_mpi_op() const { return this->Op; }
-  vtkm::Bounds operator()(const vtkm::Bounds& lhs, const vtkm::Bounds& rhs) const
-  {
-    return lhs + rhs;
-  }
-
-private:
-  std::detail::MPIPlus<vtkm::Bounds> Op;
-};
-
-template <>
-struct plus<vtkm::Range>
-{
-  MPI_Op get_mpi_op() const { return this->Op; }
-  vtkm::Range operator()(const vtkm::Range& lhs, const vtkm::Range& rhs) const { return lhs + rhs; }
-
-private:
-  std::detail::MPIPlus<vtkm::Range> Op;
-};
 }
 
-namespace diy
-{
-namespace mpi
-{
-namespace detail
-{
-template <>
-struct mpi_datatype<vtkm::Bounds>
-{
-  static MPI_Datatype datatype() { return get_mpi_datatype<vtkm::Float64>(); }
-  static const void* address(const vtkm::Bounds& x) { return &x; }
-  static void* address(vtkm::Bounds& x) { return &x; }
-  static int count(const vtkm::Bounds&) { return 6; }
-};
-
-template <>
-struct mpi_op<std::plus<vtkm::Bounds>>
-{
-  static MPI_Op get(const std::plus<vtkm::Bounds>& op) { return op.get_mpi_op(); }
-};
-
-template <>
-struct mpi_datatype<vtkm::Range>
-{
-  static MPI_Datatype datatype() { return get_mpi_datatype<vtkm::Float64>(); }
-  static const void* address(const vtkm::Range& x) { return &x; }
-  static void* address(vtkm::Range& x) { return &x; }
-  static int count(const vtkm::Range&) { return 2; }
-};
-
-template <>
-struct mpi_op<std::plus<vtkm::Range>>
-{
-  static MPI_Op get(const std::plus<vtkm::Range>& op) { return op.get_mpi_op(); }
-};
-
-} // diy::mpi::detail
-} // diy::mpi
-} // diy
-
-
 #endif
 
 namespace vtkm
@@ -311,26 +227,56 @@ VTKM_CONT vtkm::Bounds MultiBlock::GetBounds(vtkm::Id coordinate_system_index,
 
 #if defined(VTKM_ENABLE_MPI)
   auto world = vtkm::cont::EnvironmentTracker::GetCommunicator();
-  //const auto global_num_blocks = this->GetGlobalNumberOfBlocks();
+  diy::Master master(world,
+                     1,
+                     -1,
+                     []() -> void* { return new vtkm::Bounds(); },
+                     [](void* ptr) { delete static_cast<vtkm::Bounds*>(ptr); });
 
-  const auto num_blocks = this->GetNumberOfBlocks();
+  vtkm::cont::AssignerMultiBlock assigner(*this);
 
-  diy::Master master(world, 1, -1);
-  for (vtkm::Id cc = 0; cc < num_blocks; ++cc)
-  {
-    int gid = cc * world.size() + world.rank();
-    master.add(gid, const_cast<vtkm::cont::DataSet*>(&this->Blocks[cc]), new diy::Link());
-  }
+  // populate master with blocks from `this`.
+  diy::decompose(world.rank(), assigner, master);
 
-  master.foreach ([&](const vtkm::cont::DataSet* block, const diy::Master::ProxyWithLink& cp) {
-    auto coords = block->GetCoordinateSystem(coordinate_system_index);
-    const vtkm::Bounds bounds = coords.GetBounds(TypeList(), StorageList());
-    cp.all_reduce(bounds, std::plus<vtkm::Bounds>());
+  auto self = (*this);
+  master.foreach ([&](vtkm::Bounds* data, const diy::Master::ProxyWithLink& cp) {
+    const vtkm::cont::DataSet& block = vtkm::cont::detail::GetBlock(self, cp);
+    try
+    {
+      vtkm::cont::CoordinateSystem coords = block.GetCoordinateSystem(coordinate_system_index);
+      *data = coords.GetBounds(TypeList(), StorageList());
+    }
+    catch (const vtkm::cont::Error&)
+    {
+    }
   });
 
-  master.process_collectives();
-  auto bounds = master.proxy(0).get<vtkm::Bounds>();
-  return bounds;
+  vtkm::cont::DecomposerMultiBlock decomposer(assigner);
+  diy::RegularSwapPartners partners(decomposer, /*k=*/2);
+
+  auto callback =
+    [](vtkm::Bounds* data, const diy::ReduceProxy& srp, const diy::RegularSwapPartners&) {
+      // 1. dequeue.
+      std::vector<int> incoming;
+      srp.incoming(incoming);
+      vtkm::Bounds message;
+      for (const int gid : incoming)
+      {
+        srp.dequeue(gid, message);
+        data->Include(message);
+      }
+      // 2. enqueue
+      for (int cc = 0; cc < srp.out_link().size(); ++cc)
+      {
+        srp.enqueue(srp.out_link().target(cc), *data);
+      }
+    };
+  diy::reduce(master, assigner, partners, callback);
+  if (master.size())
+  {
+    return (*master.block<vtkm::Bounds>(0));
+  }
+  return vtkm::Bounds();
 
 #else
   const vtkm::Id index = coordinate_system_index;
@@ -443,65 +389,62 @@ VTKM_CONT vtkm::cont::ArrayHandle<vtkm::Range>
 MultiBlock::GetGlobalRange(const std::string& field_name, TypeList, StorageList) const
 {
 #if defined(VTKM_ENABLE_MPI)
-  auto world = vtkm::cont::EnvironmentTracker::GetCommunicator();
-  const auto num_blocks = this->GetNumberOfBlocks();
+  using BlockMetaData = std::vector<vtkm::Range>;
 
-  diy::Master master(world);
-  for (vtkm::Id cc = 0; cc < num_blocks; ++cc)
-  {
-    int gid = cc * world.size() + world.rank();
-    master.add(gid, const_cast<vtkm::cont::DataSet*>(&this->Blocks[cc]), new diy::Link());
-  }
+  auto comm = vtkm::cont::EnvironmentTracker::GetCommunicator();
+  diy::Master master(comm,
+                     1,
+                     -1,
+                     []() -> void* { return new BlockMetaData(); },
+                     [](void* ptr) { delete static_cast<BlockMetaData*>(ptr); });
 
-  // collect info about number of components in the field.
-  master.foreach ([&](const vtkm::cont::DataSet* dataset, const diy::Master::ProxyWithLink& cp) {
-    if (dataset->HasField(field_name))
+  vtkm::cont::AssignerMultiBlock assigner(*this);
+
+  diy::decompose(comm.rank(), assigner, master);
+
+  auto self = (*this);
+  master.foreach ([&](BlockMetaData* data, const diy::Master::ProxyWithLink& cp) {
+    const vtkm::cont::DataSet& block = vtkm::cont::detail::GetBlock(self, cp);
+    if (block.HasField(field_name))
     {
-      auto field = dataset->GetField(field_name);
+      auto field = block.GetField(field_name);
       const vtkm::cont::ArrayHandle<vtkm::Range> range = field.GetRange(TypeList(), StorageList());
-      vtkm::Id components = range.GetPortalConstControl().GetNumberOfValues();
-      cp.all_reduce(components, diy::mpi::maximum<vtkm::Id>());
+      *data = vtkm::cont::detail::CopyArrayPortalToVector(range.GetPortalConstControl());
     }
   });
-  master.process_collectives();
 
-  const vtkm::Id components = master.size() ? master.proxy(0).read<vtkm::Id>() : 0;
+  vtkm::cont::DecomposerMultiBlock decomposer(assigner);
+  diy::RegularSwapPartners partners(decomposer, /*k=*/2);
+  auto callback =
+    [](BlockMetaData* data, const diy::ReduceProxy& srp, const diy::RegularSwapPartners&) {
+      std::vector<int> incoming;
+      srp.incoming(incoming);
 
-  // clear all collectives.
-  master.foreach ([&](const vtkm::cont::DataSet*, const diy::Master::ProxyWithLink& cp) {
-    cp.collectives()->clear();
-  });
-
-  master.foreach ([&](const vtkm::cont::DataSet* dataset, const diy::Master::ProxyWithLink& cp) {
-    if (dataset->HasField(field_name))
-    {
-      auto field = dataset->GetField(field_name);
-      const vtkm::cont::ArrayHandle<vtkm::Range> range = field.GetRange(TypeList(), StorageList());
-      const auto v_range =
-        vtkm::cont::detail::CopyArrayPortalToVector(range.GetPortalConstControl());
-      for (const vtkm::Range& r : v_range)
+      // 1. dequeue
+      BlockMetaData message;
+      for (const int gid : incoming)
       {
-        cp.all_reduce(r, std::plus<vtkm::Range>());
+        srp.dequeue(gid, message);
+        data->resize(std::max(data->size(), message.size()));
+        for (size_t cc = 0; cc < data->size(); ++cc)
+        {
+          (*data)[cc].Include(message[cc]);
+        }
       }
-      // if current block has less that the max number of components, just add invalid ranges for the rest.
-      for (vtkm::Id cc = static_cast<vtkm::Id>(v_range.size()); cc < components; ++cc)
+      // 2. enqueue
+      for (int cc = 0; cc < srp.out_link().size(); ++cc)
       {
-        cp.all_reduce(vtkm::Range(), std::plus<vtkm::Range>());
+        srp.enqueue(srp.out_link().target(cc), *data);
       }
-    }
-  });
-  master.process_collectives();
-  std::vector<vtkm::Range> ranges(components);
-  // FIXME: is master.size() == 0 i.e. there are no blocks on the current rank,
-  // this method won't return valid range.
-  if (master.size() > 0)
+    };
+
+  diy::reduce(master, assigner, partners, callback);
+
+  BlockMetaData ranges;
+  if (master.size())
   {
-    for (vtkm::Id cc = 0; cc < components; ++cc)
-    {
-      ranges[cc] = master.proxy(0).get<vtkm::Range>();
-    }
+    ranges = *(master.block<BlockMetaData>(0));
   }
-
   vtkm::cont::ArrayHandle<vtkm::Range> tmprange = vtkm::cont::make_ArrayHandle(ranges);
   vtkm::cont::ArrayHandle<vtkm::Range> range;
   vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandle(ranges), range);

From 3408e8e5e3d7a05a0ccf021f2471f073d6dbf24f Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Thu, 21 Dec 2017 15:26:52 -0500
Subject: [PATCH 10/24] Revert "diy: pass operator instance to mpi_op<>::get()"

This reverts commit c63f3635d53259894c780e131f489d10f5c2b48e.
---
 diy/include/diy/mpi/collectives.hpp | 18 +++++++++---------
 diy/include/diy/mpi/operations.hpp  | 14 +++++++-------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/diy/include/diy/mpi/collectives.hpp b/diy/include/diy/mpi/collectives.hpp
index 4324534e5..8d70bcf01 100644
--- a/diy/include/diy/mpi/collectives.hpp
+++ b/diy/include/diy/mpi/collectives.hpp
@@ -152,13 +152,13 @@ namespace mpi
       }
     }
 
-    static void reduce(const communicator& comm, const T& in, T& out, int root, const Op& op)
+    static void reduce(const communicator& comm, const T& in, T& out, int root, const Op&)
     {
       MPI_Reduce(Datatype::address(const_cast<T&>(in)),
                  Datatype::address(out),
                  Datatype::count(in),
                  Datatype::datatype(),
-                 detail::mpi_op<Op>::get(op),
+                 detail::mpi_op<Op>::get(),
                  root, comm);
     }
 
@@ -168,38 +168,38 @@ namespace mpi
                  Datatype::address(const_cast<T&>(in)),
                  Datatype::count(in),
                  Datatype::datatype(),
-                 detail::mpi_op<Op>::get(op),
+                 detail::mpi_op<Op>::get(),
                  root, comm);
     }
 
-    static void all_reduce(const communicator& comm, const T& in, T& out, const Op& op)
+    static void all_reduce(const communicator& comm, const T& in, T& out, const Op&)
     {
       MPI_Allreduce(Datatype::address(const_cast<T&>(in)),
                     Datatype::address(out),
                     Datatype::count(in),
                     Datatype::datatype(),
-                    detail::mpi_op<Op>::get(op),
+                    detail::mpi_op<Op>::get(),
                     comm);
     }
 
-    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
+    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
     {
       out.resize(in.size());
       MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])),
                     Datatype::address(out[0]),
                     in.size(),
                     Datatype::datatype(),
-                    detail::mpi_op<Op>::get(op),
+                    detail::mpi_op<Op>::get(),
                     comm);
     }
 
-    static void scan(const communicator& comm, const T& in, T& out, const Op& op)
+    static void scan(const communicator& comm, const T& in, T& out, const Op&)
     {
       MPI_Scan(Datatype::address(const_cast<T&>(in)),
                Datatype::address(out),
                Datatype::count(in),
                Datatype::datatype(),
-               detail::mpi_op<Op>::get(op),
+               detail::mpi_op<Op>::get(),
                comm);
     }
 
diff --git a/diy/include/diy/mpi/operations.hpp b/diy/include/diy/mpi/operations.hpp
index 9c38e58ae..2f95c0a72 100644
--- a/diy/include/diy/mpi/operations.hpp
+++ b/diy/include/diy/mpi/operations.hpp
@@ -14,13 +14,13 @@ namespace mpi
 
 namespace detail
 {
-  template<class T> struct mpi_op                           { static MPI_Op  get(const T&); };
-  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get(const maximum<U>&) { return MPI_MAX; }  };
-  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get(const minimum<U>&) { return MPI_MIN; }  };
-  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get(const std::plus<U>&) { return MPI_SUM; }  };
-  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get(const std::multiplies<U>&) { return MPI_PROD; }  };
-  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get(const std::logical_and<U>&) { return MPI_LAND; }  };
-  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get(const std::logical_or<U>&) { return MPI_LOR; }  };
+  template<class T> struct mpi_op                           { static MPI_Op  get(); };
+  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get() { return MPI_MAX; }  };
+  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get() { return MPI_MIN; }  };
+  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get() { return MPI_SUM; }  };
+  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get() { return MPI_PROD; }  };
+  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get() { return MPI_LAND; }  };
+  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get() { return MPI_LOR; }  };
 }
 }
 }

From ffc833fd8cb5bfb0a6a6d95b8b62f5dacef74cde Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Thu, 21 Dec 2017 15:31:26 -0500
Subject: [PATCH 11/24] Update diy to revision `ca5e7cf23`.

---
 diy/include/diy/master.hpp | 2 --
 diy/include/diy/stats.hpp  | 9 ++-------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/diy/include/diy/master.hpp b/diy/include/diy/master.hpp
index ec7319a60..97ccb8724 100644
--- a/diy/include/diy/master.hpp
+++ b/diy/include/diy/master.hpp
@@ -1062,8 +1062,6 @@ void
 diy::Master::
 flush()
 {
-
-  auto scoped = prof.scoped("comm");
 #ifdef DEBUG
   time_type start = get_time();
   unsigned wait = 1;
diff --git a/diy/include/diy/stats.hpp b/diy/include/diy/stats.hpp
index 0628146df..4866ccfb1 100644
--- a/diy/include/diy/stats.hpp
+++ b/diy/include/diy/stats.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 
 #include "log.hpp"      // need this for format
-#define DIY_PROFILE 1
+
 namespace diy
 {
 namespace stats
@@ -71,11 +71,7 @@ struct Profiler
         {
             const Event& e = events[i];
             auto time = std::chrono::duration_cast<std::chrono::microseconds>(e.stamp - start).count();
-            fmt::print(out, "{} {} {}\n",
-                            time / 1000000.,
-                            (e.begin ? '<' : '>'),
-                            e.name);
-            /*
+
             fmt::print(out, "{:02d}:{:02d}:{:02d}.{:06d} {}{}\n",
                             time/1000000/60/60,
                             time/1000000/60 % 60,
@@ -83,7 +79,6 @@ struct Profiler
                             time % 1000000,
                             (e.begin ? '<' : '>'),
                             e.name);
-                            */
         }
     }
 

From 1737bbe9ca1ea583150c95bcfd49184d071728ef Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Sat, 23 Dec 2017 12:47:56 -0500
Subject: [PATCH 12/24] exclude shadow warnings from DIY.

DIY code is rife with shadow warnings. Ignore those.
---
 CTestCustom.cmake.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in
index db108069e..4f226f402 100644
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@@ -20,4 +20,6 @@
 
 list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
   ".*warning: ignoring loop annotation.*"
-) 
+  ".*diy.include.diy.*WShadow.*" # exclude `diy` shadow warnings.
+  ".*diy.include.diy.*note: shadowed.*" # exclude `diy` shadow warnings.
+)

From 954111f60e723ab278dcdad85f854a0b51cad638 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Sat, 23 Dec 2017 13:06:02 -0500
Subject: [PATCH 13/24] exclude -Wunused-result from diy/storage.

These parts are marked as TODO in DIY and hence we'll ignore these
warnings till DIY fixes those.
---
 CTestCustom.cmake.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in
index 4f226f402..23286b660 100644
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@@ -22,4 +22,5 @@ list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
   ".*warning: ignoring loop annotation.*"
   ".*diy.include.diy.*WShadow.*" # exclude `diy` shadow warnings.
   ".*diy.include.diy.*note: shadowed.*" # exclude `diy` shadow warnings.
+  ".*diy.include.diy.storage.hpp.*Wunused-result.*" # this is a TODO in DIY.
 )

From fc2f9f33915255f463844c972dc18e83c4455afc Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 10:44:07 -0500
Subject: [PATCH 14/24] Remove DIY.

We will add DIY back a 3rd-party import.
---
 diy/CMakeLists.txt                            |   65 -
 diy/LEGAL.txt                                 |   19 -
 diy/LICENSE.txt                               |   41 -
 diy/include/diy/algorithms.hpp                |  191 -
 diy/include/diy/assigner.hpp                  |  126 -
 diy/include/diy/collection.hpp                |  121 -
 diy/include/diy/communicator.hpp              |   13 -
 diy/include/diy/constants.h                   |   22 -
 diy/include/diy/critical-resource.hpp         |   53 -
 diy/include/diy/decomposition.hpp             |  716 ---
 .../diy/detail/algorithms/kdtree-sampling.hpp |  450 --
 diy/include/diy/detail/algorithms/kdtree.hpp  |  569 ---
 diy/include/diy/detail/algorithms/sort.hpp    |  162 -
 diy/include/diy/detail/block_traits.hpp       |   31 -
 diy/include/diy/detail/collectives.hpp        |   54 -
 diy/include/diy/detail/reduce/all-to-all.hpp  |  169 -
 diy/include/diy/detail/traits.hpp             |  318 --
 diy/include/diy/fmt/format.cc                 |  935 ----
 diy/include/diy/fmt/format.h                  | 3834 -----------------
 diy/include/diy/fmt/ostream.cc                |   61 -
 diy/include/diy/fmt/ostream.h                 |  133 -
 diy/include/diy/grid.hpp                      |  153 -
 diy/include/diy/io/block.hpp                  |  396 --
 diy/include/diy/io/bov.hpp                    |  171 -
 diy/include/diy/io/numpy.hpp                  |  213 -
 diy/include/diy/link.hpp                      |  219 -
 diy/include/diy/log.hpp                       |  103 -
 diy/include/diy/master.hpp                    | 1203 ------
 diy/include/diy/mpi.hpp                       |   32 -
 diy/include/diy/mpi/collectives.hpp           |  328 --
 diy/include/diy/mpi/communicator.hpp          |   72 -
 diy/include/diy/mpi/constants.hpp             |   13 -
 diy/include/diy/mpi/datatypes.hpp             |   63 -
 diy/include/diy/mpi/io.hpp                    |  137 -
 diy/include/diy/mpi/operations.hpp            |   26 -
 diy/include/diy/mpi/optional.hpp              |   55 -
 diy/include/diy/mpi/point-to-point.hpp        |   98 -
 diy/include/diy/mpi/request.hpp               |   26 -
 diy/include/diy/mpi/status.hpp                |   30 -
 diy/include/diy/no-thread.hpp                 |   38 -
 diy/include/diy/partners/all-reduce.hpp       |   72 -
 diy/include/diy/partners/broadcast.hpp        |   62 -
 diy/include/diy/partners/common.hpp           |  204 -
 diy/include/diy/partners/merge.hpp            |   60 -
 diy/include/diy/partners/swap.hpp             |   43 -
 diy/include/diy/pick.hpp                      |  137 -
 diy/include/diy/point.hpp                     |  120 -
 diy/include/diy/proxy.hpp                     |  228 -
 diy/include/diy/reduce-operations.hpp         |   32 -
 diy/include/diy/reduce.hpp                    |  216 -
 diy/include/diy/serialization.hpp             |  456 --
 diy/include/diy/stats.hpp                     |  115 -
 diy/include/diy/storage.hpp                   |  228 -
 diy/include/diy/thread.hpp                    |   31 -
 diy/include/diy/thread/fast_mutex.h           |  248 --
 diy/include/diy/time.hpp                      |   33 -
 diy/include/diy/types.hpp                     |   85 -
 diy/include/diy/vertices.hpp                  |   54 -
 58 files changed, 13883 deletions(-)
 delete mode 100644 diy/CMakeLists.txt
 delete mode 100644 diy/LEGAL.txt
 delete mode 100644 diy/LICENSE.txt
 delete mode 100644 diy/include/diy/algorithms.hpp
 delete mode 100644 diy/include/diy/assigner.hpp
 delete mode 100644 diy/include/diy/collection.hpp
 delete mode 100644 diy/include/diy/communicator.hpp
 delete mode 100644 diy/include/diy/constants.h
 delete mode 100644 diy/include/diy/critical-resource.hpp
 delete mode 100644 diy/include/diy/decomposition.hpp
 delete mode 100644 diy/include/diy/detail/algorithms/kdtree-sampling.hpp
 delete mode 100644 diy/include/diy/detail/algorithms/kdtree.hpp
 delete mode 100644 diy/include/diy/detail/algorithms/sort.hpp
 delete mode 100644 diy/include/diy/detail/block_traits.hpp
 delete mode 100644 diy/include/diy/detail/collectives.hpp
 delete mode 100644 diy/include/diy/detail/reduce/all-to-all.hpp
 delete mode 100644 diy/include/diy/detail/traits.hpp
 delete mode 100644 diy/include/diy/fmt/format.cc
 delete mode 100644 diy/include/diy/fmt/format.h
 delete mode 100644 diy/include/diy/fmt/ostream.cc
 delete mode 100644 diy/include/diy/fmt/ostream.h
 delete mode 100644 diy/include/diy/grid.hpp
 delete mode 100644 diy/include/diy/io/block.hpp
 delete mode 100644 diy/include/diy/io/bov.hpp
 delete mode 100644 diy/include/diy/io/numpy.hpp
 delete mode 100644 diy/include/diy/link.hpp
 delete mode 100644 diy/include/diy/log.hpp
 delete mode 100644 diy/include/diy/master.hpp
 delete mode 100644 diy/include/diy/mpi.hpp
 delete mode 100644 diy/include/diy/mpi/collectives.hpp
 delete mode 100644 diy/include/diy/mpi/communicator.hpp
 delete mode 100644 diy/include/diy/mpi/constants.hpp
 delete mode 100644 diy/include/diy/mpi/datatypes.hpp
 delete mode 100644 diy/include/diy/mpi/io.hpp
 delete mode 100644 diy/include/diy/mpi/operations.hpp
 delete mode 100644 diy/include/diy/mpi/optional.hpp
 delete mode 100644 diy/include/diy/mpi/point-to-point.hpp
 delete mode 100644 diy/include/diy/mpi/request.hpp
 delete mode 100644 diy/include/diy/mpi/status.hpp
 delete mode 100644 diy/include/diy/no-thread.hpp
 delete mode 100644 diy/include/diy/partners/all-reduce.hpp
 delete mode 100644 diy/include/diy/partners/broadcast.hpp
 delete mode 100644 diy/include/diy/partners/common.hpp
 delete mode 100644 diy/include/diy/partners/merge.hpp
 delete mode 100644 diy/include/diy/partners/swap.hpp
 delete mode 100644 diy/include/diy/pick.hpp
 delete mode 100644 diy/include/diy/point.hpp
 delete mode 100644 diy/include/diy/proxy.hpp
 delete mode 100644 diy/include/diy/reduce-operations.hpp
 delete mode 100644 diy/include/diy/reduce.hpp
 delete mode 100644 diy/include/diy/serialization.hpp
 delete mode 100644 diy/include/diy/stats.hpp
 delete mode 100644 diy/include/diy/storage.hpp
 delete mode 100644 diy/include/diy/thread.hpp
 delete mode 100644 diy/include/diy/thread/fast_mutex.h
 delete mode 100644 diy/include/diy/time.hpp
 delete mode 100644 diy/include/diy/types.hpp
 delete mode 100644 diy/include/diy/vertices.hpp

diff --git a/diy/CMakeLists.txt b/diy/CMakeLists.txt
deleted file mode 100644
index 0b3fb112f..000000000
--- a/diy/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-##=============================================================================
-##
-##  Copyright (c) Kitware, Inc.
-##  All rights reserved.
-##  See LICENSE.txt for details.
-##
-##  This software is distributed WITHOUT ANY WARRANTY; without even
-##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-##  PURPOSE.  See the above copyright notice for more information.
-##
-##  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-##  Copyright 2017 UT-Battelle, LLC.
-##  Copyright 2017 Los Alamos National Security.
-##
-##  Under the terms of Contract DE-NA0003525 with NTESS,
-##  the U.S. Government retains certain rights in this software.
-##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
-##  Laboratory (LANL), the U.S. Government retains certain rights in
-##  this software.
-##
-##=============================================================================
-
-#==============================================================================
-# See License.txt
-#==============================================================================
-add_library(diy INTERFACE)
-
-# diy needs C++11
-target_compile_features(diy INTERFACE cxx_auto_type)
-
-target_include_directories(diy INTERFACE
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  $<INSTALL_INTERFACE:${VTKm_INSTALL_INCLUDE_DIR}>)
-
-# presently, this dependency is required. Make it optional in the future.
-set(arg)
-foreach(apath IN LISTS MPI_C_INCLUDE_PATH MPI_CXX_INCLUDE_PATH)
-  list(APPEND arg $<BUILD_INTERFACE:${apath}>)
-endforeach()
-target_include_directories(diy INTERFACE ${arg})
-
-target_link_libraries(diy INTERFACE
-  $<BUILD_INTERFACE:${MPI_C_LIBRARIES}>
-  $<BUILD_INTERFACE:${MPI_CXX_LIBRARIES}>)
-
-if(MPI_C_COMPILE_DEFINITIONS)
-  target_compile_definitions(diy INTERFACE
-    $<$<COMPILE_LANGUAGE:C>:${MPI_C_COMPILE_DEFINITIONS}>)
-endif()
-if(MPI_CXX_COMPILE_DEFNITIONS)
-  target_compile_definitions(diy INTERFACE
-    $<$<COMPILE_LANGUAGE:CXX>:${MPI_CXX_COMPILE_DEFNITIONS>)
-endif()
-
-install(TARGETS diy
-  EXPORT ${VTKm_EXPORT_NAME})
-
-# Install headers
-install(DIRECTORY include/diy
-  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR})
-
-# Install other files.
-install(FILES LEGAL.txt LICENSE.txt
-  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/diy
-  )
diff --git a/diy/LEGAL.txt b/diy/LEGAL.txt
deleted file mode 100644
index 66955ef03..000000000
--- a/diy/LEGAL.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright Notice
-
-DIY2, Copyright (c) 2015, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required
-approvals from the U.S. Dept. of Energy).  All rights reserved.
-
-If you have questions about your rights to use or distribute this software,
-please contact Berkeley Lab's Technology Transfer Department at  TTD@lbl.gov.
-
-NOTICE.  This software is owned by the U.S. Department of Energy.  As such, the
-U.S. Government has been granted for itself and others acting on its behalf a
-paid-up, nonexclusive, irrevocable, worldwide license in the Software to
-reproduce, prepare derivative works, and perform publicly and display publicly.
-Beginning five (5) years after the date permission to assert copyright is
-obtained from the U.S. Department of Energy, and subject to any subsequent five
-(5) year renewals, the U.S. Government is granted for itself and others acting
-on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
-Software to reproduce, prepare derivative works, distribute copies to the
-public, perform publicly and display publicly, and to permit others to do so.
diff --git a/diy/LICENSE.txt b/diy/LICENSE.txt
deleted file mode 100644
index 7607d2ca1..000000000
--- a/diy/LICENSE.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-License Agreement
-
-"DIY2, Copyright (c) 2015, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required
-approvals from the U.S. Dept. of Energy).  All rights reserved."
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-(1) Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-(2) Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-
-(3) Neither the name of the University of California, Lawrence Berkeley National
-Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used
-to endorse or promote products derived from this software without specific prior
-written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You are under no obligation whatsoever to provide any bug fixes, patches, or
-upgrades to the features, functionality or performance of the source code
-("Enhancements") to anyone; however, if you choose to make your Enhancements
-available either publicly, or directly to Lawrence Berkeley National Laboratory,
-without imposing a separate written license agreement for such Enhancements,
-then you hereby grant the following license: a  non-exclusive, royalty-free
-perpetual license to install, use, modify, prepare derivative works, incorporate
-into other computer software, distribute, and sublicense such enhancements or
-derivative works thereof, in binary and source code form.
diff --git a/diy/include/diy/algorithms.hpp b/diy/include/diy/algorithms.hpp
deleted file mode 100644
index 23215a2c3..000000000
--- a/diy/include/diy/algorithms.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef DIY_ALGORITHMS_HPP
-#define DIY_ALGORITHMS_HPP
-
-#include <vector>
-
-#include "master.hpp"
-#include "assigner.hpp"
-#include "reduce.hpp"
-#include "reduce-operations.hpp"
-#include "partners/swap.hpp"
-
-#include "detail/algorithms/sort.hpp"
-#include "detail/algorithms/kdtree.hpp"
-#include "detail/algorithms/kdtree-sampling.hpp"
-
-#include "log.hpp"
-
-namespace diy
-{
-    /**
-     * \ingroup Algorithms
-     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
-     */
-    template<class Block, class T, class Cmp>
-    void sort(Master&                   master,               //!< master object
-              const Assigner&           assigner,             //!< assigner object
-              std::vector<T> Block::*   values,               //!< all values to sort
-              std::vector<T> Block::*   samples,              //!< (output) boundaries of blocks
-              size_t                    num_samples,          //!< desired number of samples
-              const Cmp&                cmp,                  //!< comparison function
-              int                       k   = 2,              //!< k-ary reduction will be used
-              bool                      samples_only = false) //!< false: results will be all_to_all exchanged; true: only sort but don't exchange results
-    {
-        bool immediate = master.immediate();
-        master.set_immediate(false);
-
-        // NB: although sorter will go out of scope, its member functions sample()
-        //     and exchange() will return functors whose copies get saved inside reduce
-        detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples);
-
-        // swap-reduce to all-gather samples
-        RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks());
-        RegularSwapPartners   partners(decomposer, k);
-        reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds()));
-
-        // all_to_all to exchange the values
-        if (!samples_only)
-            all_to_all(master, assigner, sorter.exchange(), k);
-
-        master.set_immediate(immediate);
-    }
-
-
-    /**
-     * \ingroup Algorithms
-     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
-     * shorter version of above sort algorithm with the default less-than comparator used for T
-     * and all_to_all exchange included
-     */
-    template<class Block, class T>
-    void sort(Master&                   master,      //!< master object
-              const Assigner&           assigner,    //!< assigner object
-              std::vector<T> Block::*   values,      //!< all values to sort
-              std::vector<T> Block::*   samples,     //!< (output) boundaries of blocks
-              size_t                    num_samples, //!< desired number of samples
-              int                       k   = 2)     //!< k-ary reduction will be used
-    {
-        sort(master, assigner, values, samples, num_samples, std::less<T>(), k);
-    }
-
-    /**
-     * \ingroup Algorithms
-     * \brief build a kd-tree and sort a set of points into it (use histograms to determine split values)
-     */
-    template<class Block, class Point>
-    void kdtree(Master&                         master,      //!< master object
-                const Assigner&                 assigner,    //!< assigner object
-                int                             dim,         //!< dimensionality
-                const ContinuousBounds&         domain,      //!< global data extents
-                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
-                size_t                          bins,        //!< number of histogram bins for splitting a dimension
-                bool                            wrap = false)//!< periodic boundaries in all dimensions
-    {
-        if (assigner.nblocks() & (assigner.nblocks() - 1))
-            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
-
-        typedef     diy::RegularContinuousLink      RCLink;
-
-        for (size_t i = 0; i < master.size(); ++i)
-        {
-            RCLink* link   = static_cast<RCLink*>(master.link(i));
-            *link = RCLink(dim, domain, domain);
-
-            if (wrap)       // set up the links to self
-            {
-                diy::BlockID self = { master.gid(i), master.communicator().rank() };
-                for (int j = 0; j < dim; ++j)
-                {
-                    diy::Direction dir, wrap_dir;
-
-                    // left
-                    dir[j] = -1; wrap_dir[j] = -1;
-                    link->add_neighbor(self);
-                    link->add_bounds(domain);
-                    link->add_direction(dir);
-                    link->add_wrap(wrap_dir);
-
-                    // right
-                    dir[j] = 1; wrap_dir[j] = 1;
-                    link->add_neighbor(self);
-                    link->add_bounds(domain);
-                    link->add_direction(dir);
-                    link->add_wrap(wrap_dir);
-                }
-            }
-        }
-
-        detail::KDTreePartition<Block,Point>    kdtree_partition(dim, points, bins);
-
-        detail::KDTreePartners                  partners(dim, assigner.nblocks(), wrap, domain);
-        reduce(master, assigner, partners, kdtree_partition);
-
-        // update master.expected to match the links
-        int expected = 0;
-        for (size_t i = 0; i < master.size(); ++i)
-            expected += master.link(i)->size_unique();
-        master.set_expected(expected);
-    }
-
-    /**
-     * \ingroup Algorithms
-     * \brief build a kd-tree and sort a set of points into it (use sampling to determine split values)
-     */
-    template<class Block, class Point>
-    void kdtree_sampling
-               (Master&                         master,      //!< master object
-                const Assigner&                 assigner,    //!< assigner object
-                int                             dim,         //!< dimensionality
-                const ContinuousBounds&         domain,      //!< global data extents
-                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
-                size_t                          samples,     //!< number of samples to take in each block
-                bool                            wrap = false)//!< periodic boundaries in all dimensions
-    {
-        if (assigner.nblocks() & (assigner.nblocks() - 1))
-            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
-
-        typedef     diy::RegularContinuousLink      RCLink;
-
-        for (size_t i = 0; i < master.size(); ++i)
-        {
-            RCLink* link   = static_cast<RCLink*>(master.link(i));
-            *link = RCLink(dim, domain, domain);
-
-            if (wrap)       // set up the links to self
-            {
-                diy::BlockID self = { master.gid(i), master.communicator().rank() };
-                for (int j = 0; j < dim; ++j)
-                {
-                    diy::Direction dir, wrap_dir;
-
-                    // left
-                    dir[j] = -1; wrap_dir[j] = -1;
-                    link->add_neighbor(self);
-                    link->add_bounds(domain);
-                    link->add_direction(dir);
-                    link->add_wrap(wrap_dir);
-
-                    // right
-                    dir[j] = 1; wrap_dir[j] = 1;
-                    link->add_neighbor(self);
-                    link->add_bounds(domain);
-                    link->add_direction(dir);
-                    link->add_wrap(wrap_dir);
-                }
-            }
-        }
-
-        detail::KDTreeSamplingPartition<Block,Point>    kdtree_partition(dim, points, samples);
-
-        detail::KDTreePartners                          partners(dim, assigner.nblocks(), wrap, domain);
-        reduce(master, assigner, partners, kdtree_partition);
-
-        // update master.expected to match the links
-        int expected = 0;
-        for (size_t i = 0; i < master.size(); ++i)
-            expected += master.link(i)->size_unique();
-        master.set_expected(expected);
-    }
-}
-
-#endif
diff --git a/diy/include/diy/assigner.hpp b/diy/include/diy/assigner.hpp
deleted file mode 100644
index 957596ddc..000000000
--- a/diy/include/diy/assigner.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef DIY_ASSIGNER_HPP
-#define DIY_ASSIGNER_HPP
-
-#include <vector>
-
-namespace diy
-{
-  // Derived types should define
-  //   int rank(int gid) const
-  // that converts a global block id to a rank that it's assigned to.
-  class Assigner
-  {
-    public:
-     /**
-      * \ingroup Assignment
-      * \brief Manages how blocks are assigned to processes
-      */
-                    Assigner(int size,     //!< total number of processes
-                             int nblocks   //!< total (global) number of blocks
-                             ):
-                      size_(size), nblocks_(nblocks)    {}
-
-      //! returns the total number of process ranks
-      int           size() const                        { return size_; }
-      //! returns the total number of global blocks
-      int           nblocks() const                     { return nblocks_; }
-      //! sets the total number of global blocks
-      void          set_nblocks(int nblocks)            { nblocks_ = nblocks; }
-      //! gets the local gids for a given process rank
-      virtual void  local_gids(int rank, std::vector<int>& gids) const   =0;
-      //! returns the process rank of the block with global id gid (need not be local)
-      virtual int   rank(int gid) const     =0;
-
-    private:
-      int           size_;      // total number of ranks
-      int           nblocks_;   // total number of blocks
-  };
-
-  class ContiguousAssigner: public Assigner
-  {
-    public:
-     /**
-      * \ingroup Assignment
-      * \brief Assigns blocks to processes in contiguous gid (block global id) order
-      */
-            ContiguousAssigner(int size,     //!< total number of processes
-                               int nblocks   //!< total (global) number of blocks
-                               ):
-              Assigner(size, nblocks)           {}
-
-      using Assigner::size;
-      using Assigner::nblocks;
-
-      int   rank(int gid) const override
-      {
-          int div = nblocks() / size();
-          int mod = nblocks() % size();
-          int r = gid / (div + 1);
-          if (r < mod)
-          {
-              return r;
-          } else
-          {
-              return mod + (gid - (div + 1)*mod)/div;
-          }
-      }
-      inline
-      void  local_gids(int rank, std::vector<int>& gids) const override;
-  };
-
-  class RoundRobinAssigner: public Assigner
-  {
-    public:
-     /**
-      * \ingroup Assignment
-      * \brief Assigns blocks to processes in cyclic or round-robin gid (block global id) order
-      */
-            RoundRobinAssigner(int size,     //!< total number of processes
-                               int nblocks   //!< total (global) number of blocks
-                               ):
-              Assigner(size, nblocks)           {}
-
-      using Assigner::size;
-      using Assigner::nblocks;
-
-      int   rank(int gid) const override        { return gid % size(); }
-      inline
-      void  local_gids(int rank, std::vector<int>& gids) const override;
-  };
-}
-
-void
-diy::ContiguousAssigner::
-local_gids(int rank, std::vector<int>& gids) const
-{
-  int div = nblocks() / size();
-  int mod = nblocks() % size();
-
-  int from, to;
-  if (rank < mod)
-      from = rank * (div + 1);
-  else
-      from = mod * (div + 1) + (rank - mod) * div;
-
-  if (rank + 1 < mod)
-      to = (rank + 1) * (div + 1);
-  else
-      to = mod * (div + 1) + (rank + 1 - mod) * div;
-
-  for (int gid = from; gid < to; ++gid)
-    gids.push_back(gid);
-}
-
-void
-diy::RoundRobinAssigner::
-local_gids(int rank, std::vector<int>& gids) const
-{
-  int cur = rank;
-  while (cur < nblocks())
-  {
-    gids.push_back(cur);
-    cur += size();
-  }
-}
-
-#endif
diff --git a/diy/include/diy/collection.hpp b/diy/include/diy/collection.hpp
deleted file mode 100644
index c24af95f5..000000000
--- a/diy/include/diy/collection.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef DIY_COLLECTION_HPP
-#define DIY_COLLECTION_HPP
-
-#include <vector>
-
-#include "serialization.hpp"
-#include "storage.hpp"
-#include "thread.hpp"
-
-
-namespace diy
-{
-  class Collection
-  {
-    public:
-      typedef       void*                                       Element;
-      typedef       std::vector<Element>                        Elements;
-      typedef       critical_resource<int, recursive_mutex>     CInt;
-
-      typedef       void* (*Create)();
-      typedef       void  (*Destroy)(void*);
-      typedef       detail::Save                                Save;
-      typedef       detail::Load                                Load;
-
-    public:
-                    Collection(Create               create,
-                               Destroy              destroy,
-                               ExternalStorage*     storage,
-                               Save                 save,
-                               Load                 load):
-                        create_(create),
-                        destroy_(destroy),
-                        storage_(storage),
-                        save_(save),
-                        load_(load),
-                        in_memory_(0)               {}
-
-      size_t        size() const                    { return elements_.size(); }
-      const CInt&   in_memory() const               { return in_memory_; }
-      inline void   clear();
-
-      int           add(Element e)                  { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return elements_.size() - 1; }
-      void*         release(int i)                  { void* e = get(i); elements_[i] = 0; return e; }
-
-      void*         find(int i) const               { return elements_[i]; }                        // possibly returns 0, if the element is unloaded
-      void*         get(int i)                      { if (!find(i)) load(i); return find(i); }      // loads the element first, and then returns its address
-
-      int           available() const               { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
-
-      inline void   load(int i);
-      inline void   unload(int i);
-
-      Create        creator() const                 { return create_; }
-      Destroy       destroyer() const               { return destroy_; }
-      Load          loader() const                  { return load_; }
-      Save          saver() const                   { return save_; }
-
-      void*         create() const                  { return create_(); }
-      void          destroy(int i)                  { if (find(i)) { destroy_(find(i)); elements_[i] = 0; } else if (external_[i] != -1) storage_->destroy(external_[i]); }
-
-      bool          own() const                     { return destroy_ != 0; }
-
-      ExternalStorage*      storage() const         { return storage_; }
-
-    private:
-      Create                create_;
-      Destroy               destroy_;
-      ExternalStorage*      storage_;
-      Save                  save_;
-      Load                  load_;
-
-      Elements              elements_;
-      std::vector<int>      external_;
-      CInt                  in_memory_;
-  };
-}
-
-void
-diy::Collection::
-clear()
-{
-  if (own())
-    for (size_t i = 0; i < size(); ++i)
-      destroy(i);
-  elements_.clear();
-  external_.clear();
-  *in_memory_.access() = 0;
-}
-
-void
-diy::Collection::
-unload(int i)
-{
-  //BinaryBuffer bb;
-  void* e = find(i);
-  //save_(e, bb);
-  //external_[i] = storage_->put(bb);
-  external_[i] = storage_->put(e, save_);
-
-  destroy_(e);
-  elements_[i] = 0;
-
-  --(*in_memory_.access());
-}
-
-void
-diy::Collection::
-load(int i)
-{
-  //BinaryBuffer bb;
-  //storage_->get(external_[i], bb);
-  void* e = create_();
-  //load_(e, bb);
-  storage_->get(external_[i], e, load_);
-  elements_[i] = e;
-  external_[i] = -1;
-
-  ++(*in_memory_.access());
-}
-
-#endif
diff --git a/diy/include/diy/communicator.hpp b/diy/include/diy/communicator.hpp
deleted file mode 100644
index b95708298..000000000
--- a/diy/include/diy/communicator.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef DIY_COMMUNICATOR_HPP
-#define DIY_COMMUNICATOR_HPP
-
-#warning "diy::Communicator (in diy/communicator.hpp) is deprecated, use diy::mpi::communicator directly"
-
-#include "mpi.hpp"
-
-namespace diy
-{
-  typedef mpi::communicator         Communicator;
-}
-
-#endif
diff --git a/diy/include/diy/constants.h b/diy/include/diy/constants.h
deleted file mode 100644
index e3c9cc563..000000000
--- a/diy/include/diy/constants.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef DIY_CONSTANTS_H
-#define DIY_CONSTANTS_H
-
-// Default DIY_MAX_DIM to 4, unless provided by the user
-// (used for static min/max size in various Bounds)
-#ifndef DIY_MAX_DIM
-#define DIY_MAX_DIM 4
-#endif
-
-enum
-{
-  DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
-  DIY_X1 = 0x02, /* maximum-side x (right) neighbor */
-  DIY_Y0 = 0x04, /* minimum-side y (bottom) neighbor */
-  DIY_Y1 = 0x08, /* maximum-side y (top) neighbor */
-  DIY_Z0 = 0x10, /* minimum-side z (back) neighbor */
-  DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */
-  DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */
-  DIY_T1 = 0x80  /* maximum-side t (later) neighbor */
-};
-
-#endif
diff --git a/diy/include/diy/critical-resource.hpp b/diy/include/diy/critical-resource.hpp
deleted file mode 100644
index 61a5a4b8a..000000000
--- a/diy/include/diy/critical-resource.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef DIY_CRITICAL_RESOURCE_HPP
-#define DIY_CRITICAL_RESOURCE_HPP
-
-namespace diy
-{
-  // TODO: when not running under C++11, i.e., when lock_guard is TinyThread's
-  //       lock_guard, and not C++11's unique_lock, this implementation might
-  //       be buggy since the copy constructor is invoked when
-  //       critical_resource::access() returns an instance of this class. Once
-  //       the temporary is destroyed the mutex is unlocked. I'm not 100%
-  //       certain of this because I'd expect a deadlock on copy constructor,
-  //       but it's clearly not happening -- so I may be missing something.
-  //       (This issue will take care of itself in DIY3 once we switch to C++11 completely.)
-  template<class T, class Mutex>
-  class resource_accessor
-  {
-    public:
-                resource_accessor(T& x, Mutex& m):
-                    x_(x), lock_(m)                         {}
-
-      T&        operator*()                                 { return x_; }
-      T*        operator->()                                { return &x_; }
-      const T&  operator*() const                           { return x_; }
-      const T*  operator->() const                          { return &x_; }
-
-    private:
-      T&                        x_;
-      lock_guard<Mutex>         lock_;
-  };
-
-  template<class T, class Mutex = fast_mutex>
-  class critical_resource
-  {
-    public:
-      typedef           resource_accessor<T, Mutex>         accessor;
-      typedef           resource_accessor<const T, Mutex>   const_accessor;     // eventually, try shared locking
-
-    public:
-                        critical_resource()                 {}
-                        critical_resource(const T& x):
-                            x_(x)                           {}
-
-      accessor          access()                            { return accessor(x_, m_); }
-      const_accessor    const_access() const                { return const_accessor(x_, m_); }
-
-    private:
-      T                 x_;
-      mutable Mutex     m_;
-  };
-}
-
-
-#endif
diff --git a/diy/include/diy/decomposition.hpp b/diy/include/diy/decomposition.hpp
deleted file mode 100644
index 51dfc5af2..000000000
--- a/diy/include/diy/decomposition.hpp
+++ /dev/null
@@ -1,716 +0,0 @@
-#ifndef DIY_DECOMPOSITION_HPP
-#define DIY_DECOMPOSITION_HPP
-
-#include <vector>
-#include <algorithm>
-#include <iostream>
-#include <cmath>
-#include <sstream>
-#include <stdexcept>
-
-#include "link.hpp"
-#include "assigner.hpp"
-#include "master.hpp"
-
-namespace diy
-{
-namespace detail
-{
-  template<class Bounds_, class Enable = void>
-  struct BoundsHelper;
-
-  // discrete bounds
-  template<class Bounds>
-  struct BoundsHelper<Bounds, typename std::enable_if<std::is_integral<typename Bounds::Coordinate>::value>::type>
-  {
-    using Coordinate = typename Bounds::Coordinate;
-
-    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)          { return min + (max - min + 1)/n * i; }
-    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool shared_face)
-    {
-      if (i == n - 1)
-        return max;
-      else
-        return from(i+1, n, min, max, shared_face) - (shared_face ? 0 : 1);
-    }
-
-    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
-    {
-        Coordinate width = (max - min + 1)/n;
-        Coordinate res = (x - min)/width;
-        if (res >= n) res = n - 1;
-
-        if (shared && x == from(res, n, min, max, shared))
-            --res;
-        return res;
-    }
-    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
-    {
-        Coordinate width = (max - min + 1)/n;
-        Coordinate res = (x - min)/width + 1;
-        if (shared && x == from(res, n, min, max, shared))
-            ++res;
-        return res;
-    }
-  };
-
-  // continuous bounds
-  template<class Bounds>
-  struct BoundsHelper<Bounds, typename std::enable_if<std::is_floating_point<typename Bounds::Coordinate>::value>::type>
-  {
-    using Coordinate = typename Bounds::Coordinate;
-
-    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * i; }
-    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * (i+1); }
-
-    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::floor((x - min)/width); if (min + res*width == x) return (res - 1); else return res; }
-    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::ceil ((x - min)/width); if (min + res*width == x) return (res + 1); else return res; }
-  };
-}
-
-  //! \ingroup Decomposition
-  //! Decomposes a regular (discrete or continuous) domain into even blocks;
-  //! creates Links with Bounds along the way.
-  template<class Bounds_>
-  struct RegularDecomposer
-  {
-    typedef         Bounds_                                         Bounds;
-    typedef         typename BoundsValue<Bounds>::type              Coordinate;
-    typedef         typename RegularLinkSelector<Bounds>::type      Link;
-
-    using Creator = std::function<void(int,      Bounds, Bounds, Bounds, Link)>;
-    using Updater = std::function<void(int, int, Bounds, Bounds, Bounds, Link)>;
-
-    typedef         std::vector<bool>                               BoolVector;
-    typedef         std::vector<Coordinate>                         CoordinateVector;
-    typedef         std::vector<int>                                DivisionsVector;
-
-    /// @param dim:        dimensionality of the decomposition
-    /// @param domain:     bounds of global domain
-    /// @param nblocks:    total number of global blocks
-    /// @param share_face: indicates dimensions on which to share block faces
-    /// @param wrap:       indicates dimensions on which to wrap the boundary
-    /// @param ghosts:     indicates how many ghosts to use in each dimension
-    /// @param divisions:  indicates how many cuts to make along each dimension
-    ///                   (0 means "no constraint," i.e., leave it up to the algorithm)
-                    RegularDecomposer(int               dim_,
-                                      const Bounds&     domain_,
-                                      int               nblocks_,
-                                      BoolVector        share_face_ = BoolVector(),
-                                      BoolVector        wrap_       = BoolVector(),
-                                      CoordinateVector  ghosts_     = CoordinateVector(),
-                                      DivisionsVector   divisions_  = DivisionsVector()):
-                      dim(dim_), domain(domain_), nblocks(nblocks_),
-                      share_face(share_face_),
-                      wrap(wrap_), ghosts(ghosts_), divisions(divisions_)
-    {
-      if ((int) share_face.size() < dim)  share_face.resize(dim);
-      if ((int) wrap.size() < dim)        wrap.resize(dim);
-      if ((int) ghosts.size() < dim)      ghosts.resize(dim);
-      if ((int) divisions.size() < dim)   divisions.resize(dim);
-
-      fill_divisions(divisions);
-    }
-
-    // Calls create(int gid, const Bounds& bounds, const Link& link)
-    void            decompose(int rank, const Assigner& assigner, const Creator& create);
-
-    void            decompose(int rank, const Assigner& assigner, Master& master, const Updater& update);
-
-    void            decompose(int rank, const Assigner& assigner, Master& master);
-
-    // find lowest gid that owns a particular point
-    template<class Point>
-    int             lowest_gid(const Point& p) const;
-
-    void            gid_to_coords(int gid, DivisionsVector& coords) const       { gid_to_coords(gid, coords, divisions); }
-    int             coords_to_gid(const DivisionsVector& coords) const          { return coords_to_gid(coords, divisions); }
-    void            fill_divisions(std::vector<int>& divisions) const;
-
-    void            fill_bounds(Bounds& bounds, const DivisionsVector& coords, bool add_ghosts = false) const;
-    void            fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const;
-
-    static bool     all(const std::vector<int>& v, int x);
-    static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions);
-    static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions);
-
-    static void     factor(std::vector<unsigned>& factors, int n);
-
-    // Point to GIDs functions
-    template<class Point>
-    void            point_to_gids(std::vector<int>& gids, const Point& p) const;
-
-    //! returns gid of a block that contains the point; ignores ghosts
-    template<class Point>
-    int             point_to_gid(const Point& p) const;
-
-    template<class Point>
-    int             num_gids(const Point& p) const;
-
-    template<class Point>
-    void            top_bottom(int& top, int& bottom, const Point& p, int axis) const;
-
-
-    int               dim;
-    Bounds            domain;
-    int               nblocks;
-    BoolVector        share_face;
-    BoolVector        wrap;
-    CoordinateVector  ghosts;
-    DivisionsVector   divisions;
-
-  };
-
-  /**
-   * \ingroup Decomposition
-   * \brief Decomposes the domain into a prescribed pattern of blocks.
-   *
-   * @param dim        dimension of the domain
-   * @param rank       local rank
-   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
-   *                   also communicates the total number of blocks
-   * @param create     the callback functor
-   * @param wrap       indicates dimensions on which to wrap the boundary
-   * @param ghosts     indicates how many ghosts to use in each dimension
-   * @param divs       indicates how many cuts to make along each dimension
-   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
-   *
-   * `create(...)` is called with each block assigned to the local domain. See [decomposition example](#decomposition-example).
-   */
-  template<class Bounds>
-  void decompose(int                dim,
-                 int                rank,
-                 const Bounds&      domain,
-                 const Assigner&    assigner,
-                 const typename RegularDecomposer<Bounds>::Creator&   create,
-                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
-                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
-  {
-    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, create);
-  }
-
-  /**
-   * \ingroup Decomposition
-   * \brief Decomposes the domain into a prescribed pattern of blocks.
-   *
-   * @param dim        dimension of the domain
-   * @param rank       local rank
-   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
-   *                   also communicates the total number of blocks
-   * @param master     gets the blocks once this function returns
-   * @param wrap       indicates dimensions on which to wrap the boundary
-   * @param ghosts     indicates how many ghosts to use in each dimension
-   * @param divs       indicates how many cuts to make along each dimension
-   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
-   *
-   * `master` must have been supplied a create function in order for this function to work.
-   */
-  template<class Bounds>
-  void decompose(int                dim,
-                 int                rank,
-                 const Bounds&      domain,
-                 const Assigner&    assigner,
-                 Master&            master,
-                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
-                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
-  {
-    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, master);
-  }
-
-  /**
-   * \ingroup Decomposition
-   * \brief A "null" decompositon that simply creates the blocks and adds them to the master
-   *
-   * @param rank       local rank
-   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
-   *                   also communicates the total number of blocks
-   * @param master     gets the blocks once this function returns
-   */
-  inline
-  void decompose(int                rank,
-                 const Assigner&    assigner,
-                 Master&            master)
-  {
-    std::vector<int>  local_gids;
-    assigner.local_gids(rank, local_gids);
-
-    for (size_t i = 0; i < local_gids.size(); ++i)
-      master.add(local_gids[i], master.create(), new diy::Link);
-  }
-
-    /**
-     * \ingroup Decomposition
-     * \brief Add a decomposition (modify links) of an existing set of blocks that were
-     * added to the master previously
-     *
-     * @param rank       local rank
-     * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
-     *                   also communicates the total number of blocks
-     */
-  template<class Bounds>
-  void decompose(int                dim,
-                 int                rank,
-                 const Bounds&      domain,
-                 const Assigner&    assigner,
-                 Master&            master,
-                 const typename RegularDecomposer<Bounds>::Updater&   update,
-                 typename RegularDecomposer<Bounds>::BoolVector       share_face =
-                 typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::BoolVector       wrap       =
-                 typename RegularDecomposer<Bounds>::BoolVector(),
-                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     =
-                 typename RegularDecomposer<Bounds>::CoordinateVector(),
-                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       =
-                 typename RegularDecomposer<Bounds>::DivisionsVector())
-  {
-      RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).
-          decompose(rank, assigner, master, update);
-  }
-
-  //! Decomposition example: \example decomposition/test-decomposition.cpp
-  //! Direct master insertion example: \example decomposition/test-direct-master.cpp
-}
-
-// decomposes domain and adds blocks to the master
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-decompose(int rank, const Assigner& assigner, Master& master)
-{
-  decompose(rank, assigner, [&master](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
-  {
-    void*     b = master.create();
-    Link*     l = new Link(link);
-    master.add(gid, b, l);
-  });
-}
-
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-decompose(int rank, const Assigner& assigner, const Creator& create)
-{
-  std::vector<int> gids;
-  assigner.local_gids(rank, gids);
-  for (int i = 0; i < (int)gids.size(); ++i)
-  {
-    int gid = gids[i];
-
-    DivisionsVector coords;
-    gid_to_coords(gid, coords);
-
-    Bounds core, bounds;
-    fill_bounds(core,   coords);
-    fill_bounds(bounds, coords, true);
-
-    // Fill link with all the neighbors
-    Link link(dim, core, bounds);
-    std::vector<int>  offsets(dim, -1);
-    offsets[0] = -2;
-    while (!all(offsets, 1))
-    {
-      // next offset
-      int i;
-      for (i = 0; i < dim; ++i)
-        if (offsets[i] == 1)
-          offsets[i] = -1;
-        else
-          break;
-      ++offsets[i];
-
-      if (all(offsets, 0)) continue;      // skip ourselves
-
-      DivisionsVector     nhbr_coords(dim);
-      Direction           dir, wrap_dir;
-      bool                inbounds = true;
-      for (int i = 0; i < dim; ++i)
-      {
-        nhbr_coords[i] = coords[i] + offsets[i];
-
-        // wrap
-        if (nhbr_coords[i] < 0)
-        {
-          if (wrap[i])
-          {
-            nhbr_coords[i] = divisions[i] - 1;
-            wrap_dir[i] = -1;
-          }
-          else
-            inbounds = false;
-        }
-
-        if (nhbr_coords[i] >= divisions[i])
-        {
-          if (wrap[i])
-          {
-            nhbr_coords[i] = 0;
-            wrap_dir[i] = 1;
-          }
-          else
-            inbounds = false;
-        }
-
-        // NB: this needs to match the addressing scheme in dir_t (in constants.h)
-        if (offsets[i] == -1 || offsets[i] == 1)
-          dir[i] = offsets[i];
-      }
-      if (!inbounds) continue;
-
-      int nhbr_gid = coords_to_gid(nhbr_coords);
-      BlockID bid; bid.gid = nhbr_gid; bid.proc = assigner.rank(nhbr_gid);
-      link.add_neighbor(bid);
-
-      Bounds nhbr_bounds;
-      fill_bounds(nhbr_bounds, nhbr_coords);
-      link.add_bounds(nhbr_bounds);
-
-      link.add_direction(dir);
-      link.add_wrap(wrap_dir);
-    }
-
-    create(gid, core, bounds, domain, link);
-  }
-}
-
-// decomposes domain but does not add blocks to master, assumes they were added already
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-decompose(int rank, const Assigner& assigner, Master& master, const Updater& update)
-{
-    decompose(rank, assigner, [&master,&update](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
-    {
-        int lid = master.lid(gid);
-        Link* l = new Link(link);
-        master.replace_link(lid, l);
-        update(gid, lid, core, bounds, domain, *l);
-    });
-}
-
-template<class Bounds>
-bool
-diy::RegularDecomposer<Bounds>::
-all(const std::vector<int>& v, int x)
-{
-  for (unsigned i = 0; i < v.size(); ++i)
-    if (v[i] != x)
-      return false;
-  return true;
-}
-
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions)
-{
-  int dim = divisions.size();
-  for (int i = 0; i < dim; ++i)
-  {
-    coords.push_back(gid % divisions[i]);
-    gid /= divisions[i];
-  }
-}
-
-template<class Bounds>
-int
-diy::RegularDecomposer<Bounds>::
-coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions)
-{
-  int gid = 0;
-  for (int i = coords.size() - 1; i >= 0; --i)
-  {
-    gid *= divisions[i];
-    gid += coords[i];
-  }
-  return gid;
-}
-
-//! \ingroup Decomposition
-//! Gets the bounds, with or without ghosts, for a block specified by its block coordinates
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-fill_bounds(Bounds& bounds,                  //!< (output) bounds
-            const DivisionsVector& coords,   //!< coordinates of the block in the decomposition
-            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
-    const
-{
-  for (int i = 0; i < dim; ++i)
-  {
-    bounds.min[i] = detail::BoundsHelper<Bounds>::from(coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
-    bounds.max[i] = detail::BoundsHelper<Bounds>::to  (coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
-  }
-
-  for (int i = dim; i < DIY_MAX_DIM; ++i)   // set the unused dimension to 0
-  {
-    bounds.min[i] = 0;
-    bounds.max[i] = 0;
-  }
-
-  if (!add_ghosts)
-    return;
-
-  for (int i = 0; i < dim; ++i)
-  {
-    if (wrap[i])
-    {
-      bounds.min[i] -= ghosts[i];
-      bounds.max[i] += ghosts[i];
-    } else
-    {
-      bounds.min[i] = std::max(domain.min[i], bounds.min[i] - ghosts[i]);
-      bounds.max[i] = std::min(domain.max[i], bounds.max[i] + ghosts[i]);
-    }
-  }
-}
-
-//! \ingroup Decomposition
-//! Gets the bounds, with or without ghosts, for a block specified by its gid
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-fill_bounds(Bounds& bounds,                  //!< (output) bounds
-            int gid,                         //!< global id of the block
-            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
-    const
-{
-    DivisionsVector coords;
-    gid_to_coords(gid, coords);
-    if (add_ghosts)
-        fill_bounds(bounds, coords, true);
-    else
-        fill_bounds(bounds, coords);
-}
-
-namespace diy { namespace detail {
-// current state of division in one dimension used in fill_divisions below
-template<class Coordinate>
-struct Div
-{
-    int dim;                                 // 0, 1, 2, etc. e.g. for x, y, z etc.
-    int nb;                                  // number of blocks so far in this dimension
-    Coordinate b_size;                       // block size so far in this dimension
-
-    // sort on descending block size unless tied, in which case
-    // sort on ascending num blocks in current dim unless tied, in which case
-    // sort on ascending dimension
-    bool operator<(Div rhs) const
-    {
-        // sort on second value of the pair unless tied, in which case sort on first
-        if (b_size == rhs.b_size)
-        {
-            if (nb == rhs.nb)
-                return(dim < rhs.dim);
-            return(nb < rhs.nb);
-        }
-        return(b_size > rhs.b_size);
-    }
-};
-} }
-
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-fill_divisions(std::vector<int>& divisions) const
-{
-    // prod = number of blocks unconstrained by user; c = number of unconstrained dimensions
-    int prod = 1; int c = 0;
-    for (int i = 0; i < dim; ++i)
-        if (divisions[i] != 0)
-        {
-            prod *= divisions[i];
-            ++c;
-        }
-
-    if (nblocks % prod != 0)
-        throw std::runtime_error("Total number of blocks cannot be factored into provided divs");
-
-    if (c == (int) divisions.size())               // nothing to do; user provided all divs
-        return;
-
-    // factor number of blocks left in unconstrained dimensions
-    // factorization is sorted from smallest to largest factors
-    std::vector<unsigned> factors;
-    factor(factors, nblocks/prod);
-
-    using detail::Div;
-    std::vector< Div<Coordinate> > missing_divs;              // pairs consisting of (dim, #divs)
-
-    // init missing_divs
-    for (int i = 0; i < dim; i++)
-    {
-        if (divisions[i] == 0)
-        {
-            Div<Coordinate> div;
-            div.dim = i;
-            div.nb = 1;
-            div.b_size = domain.max[i] - domain.min[i];
-            missing_divs.push_back(div);
-        }
-    }
-
-    // iterate over factorization of number of blocks (factors are sorted smallest to largest)
-    // NB: using int instead of size_t because must be negative in order to break out of loop
-    for (int i = factors.size() - 1; i >= 0; --i)
-    {
-        // fill in missing divs by dividing dimension w/ largest block size
-        // except when this would be illegal (resulting in bounds.max < bounds.min;
-        // only a problem for discrete bounds
-
-        // sort on decreasing block size
-        std::sort(missing_divs.begin(), missing_divs.end());
-
-        // split the dimension with the largest block size (first element in vector)
-        Coordinate min =
-            detail::BoundsHelper<Bounds>::from(0,
-                                               missing_divs[0].nb * factors[i],
-                                               domain.min[missing_divs[0].dim],
-                                               domain.max[missing_divs[0].dim],
-                                               share_face[missing_divs[0].dim]);
-        Coordinate max =
-            detail::BoundsHelper<Bounds>::to(0,
-                                             missing_divs[0].nb * factors[i],
-                                             domain.min[missing_divs[0].dim],
-                                             domain.max[missing_divs[0].dim],
-                                             share_face[missing_divs[0].dim]);
-        if (max >= min)
-        {
-            missing_divs[0].nb    *= factors[i];
-            missing_divs[0].b_size = max - min;
-        }
-        else
-        {
-            std::ostringstream oss;
-            oss << "Unable to decompose domain into " << nblocks << " blocks: " << min << " " << max;
-            throw std::runtime_error(oss.str());
-        }
-    }
-
-    // assign the divisions
-    for (size_t i = 0; i < missing_divs.size(); i++)
-        divisions[missing_divs[i].dim] = missing_divs[i].nb;
-}
-
-template<class Bounds>
-void
-diy::RegularDecomposer<Bounds>::
-factor(std::vector<unsigned>& factors, int n)
-{
-  while (n != 1)
-    for (int i = 2; i <= n; ++i)
-    {
-      if (n % i == 0)
-      {
-        factors.push_back(i);
-        n /= i;
-        break;
-      }
-    }
-}
-
-// Point to GIDs
-// TODO: deal with wrap correctly
-// TODO: add an optional ghosts argument to ignore ghosts (if we want to find the true owners, or something like that)
-template<class Bounds>
-template<class Point>
-void
-diy::RegularDecomposer<Bounds>::
-point_to_gids(std::vector<int>& gids, const Point& p) const
-{
-    std::vector< std::pair<int, int> > ranges(dim);
-    for (int i = 0; i < dim; ++i)
-        top_bottom(ranges[i].second, ranges[i].first, p, i);
-
-    // look up gids for all combinations
-    DivisionsVector coords(dim), location(dim);
-    while(location.back() < ranges.back().second - ranges.back().first)
-    {
-        for (int i = 0; i < dim; ++i)
-            coords[i] = ranges[i].first + location[i];
-        gids.push_back(coords_to_gid(coords, divisions));
-
-        location[0]++;
-        unsigned i = 0;
-        while (i < dim-1 && location[i] == ranges[i].second - ranges[i].first)
-        {
-            location[i] = 0;
-            ++i;
-            location[i]++;
-        }
-    }
-}
-
-template<class Bounds>
-template<class Point>
-int
-diy::RegularDecomposer<Bounds>::
-point_to_gid(const Point& p) const
-{
-    int gid = 0;
-    for (int axis = dim - 1; axis >= 0; --axis)
-    {
-      int bottom  = detail::BoundsHelper<Bounds>::lower(p[axis], divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
-          bottom  = std::max(0, bottom);
-
-      // coupled with coords_to_gid
-      gid *= divisions[axis];
-      gid += bottom;
-    }
-
-    return gid;
-}
-
-template<class Bounds>
-template<class Point>
-int
-diy::RegularDecomposer<Bounds>::
-num_gids(const Point& p) const
-{
-    int res = 1;
-    for (int i = 0; i < dim; ++i)
-    {
-        int top, bottom;
-        top_bottom(top, bottom, p, i);
-        res *= top - bottom;
-    }
-    return res;
-}
-
-template<class Bounds>
-template<class Point>
-void
-diy::RegularDecomposer<Bounds>::
-top_bottom(int& top, int& bottom, const Point& p, int axis) const
-{
-    Coordinate l = p[axis] - ghosts[axis];
-    Coordinate r = p[axis] + ghosts[axis];
-
-    top     = detail::BoundsHelper<Bounds>::upper(r, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
-    bottom  = detail::BoundsHelper<Bounds>::lower(l, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
-
-    if (!wrap[axis])
-    {
-        bottom  = std::max(0, bottom);
-        top     = std::min(divisions[axis], top);
-    }
-}
-
-// find lowest gid that owns a particular point
-template<class Bounds>
-template<class Point>
-int
-diy::RegularDecomposer<Bounds>::
-lowest_gid(const Point& p) const
-{
-    // TODO: optimize - no need to compute all gids
-    std::vector<int> gids;
-    point_to_gids(gids, p);
-    std::sort(gids.begin(), gids.end());
-    return gids[0];
-}
-
-#endif
diff --git a/diy/include/diy/detail/algorithms/kdtree-sampling.hpp b/diy/include/diy/detail/algorithms/kdtree-sampling.hpp
deleted file mode 100644
index 7cf2ee1e5..000000000
--- a/diy/include/diy/detail/algorithms/kdtree-sampling.hpp
+++ /dev/null
@@ -1,450 +0,0 @@
-#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
-#define DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
-
-#include <vector>
-#include <cassert>
-#include "../../partners/all-reduce.hpp"
-#include "../../log.hpp"
-
-// TODO: technically, what's done now is not a perfect subsample:
-//       we take the same number of samples from every block, in reality this number should be selected at random,
-//       so that the total number of samples adds up to samples*nblocks
-//
-// NB: random samples are chosen using rand(), which is assumed to be seeded
-//     externally. Once we switch to C++11, we should use its more advanced
-//     random number generators (and take a generator as an external parameter)
-//     (TODO)
-
-namespace diy
-{
-namespace detail
-{
-
-template<class Block, class Point>
-struct KDTreeSamplingPartition
-{
-    typedef     diy::RegularContinuousLink      RCLink;
-    typedef     diy::ContinuousBounds           Bounds;
-
-    typedef     std::vector<float>              Samples;
-
-                KDTreeSamplingPartition(int                             dim,
-                                        std::vector<Point>  Block::*    points,
-                                        size_t                          samples):
-                    dim_(dim), points_(points), samples_(samples)           {}
-
-    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
-
-    int         divide_gid(int gid, bool lower, int round, int rounds) const;
-    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
-    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
-    diy::Direction
-                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
-
-    void        compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const;
-    void        add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const;
-    void        receive_samples(Block* b, const diy::ReduceProxy& srp,       Samples& samples) const;
-    void        forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const;
-
-    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const;
-    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
-
-    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
-    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
-    float       find_split(const Bounds& changed, const Bounds& original) const;
-
-    int                             dim_;
-    std::vector<Point>  Block::*    points_;
-    size_t                          samples_;
-};
-
-}
-}
-
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
-{
-    int dim;
-    if (srp.round() < partners.rounds())
-        dim = partners.dim(srp.round());
-    else
-        dim = partners.dim(srp.round() - 1);
-
-    if (srp.round() == partners.rounds())
-        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
-    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
-    {
-        dequeue_exchange(b, srp, dim);         // from the swap round
-        split_to_neighbors(b, srp, dim);
-    }
-    else if (partners.swap_round(srp.round()))
-    {
-        Samples samples;
-        receive_samples(b, srp, samples);
-        enqueue_exchange(b, srp, dim, samples);
-    } else if (partners.sub_round(srp.round()) == 0)
-    {
-        if (srp.round() > 0)
-        {
-            int prev_dim = dim - 1;
-            if (prev_dim < 0)
-                prev_dim += dim_;
-            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
-        }
-
-        compute_local_samples(b, srp, dim);
-    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)     // we are reusing partners class, so really we are talking about the samples rounds here
-    {
-        Samples samples;
-        add_samples(b, srp, samples);
-        srp.enqueue(srp.out_link().target(0), samples);
-    } else
-    {
-        Samples samples;
-        add_samples(b, srp, samples);
-        if (samples.size() != 1)
-        {
-            // pick the median
-            std::nth_element(samples.begin(), samples.begin() + samples.size()/2, samples.end());
-            std::swap(samples[0], samples[samples.size()/2]);
-            //std::sort(samples.begin(), samples.end());
-            //samples[0] = (samples[samples.size()/2] + samples[samples.size()/2 + 1])/2;
-            samples.resize(1);
-        }
-        forward_samples(b, srp, samples);
-    }
-}
-
-template<class Block, class Point>
-int
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-divide_gid(int gid, bool lower, int round, int rounds) const
-{
-    if (lower)
-        gid &= ~(1 << (rounds - 1 - round));
-    else
-        gid |=  (1 << (rounds - 1 - round));
-    return gid;
-}
-
-// round here is the outer iteration of the algorithm
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
-{
-    auto        log  = get_logger();
-    int         gid  = srp.gid();
-    int         lid  = srp.master()->lid(gid);
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    // (gid, dir) -> i
-    std::map<std::pair<int,diy::Direction>, int> link_map;
-    for (int i = 0; i < link->size(); ++i)
-        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
-
-    // NB: srp.enqueue(..., ...) should match the link
-    std::vector<float>  splits(link->size());
-    for (int i = 0; i < link->size(); ++i)
-    {
-        float split; diy::Direction dir;
-
-        int in_gid = link->target(i).gid;
-        while(srp.incoming(in_gid))
-        {
-            srp.dequeue(in_gid, split);
-            srp.dequeue(in_gid, dir);
-
-            // reverse dir
-            for (int j = 0; j < dim_; ++j)
-                dir[j] = -dir[j];
-
-            int k = link_map[std::make_pair(in_gid, dir)];
-            log->trace("{} {} {} -> {}", in_gid, dir, split, k);
-            splits[k] = split;
-        }
-    }
-
-    RCLink      new_link(dim_, link->core(), link->core());
-
-    bool lower = !(gid & (1 << (rounds - 1 - round)));
-
-    // fill out the new link
-    for (int i = 0; i < link->size(); ++i)
-    {
-        diy::Direction  dir = link->direction(i);
-        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
-        if (dir[dim] != 0)
-        {
-            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
-            {
-                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
-                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
-                new_link.add_neighbor(nbr);
-
-                new_link.add_direction(dir);
-
-                Bounds bounds = link->bounds(i);
-                update_neighbor_bounds(bounds, splits[i], dim, !lower);
-                new_link.add_bounds(bounds);
-
-                if (wrap)
-                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
-                else
-                    new_link.add_wrap(diy::Direction());
-            }
-        } else // non-aligned side
-        {
-            for (int j = 0; j < 2; ++j)
-            {
-                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
-
-                Bounds  bounds  = link->bounds(i);
-                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
-
-                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
-                {
-                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
-                    new_link.add_neighbor(nbr);
-                    new_link.add_direction(dir);
-                    new_link.add_bounds(bounds);
-
-                    if (wrap)
-                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
-                    else
-                        new_link.add_wrap(diy::Direction());
-                }
-            }
-        }
-    }
-
-    // add link to the dual block
-    int dual_gid = divide_gid(gid, !lower, round, rounds);
-    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
-    new_link.add_neighbor(dual);
-
-    Bounds nbr_bounds = link->bounds();     // old block bounds
-    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
-    new_link.add_bounds(nbr_bounds);
-
-    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
-
-    if (lower)
-    {
-        diy::Direction right;
-        right[dim] = 1;
-        new_link.add_direction(right);
-    } else
-    {
-        diy::Direction left;
-        left[dim] = -1;
-        new_link.add_direction(left);
-    }
-
-    // update the link; notice that this won't conflict with anything since
-    // reduce is using its own notion of the link constructed through the
-    // partners
-    link->swap(new_link);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    // determine split
-    float split = find_split(link->core(), link->bounds());
-
-    for (int i = 0; i < link->size(); ++i)
-    {
-        srp.enqueue(link->target(i), split);
-        srp.enqueue(link->target(i), link->direction(i));
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    // compute and enqueue local samples
-    Samples samples;
-    size_t points_size = (b->*points_).size();
-    size_t n = std::min(points_size, samples_);
-    samples.reserve(n);
-    for (size_t i = 0; i < n; ++i)
-    {
-        float x = (b->*points_)[rand() % points_size][dim];
-        samples.push_back(x);
-    }
-
-    srp.enqueue(srp.out_link().target(0), samples);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
-{
-    // dequeue and combine the samples
-    for (int i = 0; i < srp.in_link().size(); ++i)
-    {
-        int nbr_gid = srp.in_link().target(i).gid;
-
-        Samples smpls;
-        srp.dequeue(nbr_gid, smpls);
-        for (size_t i = 0; i < smpls.size(); ++i)
-            samples.push_back(smpls[i]);
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
-{
-    srp.dequeue(srp.in_link().target(0).gid, samples);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const
-{
-    for (int i = 0; i < srp.out_link().size(); ++i)
-        srp.enqueue(srp.out_link().target(i), samples);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    int k = srp.out_link().size();
-
-    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
-        return;
-
-    // pick split points
-    float split = samples[0];
-
-    // subset and enqueue
-    std::vector< std::vector<Point> > out_points(srp.out_link().size());
-    for (size_t i = 0; i < (b->*points_).size(); ++i)
-    {
-      float x = (b->*points_)[i][dim];
-      int loc = x < split ? 0 : 1;
-      out_points[loc].push_back((b->*points_)[i]);
-    }
-    int pos = -1;
-    for (int i = 0; i < k; ++i)
-    {
-      if (srp.out_link().target(i).gid == srp.gid())
-      {
-        (b->*points_).swap(out_points[i]);
-        pos = i;
-      }
-      else
-        srp.enqueue(srp.out_link().target(i), out_points[i]);
-    }
-    if (pos == 0)
-        link->core().max[dim] = split;
-    else
-        link->core().min[dim] = split;
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    for (int i = 0; i < srp.in_link().size(); ++i)
-    {
-      int nbr_gid = srp.in_link().target(i).gid;
-      if (nbr_gid == srp.gid())
-          continue;
-
-      std::vector<Point>   in_points;
-      srp.dequeue(nbr_gid, in_points);
-      for (size_t j = 0; j < in_points.size(); ++j)
-      {
-        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
-            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
-                                                 in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
-        (b->*points_).push_back(in_points[j]);
-      }
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
-{
-    if (lower)
-        bounds.max[dim] = split;
-    else
-        bounds.min[dim] = split;
-}
-
-template<class Block, class Point>
-bool
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
-{
-    if (wrap)
-    {
-        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
-            return true;
-        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
-            return true;
-    }
-    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
-}
-
-template<class Block, class Point>
-float
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-find_split(const Bounds& changed, const Bounds& original) const
-{
-    for (int i = 0; i < dim_; ++i)
-    {
-        if (changed.min[i] != original.min[i])
-            return changed.min[i];
-        if (changed.max[i] != original.max[i])
-            return changed.max[i];
-    }
-    assert(0);
-    return -1;
-}
-
-template<class Block, class Point>
-diy::Direction
-diy::detail::KDTreeSamplingPartition<Block,Point>::
-find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
-{
-    diy::Direction wrap;
-    for (int i = 0; i < dim_; ++i)
-    {
-        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
-            wrap[i] = -1;
-        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
-            wrap[i] =  1;
-    }
-    return wrap;
-}
-
-
-#endif
diff --git a/diy/include/diy/detail/algorithms/kdtree.hpp b/diy/include/diy/detail/algorithms/kdtree.hpp
deleted file mode 100644
index 286929dc9..000000000
--- a/diy/include/diy/detail/algorithms/kdtree.hpp
+++ /dev/null
@@ -1,569 +0,0 @@
-#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_HPP
-#define DIY_DETAIL_ALGORITHMS_KDTREE_HPP
-
-#include <vector>
-#include <cassert>
-#include "../../partners/all-reduce.hpp"
-#include "../../log.hpp"
-
-namespace diy
-{
-namespace detail
-{
-
-struct KDTreePartners;
-
-template<class Block, class Point>
-struct KDTreePartition
-{
-    typedef     diy::RegularContinuousLink      RCLink;
-    typedef     diy::ContinuousBounds           Bounds;
-
-    typedef     std::vector<size_t>             Histogram;
-
-                KDTreePartition(int                             dim,
-                                std::vector<Point>  Block::*    points,
-                                size_t                          bins):
-                    dim_(dim), points_(points), bins_(bins)            {}
-
-    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
-
-    int         divide_gid(int gid, bool lower, int round, int rounds) const;
-    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
-    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
-    diy::Direction
-                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
-
-    void        compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const;
-    void        add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const;
-    void        receive_histogram(Block* b, const diy::ReduceProxy& srp,       Histogram& histogram) const;
-    void        forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const;
-
-    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const;
-    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
-
-    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
-    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
-    float       find_split(const Bounds& changed, const Bounds& original) const;
-
-    int                             dim_;
-    std::vector<Point>  Block::*    points_;
-    size_t                          bins_;
-};
-
-}
-}
-
-struct diy::detail::KDTreePartners
-{
-  // bool = are we in a swap (vs histogram) round
-  // int  = round within that partner
-  typedef           std::pair<bool, int>                    RoundType;
-  typedef           diy::ContinuousBounds                   Bounds;
-
-                    KDTreePartners(int dim, int nblocks, bool wrap_, const Bounds& domain_):
-                        decomposer(1, interval(0,nblocks-1), nblocks),
-                        histogram(decomposer, 2),
-                        swap(decomposer, 2, false),
-                        wrap(wrap_),
-                        domain(domain_)
-  {
-    for (unsigned i = 0; i < swap.rounds(); ++i)
-    {
-      // fill histogram rounds
-      for (unsigned j = 0; j < histogram.rounds(); ++j)
-      {
-        rounds_.push_back(std::make_pair(false, j));
-        dim_.push_back(i % dim);
-        if (j == histogram.rounds() / 2 - 1 - i)
-            j += 2*i;
-      }
-
-      // fill swap round
-      rounds_.push_back(std::make_pair(true, i));
-      dim_.push_back(i % dim);
-
-      // fill link round
-      rounds_.push_back(std::make_pair(true, -1));          // (true, -1) signals link round
-      dim_.push_back(i % dim);
-    }
-  }
-
-  size_t        rounds() const                              { return rounds_.size(); }
-  size_t        swap_rounds() const                         { return swap.rounds(); }
-
-  int           dim(int round) const                        { return dim_[round]; }
-  bool          swap_round(int round) const                 { return rounds_[round].first; }
-  int           sub_round(int round) const                  { return rounds_[round].second; }
-
-  inline bool   active(int round, int gid, const diy::Master& m) const
-  {
-    if (round == (int) rounds())
-        return true;
-    else if (swap_round(round) && sub_round(round) < 0)     // link round
-        return true;
-    else if (swap_round(round))
-        return swap.active(sub_round(round), gid, m);
-    else
-        return histogram.active(sub_round(round), gid, m);
-  }
-
-  inline void   incoming(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
-  {
-    if (round == (int) rounds())
-        link_neighbors(-1, gid, partners, m);
-    else if (swap_round(round) && sub_round(round) < 0)       // link round
-        swap.incoming(sub_round(round - 1) + 1, gid, partners, m);
-    else if (swap_round(round))
-        histogram.incoming(histogram.rounds(), gid, partners, m);
-    else
-    {
-        if (round > 0 && sub_round(round) == 0)
-            link_neighbors(-1, gid, partners, m);
-        else if (round > 0 && sub_round(round - 1) != sub_round(round) - 1)        // jump through the histogram rounds
-            histogram.incoming(sub_round(round - 1) + 1, gid, partners, m);
-        else
-            histogram.incoming(sub_round(round), gid, partners, m);
-    }
-  }
-
-  inline void   outgoing(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
-  {
-    if (round == (int) rounds())
-        swap.outgoing(sub_round(round-1) + 1, gid, partners, m);
-    else if (swap_round(round) && sub_round(round) < 0)       // link round
-        link_neighbors(-1, gid, partners, m);
-    else if (swap_round(round))
-        swap.outgoing(sub_round(round), gid, partners, m);
-    else
-        histogram.outgoing(sub_round(round), gid, partners, m);
-  }
-
-  inline void   link_neighbors(int, int gid, std::vector<int>& partners, const diy::Master& m) const
-  {
-    int         lid  = m.lid(gid);
-    diy::Link*  link = m.link(lid);
-
-    std::set<int> result;       // partners must be unique
-    for (int i = 0; i < link->size(); ++i)
-        result.insert(link->target(i).gid);
-
-    for (std::set<int>::const_iterator it = result.begin(); it != result.end(); ++it)
-        partners.push_back(*it);
-  }
-
-  // 1-D domain to feed into histogram and swap
-  diy::RegularDecomposer<diy::DiscreteBounds>   decomposer;
-
-  diy::RegularAllReducePartners     histogram;
-  diy::RegularSwapPartners          swap;
-
-  std::vector<RoundType>            rounds_;
-  std::vector<int>                  dim_;
-
-  bool                              wrap;
-  Bounds                            domain;
-};
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
-{
-    int dim;
-    if (srp.round() < partners.rounds())
-        dim = partners.dim(srp.round());
-    else
-        dim = partners.dim(srp.round() - 1);
-
-    if (srp.round() == partners.rounds())
-        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
-    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
-    {
-        dequeue_exchange(b, srp, dim);         // from the swap round
-        split_to_neighbors(b, srp, dim);
-    }
-    else if (partners.swap_round(srp.round()))
-    {
-        Histogram   histogram;
-        receive_histogram(b, srp, histogram);
-        enqueue_exchange(b, srp, dim, histogram);
-    } else if (partners.sub_round(srp.round()) == 0)
-    {
-        if (srp.round() > 0)
-        {
-            int prev_dim = dim - 1;
-            if (prev_dim < 0)
-                prev_dim += dim_;
-            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
-        }
-
-        compute_local_histogram(b, srp, dim);
-    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)
-    {
-        Histogram   histogram(bins_);
-        add_histogram(b, srp, histogram);
-        srp.enqueue(srp.out_link().target(0), histogram);
-    }
-    else
-    {
-        Histogram   histogram(bins_);
-        add_histogram(b, srp, histogram);
-        forward_histogram(b, srp, histogram);
-    }
-}
-
-template<class Block, class Point>
-int
-diy::detail::KDTreePartition<Block,Point>::
-divide_gid(int gid, bool lower, int round, int rounds) const
-{
-    if (lower)
-        gid &= ~(1 << (rounds - 1 - round));
-    else
-        gid |=  (1 << (rounds - 1 - round));
-    return gid;
-}
-
-// round here is the outer iteration of the algorithm
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
-{
-    int         gid  = srp.gid();
-    int         lid  = srp.master()->lid(gid);
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    // (gid, dir) -> i
-    std::map<std::pair<int,diy::Direction>, int> link_map;
-    for (int i = 0; i < link->size(); ++i)
-        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
-
-    // NB: srp.enqueue(..., ...) should match the link
-    std::vector<float>  splits(link->size());
-    for (int i = 0; i < link->size(); ++i)
-    {
-        float split; diy::Direction dir;
-
-        int in_gid = link->target(i).gid;
-        while(srp.incoming(in_gid))
-        {
-            srp.dequeue(in_gid, split);
-            srp.dequeue(in_gid, dir);
-
-            // reverse dir
-            for (int j = 0; j < dim_; ++j)
-                dir[j] = -dir[j];
-
-            int k = link_map[std::make_pair(in_gid, dir)];
-            splits[k] = split;
-        }
-    }
-
-    RCLink      new_link(dim_, link->core(), link->core());
-
-    bool lower = !(gid & (1 << (rounds - 1 - round)));
-
-    // fill out the new link
-    for (int i = 0; i < link->size(); ++i)
-    {
-        diy::Direction  dir      = link->direction(i);
-        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
-        if (dir[dim] != 0)
-        {
-            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
-            {
-                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
-                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
-                new_link.add_neighbor(nbr);
-
-                new_link.add_direction(dir);
-
-                Bounds bounds = link->bounds(i);
-                update_neighbor_bounds(bounds, splits[i], dim, !lower);
-                new_link.add_bounds(bounds);
-
-                if (wrap)
-                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
-                else
-                    new_link.add_wrap(diy::Direction());
-            }
-        } else // non-aligned side
-        {
-            for (int j = 0; j < 2; ++j)
-            {
-                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
-
-                Bounds  bounds  = link->bounds(i);
-                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
-
-                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
-                {
-                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
-                    new_link.add_neighbor(nbr);
-                    new_link.add_direction(dir);
-                    new_link.add_bounds(bounds);
-
-                    if (wrap)
-                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
-                    else
-                        new_link.add_wrap(diy::Direction());
-                }
-            }
-        }
-    }
-
-    // add link to the dual block
-    int dual_gid = divide_gid(gid, !lower, round, rounds);
-    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
-    new_link.add_neighbor(dual);
-
-    Bounds nbr_bounds = link->bounds();     // old block bounds
-    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
-    new_link.add_bounds(nbr_bounds);
-
-    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
-
-    if (lower)
-    {
-        diy::Direction right;
-        right[dim] = 1;
-        new_link.add_direction(right);
-    } else
-    {
-        diy::Direction left;
-        left[dim] = -1;
-        new_link.add_direction(left);
-    }
-
-    // update the link; notice that this won't conflict with anything since
-    // reduce is using its own notion of the link constructed through the
-    // partners
-    link->swap(new_link);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    // determine split
-    float split = find_split(link->core(), link->bounds());
-
-    for (int i = 0; i < link->size(); ++i)
-    {
-        srp.enqueue(link->target(i), split);
-        srp.enqueue(link->target(i), link->direction(i));
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    // compute and enqueue local histogram
-    Histogram histogram(bins_);
-
-    float   width = (link->core().max[dim] - link->core().min[dim])/bins_;
-    for (size_t i = 0; i < (b->*points_).size(); ++i)
-    {
-        float x = (b->*points_)[i][dim];
-        int loc = (x - link->core().min[dim]) / width;
-        if (loc < 0)
-            throw std::runtime_error(fmt::format("{} {} {}", loc, x, link->core().min[dim]));
-        if (loc >= (int) bins_)
-            loc = bins_ - 1;
-        ++(histogram[loc]);
-    }
-
-    srp.enqueue(srp.out_link().target(0), histogram);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
-{
-    // dequeue and add up the histograms
-    for (int i = 0; i < srp.in_link().size(); ++i)
-    {
-        int nbr_gid = srp.in_link().target(i).gid;
-
-        Histogram hist;
-        srp.dequeue(nbr_gid, hist);
-        for (size_t i = 0; i < hist.size(); ++i)
-            histogram[i] += hist[i];
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-receive_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
-{
-    srp.dequeue(srp.in_link().target(0).gid, histogram);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const
-{
-    for (int i = 0; i < srp.out_link().size(); ++i)
-        srp.enqueue(srp.out_link().target(i), histogram);
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const
-{
-    auto        log = get_logger();
-
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    int k = srp.out_link().size();
-
-    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
-        return;
-
-    // pick split points
-    size_t total = 0;
-    for (size_t i = 0; i < histogram.size(); ++i)
-        total += histogram[i];
-    log->trace("Histogram total: {}", total);
-
-    size_t cur   = 0;
-    float  width = (link->core().max[dim] - link->core().min[dim])/bins_;
-    float  split = 0;
-    for (size_t i = 0; i < histogram.size(); ++i)
-    {
-        if (cur + histogram[i] > total/2)
-        {
-            split = link->core().min[dim] + width*i;
-            break;
-        }
-        cur += histogram[i];
-    }
-    log->trace("Found split: {} (dim={}) in {} - {}", split, dim, link->core().min[dim], link->core().max[dim]);
-
-    // subset and enqueue
-    std::vector< std::vector<Point> > out_points(srp.out_link().size());
-    for (size_t i = 0; i < (b->*points_).size(); ++i)
-    {
-      float x = (b->*points_)[i][dim];
-      int loc = x < split ? 0 : 1;
-      out_points[loc].push_back((b->*points_)[i]);
-    }
-    int pos = -1;
-    for (int i = 0; i < k; ++i)
-    {
-      if (srp.out_link().target(i).gid == srp.gid())
-      {
-        (b->*points_).swap(out_points[i]);
-        pos = i;
-      }
-      else
-        srp.enqueue(srp.out_link().target(i), out_points[i]);
-    }
-    if (pos == 0)
-        link->core().max[dim] = split;
-    else
-        link->core().min[dim] = split;
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
-{
-    int         lid  = srp.master()->lid(srp.gid());
-    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
-
-    for (int i = 0; i < srp.in_link().size(); ++i)
-    {
-      int nbr_gid = srp.in_link().target(i).gid;
-      if (nbr_gid == srp.gid())
-          continue;
-
-      std::vector<Point>   in_points;
-      srp.dequeue(nbr_gid, in_points);
-      for (size_t j = 0; j < in_points.size(); ++j)
-      {
-        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
-            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
-                                     in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
-        (b->*points_).push_back(in_points[j]);
-      }
-    }
-}
-
-template<class Block, class Point>
-void
-diy::detail::KDTreePartition<Block,Point>::
-update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
-{
-    if (lower)
-        bounds.max[dim] = split;
-    else
-        bounds.min[dim] = split;
-}
-
-template<class Block, class Point>
-bool
-diy::detail::KDTreePartition<Block,Point>::
-intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
-{
-    if (wrap)
-    {
-        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
-            return true;
-        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
-            return true;
-    }
-    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
-}
-
-template<class Block, class Point>
-float
-diy::detail::KDTreePartition<Block,Point>::
-find_split(const Bounds& changed, const Bounds& original) const
-{
-    for (int i = 0; i < dim_; ++i)
-    {
-        if (changed.min[i] != original.min[i])
-            return changed.min[i];
-        if (changed.max[i] != original.max[i])
-            return changed.max[i];
-    }
-    assert(0);
-    return -1;
-}
-
-template<class Block, class Point>
-diy::Direction
-diy::detail::KDTreePartition<Block,Point>::
-find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
-{
-    diy::Direction wrap;
-    for (int i = 0; i < dim_; ++i)
-    {
-        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
-            wrap[i] = -1;
-        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
-            wrap[i] =  1;
-    }
-    return wrap;
-}
-
-
-#endif
diff --git a/diy/include/diy/detail/algorithms/sort.hpp b/diy/include/diy/detail/algorithms/sort.hpp
deleted file mode 100644
index 5cc3f8807..000000000
--- a/diy/include/diy/detail/algorithms/sort.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#ifndef DIY_DETAIL_ALGORITHMS_SORT_HPP
-#define DIY_DETAIL_ALGORITHMS_SORT_HPP
-
-#include <functional>
-#include <algorithm>
-
-namespace diy
-{
-
-namespace detail
-{
-
-template<class Block, class T, class Cmp>
-struct SampleSort
-{
-    typedef         std::vector<T>      Block::*ValuesVector;
-    struct Sampler;
-    struct Exchanger;
-
-                    SampleSort(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_, size_t num_samples_):
-                        values(values_), samples(samples_),
-                        cmp(cmp_), num_samples(num_samples_)                    {}
-
-    Sampler         sample() const                                              { return Sampler(values, samples, cmp, num_samples); }
-    Exchanger       exchange() const                                            { return Exchanger(values, samples, cmp); }
-
-    static void     dequeue_values(std::vector<T>& v, const ReduceProxy& rp, bool skip_self = true)
-    {
-        auto log = get_logger();
-
-        int k_in  = rp.in_link().size();
-
-        log->trace("dequeue_values(): gid={}, round={}; v.size()={}", rp.gid(), rp.round(), v.size());
-
-        if (detail::is_default< Serialization<T> >::value)
-        {
-            // add up sizes
-            size_t sz = 0;
-            size_t end = v.size();
-            for (int i = 0; i < k_in; ++i)
-            {
-                log->trace("    incoming size from {}: {}", rp.in_link().target(i).gid, sz);
-                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
-                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
-                sz += in.size() / sizeof(T);
-            }
-            log->trace("    incoming size: {}", sz);
-            v.resize(end + sz);
-
-            for (int i = 0; i < k_in; ++i)
-            {
-                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
-                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
-                size_t sz = in.size() / sizeof(T);
-                T* bg = (T*) &in.buffer[0];
-                std::copy(bg, bg + sz, &v[end]);
-                end += sz;
-            }
-        } else
-        {
-            for (int i = 0; i < k_in; ++i)
-            {
-                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
-                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
-                while(in)
-                {
-                    T x;
-                    diy::load(in, x);
-                    v.emplace_back(std::move(x));
-                }
-            }
-        }
-        log->trace("    v.size()={}", v.size());
-    }
-
-    ValuesVector    values;
-    ValuesVector    samples;
-    Cmp             cmp;
-    size_t          num_samples;
-};
-
-template<class Block, class T, class Cmp>
-struct SampleSort<Block,T,Cmp>::Sampler
-{
-                    Sampler(ValuesVector values_, ValuesVector dividers_, const Cmp& cmp_, size_t num_samples_):
-                        values(values_), dividers(dividers_), cmp(cmp_), num_samples(num_samples_)    {}
-
-    void            operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
-    {
-        int k_in  = srp.in_link().size();
-        int k_out = srp.out_link().size();
-
-        std::vector<T> samples;
-
-        if (k_in == 0)
-        {
-            // draw random samples
-            for (size_t i = 0; i < num_samples; ++i)
-                samples.push_back((b->*values)[std::rand() % (b->*values).size()]);
-        } else
-            dequeue_values(samples, srp, false);
-
-        if (k_out == 0)
-        {
-            // pick subsamples that separate quantiles
-            std::sort(samples.begin(), samples.end(), cmp);
-            std::vector<T>  subsamples(srp.nblocks() - 1);
-            int step = samples.size() / srp.nblocks();       // NB: subsamples.size() + 1
-            for (size_t i = 0; i < subsamples.size(); ++i)
-                subsamples[i] = samples[(i+1)*step];
-            (b->*dividers).swap(subsamples);
-        }
-        else
-        {
-            for (int i = 0; i < k_out; ++i)
-            {
-                MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
-                save(out, &samples[0], samples.size());
-            }
-        }
-    }
-
-    ValuesVector    values;
-    ValuesVector    dividers;
-    Cmp             cmp;
-    size_t          num_samples;
-};
-
-template<class Block, class T, class Cmp>
-struct SampleSort<Block,T,Cmp>::Exchanger
-{
-                    Exchanger(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_):
-                        values(values_), samples(samples_), cmp(cmp_)       {}
-
-    void            operator()(Block* b, const ReduceProxy& rp) const
-    {
-        if (rp.round() == 0)
-        {
-            // enqueue values to the correct locations
-            for (size_t i = 0; i < (b->*values).size(); ++i)
-            {
-                int to = std::lower_bound((b->*samples).begin(), (b->*samples).end(), (b->*values)[i], cmp) - (b->*samples).begin();
-                rp.enqueue(rp.out_link().target(to), (b->*values)[i]);
-            }
-            (b->*values).clear();
-        } else
-        {
-            dequeue_values((b->*values), rp, false);
-            std::sort((b->*values).begin(), (b->*values).end(), cmp);
-        }
-    }
-
-    ValuesVector    values;
-    ValuesVector    samples;
-    Cmp             cmp;
-};
-
-}
-
-}
-
-#endif
diff --git a/diy/include/diy/detail/block_traits.hpp b/diy/include/diy/detail/block_traits.hpp
deleted file mode 100644
index eb4b7c547..000000000
--- a/diy/include/diy/detail/block_traits.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef DIY_BLOCK_TRAITS_HPP
-#define DIY_BLOCK_TRAITS_HPP
-
-#include "traits.hpp"
-
-namespace diy
-{
-namespace detail
-{
-    template<class F>
-    struct block_traits
-    {
-        typedef typename std::remove_pointer<typename function_traits<F>::template arg<0>::type>::type type;
-    };
-
-    // matches block member functions
-    template<class Block, class R, class... Args>
-    struct block_traits<R(Block::*)(Args...)>
-    {
-        typedef Block type;
-    };
-
-    template<class Block, class R, class... Args>
-    struct block_traits<R(Block::*)(Args...) const>
-    {
-        typedef Block type;
-    };
-}
-}
-
-#endif
diff --git a/diy/include/diy/detail/collectives.hpp b/diy/include/diy/detail/collectives.hpp
deleted file mode 100644
index a85a0f3e4..000000000
--- a/diy/include/diy/detail/collectives.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef DIY_COLLECTIVES_HPP
-#define DIY_COLLECTIVES_HPP
-
-namespace diy
-{
-namespace detail
-{
-  struct CollectiveOp
-  {
-    virtual void    init()                                  =0;
-    virtual void    update(const CollectiveOp& other)       =0;
-    virtual void    global(const mpi::communicator& comm)   =0;
-    virtual void    copy_from(const CollectiveOp& other)    =0;
-    virtual void    result_out(void* dest) const            =0;
-    virtual         ~CollectiveOp()                         {}
-  };
-
-  template<class T, class Op>
-  struct AllReduceOp: public CollectiveOp
-  {
-          AllReduceOp(const T& x, Op op):
-            in_(x), op_(op)         {}
-
-    void  init()                                    { out_ = in_; }
-    void  update(const CollectiveOp& other)         { out_ = op_(out_, static_cast<const AllReduceOp&>(other).in_); }
-    void  global(const mpi::communicator& comm)     { T res; mpi::all_reduce(comm, out_, res, op_); out_ = res; }
-    void  copy_from(const CollectiveOp& other)      { out_ = static_cast<const AllReduceOp&>(other).out_; }
-    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = out_; }
-
-    private:
-      T     in_, out_;
-      Op    op_;
-  };
-
-  template<class T>
-  struct Scratch: public CollectiveOp
-  {
-          Scratch(const T& x):
-            x_(x)                                   {}
-
-    void  init()                                    {}
-    void  update(const CollectiveOp& other)         {}
-    void  global(const mpi::communicator& comm)     {}
-    void  copy_from(const CollectiveOp& other)      {}
-    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = x_; }
-
-    private:
-      T     x_;
-  };
-
-}
-}
-
-#endif
diff --git a/diy/include/diy/detail/reduce/all-to-all.hpp b/diy/include/diy/detail/reduce/all-to-all.hpp
deleted file mode 100644
index 1e555db82..000000000
--- a/diy/include/diy/detail/reduce/all-to-all.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef DIY_DETAIL_ALL_TO_ALL_HPP
-#define DIY_DETAIL_ALL_TO_ALL_HPP
-
-#include "../block_traits.hpp"
-
-namespace diy
-{
-
-namespace detail
-{
-  template<class Op>
-  struct AllToAllReduce
-  {
-    using Block = typename block_traits<Op>::type;
-
-         AllToAllReduce(const Op& op_, const Assigner& assigner):
-             op(op_)
-    {
-      for (int gid = 0; gid < assigner.nblocks(); ++gid)
-      {
-        BlockID nbr = { gid, assigner.rank(gid) };
-        all_neighbors_link.add_neighbor(nbr);
-      }
-    }
-
-    void operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
-    {
-      int k_in  = srp.in_link().size();
-      int k_out = srp.out_link().size();
-
-      if (k_in == 0 && k_out == 0)  // special case of a single block
-      {
-          ReduceProxy all_srp_out(srp, srp.block(), 0, srp.assigner(), empty_link,         all_neighbors_link);
-          ReduceProxy all_srp_in (srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
-
-          op(b, all_srp_out);
-          MemoryBuffer& in_queue = all_srp_in.incoming(all_srp_in.in_link().target(0).gid);
-          in_queue.swap(all_srp_out.outgoing(all_srp_out.out_link().target(0)));
-          in_queue.reset();
-
-          op(b, all_srp_in);
-          return;
-      }
-
-      if (k_in == 0)                // initial round
-      {
-        ReduceProxy all_srp(srp, srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
-        op(b, all_srp);
-
-        Master::OutgoingQueues all_queues;
-        all_queues.swap(*all_srp.outgoing());       // clears out the queues and stores them locally
-
-        // enqueue outgoing
-        int group = all_srp.out_link().size() / k_out;
-        for (int i = 0; i < k_out; ++i)
-        {
-          std::pair<int,int> range(i*group, (i+1)*group);
-          srp.enqueue(srp.out_link().target(i), range);
-          for (int j = i*group; j < (i+1)*group; ++j)
-          {
-            int from = srp.gid();
-            int to   = all_srp.out_link().target(j).gid;
-            srp.enqueue(srp.out_link().target(i), std::make_pair(from, to));
-            srp.enqueue(srp.out_link().target(i), all_queues[all_srp.out_link().target(j)]);
-          }
-        }
-      } else if (k_out == 0)        // final round
-      {
-        // dequeue incoming + reorder into the correct order
-        ReduceProxy all_srp(srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
-
-        Master::IncomingQueues all_incoming;
-        all_incoming.swap(*srp.incoming());
-
-        std::pair<int, int> range;      // all the ranges should be the same
-        for (int i = 0; i < k_in; ++i)
-        {
-          int gid_in = srp.in_link().target(i).gid;
-          MemoryBuffer& in = all_incoming[gid_in];
-          load(in, range);
-          while(in)
-          {
-            std::pair<int, int> from_to;
-            load(in, from_to);
-            load(in, all_srp.incoming(from_to.first));
-            all_srp.incoming(from_to.first).reset();
-          }
-        }
-
-        op(b, all_srp);
-      } else                                        // intermediate round: reshuffle queues
-      {
-        // add up buffer sizes
-        std::vector<size_t> sizes_out(k_out, sizeof(std::pair<int,int>));
-        std::pair<int, int> range;      // all the ranges should be the same
-        for (int i = 0; i < k_in; ++i)
-        {
-          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
-
-          load(in, range);
-          int group = (range.second - range.first)/k_out;
-
-          std::pair<int, int> from_to;
-          size_t s;
-          while(in)
-          {
-            diy::load(in, from_to);
-            diy::load(in, s);
-
-            int j = (from_to.second - range.first) / group;
-            sizes_out[j] += s + sizeof(size_t) + sizeof(std::pair<int,int>);
-            in.skip(s);
-          }
-          in.reset();
-        }
-
-        // reserve outgoing buffers of correct size
-        int group = (range.second - range.first)/k_out;
-        for (int i = 0; i < k_out; ++i)
-        {
-          MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
-          out.reserve(sizes_out[i]);
-
-          std::pair<int, int> out_range;
-          out_range.first  = range.first + group*i;
-          out_range.second = range.first + group*(i+1);
-          save(out, out_range);
-        }
-
-        // re-direct the queues
-        for (int i = 0; i < k_in; ++i)
-        {
-          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
-
-          std::pair<int, int> range;
-          load(in, range);
-
-          std::pair<int, int> from_to;
-          while(in)
-          {
-            load(in, from_to);
-            int j = (from_to.second - range.first) / group;
-
-            MemoryBuffer& out = srp.outgoing(srp.out_link().target(j));
-            save(out, from_to);
-            MemoryBuffer::copy(in, out);
-          }
-        }
-      }
-    }
-
-    const Op&           op;
-    Link                all_neighbors_link, empty_link;
-  };
-
-  struct SkipIntermediate
-  {
-         SkipIntermediate(size_t rounds_):
-            rounds(rounds_)                                     {}
-
-    bool operator()(int round, int, const Master&) const        { if (round == 0 || round == (int) rounds) return false; return true; }
-
-    size_t  rounds;
-  };
-}
-
-}
-
-#endif
diff --git a/diy/include/diy/detail/traits.hpp b/diy/include/diy/detail/traits.hpp
deleted file mode 100644
index f47b733c8..000000000
--- a/diy/include/diy/detail/traits.hpp
+++ /dev/null
@@ -1,318 +0,0 @@
-//--------------------------------------
-// utils/traits: Additional type traits
-//--------------------------------------
-//
-//          Copyright kennytm (auraHT Ltd.) 2011.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file doc/LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-
-/**
-
-``<utils/traits.hpp>`` --- Additional type traits
-=================================================
-
-This module provides additional type traits and related functions, missing from
-the standard library.
-
-*/
-
-#ifndef DIY_UTILS_TRAITS_HPP
-#define DIY_UTILS_TRAITS_HPP
-
-#include <cstdlib>
-#include <tuple>
-#include <functional>
-#include <type_traits>
-
-namespace diy
-{
-namespace detail {
-
-/**
-.. macro:: DECLARE_HAS_TYPE_MEMBER(member_name)
-
-    This macro declares a template ``has_member_name`` which will check whether
-    a type member ``member_name`` exists in a particular type.
-
-    Example::
-
-        DECLARE_HAS_TYPE_MEMBER(result_type)
-
-        ...
-
-        printf("%d\n", has_result_type< std::plus<int> >::value);
-        // ^ prints '1' (true)
-        printf("%d\n", has_result_type< double(*)() >::value);
-        // ^ prints '0' (false)
-*/
-#define DECLARE_HAS_TYPE_MEMBER(member_name) \
-    template <typename, typename = void> \
-    struct has_##member_name \
-    { enum { value = false }; }; \
-    template <typename T> \
-    struct has_##member_name<T, typename std::enable_if<sizeof(typename T::member_name)||true>::type> \
-    { enum { value = true }; };
-
-/**
-.. type:: struct utils::function_traits<F>
-
-    Obtain compile-time information about a function object *F*.
-
-    This template currently supports the following types:
-
-    * Normal function types (``R(T...)``), function pointers (``R(*)(T...)``)
-      and function references (``R(&)(T...)`` and ``R(&&)(T...)``).
-    * Member functions (``R(C::*)(T...)``)
-    * ``std::function<F>``
-    * Type of lambda functions, and any other types that has a unique
-      ``operator()``.
-    * Type of ``std::mem_fn`` (only for GCC's libstdc++ and LLVM's libc++).
-      Following the C++ spec, the first argument will be a raw pointer.
-*/
-template <typename T>
-struct function_traits
-    : public function_traits<decltype(&T::operator())>
-{};
-
-namespace xx_impl
-{
-    template <typename C, typename R, typename... A>
-    struct memfn_type
-    {
-        typedef typename std::conditional<
-            std::is_const<C>::value,
-            typename std::conditional<
-                std::is_volatile<C>::value,
-                R (C::*)(A...) const volatile,
-                R (C::*)(A...) const
-            >::type,
-            typename std::conditional<
-                std::is_volatile<C>::value,
-                R (C::*)(A...) volatile,
-                R (C::*)(A...)
-            >::type
-        >::type type;
-    };
-}
-
-template <typename ReturnType, typename... Args>
-struct function_traits<ReturnType(Args...)>
-{
-    /**
-    .. type:: type result_type
-
-        The type returned by calling an instance of the function object type *F*.
-    */
-    typedef ReturnType result_type;
-
-    /**
-    .. type:: type function_type
-
-        The function type (``R(T...)``).
-    */
-    typedef ReturnType function_type(Args...);
-
-    /**
-    .. type:: type member_function_type<OwnerType>
-
-        The member function type for an *OwnerType* (``R(OwnerType::*)(T...)``).
-    */
-    template <typename OwnerType>
-    using member_function_type = typename xx_impl::memfn_type<
-        typename std::remove_pointer<typename std::remove_reference<OwnerType>::type>::type,
-        ReturnType, Args...
-    >::type;
-
-    /**
-    .. data:: static const size_t arity
-
-        Number of arguments the function object will take.
-    */
-    enum { arity = sizeof...(Args) };
-
-    /**
-    .. type:: type arg<n>::type
-
-        The type of the *n*-th argument.
-    */
-    template <size_t i>
-    struct arg
-    {
-        typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
-    };
-};
-
-template <typename ReturnType, typename... Args>
-struct function_traits<ReturnType(*)(Args...)>
-    : public function_traits<ReturnType(Args...)>
-{};
-
-template <typename ClassType, typename ReturnType, typename... Args>
-struct function_traits<ReturnType(ClassType::*)(Args...)>
-    : public function_traits<ReturnType(Args...)>
-{
-    typedef ClassType& owner_type;
-};
-
-template <typename ClassType, typename ReturnType, typename... Args>
-struct function_traits<ReturnType(ClassType::*)(Args...) const>
-    : public function_traits<ReturnType(Args...)>
-{
-    typedef const ClassType& owner_type;
-};
-
-template <typename ClassType, typename ReturnType, typename... Args>
-struct function_traits<ReturnType(ClassType::*)(Args...) volatile>
-    : public function_traits<ReturnType(Args...)>
-{
-    typedef volatile ClassType& owner_type;
-};
-
-template <typename ClassType, typename ReturnType, typename... Args>
-struct function_traits<ReturnType(ClassType::*)(Args...) const volatile>
-    : public function_traits<ReturnType(Args...)>
-{
-    typedef const volatile ClassType& owner_type;
-};
-
-template <typename FunctionType>
-struct function_traits<std::function<FunctionType>>
-    : public function_traits<FunctionType>
-{};
-
-#if defined(_GLIBCXX_FUNCTIONAL)
-#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::_Mem_fn
-#elif defined(_LIBCPP_FUNCTIONAL)
-#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::__mem_fn
-#endif
-
-#ifdef MEM_FN_SYMBOL_XX0SL7G4Z0J
-
-template <typename R, typename C>
-struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R C::*>>
-    : public function_traits<R(C*)>
-{};
-template <typename R, typename C, typename... A>
-struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...)>>
-    : public function_traits<R(C*, A...)>
-{};
-template <typename R, typename C, typename... A>
-struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const>>
-    : public function_traits<R(const C*, A...)>
-{};
-template <typename R, typename C, typename... A>
-struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) volatile>>
-    : public function_traits<R(volatile C*, A...)>
-{};
-template <typename R, typename C, typename... A>
-struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const volatile>>
-    : public function_traits<R(const volatile C*, A...)>
-{};
-
-#undef MEM_FN_SYMBOL_XX0SL7G4Z0J
-#endif
-
-template <typename T>
-struct function_traits<T&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<const T&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<volatile T&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<const volatile T&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<T&&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<const T&&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<volatile T&&> : public function_traits<T> {};
-template <typename T>
-struct function_traits<const volatile T&&> : public function_traits<T> {};
-
-
-#define FORWARD_RES_8QR485JMSBT \
-    typename std::conditional< \
-        std::is_lvalue_reference<R>::value, \
-        T&, \
-        typename std::remove_reference<T>::type&& \
-    >::type
-
-/**
-.. function:: auto utils::forward_like<Like, T>(T&& t) noexcept
-
-    Forward the reference *t* like the type of *Like*. That means, if *Like* is
-    an lvalue (reference), this function will return an lvalue reference of *t*.
-    Otherwise, if *Like* is an rvalue, this function will return an rvalue
-    reference of *t*.
-
-    This is mainly used to propagate the expression category (lvalue/rvalue) of
-    a member of *Like*, generalizing ``std::forward``.
-*/
-template <typename R, typename T>
-FORWARD_RES_8QR485JMSBT forward_like(T&& input) noexcept
-{
-    return static_cast<FORWARD_RES_8QR485JMSBT>(input);
-}
-
-#undef FORWARD_RES_8QR485JMSBT
-
-/**
-.. type:: struct utils::copy_cv<From, To>
-
-    Copy the CV qualifier between the two types. For example,
-    ``utils::copy_cv<const int, double>::type`` will become ``const double``.
-*/
-template <typename From, typename To>
-struct copy_cv
-{
-private:
-    typedef typename std::remove_cv<To>::type raw_To;
-    typedef typename std::conditional<std::is_const<From>::value,
-                                      const raw_To, raw_To>::type const_raw_To;
-public:
-    /**
-    .. type:: type type
-
-        Result of cv-copying.
-    */
-    typedef typename std::conditional<std::is_volatile<From>::value,
-                                      volatile const_raw_To, const_raw_To>::type type;
-};
-
-/**
-.. type:: struct utils::pointee<T>
-
-    Returns the type by derefering an instance of *T*. This is a generalization
-    of ``std::remove_pointer``, that it also works with iterators.
-*/
-template <typename T>
-struct pointee
-{
-    /**
-    .. type:: type type
-
-        Result of dereferencing.
-    */
-    typedef typename std::remove_reference<decltype(*std::declval<T>())>::type type;
-};
-
-/**
-.. function:: std::add_rvalue_reference<T>::type utils::rt_val<T>() noexcept
-
-    Returns a value of type *T*. It is guaranteed to do nothing and will not
-    throw a compile-time error, but using the returned result will cause
-    undefined behavior.
-*/
-template <typename T>
-typename std::add_rvalue_reference<T>::type rt_val() noexcept
-{
-    return std::move(*static_cast<T*>(nullptr));
-}
-
-}
-
-}
-
-#endif
-
diff --git a/diy/include/diy/fmt/format.cc b/diy/include/diy/fmt/format.cc
deleted file mode 100644
index ae5d11034..000000000
--- a/diy/include/diy/fmt/format.cc
+++ /dev/null
@@ -1,935 +0,0 @@
-/*
- Formatting library for C++
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "format.h"
-
-#include <string.h>
-
-#include <cctype>
-#include <cerrno>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstddef>  // for std::ptrdiff_t
-
-#if defined(_WIN32) && defined(__MINGW32__)
-# include <cstring>
-#endif
-
-#if FMT_USE_WINDOWS_H
-# if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
-#  include <windows.h>
-# else
-#  define NOMINMAX
-#  include <windows.h>
-#  undef NOMINMAX
-# endif
-#endif
-
-using fmt::internal::Arg;
-
-#if FMT_EXCEPTIONS
-# define FMT_TRY try
-# define FMT_CATCH(x) catch (x)
-#else
-# define FMT_TRY if (true)
-# define FMT_CATCH(x) if (false)
-#endif
-
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4127)  // conditional expression is constant
-# pragma warning(disable: 4702)  // unreachable code
-// Disable deprecation warning for strerror. The latter is not called but
-// MSVC fails to detect it.
-# pragma warning(disable: 4996)
-#endif
-
-// Dummy implementations of strerror_r and strerror_s called if corresponding
-// system functions are not available.
-static inline fmt::internal::Null<> strerror_r(int, char *, ...) {
-  return fmt::internal::Null<>();
-}
-static inline fmt::internal::Null<> strerror_s(char *, std::size_t, ...) {
-  return fmt::internal::Null<>();
-}
-
-namespace fmt {
-namespace {
-
-#ifndef _MSC_VER
-# define FMT_SNPRINTF snprintf
-#else  // _MSC_VER
-inline int fmt_snprintf(char *buffer, size_t size, const char *format, ...) {
-  va_list args;
-  va_start(args, format);
-  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
-  va_end(args);
-  return result;
-}
-# define FMT_SNPRINTF fmt_snprintf
-#endif  // _MSC_VER
-
-#if defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
-# define FMT_SWPRINTF snwprintf
-#else
-# define FMT_SWPRINTF swprintf
-#endif // defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
-
-// Checks if a value fits in int - used to avoid warnings about comparing
-// signed and unsigned integers.
-template <bool IsSigned>
-struct IntChecker {
-  template <typename T>
-  static bool fits_in_int(T value) {
-    unsigned max = INT_MAX;
-    return value <= max;
-  }
-  static bool fits_in_int(bool) { return true; }
-};
-
-template <>
-struct IntChecker<true> {
-  template <typename T>
-  static bool fits_in_int(T value) {
-    return value >= INT_MIN && value <= INT_MAX;
-  }
-  static bool fits_in_int(int) { return true; }
-};
-
-const char RESET_COLOR[] = "\x1b[0m";
-
-typedef void (*FormatFunc)(Writer &, int, StringRef);
-
-// Portable thread-safe version of strerror.
-// Sets buffer to point to a string describing the error code.
-// This can be either a pointer to a string stored in buffer,
-// or a pointer to some static immutable string.
-// Returns one of the following values:
-//   0      - success
-//   ERANGE - buffer is not large enough to store the error message
-//   other  - failure
-// Buffer should be at least of size 1.
-int safe_strerror(
-    int error_code, char *&buffer, std::size_t buffer_size) FMT_NOEXCEPT {
-  FMT_ASSERT(buffer != 0 && buffer_size != 0, "invalid buffer");
-
-  class StrError {
-   private:
-    int error_code_;
-    char *&buffer_;
-    std::size_t buffer_size_;
-
-    // A noop assignment operator to avoid bogus warnings.
-    void operator=(const StrError &) {}
-
-    // Handle the result of XSI-compliant version of strerror_r.
-    int handle(int result) {
-      // glibc versions before 2.13 return result in errno.
-      return result == -1 ? errno : result;
-    }
-
-    // Handle the result of GNU-specific version of strerror_r.
-    int handle(char *message) {
-      // If the buffer is full then the message is probably truncated.
-      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
-        return ERANGE;
-      buffer_ = message;
-      return 0;
-    }
-
-    // Handle the case when strerror_r is not available.
-    int handle(internal::Null<>) {
-      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
-    }
-
-    // Fallback to strerror_s when strerror_r is not available.
-    int fallback(int result) {
-      // If the buffer is full then the message is probably truncated.
-      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ?
-            ERANGE : result;
-    }
-
-    // Fallback to strerror if strerror_r and strerror_s are not available.
-    int fallback(internal::Null<>) {
-      errno = 0;
-      buffer_ = strerror(error_code_);
-      return errno;
-    }
-
-   public:
-    StrError(int err_code, char *&buf, std::size_t buf_size)
-      : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
-
-    int run() {
-      strerror_r(0, 0, "");  // Suppress a warning about unused strerror_r.
-      return handle(strerror_r(error_code_, buffer_, buffer_size_));
-    }
-  };
-  return StrError(error_code, buffer, buffer_size).run();
-}
-
-void format_error_code(Writer &out, int error_code,
-                       StringRef message) FMT_NOEXCEPT {
-  // Report error code making sure that the output fits into
-  // INLINE_BUFFER_SIZE to avoid dynamic memory allocation and potential
-  // bad_alloc.
-  out.clear();
-  static const char SEP[] = ": ";
-  static const char ERROR_STR[] = "error ";
-  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
-  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
-  typedef internal::IntTraits<int>::MainType MainType;
-  MainType abs_value = static_cast<MainType>(error_code);
-  if (internal::is_negative(error_code)) {
-    abs_value = 0 - abs_value;
-    ++error_code_size;
-  }
-  error_code_size += internal::count_digits(abs_value);
-  if (message.size() <= internal::INLINE_BUFFER_SIZE - error_code_size)
-    out << message << SEP;
-  out << ERROR_STR << error_code;
-  assert(out.size() <= internal::INLINE_BUFFER_SIZE);
-}
-
-void report_error(FormatFunc func, int error_code,
-                  StringRef message) FMT_NOEXCEPT {
-  MemoryWriter full_message;
-  func(full_message, error_code, message);
-  // Use Writer::data instead of Writer::c_str to avoid potential memory
-  // allocation.
-  std::fwrite(full_message.data(), full_message.size(), 1, stderr);
-  std::fputc('\n', stderr);
-}
-
-// IsZeroInt::visit(arg) returns true iff arg is a zero integer.
-class IsZeroInt : public ArgVisitor<IsZeroInt, bool> {
- public:
-  template <typename T>
-  bool visit_any_int(T value) { return value == 0; }
-};
-
-// Checks if an argument is a valid printf width specifier and sets
-// left alignment if it is negative.
-class WidthHandler : public ArgVisitor<WidthHandler, unsigned> {
- private:
-  FormatSpec &spec_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(WidthHandler);
-
- public:
-  explicit WidthHandler(FormatSpec &spec) : spec_(spec) {}
-
-  void report_unhandled_arg() {
-    FMT_THROW(FormatError("width is not integer"));
-  }
-
-  template <typename T>
-  unsigned visit_any_int(T value) {
-    typedef typename internal::IntTraits<T>::MainType UnsignedType;
-    UnsignedType width = static_cast<UnsignedType>(value);
-    if (internal::is_negative(value)) {
-      spec_.align_ = ALIGN_LEFT;
-      width = 0 - width;
-    }
-    if (width > INT_MAX)
-      FMT_THROW(FormatError("number is too big"));
-    return static_cast<unsigned>(width);
-  }
-};
-
-class PrecisionHandler : public ArgVisitor<PrecisionHandler, int> {
- public:
-  void report_unhandled_arg() {
-    FMT_THROW(FormatError("precision is not integer"));
-  }
-
-  template <typename T>
-  int visit_any_int(T value) {
-    if (!IntChecker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
-      FMT_THROW(FormatError("number is too big"));
-    return static_cast<int>(value);
-  }
-};
-
-template <typename T, typename U>
-struct is_same {
-  enum { value = 0 };
-};
-
-template <typename T>
-struct is_same<T, T> {
-  enum { value = 1 };
-};
-
-// An argument visitor that converts an integer argument to T for printf,
-// if T is an integral type. If T is void, the argument is converted to
-// corresponding signed or unsigned type depending on the type specifier:
-// 'd' and 'i' - signed, other - unsigned)
-template <typename T = void>
-class ArgConverter : public ArgVisitor<ArgConverter<T>, void> {
- private:
-  internal::Arg &arg_;
-  wchar_t type_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(ArgConverter);
-
- public:
-  ArgConverter(internal::Arg &arg, wchar_t type)
-    : arg_(arg), type_(type) {}
-
-  void visit_bool(bool value) {
-    if (type_ != 's')
-      visit_any_int(value);
-  }
-
-  template <typename U>
-  void visit_any_int(U value) {
-    bool is_signed = type_ == 'd' || type_ == 'i';
-    using internal::Arg;
-    typedef typename internal::Conditional<
-        is_same<T, void>::value, U, T>::type TargetType;
-    if (sizeof(TargetType) <= sizeof(int)) {
-      // Extra casts are used to silence warnings.
-      if (is_signed) {
-        arg_.type = Arg::INT;
-        arg_.int_value = static_cast<int>(static_cast<TargetType>(value));
-      } else {
-        arg_.type = Arg::UINT;
-        typedef typename internal::MakeUnsigned<TargetType>::Type Unsigned;
-        arg_.uint_value = static_cast<unsigned>(static_cast<Unsigned>(value));
-      }
-    } else {
-      if (is_signed) {
-        arg_.type = Arg::LONG_LONG;
-        // glibc's printf doesn't sign extend arguments of smaller types:
-        //   std::printf("%lld", -42);  // prints "4294967254"
-        // but we don't have to do the same because it's a UB.
-        arg_.long_long_value = static_cast<LongLong>(value);
-      } else {
-        arg_.type = Arg::ULONG_LONG;
-        arg_.ulong_long_value =
-            static_cast<typename internal::MakeUnsigned<U>::Type>(value);
-      }
-    }
-  }
-};
-
-// Converts an integer argument to char for printf.
-class CharConverter : public ArgVisitor<CharConverter, void> {
- private:
-  internal::Arg &arg_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(CharConverter);
-
- public:
-  explicit CharConverter(internal::Arg &arg) : arg_(arg) {}
-
-  template <typename T>
-  void visit_any_int(T value) {
-    arg_.type = internal::Arg::CHAR;
-    arg_.int_value = static_cast<char>(value);
-  }
-};
-}  // namespace
-
-namespace internal {
-
-template <typename Char>
-class PrintfArgFormatter :
-    public ArgFormatterBase<PrintfArgFormatter<Char>, Char> {
-
-  void write_null_pointer() {
-    this->spec().type_ = 0;
-    this->write("(nil)");
-  }
-
-  typedef ArgFormatterBase<PrintfArgFormatter<Char>, Char> Base;
-
- public:
-  PrintfArgFormatter(BasicWriter<Char> &w, FormatSpec &s)
-  : ArgFormatterBase<PrintfArgFormatter<Char>, Char>(w, s) {}
-
-  void visit_bool(bool value) {
-    FormatSpec &fmt_spec = this->spec();
-    if (fmt_spec.type_ != 's')
-      return this->visit_any_int(value);
-    fmt_spec.type_ = 0;
-    this->write(value);
-  }
-
-  void visit_char(int value) {
-    const FormatSpec &fmt_spec = this->spec();
-    BasicWriter<Char> &w = this->writer();
-    if (fmt_spec.type_ && fmt_spec.type_ != 'c')
-      w.write_int(value, fmt_spec);
-    typedef typename BasicWriter<Char>::CharPtr CharPtr;
-    CharPtr out = CharPtr();
-    if (fmt_spec.width_ > 1) {
-      Char fill = ' ';
-      out = w.grow_buffer(fmt_spec.width_);
-      if (fmt_spec.align_ != ALIGN_LEFT) {
-        std::fill_n(out, fmt_spec.width_ - 1, fill);
-        out += fmt_spec.width_ - 1;
-      } else {
-        std::fill_n(out + 1, fmt_spec.width_ - 1, fill);
-      }
-    } else {
-      out = w.grow_buffer(1);
-    }
-    *out = static_cast<Char>(value);
-  }
-
-  void visit_cstring(const char *value) {
-    if (value)
-      Base::visit_cstring(value);
-    else if (this->spec().type_ == 'p')
-      write_null_pointer();
-    else
-      this->write("(null)");
-  }
-
-  void visit_pointer(const void *value) {
-    if (value)
-      return Base::visit_pointer(value);
-    this->spec().type_ = 0;
-    write_null_pointer();
-  }
-
-  void visit_custom(Arg::CustomValue c) {
-    BasicFormatter<Char> formatter(ArgList(), this->writer());
-    const Char format_str[] = {'}', 0};
-    const Char *format = format_str;
-    c.format(&formatter, c.value, &format);
-  }
-};
-}  // namespace internal
-}  // namespace fmt
-
-FMT_FUNC void fmt::SystemError::init(
-    int err_code, CStringRef format_str, ArgList args) {
-  error_code_ = err_code;
-  MemoryWriter w;
-  internal::format_system_error(w, err_code, format(format_str, args));
-  std::runtime_error &base = *this;
-  base = std::runtime_error(w.str());
-}
-
-template <typename T>
-int fmt::internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, T value) {
-  if (width == 0) {
-    return precision < 0 ?
-        FMT_SNPRINTF(buffer, size, format, value) :
-        FMT_SNPRINTF(buffer, size, format, precision, value);
-  }
-  return precision < 0 ?
-      FMT_SNPRINTF(buffer, size, format, width, value) :
-      FMT_SNPRINTF(buffer, size, format, width, precision, value);
-}
-
-template <typename T>
-int fmt::internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, T value) {
-  if (width == 0) {
-    return precision < 0 ?
-        FMT_SWPRINTF(buffer, size, format, value) :
-        FMT_SWPRINTF(buffer, size, format, precision, value);
-  }
-  return precision < 0 ?
-      FMT_SWPRINTF(buffer, size, format, width, value) :
-      FMT_SWPRINTF(buffer, size, format, width, precision, value);
-}
-
-template <typename T>
-const char fmt::internal::BasicData<T>::DIGITS[] =
-    "0001020304050607080910111213141516171819"
-    "2021222324252627282930313233343536373839"
-    "4041424344454647484950515253545556575859"
-    "6061626364656667686970717273747576777879"
-    "8081828384858687888990919293949596979899";
-
-#define FMT_POWERS_OF_10(factor) \
-  factor * 10, \
-  factor * 100, \
-  factor * 1000, \
-  factor * 10000, \
-  factor * 100000, \
-  factor * 1000000, \
-  factor * 10000000, \
-  factor * 100000000, \
-  factor * 1000000000
-
-template <typename T>
-const uint32_t fmt::internal::BasicData<T>::POWERS_OF_10_32[] = {
-  0, FMT_POWERS_OF_10(1)
-};
-
-template <typename T>
-const uint64_t fmt::internal::BasicData<T>::POWERS_OF_10_64[] = {
-  0,
-  FMT_POWERS_OF_10(1),
-  FMT_POWERS_OF_10(fmt::ULongLong(1000000000)),
-  // Multiply several constants instead of using a single long long constant
-  // to avoid warnings about C++98 not supporting long long.
-  fmt::ULongLong(1000000000) * fmt::ULongLong(1000000000) * 10
-};
-
-FMT_FUNC void fmt::internal::report_unknown_type(char code, const char *type) {
-  (void)type;
-  if (std::isprint(static_cast<unsigned char>(code))) {
-    FMT_THROW(fmt::FormatError(
-        fmt::format("unknown format code '{}' for {}", code, type)));
-  }
-  FMT_THROW(fmt::FormatError(
-      fmt::format("unknown format code '\\x{:02x}' for {}",
-        static_cast<unsigned>(code), type)));
-}
-
-#if FMT_USE_WINDOWS_H
-
-FMT_FUNC fmt::internal::UTF8ToUTF16::UTF8ToUTF16(fmt::StringRef s) {
-  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
-  if (s.size() > INT_MAX)
-    FMT_THROW(WindowsError(ERROR_INVALID_PARAMETER, ERROR_MSG));
-  int s_size = static_cast<int>(s.size());
-  int length = MultiByteToWideChar(
-      CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, 0, 0);
-  if (length == 0)
-    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
-  buffer_.resize(length + 1);
-  length = MultiByteToWideChar(
-    CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, &buffer_[0], length);
-  if (length == 0)
-    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
-  buffer_[length] = 0;
-}
-
-FMT_FUNC fmt::internal::UTF16ToUTF8::UTF16ToUTF8(fmt::WStringRef s) {
-  if (int error_code = convert(s)) {
-    FMT_THROW(WindowsError(error_code,
-        "cannot convert string from UTF-16 to UTF-8"));
-  }
-}
-
-FMT_FUNC int fmt::internal::UTF16ToUTF8::convert(fmt::WStringRef s) {
-  if (s.size() > INT_MAX)
-    return ERROR_INVALID_PARAMETER;
-  int s_size = static_cast<int>(s.size());
-  int length = WideCharToMultiByte(CP_UTF8, 0, s.data(), s_size, 0, 0, 0, 0);
-  if (length == 0)
-    return GetLastError();
-  buffer_.resize(length + 1);
-  length = WideCharToMultiByte(
-    CP_UTF8, 0, s.data(), s_size, &buffer_[0], length, 0, 0);
-  if (length == 0)
-    return GetLastError();
-  buffer_[length] = 0;
-  return 0;
-}
-
-FMT_FUNC void fmt::WindowsError::init(
-    int err_code, CStringRef format_str, ArgList args) {
-  error_code_ = err_code;
-  MemoryWriter w;
-  internal::format_windows_error(w, err_code, format(format_str, args));
-  std::runtime_error &base = *this;
-  base = std::runtime_error(w.str());
-}
-
-FMT_FUNC void fmt::internal::format_windows_error(
-    fmt::Writer &out, int error_code,
-    fmt::StringRef message) FMT_NOEXCEPT {
-  FMT_TRY {
-    MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer;
-    buffer.resize(INLINE_BUFFER_SIZE);
-    for (;;) {
-      wchar_t *system_message = &buffer[0];
-      int result = FormatMessageW(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                  0, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-                                  system_message, static_cast<uint32_t>(buffer.size()), 0);
-      if (result != 0) {
-        UTF16ToUTF8 utf8_message;
-        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
-          out << message << ": " << utf8_message;
-          return;
-        }
-        break;
-      }
-      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-        break;  // Can't get error message, report error code instead.
-      buffer.resize(buffer.size() * 2);
-    }
-  } FMT_CATCH(...) {}
-  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
-}
-
-#endif  // FMT_USE_WINDOWS_H
-
-FMT_FUNC void fmt::internal::format_system_error(
-    fmt::Writer &out, int error_code,
-    fmt::StringRef message) FMT_NOEXCEPT {
-  FMT_TRY {
-    MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer;
-    buffer.resize(INLINE_BUFFER_SIZE);
-    for (;;) {
-      char *system_message = &buffer[0];
-      int result = safe_strerror(error_code, system_message, buffer.size());
-      if (result == 0) {
-        out << message << ": " << system_message;
-        return;
-      }
-      if (result != ERANGE)
-        break;  // Can't get error message, report error code instead.
-      buffer.resize(buffer.size() * 2);
-    }
-  } FMT_CATCH(...) {}
-  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
-}
-
-template <typename Char>
-void fmt::internal::ArgMap<Char>::init(const ArgList &args) {
-  if (!map_.empty())
-    return;
-  typedef internal::NamedArg<Char> NamedArg;
-  const NamedArg *named_arg = 0;
-  bool use_values =
-      args.type(ArgList::MAX_PACKED_ARGS - 1) == internal::Arg::NONE;
-  if (use_values) {
-    for (unsigned i = 0;/*nothing*/; ++i) {
-      internal::Arg::Type arg_type = args.type(i);
-      switch (arg_type) {
-      case internal::Arg::NONE:
-        return;
-      case internal::Arg::NAMED_ARG:
-        named_arg = static_cast<const NamedArg*>(args.values_[i].pointer);
-        map_.push_back(Pair(named_arg->name, *named_arg));
-        break;
-      default:
-        /*nothing*/;
-      }
-    }
-    return;
-  }
-  for (unsigned i = 0; i != ArgList::MAX_PACKED_ARGS; ++i) {
-    internal::Arg::Type arg_type = args.type(i);
-    if (arg_type == internal::Arg::NAMED_ARG) {
-      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
-      map_.push_back(Pair(named_arg->name, *named_arg));
-    }
-  }
-  for (unsigned i = ArgList::MAX_PACKED_ARGS;/*nothing*/; ++i) {
-    switch (args.args_[i].type) {
-    case internal::Arg::NONE:
-      return;
-    case internal::Arg::NAMED_ARG:
-      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
-      map_.push_back(Pair(named_arg->name, *named_arg));
-      break;
-    default:
-      /*nothing*/;
-    }
-  }
-}
-
-template <typename Char>
-void fmt::internal::FixedBuffer<Char>::grow(std::size_t) {
-  FMT_THROW(std::runtime_error("buffer overflow"));
-}
-
-FMT_FUNC Arg fmt::internal::FormatterBase::do_get_arg(
-    unsigned arg_index, const char *&error) {
-  Arg arg = args_[arg_index];
-  switch (arg.type) {
-  case Arg::NONE:
-    error = "argument index out of range";
-    break;
-  case Arg::NAMED_ARG:
-    arg = *static_cast<const internal::Arg*>(arg.pointer);
-    break;
-  default:
-    /*nothing*/;
-  }
-  return arg;
-}
-
-template <typename Char>
-void fmt::internal::PrintfFormatter<Char>::parse_flags(
-    FormatSpec &spec, const Char *&s) {
-  for (;;) {
-    switch (*s++) {
-      case '-':
-        spec.align_ = ALIGN_LEFT;
-        break;
-      case '+':
-        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
-        break;
-      case '0':
-        spec.fill_ = '0';
-        break;
-      case ' ':
-        spec.flags_ |= SIGN_FLAG;
-        break;
-      case '#':
-        spec.flags_ |= HASH_FLAG;
-        break;
-      default:
-        --s;
-        return;
-    }
-  }
-}
-
-template <typename Char>
-Arg fmt::internal::PrintfFormatter<Char>::get_arg(
-    const Char *s, unsigned arg_index) {
-  (void)s;
-  const char *error = 0;
-  Arg arg = arg_index == UINT_MAX ?
-    next_arg(error) : FormatterBase::get_arg(arg_index - 1, error);
-  if (error)
-    FMT_THROW(FormatError(!*s ? "invalid format string" : error));
-  return arg;
-}
-
-template <typename Char>
-unsigned fmt::internal::PrintfFormatter<Char>::parse_header(
-  const Char *&s, FormatSpec &spec) {
-  unsigned arg_index = UINT_MAX;
-  Char c = *s;
-  if (c >= '0' && c <= '9') {
-    // Parse an argument index (if followed by '$') or a width possibly
-    // preceded with '0' flag(s).
-    unsigned value = parse_nonnegative_int(s);
-    if (*s == '$') {  // value is an argument index
-      ++s;
-      arg_index = value;
-    } else {
-      if (c == '0')
-        spec.fill_ = '0';
-      if (value != 0) {
-        // Nonzero value means that we parsed width and don't need to
-        // parse it or flags again, so return now.
-        spec.width_ = value;
-        return arg_index;
-      }
-    }
-  }
-  parse_flags(spec, s);
-  // Parse width.
-  if (*s >= '0' && *s <= '9') {
-    spec.width_ = parse_nonnegative_int(s);
-  } else if (*s == '*') {
-    ++s;
-    spec.width_ = WidthHandler(spec).visit(get_arg(s));
-  }
-  return arg_index;
-}
-
-template <typename Char>
-void fmt::internal::PrintfFormatter<Char>::format(
-    BasicWriter<Char> &writer, BasicCStringRef<Char> format_str) {
-  const Char *start = format_str.c_str();
-  const Char *s = start;
-  while (*s) {
-    Char c = *s++;
-    if (c != '%') continue;
-    if (*s == c) {
-      write(writer, start, s);
-      start = ++s;
-      continue;
-    }
-    write(writer, start, s - 1);
-
-    FormatSpec spec;
-    spec.align_ = ALIGN_RIGHT;
-
-    // Parse argument index, flags and width.
-    unsigned arg_index = parse_header(s, spec);
-
-    // Parse precision.
-    if (*s == '.') {
-      ++s;
-      if ('0' <= *s && *s <= '9') {
-        spec.precision_ = static_cast<int>(parse_nonnegative_int(s));
-      } else if (*s == '*') {
-        ++s;
-        spec.precision_ = PrecisionHandler().visit(get_arg(s));
-      }
-    }
-
-    Arg arg = get_arg(s, arg_index);
-    if (spec.flag(HASH_FLAG) && IsZeroInt().visit(arg))
-      spec.flags_ &= ~to_unsigned<int>(HASH_FLAG);
-    if (spec.fill_ == '0') {
-      if (arg.type <= Arg::LAST_NUMERIC_TYPE)
-        spec.align_ = ALIGN_NUMERIC;
-      else
-        spec.fill_ = ' ';  // Ignore '0' flag for non-numeric types.
-    }
-
-    // Parse length and convert the argument to the required type.
-    switch (*s++) {
-    case 'h':
-      if (*s == 'h')
-        ArgConverter<signed char>(arg, *++s).visit(arg);
-      else
-        ArgConverter<short>(arg, *s).visit(arg);
-      break;
-    case 'l':
-      if (*s == 'l')
-        ArgConverter<fmt::LongLong>(arg, *++s).visit(arg);
-      else
-        ArgConverter<long>(arg, *s).visit(arg);
-      break;
-    case 'j':
-      ArgConverter<intmax_t>(arg, *s).visit(arg);
-      break;
-    case 'z':
-      ArgConverter<std::size_t>(arg, *s).visit(arg);
-      break;
-    case 't':
-      ArgConverter<std::ptrdiff_t>(arg, *s).visit(arg);
-      break;
-    case 'L':
-      // printf produces garbage when 'L' is omitted for long double, no
-      // need to do the same.
-      break;
-    default:
-      --s;
-      ArgConverter<void>(arg, *s).visit(arg);
-    }
-
-    // Parse type.
-    if (!*s)
-      FMT_THROW(FormatError("invalid format string"));
-    spec.type_ = static_cast<char>(*s++);
-    if (arg.type <= Arg::LAST_INTEGER_TYPE) {
-      // Normalize type.
-      switch (spec.type_) {
-      case 'i': case 'u':
-        spec.type_ = 'd';
-        break;
-      case 'c':
-        // TODO: handle wchar_t
-        CharConverter(arg).visit(arg);
-        break;
-      }
-    }
-
-    start = s;
-
-    // Format argument.
-    internal::PrintfArgFormatter<Char>(writer, spec).visit(arg);
-  }
-  write(writer, start, s);
-}
-
-FMT_FUNC void fmt::report_system_error(
-    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
-  // 'fmt::' is for bcc32.
-  fmt::report_error(internal::format_system_error, error_code, message);
-}
-
-#if FMT_USE_WINDOWS_H
-FMT_FUNC void fmt::report_windows_error(
-    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
-  // 'fmt::' is for bcc32.
-  fmt::report_error(internal::format_windows_error, error_code, message);
-}
-#endif
-
-FMT_FUNC void fmt::print(std::FILE *f, CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  w.write(format_str, args);
-  std::fwrite(w.data(), 1, w.size(), f);
-}
-
-FMT_FUNC void fmt::print(CStringRef format_str, ArgList args) {
-  print(stdout, format_str, args);
-}
-
-FMT_FUNC void fmt::print_colored(Color c, CStringRef format, ArgList args) {
-  char escape[] = "\x1b[30m";
-  escape[3] = static_cast<char>('0' + c);
-  std::fputs(escape, stdout);
-  print(format, args);
-  std::fputs(RESET_COLOR, stdout);
-}
-
-FMT_FUNC int fmt::fprintf(std::FILE *f, CStringRef format, ArgList args) {
-  MemoryWriter w;
-  printf(w, format, args);
-  std::size_t size = w.size();
-  return std::fwrite(w.data(), 1, size, f) < size ? -1 : static_cast<int>(size);
-}
-
-#ifndef FMT_HEADER_ONLY
-
-template struct fmt::internal::BasicData<void>;
-
-// Explicit instantiations for char.
-
-template void fmt::internal::FixedBuffer<char>::grow(std::size_t);
-
-template void fmt::internal::ArgMap<char>::init(const fmt::ArgList &args);
-
-template void fmt::internal::PrintfFormatter<char>::format(
-  BasicWriter<char> &writer, CStringRef format);
-
-template int fmt::internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, double value);
-
-template int fmt::internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, long double value);
-
-// Explicit instantiations for wchar_t.
-
-template void fmt::internal::FixedBuffer<wchar_t>::grow(std::size_t);
-
-template void fmt::internal::ArgMap<wchar_t>::init(const fmt::ArgList &args);
-
-template void fmt::internal::PrintfFormatter<wchar_t>::format(
-    BasicWriter<wchar_t> &writer, WCStringRef format);
-
-template int fmt::internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, double value);
-
-template int fmt::internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, long double value);
-
-#endif  // FMT_HEADER_ONLY
-
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
diff --git a/diy/include/diy/fmt/format.h b/diy/include/diy/fmt/format.h
deleted file mode 100644
index 0ca1576b8..000000000
--- a/diy/include/diy/fmt/format.h
+++ /dev/null
@@ -1,3834 +0,0 @@
-/*
- Formatting library for C++
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef FMT_FORMAT_H_
-#define FMT_FORMAT_H_
-
-#define FMT_HEADER_ONLY     // Added by diy for header-only usage
-
-#include <cassert>
-#include <clocale>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include <utility>
-
-#ifdef _SECURE_SCL
-# define FMT_SECURE_SCL _SECURE_SCL
-#else
-# define FMT_SECURE_SCL 0
-#endif
-
-#if FMT_SECURE_SCL
-# include <iterator>
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-typedef unsigned __int32 uint32_t;
-typedef unsigned __int64 uint64_t;
-typedef __int64          intmax_t;
-#else
-#include <stdint.h>
-#endif
-
-#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-# ifdef FMT_EXPORT
-#  define FMT_API __declspec(dllexport)
-# elif defined(FMT_SHARED)
-#  define FMT_API __declspec(dllimport)
-# endif
-#endif
-#ifndef FMT_API
-# define FMT_API
-#endif
-
-#ifdef __GNUC__
-# define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-# define FMT_GCC_EXTENSION __extension__
-# if FMT_GCC_VERSION >= 406
-#  pragma GCC diagnostic push
-// Disable the warning about "long long" which is sometimes reported even
-// when using __extension__.
-#  pragma GCC diagnostic ignored "-Wlong-long"
-// Disable the warning about declaration shadowing because it affects too
-// many valid cases.
-#  pragma GCC diagnostic ignored "-Wshadow"
-// Disable the warning about implicit conversions that may change the sign of
-// an integer; silencing it otherwise would require many explicit casts.
-#  pragma GCC diagnostic ignored "-Wsign-conversion"
-# endif
-# if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__
-#  define FMT_HAS_GXX_CXX11 1
-# endif
-#else
-# define FMT_GCC_EXTENSION
-#endif
-
-#if defined(__INTEL_COMPILER)
-# define FMT_ICC_VERSION __INTEL_COMPILER
-#elif defined(__ICL)
-# define FMT_ICC_VERSION __ICL
-#endif
-
-#if defined(__clang__) && !defined(FMT_ICC_VERSION)
-# pragma clang diagnostic push
-# pragma clang diagnostic ignored "-Wdocumentation"
-#endif
-
-#ifdef __GNUC_LIBSTD__
-# define FMT_GNUC_LIBSTD_VERSION (__GNUC_LIBSTD__ * 100 + __GNUC_LIBSTD_MINOR__)
-#endif
-
-#ifdef __has_feature
-# define FMT_HAS_FEATURE(x) __has_feature(x)
-#else
-# define FMT_HAS_FEATURE(x) 0
-#endif
-
-#ifdef __has_builtin
-# define FMT_HAS_BUILTIN(x) __has_builtin(x)
-#else
-# define FMT_HAS_BUILTIN(x) 0
-#endif
-
-#ifdef __has_cpp_attribute
-# define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-# define FMT_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-#ifndef FMT_USE_VARIADIC_TEMPLATES
-// Variadic templates are available in GCC since version 4.4
-// (http://gcc.gnu.org/projects/cxx0x.html) and in Visual C++
-// since version 2013.
-# define FMT_USE_VARIADIC_TEMPLATES \
-   (FMT_HAS_FEATURE(cxx_variadic_templates) || \
-       (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1800)
-#endif
-
-#ifndef FMT_USE_RVALUE_REFERENCES
-// Don't use rvalue references when compiling with clang and an old libstdc++
-// as the latter doesn't provide std::move.
-# if defined(FMT_GNUC_LIBSTD_VERSION) && FMT_GNUC_LIBSTD_VERSION <= 402
-#  define FMT_USE_RVALUE_REFERENCES 0
-# else
-#  define FMT_USE_RVALUE_REFERENCES \
-    (FMT_HAS_FEATURE(cxx_rvalue_references) || \
-        (FMT_GCC_VERSION >= 403 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1600)
-# endif
-#endif
-
-#if FMT_USE_RVALUE_REFERENCES
-# include <utility>  // for std::move
-#endif
-
-// Check if exceptions are disabled.
-#if defined(__GNUC__) && !defined(__EXCEPTIONS)
-# define FMT_EXCEPTIONS 0
-#endif
-#if defined(_MSC_VER) && !_HAS_EXCEPTIONS
-# define FMT_EXCEPTIONS 0
-#endif
-#ifndef FMT_EXCEPTIONS
-# define FMT_EXCEPTIONS 1
-#endif
-
-#ifndef FMT_THROW
-# if FMT_EXCEPTIONS
-#  define FMT_THROW(x) throw x
-# else
-#  define FMT_THROW(x) assert(false)
-# endif
-#endif
-
-// Define FMT_USE_NOEXCEPT to make fmt use noexcept (C++11 feature).
-#ifndef FMT_USE_NOEXCEPT
-# define FMT_USE_NOEXCEPT 0
-#endif
-
-#ifndef FMT_NOEXCEPT
-# if FMT_EXCEPTIONS
-#  if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
-    (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || \
-    _MSC_VER >= 1900
-#   define FMT_NOEXCEPT noexcept
-#  else
-#   define FMT_NOEXCEPT throw()
-#  endif
-# else
-#  define FMT_NOEXCEPT
-# endif
-#endif
-
-// A macro to disallow the copy constructor and operator= functions
-// This should be used in the private: declarations for a class
-#ifndef FMT_USE_DELETED_FUNCTIONS
-# define FMT_USE_DELETED_FUNCTIONS 0
-#endif
-
-#if FMT_USE_DELETED_FUNCTIONS || FMT_HAS_FEATURE(cxx_deleted_functions) || \
-  (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1800
-# define FMT_DELETED_OR_UNDEFINED  = delete
-# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-    TypeName(const TypeName&) = delete; \
-    TypeName& operator=(const TypeName&) = delete
-#else
-# define FMT_DELETED_OR_UNDEFINED
-# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-    TypeName(const TypeName&); \
-    TypeName& operator=(const TypeName&)
-#endif
-
-#ifndef FMT_USE_USER_DEFINED_LITERALS
-// All compilers which support UDLs also support variadic templates. This
-// makes the fmt::literals implementation easier. However, an explicit check
-// for variadic templates is added here just in case.
-// For Intel's compiler both it and the system gcc/msc must support UDLs.
-# define FMT_USE_USER_DEFINED_LITERALS \
-   FMT_USE_VARIADIC_TEMPLATES && FMT_USE_RVALUE_REFERENCES && \
-   (FMT_HAS_FEATURE(cxx_user_literals) || \
-       (FMT_GCC_VERSION >= 407 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1900) && \
-   (!defined(FMT_ICC_VERSION) || FMT_ICC_VERSION >= 1500)
-#endif
-
-#ifndef FMT_ASSERT
-# define FMT_ASSERT(condition, message) assert((condition) && message)
-#endif
-
-
-#if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clz)
-# define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
-#endif
-
-#if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clzll)
-# define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
-#endif
-
-// Some compilers masquerade as both MSVC and GCC-likes or 
-// otherwise support __builtin_clz and __builtin_clzll, so
-// only define FMT_BUILTIN_CLZ using the MSVC intrinsics
-// if the clz and clzll builtins are not available.
-#if defined(_MSC_VER) && !defined(FMT_BUILTIN_CLZLL)
-# include <intrin.h>  // _BitScanReverse, _BitScanReverse64
-
-namespace fmt {
-namespace internal {
-# pragma intrinsic(_BitScanReverse)
-inline uint32_t clz(uint32_t x) {
-  unsigned long r = 0;
-  _BitScanReverse(&r, x);
-
-  assert(x != 0);
-  // Static analysis complains about using uninitialized data
-  // "r", but the only way that can happen is if "x" is 0, 
-  // which the callers guarantee to not happen.
-# pragma warning(suppress: 6102)
-  return 31 - r;
-}
-# define FMT_BUILTIN_CLZ(n) fmt::internal::clz(n)
-
-# ifdef _WIN64
-#  pragma intrinsic(_BitScanReverse64)
-# endif
-
-inline uint32_t clzll(uint64_t x) {
-  unsigned long r = 0;
-# ifdef _WIN64
-  _BitScanReverse64(&r, x);
-# else
-  // Scan the high 32 bits.
-  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
-    return 63 - (r + 32);
-
-  // Scan the low 32 bits.
-  _BitScanReverse(&r, static_cast<uint32_t>(x));
-# endif
-
-  assert(x != 0);
-  // Static analysis complains about using uninitialized data
-  // "r", but the only way that can happen is if "x" is 0, 
-  // which the callers guarantee to not happen.
-# pragma warning(suppress: 6102)
-  return 63 - r;
-}
-# define FMT_BUILTIN_CLZLL(n) fmt::internal::clzll(n)
-}
-}
-#endif
-
-namespace fmt {
-namespace internal {
-struct DummyInt {
-  int data[2];
-  operator int() const { return 0; }
-};
-typedef std::numeric_limits<fmt::internal::DummyInt> FPUtil;
-
-// Dummy implementations of system functions such as signbit and ecvt called
-// if the latter are not available.
-inline DummyInt signbit(...) { return DummyInt(); }
-inline DummyInt _ecvt_s(...) { return DummyInt(); }
-inline DummyInt isinf(...) { return DummyInt(); }
-inline DummyInt _finite(...) { return DummyInt(); }
-inline DummyInt isnan(...) { return DummyInt(); }
-inline DummyInt _isnan(...) { return DummyInt(); }
-
-// A helper function to suppress bogus "conditional expression is constant"
-// warnings.
-template <typename T>
-inline T check(T value) { return value; }
-}
-}  // namespace fmt
-
-namespace std {
-// Standard permits specialization of std::numeric_limits. This specialization
-// is used to resolve ambiguity between isinf and std::isinf in glibc:
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48891
-// and the same for isnan and signbit.
-template <>
-class numeric_limits<fmt::internal::DummyInt> :
-    public std::numeric_limits<int> {
- public:
-  // Portable version of isinf.
-  template <typename T>
-  static bool isinfinity(T x) {
-    using namespace fmt::internal;
-    // The resolution "priority" is:
-    // isinf macro > std::isinf > ::isinf > fmt::internal::isinf
-    if (check(sizeof(isinf(x)) == sizeof(bool) ||
-              sizeof(isinf(x)) == sizeof(int))) {
-      return isinf(x) != 0;
-    }
-    return !_finite(static_cast<double>(x));
-  }
-
-  // Portable version of isnan.
-  template <typename T>
-  static bool isnotanumber(T x) {
-    using namespace fmt::internal;
-    if (check(sizeof(isnan(x)) == sizeof(bool) ||
-              sizeof(isnan(x)) == sizeof(int))) {
-      return isnan(x) != 0;
-    }
-    return _isnan(static_cast<double>(x)) != 0;
-  }
-
-  // Portable version of signbit.
-  static bool isnegative(double x) {
-    using namespace fmt::internal;
-    if (check(sizeof(signbit(x)) == sizeof(int)))
-      return signbit(x) != 0;
-    if (x < 0) return true;
-    if (!isnotanumber(x)) return false;
-    int dec = 0, sign = 0;
-    char buffer[2];  // The buffer size must be >= 2 or _ecvt_s will fail.
-    _ecvt_s(buffer, sizeof(buffer), x, 0, &dec, &sign);
-    return sign != 0;
-  }
-};
-}  // namespace std
-
-namespace fmt {
-
-// Fix the warning about long long on older versions of GCC
-// that don't support the diagnostic pragma.
-FMT_GCC_EXTENSION typedef long long LongLong;
-FMT_GCC_EXTENSION typedef unsigned long long ULongLong;
-
-#if FMT_USE_RVALUE_REFERENCES
-using std::move;
-#endif
-
-template <typename Char>
-class BasicWriter;
-
-typedef BasicWriter<char> Writer;
-typedef BasicWriter<wchar_t> WWriter;
-
-template <typename Char>
-class ArgFormatter;
-
-template <typename CharType,
-          typename ArgFormatter = fmt::ArgFormatter<CharType> >
-class BasicFormatter;
-
-/**
-  \rst
-  A string reference. It can be constructed from a C string or ``std::string``.
-
-  You can use one of the following typedefs for common character types:
-
-  +------------+-------------------------+
-  | Type       | Definition              |
-  +============+=========================+
-  | StringRef  | BasicStringRef<char>    |
-  +------------+-------------------------+
-  | WStringRef | BasicStringRef<wchar_t> |
-  +------------+-------------------------+
-
-  This class is most useful as a parameter type to allow passing
-  different types of strings to a function, for example::
-
-    template <typename... Args>
-    std::string format(StringRef format_str, const Args & ... args);
-
-    format("{}", 42);
-    format(std::string("{}"), 42);
-  \endrst
- */
-template <typename Char>
-class BasicStringRef {
- private:
-  const Char *data_;
-  std::size_t size_;
-
- public:
-  /** Constructs a string reference object from a C string and a size. */
-  BasicStringRef(const Char *s, std::size_t size) : data_(s), size_(size) {}
-
-  /**
-    \rst
-    Constructs a string reference object from a C string computing
-    the size with ``std::char_traits<Char>::length``.
-    \endrst
-   */
-  BasicStringRef(const Char *s)
-    : data_(s), size_(std::char_traits<Char>::length(s)) {}
-
-  /**
-    \rst
-    Constructs a string reference from an ``std::string`` object.
-    \endrst
-   */
-  BasicStringRef(const std::basic_string<Char> &s)
-  : data_(s.c_str()), size_(s.size()) {}
-
-  /**
-    \rst
-    Converts a string reference to an ``std::string`` object.
-    \endrst
-   */
-  std::basic_string<Char> to_string() const {
-    return std::basic_string<Char>(data_, size_);
-  }
-
-  /** Returns a pointer to the string data. */
-  const Char *data() const { return data_; }
-
-  /** Returns the string size. */
-  std::size_t size() const { return size_; }
-
-  // Lexicographically compare this string reference to other.
-  int compare(BasicStringRef other) const {
-    std::size_t size = size_ < other.size_ ? size_ : other.size_;
-    int result = std::char_traits<Char>::compare(data_, other.data_, size);
-    if (result == 0)
-      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
-    return result;
-  }
-
-  friend bool operator==(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) == 0;
-  }
-  friend bool operator!=(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) != 0;
-  }
-  friend bool operator<(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) < 0;
-  }
-  friend bool operator<=(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) <= 0;
-  }
-  friend bool operator>(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) > 0;
-  }
-  friend bool operator>=(BasicStringRef lhs, BasicStringRef rhs) {
-    return lhs.compare(rhs) >= 0;
-  }
-};
-
-typedef BasicStringRef<char> StringRef;
-typedef BasicStringRef<wchar_t> WStringRef;
-
-/**
-  \rst
-  A reference to a null terminated string. It can be constructed from a C
-  string or ``std::string``.
-
-  You can use one of the following typedefs for common character types:
-
-  +-------------+--------------------------+
-  | Type        | Definition               |
-  +=============+==========================+
-  | CStringRef  | BasicCStringRef<char>    |
-  +-------------+--------------------------+
-  | WCStringRef | BasicCStringRef<wchar_t> |
-  +-------------+--------------------------+
-
-  This class is most useful as a parameter type to allow passing
-  different types of strings to a function, for example::
-
-    template <typename... Args>
-    std::string format(CStringRef format_str, const Args & ... args);
-
-    format("{}", 42);
-    format(std::string("{}"), 42);
-  \endrst
- */
-template <typename Char>
-class BasicCStringRef {
- private:
-  const Char *data_;
-
- public:
-  /** Constructs a string reference object from a C string. */
-  BasicCStringRef(const Char *s) : data_(s) {}
-
-  /**
-    \rst
-    Constructs a string reference from an ``std::string`` object.
-    \endrst
-   */
-  BasicCStringRef(const std::basic_string<Char> &s) : data_(s.c_str()) {}
-
-  /** Returns the pointer to a C string. */
-  const Char *c_str() const { return data_; }
-};
-
-typedef BasicCStringRef<char> CStringRef;
-typedef BasicCStringRef<wchar_t> WCStringRef;
-
-/**
-  A formatting error such as invalid format string.
-*/
-class FormatError : public std::runtime_error {
- public:
-  explicit FormatError(CStringRef message)
-  : std::runtime_error(message.c_str()) {}
-};
-
-namespace internal {
-
-// MakeUnsigned<T>::Type gives an unsigned type corresponding to integer type T.
-template <typename T>
-struct MakeUnsigned { typedef T Type; };
-
-#define FMT_SPECIALIZE_MAKE_UNSIGNED(T, U) \
-  template <> \
-  struct MakeUnsigned<T> { typedef U Type; }
-
-FMT_SPECIALIZE_MAKE_UNSIGNED(char, unsigned char);
-FMT_SPECIALIZE_MAKE_UNSIGNED(signed char, unsigned char);
-FMT_SPECIALIZE_MAKE_UNSIGNED(short, unsigned short);
-FMT_SPECIALIZE_MAKE_UNSIGNED(int, unsigned);
-FMT_SPECIALIZE_MAKE_UNSIGNED(long, unsigned long);
-FMT_SPECIALIZE_MAKE_UNSIGNED(LongLong, ULongLong);
-
-// Casts nonnegative integer to unsigned.
-template <typename Int>
-inline typename MakeUnsigned<Int>::Type to_unsigned(Int value) {
-  FMT_ASSERT(value >= 0, "negative value");
-  return static_cast<typename MakeUnsigned<Int>::Type>(value);
-}
-
-// The number of characters to store in the MemoryBuffer object itself
-// to avoid dynamic memory allocation.
-enum { INLINE_BUFFER_SIZE = 500 };
-
-#if FMT_SECURE_SCL
-// Use checked iterator to avoid warnings on MSVC.
-template <typename T>
-inline stdext::checked_array_iterator<T*> make_ptr(T *ptr, std::size_t size) {
-  return stdext::checked_array_iterator<T*>(ptr, size);
-}
-#else
-template <typename T>
-inline T *make_ptr(T *ptr, std::size_t) { return ptr; }
-#endif
-}  // namespace internal
-
-/**
-  \rst
-  A buffer supporting a subset of ``std::vector``'s operations.
-  \endrst
- */
-template <typename T>
-class Buffer {
- private:
-  FMT_DISALLOW_COPY_AND_ASSIGN(Buffer);
-
- protected:
-  T *ptr_;
-  std::size_t size_;
-  std::size_t capacity_;
-
-  Buffer(T *ptr = 0, std::size_t capacity = 0)
-    : ptr_(ptr), size_(0), capacity_(capacity) {}
-
-  /**
-    \rst
-    Increases the buffer capacity to hold at least *size* elements updating
-    ``ptr_`` and ``capacity_``.
-    \endrst
-   */
-  virtual void grow(std::size_t size) = 0;
-
- public:
-  virtual ~Buffer() {}
-
-  /** Returns the size of this buffer. */
-  std::size_t size() const { return size_; }
-
-  /** Returns the capacity of this buffer. */
-  std::size_t capacity() const { return capacity_; }
-
-  /**
-    Resizes the buffer. If T is a POD type new elements may not be initialized.
-   */
-  void resize(std::size_t new_size) {
-    if (new_size > capacity_)
-      grow(new_size);
-    size_ = new_size;
-  }
-
-  /**
-    \rst
-    Reserves space to store at least *capacity* elements.
-    \endrst
-   */
-  void reserve(std::size_t capacity) {
-    if (capacity > capacity_)
-      grow(capacity);
-  }
-
-  void clear() FMT_NOEXCEPT { size_ = 0; }
-
-  void push_back(const T &value) {
-    if (size_ == capacity_)
-      grow(size_ + 1);
-    ptr_[size_++] = value;
-  }
-
-  /** Appends data to the end of the buffer. */
-  template <typename U>
-  void append(const U *begin, const U *end);
-
-  T &operator[](std::size_t index) { return ptr_[index]; }
-  const T &operator[](std::size_t index) const { return ptr_[index]; }
-};
-
-template <typename T>
-template <typename U>
-void Buffer<T>::append(const U *begin, const U *end) {
-  std::size_t new_size = size_ + internal::to_unsigned(end - begin);
-  if (new_size > capacity_)
-    grow(new_size);
-  std::uninitialized_copy(begin, end,
-                          internal::make_ptr(ptr_, capacity_) + size_);
-  size_ = new_size;
-}
-
-namespace internal {
-
-// A memory buffer for trivially copyable/constructible types with the first SIZE
-// elements stored in the object itself.
-template <typename T, std::size_t SIZE, typename Allocator = std::allocator<T> >
-class MemoryBuffer : private Allocator, public Buffer<T> {
- private:
-  T data_[SIZE];
-
-  // Deallocate memory allocated by the buffer.
-  void deallocate() {
-    if (this->ptr_ != data_) Allocator::deallocate(this->ptr_, this->capacity_);
-  }
-
- protected:
-  void grow(std::size_t size);
-
- public:
-  explicit MemoryBuffer(const Allocator &alloc = Allocator())
-      : Allocator(alloc), Buffer<T>(data_, SIZE) {}
-  ~MemoryBuffer() { deallocate(); }
-
-#if FMT_USE_RVALUE_REFERENCES
- private:
-  // Move data from other to this buffer.
-  void move(MemoryBuffer &other) {
-    Allocator &this_alloc = *this, &other_alloc = other;
-    this_alloc = std::move(other_alloc);
-    this->size_ = other.size_;
-    this->capacity_ = other.capacity_;
-    if (other.ptr_ == other.data_) {
-      this->ptr_ = data_;
-      std::uninitialized_copy(other.data_, other.data_ + this->size_,
-                              make_ptr(data_, this->capacity_));
-    } else {
-      this->ptr_ = other.ptr_;
-      // Set pointer to the inline array so that delete is not called
-      // when deallocating.
-      other.ptr_ = other.data_;
-    }
-  }
-
- public:
-  MemoryBuffer(MemoryBuffer &&other) {
-    move(other);
-  }
-
-  MemoryBuffer &operator=(MemoryBuffer &&other) {
-    assert(this != &other);
-    deallocate();
-    move(other);
-    return *this;
-  }
-#endif
-
-  // Returns a copy of the allocator associated with this buffer.
-  Allocator get_allocator() const { return *this; }
-};
-
-template <typename T, std::size_t SIZE, typename Allocator>
-void MemoryBuffer<T, SIZE, Allocator>::grow(std::size_t size) {
-  std::size_t new_capacity = this->capacity_ + this->capacity_ / 2;
-  if (size > new_capacity)
-      new_capacity = size;
-  T *new_ptr = this->allocate(new_capacity);
-  // The following code doesn't throw, so the raw pointer above doesn't leak.
-  std::uninitialized_copy(this->ptr_, this->ptr_ + this->size_,
-                          make_ptr(new_ptr, new_capacity));
-  std::size_t old_capacity = this->capacity_;
-  T *old_ptr = this->ptr_;
-  this->capacity_ = new_capacity;
-  this->ptr_ = new_ptr;
-  // deallocate may throw (at least in principle), but it doesn't matter since
-  // the buffer already uses the new storage and will deallocate it in case
-  // of exception.
-  if (old_ptr != data_)
-    Allocator::deallocate(old_ptr, old_capacity);
-}
-
-// A fixed-size buffer.
-template <typename Char>
-class FixedBuffer : public fmt::Buffer<Char> {
- public:
-  FixedBuffer(Char *array, std::size_t size) : fmt::Buffer<Char>(array, size) {}
-
- protected:
-  FMT_API void grow(std::size_t size);
-};
-
-template <typename Char>
-class BasicCharTraits {
- public:
-#if FMT_SECURE_SCL
-  typedef stdext::checked_array_iterator<Char*> CharPtr;
-#else
-  typedef Char *CharPtr;
-#endif
-  static Char cast(int value) { return static_cast<Char>(value); }
-};
-
-template <typename Char>
-class CharTraits;
-
-template <>
-class CharTraits<char> : public BasicCharTraits<char> {
- private:
-  // Conversion from wchar_t to char is not allowed.
-  static char convert(wchar_t);
-
- public:
-  static char convert(char value) { return value; }
-
-  // Formats a floating-point number.
-  template <typename T>
-  FMT_API static int format_float(char *buffer, std::size_t size,
-      const char *format, unsigned width, int precision, T value);
-};
-
-template <>
-class CharTraits<wchar_t> : public BasicCharTraits<wchar_t> {
- public:
-  static wchar_t convert(char value) { return value; }
-  static wchar_t convert(wchar_t value) { return value; }
-
-  template <typename T>
-  FMT_API static int format_float(wchar_t *buffer, std::size_t size,
-      const wchar_t *format, unsigned width, int precision, T value);
-};
-
-// Checks if a number is negative - used to avoid warnings.
-template <bool IsSigned>
-struct SignChecker {
-  template <typename T>
-  static bool is_negative(T value) { return value < 0; }
-};
-
-template <>
-struct SignChecker<false> {
-  template <typename T>
-  static bool is_negative(T) { return false; }
-};
-
-// Returns true if value is negative, false otherwise.
-// Same as (value < 0) but doesn't produce warnings if T is an unsigned type.
-template <typename T>
-inline bool is_negative(T value) {
-  return SignChecker<std::numeric_limits<T>::is_signed>::is_negative(value);
-}
-
-// Selects uint32_t if FitsIn32Bits is true, uint64_t otherwise.
-template <bool FitsIn32Bits>
-struct TypeSelector { typedef uint32_t Type; };
-
-template <>
-struct TypeSelector<false> { typedef uint64_t Type; };
-
-template <typename T>
-struct IntTraits {
-  // Smallest of uint32_t and uint64_t that is large enough to represent
-  // all values of T.
-  typedef typename
-    TypeSelector<std::numeric_limits<T>::digits <= 32>::Type MainType;
-};
-
-FMT_API void report_unknown_type(char code, const char *type);
-
-// Static data is placed in this class template to allow header-only
-// configuration.
-template <typename T = void>
-struct FMT_API BasicData {
-  static const uint32_t POWERS_OF_10_32[];
-  static const uint64_t POWERS_OF_10_64[];
-  static const char DIGITS[];
-};
-
-typedef BasicData<> Data;
-
-#ifdef FMT_BUILTIN_CLZLL
-// Returns the number of decimal digits in n. Leading zeros are not counted
-// except for n == 0 in which case count_digits returns 1.
-inline unsigned count_digits(uint64_t n) {
-  // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
-  // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
-  int t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
-  return to_unsigned(t) - (n < Data::POWERS_OF_10_64[t]) + 1;
-}
-#else
-// Fallback version of count_digits used when __builtin_clz is not available.
-inline unsigned count_digits(uint64_t n) {
-  unsigned count = 1;
-  for (;;) {
-    // Integer division is slow so do it for a group of four digits instead
-    // of for every digit. The idea comes from the talk by Alexandrescu
-    // "Three Optimization Tips for C++". See speed-test for a comparison.
-    if (n < 10) return count;
-    if (n < 100) return count + 1;
-    if (n < 1000) return count + 2;
-    if (n < 10000) return count + 3;
-    n /= 10000u;
-    count += 4;
-  }
-}
-#endif
-
-#ifdef FMT_BUILTIN_CLZ
-// Optional version of count_digits for better performance on 32-bit platforms.
-inline unsigned count_digits(uint32_t n) {
-  int t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
-  return to_unsigned(t) - (n < Data::POWERS_OF_10_32[t]) + 1;
-}
-#endif
-
-// A functor that doesn't add a thousands separator.
-struct NoThousandsSep {
-  template <typename Char>
-  void operator()(Char *) {}
-};
-
-// A functor that adds a thousands separator.
-class ThousandsSep {
- private:
-  fmt::StringRef sep_;
-
-  // Index of a decimal digit with the least significant digit having index 0.
-  unsigned digit_index_;
-
- public:
-  explicit ThousandsSep(fmt::StringRef sep) : sep_(sep), digit_index_(0) {}
-
-  template <typename Char>
-  void operator()(Char *&buffer) {
-    if (++digit_index_ % 3 != 0)
-      return;
-    buffer -= sep_.size();
-    std::uninitialized_copy(sep_.data(), sep_.data() + sep_.size(),
-                            internal::make_ptr(buffer, sep_.size()));
-  }
-};
-
-// Formats a decimal unsigned integer value writing into buffer.
-// thousands_sep is a functor that is called after writing each char to
-// add a thousands separator if necessary.
-template <typename UInt, typename Char, typename ThousandsSep>
-inline void format_decimal(Char *buffer, UInt value, unsigned num_digits,
-                           ThousandsSep thousands_sep) {
-  buffer += num_digits;
-  while (value >= 100) {
-    // Integer division is slow so do it for a group of two digits instead
-    // of for every digit. The idea comes from the talk by Alexandrescu
-    // "Three Optimization Tips for C++". See speed-test for a comparison.
-    unsigned index = static_cast<unsigned>((value % 100) * 2);
-    value /= 100;
-    *--buffer = Data::DIGITS[index + 1];
-    thousands_sep(buffer);
-    *--buffer = Data::DIGITS[index];
-    thousands_sep(buffer);
-  }
-  if (value < 10) {
-    *--buffer = static_cast<char>('0' + value);
-    return;
-  }
-  unsigned index = static_cast<unsigned>(value * 2);
-  *--buffer = Data::DIGITS[index + 1];
-  *--buffer = Data::DIGITS[index];
-}
-
-template <typename UInt, typename Char>
-inline void format_decimal(Char *buffer, UInt value, unsigned num_digits) {
-  return format_decimal(buffer, value, num_digits, NoThousandsSep());
-}
-
-#ifndef _WIN32
-# define FMT_USE_WINDOWS_H 0
-#elif !defined(FMT_USE_WINDOWS_H)
-# define FMT_USE_WINDOWS_H 1
-#endif
-
-// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
-// All the functionality that relies on it will be disabled too.
-#if FMT_USE_WINDOWS_H
-// A converter from UTF-8 to UTF-16.
-// It is only provided for Windows since other systems support UTF-8 natively.
-class UTF8ToUTF16 {
- private:
-  MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer_;
-
- public:
-  FMT_API explicit UTF8ToUTF16(StringRef s);
-  operator WStringRef() const { return WStringRef(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const wchar_t *c_str() const { return &buffer_[0]; }
-  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
-};
-
-// A converter from UTF-16 to UTF-8.
-// It is only provided for Windows since other systems support UTF-8 natively.
-class UTF16ToUTF8 {
- private:
-  MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer_;
-
- public:
-  UTF16ToUTF8() {}
-  FMT_API explicit UTF16ToUTF8(WStringRef s);
-  operator StringRef() const { return StringRef(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const char *c_str() const { return &buffer_[0]; }
-  std::string str() const { return std::string(&buffer_[0], size()); }
-
-  // Performs conversion returning a system error code instead of
-  // throwing exception on conversion error. This method may still throw
-  // in case of memory allocation error.
-  FMT_API int convert(WStringRef s);
-};
-
-FMT_API void format_windows_error(fmt::Writer &out, int error_code,
-                                  fmt::StringRef message) FMT_NOEXCEPT;
-#endif
-
-FMT_API void format_system_error(fmt::Writer &out, int error_code,
-                                 fmt::StringRef message) FMT_NOEXCEPT;
-
-// A formatting argument value.
-struct Value {
-  template <typename Char>
-  struct StringValue {
-    const Char *value;
-    std::size_t size;
-  };
-
-  typedef void (*FormatFunc)(
-      void *formatter, const void *arg, void *format_str_ptr);
-
-  struct CustomValue {
-    const void *value;
-    FormatFunc format;
-  };
-
-  union {
-    int int_value;
-    unsigned uint_value;
-    LongLong long_long_value;
-    ULongLong ulong_long_value;
-    double double_value;
-    long double long_double_value;
-    const void *pointer;
-    StringValue<char> string;
-    StringValue<signed char> sstring;
-    StringValue<unsigned char> ustring;
-    StringValue<wchar_t> wstring;
-    CustomValue custom;
-  };
-
-  enum Type {
-    NONE, NAMED_ARG,
-    // Integer types should go first,
-    INT, UINT, LONG_LONG, ULONG_LONG, BOOL, CHAR, LAST_INTEGER_TYPE = CHAR,
-    // followed by floating-point types.
-    DOUBLE, LONG_DOUBLE, LAST_NUMERIC_TYPE = LONG_DOUBLE,
-    CSTRING, STRING, WSTRING, POINTER, CUSTOM
-  };
-};
-
-// A formatting argument. It is a trivially copyable/constructible type to
-// allow storage in internal::MemoryBuffer.
-struct Arg : Value {
-  Type type;
-};
-
-template <typename Char>
-struct NamedArg;
-
-template <typename T = void>
-struct Null {};
-
-// A helper class template to enable or disable overloads taking wide
-// characters and strings in MakeValue.
-template <typename T, typename Char>
-struct WCharHelper {
-  typedef Null<T> Supported;
-  typedef T Unsupported;
-};
-
-template <typename T>
-struct WCharHelper<T, wchar_t> {
-  typedef T Supported;
-  typedef Null<T> Unsupported;
-};
-
-typedef char Yes[1];
-typedef char No[2];
-
-template <typename T>
-T &get();
-
-// These are non-members to workaround an overload resolution bug in bcc32.
-Yes &convert(fmt::ULongLong);
-No &convert(...);
-
-template<typename T, bool ENABLE_CONVERSION>
-struct ConvertToIntImpl {
-  enum { value = ENABLE_CONVERSION };
-};
-
-template<typename T, bool ENABLE_CONVERSION>
-struct ConvertToIntImpl2 {
-  enum { value = false };
-};
-
-template<typename T>
-struct ConvertToIntImpl2<T, true> {
-  enum {
-    // Don't convert numeric types.
-    value = ConvertToIntImpl<T, !std::numeric_limits<T>::is_specialized>::value
-  };
-};
-
-template<typename T>
-struct ConvertToInt {
-  enum { enable_conversion = sizeof(convert(get<T>())) == sizeof(Yes) };
-  enum { value = ConvertToIntImpl2<T, enable_conversion>::value };
-};
-
-#define FMT_DISABLE_CONVERSION_TO_INT(Type) \
-  template <> \
-  struct ConvertToInt<Type> {  enum { value = 0 }; }
-
-// Silence warnings about convering float to int.
-FMT_DISABLE_CONVERSION_TO_INT(float);
-FMT_DISABLE_CONVERSION_TO_INT(double);
-FMT_DISABLE_CONVERSION_TO_INT(long double);
-
-template<bool B, class T = void>
-struct EnableIf {};
-
-template<class T>
-struct EnableIf<true, T> { typedef T type; };
-
-template<bool B, class T, class F>
-struct Conditional { typedef T type; };
-
-template<class T, class F>
-struct Conditional<false, T, F> { typedef F type; };
-
-// For bcc32 which doesn't understand ! in template arguments.
-template<bool>
-struct Not { enum { value = 0 }; };
-
-template<>
-struct Not<false> { enum { value = 1 }; };
-
-// Makes an Arg object from any type.
-template <typename Formatter>
-class MakeValue : public Arg {
- public:
-  typedef typename Formatter::Char Char;
-
- private:
-  // The following two methods are private to disallow formatting of
-  // arbitrary pointers. If you want to output a pointer cast it to
-  // "void *" or "const void *". In particular, this forbids formatting
-  // of "[const] volatile char *" which is printed as bool by iostreams.
-  // Do not implement!
-  template <typename T>
-  MakeValue(const T *value);
-  template <typename T>
-  MakeValue(T *value);
-
-  // The following methods are private to disallow formatting of wide
-  // characters and strings into narrow strings as in
-  //   fmt::format("{}", L"test");
-  // To fix this, use a wide format string: fmt::format(L"{}", L"test").
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-  MakeValue(typename WCharHelper<wchar_t, Char>::Unsupported);
-#endif
-  MakeValue(typename WCharHelper<wchar_t *, Char>::Unsupported);
-  MakeValue(typename WCharHelper<const wchar_t *, Char>::Unsupported);
-  MakeValue(typename WCharHelper<const std::wstring &, Char>::Unsupported);
-  MakeValue(typename WCharHelper<WStringRef, Char>::Unsupported);
-
-  void set_string(StringRef str) {
-    string.value = str.data();
-    string.size = str.size();
-  }
-
-  void set_string(WStringRef str) {
-    wstring.value = str.data();
-    wstring.size = str.size();
-  }
-
-  // Formats an argument of a custom type, such as a user-defined class.
-  template <typename T>
-  static void format_custom_arg(
-      void *formatter, const void *arg, void *format_str_ptr) {
-    format(*static_cast<Formatter*>(formatter),
-           *static_cast<const Char**>(format_str_ptr),
-           *static_cast<const T*>(arg));
-  }
-
- public:
-  MakeValue() {}
-
-#define FMT_MAKE_VALUE_(Type, field, TYPE, rhs) \
-  MakeValue(Type value) { field = rhs; } \
-  static uint64_t type(Type) { return Arg::TYPE; }
-
-#define FMT_MAKE_VALUE(Type, field, TYPE) \
-  FMT_MAKE_VALUE_(Type, field, TYPE, value)
-
-  FMT_MAKE_VALUE(bool, int_value, BOOL)
-  FMT_MAKE_VALUE(short, int_value, INT)
-  FMT_MAKE_VALUE(unsigned short, uint_value, UINT)
-  FMT_MAKE_VALUE(int, int_value, INT)
-  FMT_MAKE_VALUE(unsigned, uint_value, UINT)
-
-  MakeValue(long value) {
-    // To minimize the number of types we need to deal with, long is
-    // translated either to int or to long long depending on its size.
-    if (check(sizeof(long) == sizeof(int)))
-      int_value = static_cast<int>(value);
-    else
-      long_long_value = value;
-  }
-  static uint64_t type(long) {
-    return sizeof(long) == sizeof(int) ? Arg::INT : Arg::LONG_LONG;
-  }
-
-  MakeValue(unsigned long value) {
-    if (check(sizeof(unsigned long) == sizeof(unsigned)))
-      uint_value = static_cast<unsigned>(value);
-    else
-      ulong_long_value = value;
-  }
-  static uint64_t type(unsigned long) {
-    return sizeof(unsigned long) == sizeof(unsigned) ?
-          Arg::UINT : Arg::ULONG_LONG;
-  }
-
-  FMT_MAKE_VALUE(LongLong, long_long_value, LONG_LONG)
-  FMT_MAKE_VALUE(ULongLong, ulong_long_value, ULONG_LONG)
-  FMT_MAKE_VALUE(float, double_value, DOUBLE)
-  FMT_MAKE_VALUE(double, double_value, DOUBLE)
-  FMT_MAKE_VALUE(long double, long_double_value, LONG_DOUBLE)
-  FMT_MAKE_VALUE(signed char, int_value, INT)
-  FMT_MAKE_VALUE(unsigned char, uint_value, UINT)
-  FMT_MAKE_VALUE(char, int_value, CHAR)
-
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-  MakeValue(typename WCharHelper<wchar_t, Char>::Supported value) {
-    int_value = value;
-  }
-  static uint64_t type(wchar_t) { return Arg::CHAR; }
-#endif
-
-#define FMT_MAKE_STR_VALUE(Type, TYPE) \
-  MakeValue(Type value) { set_string(value); } \
-  static uint64_t type(Type) { return Arg::TYPE; }
-
-  FMT_MAKE_VALUE(char *, string.value, CSTRING)
-  FMT_MAKE_VALUE(const char *, string.value, CSTRING)
-  FMT_MAKE_VALUE(const signed char *, sstring.value, CSTRING)
-  FMT_MAKE_VALUE(const unsigned char *, ustring.value, CSTRING)
-  FMT_MAKE_STR_VALUE(const std::string &, STRING)
-  FMT_MAKE_STR_VALUE(StringRef, STRING)
-  FMT_MAKE_VALUE_(CStringRef, string.value, CSTRING, value.c_str())
-
-#define FMT_MAKE_WSTR_VALUE(Type, TYPE) \
-  MakeValue(typename WCharHelper<Type, Char>::Supported value) { \
-    set_string(value); \
-  } \
-  static uint64_t type(Type) { return Arg::TYPE; }
-
-  FMT_MAKE_WSTR_VALUE(wchar_t *, WSTRING)
-  FMT_MAKE_WSTR_VALUE(const wchar_t *, WSTRING)
-  FMT_MAKE_WSTR_VALUE(const std::wstring &, WSTRING)
-  FMT_MAKE_WSTR_VALUE(WStringRef, WSTRING)
-
-  FMT_MAKE_VALUE(void *, pointer, POINTER)
-  FMT_MAKE_VALUE(const void *, pointer, POINTER)
-
-  template <typename T>
-  MakeValue(const T &value,
-            typename EnableIf<Not<
-              ConvertToInt<T>::value>::value, int>::type = 0) {
-    custom.value = &value;
-    custom.format = &format_custom_arg<T>;
-  }
-
-  template <typename T>
-  MakeValue(const T &value,
-            typename EnableIf<ConvertToInt<T>::value, int>::type = 0) {
-    int_value = value;
-  }
-
-  template <typename T>
-  static uint64_t type(const T &) {
-    return ConvertToInt<T>::value ? Arg::INT : Arg::CUSTOM;
-  }
-
-  // Additional template param `Char_` is needed here because make_type always
-  // uses char.
-  template <typename Char_>
-  MakeValue(const NamedArg<Char_> &value) { pointer = &value; }
-
-  template <typename Char_>
-  static uint64_t type(const NamedArg<Char_> &) { return Arg::NAMED_ARG; }
-};
-
-template <typename Formatter>
-class MakeArg : public Arg {
-public:
-  MakeArg() {
-    type = Arg::NONE;
-  }
-  
-  template <typename T>
-  MakeArg(const T &value)
-  : Arg(MakeValue<Formatter>(value)) {
-    type = static_cast<Arg::Type>(MakeValue<Formatter>::type(value));
-  }
-};
-
-template <typename Char>
-struct NamedArg : Arg {
-  BasicStringRef<Char> name;
-
-  template <typename T>
-  NamedArg(BasicStringRef<Char> argname, const T &value)
-  : Arg(MakeArg< BasicFormatter<Char> >(value)), name(argname) {}
-};
-
-class RuntimeError : public std::runtime_error {
- protected:
-  RuntimeError() : std::runtime_error("") {}
-};
-
-template <typename Char>
-class PrintfArgFormatter;
-
-template <typename Char>
-class ArgMap;
-}  // namespace internal
-
-/** An argument list. */
-class ArgList {
- private:
-  // To reduce compiled code size per formatting function call, types of first
-  // MAX_PACKED_ARGS arguments are passed in the types_ field.
-  uint64_t types_;
-  union {
-    // If the number of arguments is less than MAX_PACKED_ARGS, the argument
-    // values are stored in values_, otherwise they are stored in args_.
-    // This is done to reduce compiled code size as storing larger objects
-    // may require more code (at least on x86-64) even if the same amount of
-    // data is actually copied to stack. It saves ~10% on the bloat test.
-    const internal::Value *values_;
-    const internal::Arg *args_;
-  };
-
-  internal::Arg::Type type(unsigned index) const {
-    unsigned shift = index * 4;
-    uint64_t mask = 0xf;
-    return static_cast<internal::Arg::Type>(
-          (types_ & (mask << shift)) >> shift);
-  }
-
-  template <typename Char>
-  friend class internal::ArgMap;
-
- public:
-  // Maximum number of arguments with packed types.
-  enum { MAX_PACKED_ARGS = 16 };
-
-  ArgList() : types_(0) {}
-
-  ArgList(ULongLong types, const internal::Value *values)
-  : types_(types), values_(values) {}
-  ArgList(ULongLong types, const internal::Arg *args)
-  : types_(types), args_(args) {}
-
-  /** Returns the argument at specified index. */
-  internal::Arg operator[](unsigned index) const {
-    using internal::Arg;
-    Arg arg;
-    bool use_values = type(MAX_PACKED_ARGS - 1) == Arg::NONE;
-    if (index < MAX_PACKED_ARGS) {
-      Arg::Type arg_type = type(index);
-      internal::Value &val = arg;
-      if (arg_type != Arg::NONE)
-        val = use_values ? values_[index] : args_[index];
-      arg.type = arg_type;
-      return arg;
-    }
-    if (use_values) {
-      // The index is greater than the number of arguments that can be stored
-      // in values, so return a "none" argument.
-      arg.type = Arg::NONE;
-      return arg;
-    }
-    for (unsigned i = MAX_PACKED_ARGS; i <= index; ++i) {
-      if (args_[i].type == Arg::NONE)
-        return args_[i];
-    }
-    return args_[index];
-  }
-};
-
-#define FMT_DISPATCH(call) static_cast<Impl*>(this)->call
-
-/**
-  \rst
-  An argument visitor based on the `curiously recurring template pattern
-  <http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern>`_.
-
-  To use `~fmt::ArgVisitor` define a subclass that implements some or all of the
-  visit methods with the same signatures as the methods in `~fmt::ArgVisitor`,
-  for example, `~fmt::ArgVisitor::visit_int()`.
-  Pass the subclass as the *Impl* template parameter. Then calling
-  `~fmt::ArgVisitor::visit` for some argument will dispatch to a visit method
-  specific to the argument type. For example, if the argument type is
-  ``double`` then the `~fmt::ArgVisitor::visit_double()` method of a subclass
-  will be called. If the subclass doesn't contain a method with this signature,
-  then a corresponding method of `~fmt::ArgVisitor` will be called.
-
-  **Example**::
-
-    class MyArgVisitor : public fmt::ArgVisitor<MyArgVisitor, void> {
-     public:
-      void visit_int(int value) { fmt::print("{}", value); }
-      void visit_double(double value) { fmt::print("{}", value ); }
-    };
-  \endrst
- */
-template <typename Impl, typename Result>
-class ArgVisitor {
- private:
-  typedef internal::Arg Arg;
-
- public:
-  void report_unhandled_arg() {}
-
-  Result visit_unhandled_arg() {
-    FMT_DISPATCH(report_unhandled_arg());
-    return Result();
-  }
-
-  /** Visits an ``int`` argument. **/
-  Result visit_int(int value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits a ``long long`` argument. **/
-  Result visit_long_long(LongLong value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits an ``unsigned`` argument. **/
-  Result visit_uint(unsigned value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits an ``unsigned long long`` argument. **/
-  Result visit_ulong_long(ULongLong value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits a ``bool`` argument. **/
-  Result visit_bool(bool value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits a ``char`` or ``wchar_t`` argument. **/
-  Result visit_char(int value) {
-    return FMT_DISPATCH(visit_any_int(value));
-  }
-
-  /** Visits an argument of any integral type. **/
-  template <typename T>
-  Result visit_any_int(T) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits a ``double`` argument. **/
-  Result visit_double(double value) {
-    return FMT_DISPATCH(visit_any_double(value));
-  }
-
-  /** Visits a ``long double`` argument. **/
-  Result visit_long_double(long double value) {
-    return FMT_DISPATCH(visit_any_double(value));
-  }
-
-  /** Visits a ``double`` or ``long double`` argument. **/
-  template <typename T>
-  Result visit_any_double(T) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits a null-terminated C string (``const char *``) argument. **/
-  Result visit_cstring(const char *) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits a string argument. **/
-  Result visit_string(Arg::StringValue<char>) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits a wide string argument. **/
-  Result visit_wstring(Arg::StringValue<wchar_t>) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits a pointer argument. **/
-  Result visit_pointer(const void *) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /** Visits an argument of a custom (user-defined) type. **/
-  Result visit_custom(Arg::CustomValue) {
-    return FMT_DISPATCH(visit_unhandled_arg());
-  }
-
-  /**
-    \rst
-    Visits an argument dispatching to the appropriate visit method based on
-    the argument type. For example, if the argument type is ``double`` then
-    the `~fmt::ArgVisitor::visit_double()` method of the *Impl* class will be
-    called.
-    \endrst
-   */
-  Result visit(const Arg &arg) {
-    switch (arg.type) {
-    default:
-      FMT_ASSERT(false, "invalid argument type");
-      return Result();
-    case Arg::INT:
-      return FMT_DISPATCH(visit_int(arg.int_value));
-    case Arg::UINT:
-      return FMT_DISPATCH(visit_uint(arg.uint_value));
-    case Arg::LONG_LONG:
-      return FMT_DISPATCH(visit_long_long(arg.long_long_value));
-    case Arg::ULONG_LONG:
-      return FMT_DISPATCH(visit_ulong_long(arg.ulong_long_value));
-    case Arg::BOOL:
-      return FMT_DISPATCH(visit_bool(arg.int_value != 0));
-    case Arg::CHAR:
-      return FMT_DISPATCH(visit_char(arg.int_value));
-    case Arg::DOUBLE:
-      return FMT_DISPATCH(visit_double(arg.double_value));
-    case Arg::LONG_DOUBLE:
-      return FMT_DISPATCH(visit_long_double(arg.long_double_value));
-    case Arg::CSTRING:
-      return FMT_DISPATCH(visit_cstring(arg.string.value));
-    case Arg::STRING:
-      return FMT_DISPATCH(visit_string(arg.string));
-    case Arg::WSTRING:
-      return FMT_DISPATCH(visit_wstring(arg.wstring));
-    case Arg::POINTER:
-      return FMT_DISPATCH(visit_pointer(arg.pointer));
-    case Arg::CUSTOM:
-      return FMT_DISPATCH(visit_custom(arg.custom));
-    }
-  }
-};
-
-enum Alignment {
-  ALIGN_DEFAULT, ALIGN_LEFT, ALIGN_RIGHT, ALIGN_CENTER, ALIGN_NUMERIC
-};
-
-// Flags.
-enum {
-  SIGN_FLAG = 1, PLUS_FLAG = 2, MINUS_FLAG = 4, HASH_FLAG = 8,
-  CHAR_FLAG = 0x10  // Argument has char type - used in error reporting.
-};
-
-// An empty format specifier.
-struct EmptySpec {};
-
-// A type specifier.
-template <char TYPE>
-struct TypeSpec : EmptySpec {
-  Alignment align() const { return ALIGN_DEFAULT; }
-  unsigned width() const { return 0; }
-  int precision() const { return -1; }
-  bool flag(unsigned) const { return false; }
-  char type() const { return TYPE; }
-  char fill() const { return ' '; }
-};
-
-// A width specifier.
-struct WidthSpec {
-  unsigned width_;
-  // Fill is always wchar_t and cast to char if necessary to avoid having
-  // two specialization of WidthSpec and its subclasses.
-  wchar_t fill_;
-
-  WidthSpec(unsigned width, wchar_t fill) : width_(width), fill_(fill) {}
-
-  unsigned width() const { return width_; }
-  wchar_t fill() const { return fill_; }
-};
-
-// An alignment specifier.
-struct AlignSpec : WidthSpec {
-  Alignment align_;
-
-  AlignSpec(unsigned width, wchar_t fill, Alignment align = ALIGN_DEFAULT)
-  : WidthSpec(width, fill), align_(align) {}
-
-  Alignment align() const { return align_; }
-
-  int precision() const { return -1; }
-};
-
-// An alignment and type specifier.
-template <char TYPE>
-struct AlignTypeSpec : AlignSpec {
-  AlignTypeSpec(unsigned width, wchar_t fill) : AlignSpec(width, fill) {}
-
-  bool flag(unsigned) const { return false; }
-  char type() const { return TYPE; }
-};
-
-// A full format specifier.
-struct FormatSpec : AlignSpec {
-  unsigned flags_;
-  int precision_;
-  char type_;
-
-  FormatSpec(
-    unsigned width = 0, char type = 0, wchar_t fill = ' ')
-  : AlignSpec(width, fill), flags_(0), precision_(-1), type_(type) {}
-
-  bool flag(unsigned f) const { return (flags_ & f) != 0; }
-  int precision() const { return precision_; }
-  char type() const { return type_; }
-};
-
-// An integer format specifier.
-template <typename T, typename SpecT = TypeSpec<0>, typename Char = char>
-class IntFormatSpec : public SpecT {
- private:
-  T value_;
-
- public:
-  IntFormatSpec(T val, const SpecT &spec = SpecT())
-  : SpecT(spec), value_(val) {}
-
-  T value() const { return value_; }
-};
-
-// A string format specifier.
-template <typename Char>
-class StrFormatSpec : public AlignSpec {
- private:
-  const Char *str_;
-
- public:
-  template <typename FillChar>
-  StrFormatSpec(const Char *str, unsigned width, FillChar fill)
-  : AlignSpec(width, fill), str_(str) {
-    internal::CharTraits<Char>::convert(FillChar());
-  }
-
-  const Char *str() const { return str_; }
-};
-
-/**
-  Returns an integer format specifier to format the value in base 2.
- */
-IntFormatSpec<int, TypeSpec<'b'> > bin(int value);
-
-/**
-  Returns an integer format specifier to format the value in base 8.
- */
-IntFormatSpec<int, TypeSpec<'o'> > oct(int value);
-
-/**
-  Returns an integer format specifier to format the value in base 16 using
-  lower-case letters for the digits above 9.
- */
-IntFormatSpec<int, TypeSpec<'x'> > hex(int value);
-
-/**
-  Returns an integer formatter format specifier to format in base 16 using
-  upper-case letters for the digits above 9.
- */
-IntFormatSpec<int, TypeSpec<'X'> > hexu(int value);
-
-/**
-  \rst
-  Returns an integer format specifier to pad the formatted argument with the
-  fill character to the specified width using the default (right) numeric
-  alignment.
-
-  **Example**::
-
-    MemoryWriter out;
-    out << pad(hex(0xcafe), 8, '0');
-    // out.str() == "0000cafe"
-
-  \endrst
- */
-template <char TYPE_CODE, typename Char>
-IntFormatSpec<int, AlignTypeSpec<TYPE_CODE>, Char> pad(
-    int value, unsigned width, Char fill = ' ');
-
-#define FMT_DEFINE_INT_FORMATTERS(TYPE) \
-inline IntFormatSpec<TYPE, TypeSpec<'b'> > bin(TYPE value) { \
-  return IntFormatSpec<TYPE, TypeSpec<'b'> >(value, TypeSpec<'b'>()); \
-} \
- \
-inline IntFormatSpec<TYPE, TypeSpec<'o'> > oct(TYPE value) { \
-  return IntFormatSpec<TYPE, TypeSpec<'o'> >(value, TypeSpec<'o'>()); \
-} \
- \
-inline IntFormatSpec<TYPE, TypeSpec<'x'> > hex(TYPE value) { \
-  return IntFormatSpec<TYPE, TypeSpec<'x'> >(value, TypeSpec<'x'>()); \
-} \
- \
-inline IntFormatSpec<TYPE, TypeSpec<'X'> > hexu(TYPE value) { \
-  return IntFormatSpec<TYPE, TypeSpec<'X'> >(value, TypeSpec<'X'>()); \
-} \
- \
-template <char TYPE_CODE> \
-inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> > pad( \
-    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE> > f, unsigned width) { \
-  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> >( \
-      f.value(), AlignTypeSpec<TYPE_CODE>(width, ' ')); \
-} \
- \
-/* For compatibility with older compilers we provide two overloads for pad, */ \
-/* one that takes a fill character and one that doesn't. In the future this */ \
-/* can be replaced with one overload making the template argument Char      */ \
-/* default to char (C++11). */ \
-template <char TYPE_CODE, typename Char> \
-inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char> pad( \
-    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE>, Char> f, \
-    unsigned width, Char fill) { \
-  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char>( \
-      f.value(), AlignTypeSpec<TYPE_CODE>(width, fill)); \
-} \
- \
-inline IntFormatSpec<TYPE, AlignTypeSpec<0> > pad( \
-    TYPE value, unsigned width) { \
-  return IntFormatSpec<TYPE, AlignTypeSpec<0> >( \
-      value, AlignTypeSpec<0>(width, ' ')); \
-} \
- \
-template <typename Char> \
-inline IntFormatSpec<TYPE, AlignTypeSpec<0>, Char> pad( \
-   TYPE value, unsigned width, Char fill) { \
- return IntFormatSpec<TYPE, AlignTypeSpec<0>, Char>( \
-     value, AlignTypeSpec<0>(width, fill)); \
-}
-
-FMT_DEFINE_INT_FORMATTERS(int)
-FMT_DEFINE_INT_FORMATTERS(long)
-FMT_DEFINE_INT_FORMATTERS(unsigned)
-FMT_DEFINE_INT_FORMATTERS(unsigned long)
-FMT_DEFINE_INT_FORMATTERS(LongLong)
-FMT_DEFINE_INT_FORMATTERS(ULongLong)
-
-/**
-  \rst
-  Returns a string formatter that pads the formatted argument with the fill
-  character to the specified width using the default (left) string alignment.
-
-  **Example**::
-
-    std::string s = str(MemoryWriter() << pad("abc", 8));
-    // s == "abc     "
-
-  \endrst
- */
-template <typename Char>
-inline StrFormatSpec<Char> pad(
-    const Char *str, unsigned width, Char fill = ' ') {
-  return StrFormatSpec<Char>(str, width, fill);
-}
-
-inline StrFormatSpec<wchar_t> pad(
-    const wchar_t *str, unsigned width, char fill = ' ') {
-  return StrFormatSpec<wchar_t>(str, width, fill);
-}
-
-namespace internal {
-
-template <typename Char>
-class ArgMap {
- private:
-  typedef std::vector<
-    std::pair<fmt::BasicStringRef<Char>, internal::Arg> > MapType;
-  typedef typename MapType::value_type Pair;
-
-  MapType map_;
-
- public:
-  FMT_API void init(const ArgList &args);
-
-  const internal::Arg* find(const fmt::BasicStringRef<Char> &name) const {
-    // The list is unsorted, so just return the first matching name.
-    for (typename MapType::const_iterator it = map_.begin(), end = map_.end();
-         it != end; ++it) {
-      if (it->first == name)
-        return &it->second;
-    }
-    return 0;
-  }
-};
-
-template <typename Impl, typename Char>
-class ArgFormatterBase : public ArgVisitor<Impl, void> {
- private:
-  BasicWriter<Char> &writer_;
-  FormatSpec &spec_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(ArgFormatterBase);
-
-  void write_pointer(const void *p) {
-    spec_.flags_ = HASH_FLAG;
-    spec_.type_ = 'x';
-    writer_.write_int(reinterpret_cast<uintptr_t>(p), spec_);
-  }
-
- protected:
-  BasicWriter<Char> &writer() { return writer_; }
-  FormatSpec &spec() { return spec_; }
-
-  void write(bool value) {
-    const char *str_value = value ? "true" : "false";
-    Arg::StringValue<char> str = { str_value, std::strlen(str_value) };
-    writer_.write_str(str, spec_);
-  }
-
-  void write(const char *value) {
-    Arg::StringValue<char> str = {value, value != 0 ? std::strlen(value) : 0};
-    writer_.write_str(str, spec_);
-  }
-
- public:
-  ArgFormatterBase(BasicWriter<Char> &w, FormatSpec &s)
-  : writer_(w), spec_(s) {}
-
-  template <typename T>
-  void visit_any_int(T value) { writer_.write_int(value, spec_); }
-
-  template <typename T>
-  void visit_any_double(T value) { writer_.write_double(value, spec_); }
-
-  void visit_bool(bool value) {
-    if (spec_.type_)
-      return visit_any_int(value);
-    write(value);
-  }
-
-  void visit_char(int value) {
-    if (spec_.type_ && spec_.type_ != 'c') {
-      spec_.flags_ |= CHAR_FLAG;
-      writer_.write_int(value, spec_);
-      return;
-    }
-    if (spec_.align_ == ALIGN_NUMERIC || spec_.flags_ != 0)
-      FMT_THROW(FormatError("invalid format specifier for char"));
-    typedef typename BasicWriter<Char>::CharPtr CharPtr;
-    Char fill = internal::CharTraits<Char>::cast(spec_.fill());
-    CharPtr out = CharPtr();
-    const unsigned CHAR_WIDTH = 1;
-    if (spec_.width_ > CHAR_WIDTH) {
-      out = writer_.grow_buffer(spec_.width_);
-      if (spec_.align_ == ALIGN_RIGHT) {
-        std::uninitialized_fill_n(out, spec_.width_ - CHAR_WIDTH, fill);
-        out += spec_.width_ - CHAR_WIDTH;
-      } else if (spec_.align_ == ALIGN_CENTER) {
-        out = writer_.fill_padding(out, spec_.width_,
-                                   internal::check(CHAR_WIDTH), fill);
-      } else {
-        std::uninitialized_fill_n(out + CHAR_WIDTH,
-                                  spec_.width_ - CHAR_WIDTH, fill);
-      }
-    } else {
-      out = writer_.grow_buffer(CHAR_WIDTH);
-    }
-    *out = internal::CharTraits<Char>::cast(value);
-  }
-
-  void visit_cstring(const char *value) {
-    if (spec_.type_ == 'p')
-      return write_pointer(value);
-    write(value);
-  }
-
-  void visit_string(Arg::StringValue<char> value) {
-    writer_.write_str(value, spec_);
-  }
-
-  using ArgVisitor<Impl, void>::visit_wstring;
-
-  void visit_wstring(Arg::StringValue<Char> value) {
-    writer_.write_str(value, spec_);
-  }
-
-  void visit_pointer(const void *value) {
-    if (spec_.type_ && spec_.type_ != 'p')
-      report_unknown_type(spec_.type_, "pointer");
-    write_pointer(value);
-  }
-};
-
-class FormatterBase {
- private:
-  ArgList args_;
-  int next_arg_index_;
-
-  // Returns the argument with specified index.
-  FMT_API Arg do_get_arg(unsigned arg_index, const char *&error);
-
- protected:
-  const ArgList &args() const { return args_; }
-
-  explicit FormatterBase(const ArgList &args) {
-    args_ = args;
-    next_arg_index_ = 0;
-  }
-
-  // Returns the next argument.
-  Arg next_arg(const char *&error) {
-    if (next_arg_index_ >= 0)
-      return do_get_arg(internal::to_unsigned(next_arg_index_++), error);
-    error = "cannot switch from manual to automatic argument indexing";
-    return Arg();
-  }
-
-  // Checks if manual indexing is used and returns the argument with
-  // specified index.
-  Arg get_arg(unsigned arg_index, const char *&error) {
-    return check_no_auto_index(error) ? do_get_arg(arg_index, error) : Arg();
-  }
-
-  bool check_no_auto_index(const char *&error) {
-    if (next_arg_index_ > 0) {
-      error = "cannot switch from automatic to manual argument indexing";
-      return false;
-    }
-    next_arg_index_ = -1;
-    return true;
-  }
-
-  template <typename Char>
-  void write(BasicWriter<Char> &w, const Char *start, const Char *end) {
-    if (start != end)
-      w << BasicStringRef<Char>(start, internal::to_unsigned(end - start));
-  }
-};
-
-// A printf formatter.
-template <typename Char>
-class PrintfFormatter : private FormatterBase {
- private:
-  void parse_flags(FormatSpec &spec, const Char *&s);
-
-  // Returns the argument with specified index or, if arg_index is equal
-  // to the maximum unsigned value, the next argument.
-  Arg get_arg(const Char *s,
-      unsigned arg_index = (std::numeric_limits<unsigned>::max)());
-
-  // Parses argument index, flags and width and returns the argument index.
-  unsigned parse_header(const Char *&s, FormatSpec &spec);
-
- public:
-  explicit PrintfFormatter(const ArgList &args) : FormatterBase(args) {}
-  FMT_API void format(BasicWriter<Char> &writer,
-                      BasicCStringRef<Char> format_str);
-};
-}  // namespace internal
-
-/**
-  \rst
-  An argument formatter based on the `curiously recurring template pattern
-  <http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern>`_.
-
-  To use `~fmt::BasicArgFormatter` define a subclass that implements some or
-  all of the visit methods with the same signatures as the methods in
-  `~fmt::ArgVisitor`, for example, `~fmt::ArgVisitor::visit_int()`.
-  Pass the subclass as the *Impl* template parameter. When a formatting
-  function processes an argument, it will dispatch to a visit method
-  specific to the argument type. For example, if the argument type is
-  ``double`` then the `~fmt::ArgVisitor::visit_double()` method of a subclass
-  will be called. If the subclass doesn't contain a method with this signature,
-  then a corresponding method of `~fmt::BasicArgFormatter` or its superclass
-  will be called.
-  \endrst
- */
-template <typename Impl, typename Char>
-class BasicArgFormatter : public internal::ArgFormatterBase<Impl, Char> {
- private:
-  BasicFormatter<Char, Impl> &formatter_;
-  const Char *format_;
-
- public:
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *formatter* is a reference to the main formatter object, *spec* contains
-    format specifier information for standard argument types, and *fmt* points
-    to the part of the format string being parsed for custom argument types.
-    \endrst
-   */
-  BasicArgFormatter(BasicFormatter<Char, Impl> &formatter,
-                    FormatSpec &spec, const Char *fmt)
-  : internal::ArgFormatterBase<Impl, Char>(formatter.writer(), spec),
-    formatter_(formatter), format_(fmt) {}
-
-  /** Formats argument of a custom (user-defined) type. */
-  void visit_custom(internal::Arg::CustomValue c) {
-    c.format(&formatter_, c.value, &format_);
-  }
-};
-
-/** The default argument formatter. */
-template <typename Char>
-class ArgFormatter : public BasicArgFormatter<ArgFormatter<Char>, Char> {
- public:
-  /** Constructs an argument formatter object. */
-  ArgFormatter(BasicFormatter<Char> &formatter,
-               FormatSpec &spec, const Char *fmt)
-  : BasicArgFormatter<ArgFormatter<Char>, Char>(formatter, spec, fmt) {}
-};
-
-/** This template formats data and writes the output to a writer. */
-template <typename CharType, typename ArgFormatter>
-class BasicFormatter : private internal::FormatterBase {
- public:
-  /** The character type for the output. */
-  typedef CharType Char;
-
- private:
-  BasicWriter<Char> &writer_;
-  internal::ArgMap<Char> map_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(BasicFormatter);
-
-  using internal::FormatterBase::get_arg;
-
-  // Checks if manual indexing is used and returns the argument with
-  // specified name.
-  internal::Arg get_arg(BasicStringRef<Char> arg_name, const char *&error);
-
-  // Parses argument index and returns corresponding argument.
-  internal::Arg parse_arg_index(const Char *&s);
-
-  // Parses argument name and returns corresponding argument.
-  internal::Arg parse_arg_name(const Char *&s);
-
- public:
-  /**
-   \rst
-   Constructs a ``BasicFormatter`` object. References to the arguments and
-   the writer are stored in the formatter object so make sure they have
-   appropriate lifetimes.
-   \endrst
-   */
-  BasicFormatter(const ArgList &args, BasicWriter<Char> &w)
-    : internal::FormatterBase(args), writer_(w) {}
-
-  /** Returns a reference to the writer associated with this formatter. */
-  BasicWriter<Char> &writer() { return writer_; }
-
-  /** Formats stored arguments and writes the output to the writer. */
-  void format(BasicCStringRef<Char> format_str);
-
-  // Formats a single argument and advances format_str, a format string pointer.
-  const Char *format(const Char *&format_str, const internal::Arg &arg);
-};
-
-// Generates a comma-separated list with results of applying f to
-// numbers 0..n-1.
-# define FMT_GEN(n, f) FMT_GEN##n(f)
-# define FMT_GEN1(f)  f(0)
-# define FMT_GEN2(f)  FMT_GEN1(f),  f(1)
-# define FMT_GEN3(f)  FMT_GEN2(f),  f(2)
-# define FMT_GEN4(f)  FMT_GEN3(f),  f(3)
-# define FMT_GEN5(f)  FMT_GEN4(f),  f(4)
-# define FMT_GEN6(f)  FMT_GEN5(f),  f(5)
-# define FMT_GEN7(f)  FMT_GEN6(f),  f(6)
-# define FMT_GEN8(f)  FMT_GEN7(f),  f(7)
-# define FMT_GEN9(f)  FMT_GEN8(f),  f(8)
-# define FMT_GEN10(f) FMT_GEN9(f),  f(9)
-# define FMT_GEN11(f) FMT_GEN10(f), f(10)
-# define FMT_GEN12(f) FMT_GEN11(f), f(11)
-# define FMT_GEN13(f) FMT_GEN12(f), f(12)
-# define FMT_GEN14(f) FMT_GEN13(f), f(13)
-# define FMT_GEN15(f) FMT_GEN14(f), f(14)
-
-namespace internal {
-inline uint64_t make_type() { return 0; }
-
-template <typename T>
-inline uint64_t make_type(const T &arg) {
-  return MakeValue< BasicFormatter<char> >::type(arg);
-}
-
-template <unsigned N, bool/*IsPacked*/= (N < ArgList::MAX_PACKED_ARGS)>
-struct ArgArray;
-
-template <unsigned N>
-struct ArgArray<N, true/*IsPacked*/> {
-  typedef Value Type[N > 0 ? N : 1];
-  
-  template <typename Formatter, typename T>
-  static Value make(const T &value) {
-#ifdef __clang__
-    Value result = MakeValue<Formatter>(value);
-    // Workaround a bug in Apple LLVM version 4.2 (clang-425.0.28) of clang:
-    // https://github.com/fmtlib/fmt/issues/276
-    (void)result.custom.format;
-    return result;
-#else
-    return MakeValue<Formatter>(value);
-#endif
-  }
-};
-
-template <unsigned N>
-struct ArgArray<N, false/*IsPacked*/> {
-  typedef Arg Type[N + 1]; // +1 for the list end Arg::NONE
-
-  template <typename Formatter, typename T>
-  static Arg make(const T &value) { return MakeArg<Formatter>(value); }
-};
-
-#if FMT_USE_VARIADIC_TEMPLATES
-template <typename Arg, typename... Args>
-inline uint64_t make_type(const Arg &first, const Args & ... tail) {
-  return make_type(first) | (make_type(tail...) << 4);
-}
-
-#else
-
-struct ArgType {
-  uint64_t type;
-
-  ArgType() : type(0) {}
-
-  template <typename T>
-  ArgType(const T &arg) : type(make_type(arg)) {}
-};
-
-# define FMT_ARG_TYPE_DEFAULT(n) ArgType t##n = ArgType()
-
-inline uint64_t make_type(FMT_GEN15(FMT_ARG_TYPE_DEFAULT)) {
-  return t0.type | (t1.type << 4) | (t2.type << 8) | (t3.type << 12) |
-      (t4.type << 16) | (t5.type << 20) | (t6.type << 24) | (t7.type << 28) |
-      (t8.type << 32) | (t9.type << 36) | (t10.type << 40) | (t11.type << 44) |
-      (t12.type << 48) | (t13.type << 52) | (t14.type << 56);
-}
-#endif
-}  // namespace internal
-
-# define FMT_MAKE_TEMPLATE_ARG(n) typename T##n
-# define FMT_MAKE_ARG_TYPE(n) T##n
-# define FMT_MAKE_ARG(n) const T##n &v##n
-# define FMT_ASSIGN_char(n) \
-  arr[n] = fmt::internal::MakeValue< fmt::BasicFormatter<char> >(v##n)
-# define FMT_ASSIGN_wchar_t(n) \
-  arr[n] = fmt::internal::MakeValue< fmt::BasicFormatter<wchar_t> >(v##n)
-
-#if FMT_USE_VARIADIC_TEMPLATES
-// Defines a variadic function returning void.
-# define FMT_VARIADIC_VOID(func, arg_type) \
-  template <typename... Args> \
-  void func(arg_type arg0, const Args & ... args) { \
-    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
-    typename ArgArray::Type array{ \
-      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
-    func(arg0, fmt::ArgList(fmt::internal::make_type(args...), array)); \
-  }
-
-// Defines a variadic constructor.
-# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
-  template <typename... Args> \
-  ctor(arg0_type arg0, arg1_type arg1, const Args & ... args) { \
-    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
-    typename ArgArray::Type array{ \
-      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
-    func(arg0, arg1, fmt::ArgList(fmt::internal::make_type(args...), array)); \
-  }
-
-#else
-
-# define FMT_MAKE_REF(n) \
-  fmt::internal::MakeValue< fmt::BasicFormatter<Char> >(v##n)
-# define FMT_MAKE_REF2(n) v##n
-
-// Defines a wrapper for a function taking one argument of type arg_type
-// and n additional arguments of arbitrary types.
-# define FMT_WRAP1(func, arg_type, n) \
-  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
-  inline void func(arg_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
-    const fmt::internal::ArgArray<n>::Type array = {FMT_GEN(n, FMT_MAKE_REF)}; \
-    func(arg1, fmt::ArgList( \
-      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), array)); \
-  }
-
-// Emulates a variadic function returning void on a pre-C++11 compiler.
-# define FMT_VARIADIC_VOID(func, arg_type) \
-  inline void func(arg_type arg) { func(arg, fmt::ArgList()); } \
-  FMT_WRAP1(func, arg_type, 1) FMT_WRAP1(func, arg_type, 2) \
-  FMT_WRAP1(func, arg_type, 3) FMT_WRAP1(func, arg_type, 4) \
-  FMT_WRAP1(func, arg_type, 5) FMT_WRAP1(func, arg_type, 6) \
-  FMT_WRAP1(func, arg_type, 7) FMT_WRAP1(func, arg_type, 8) \
-  FMT_WRAP1(func, arg_type, 9) FMT_WRAP1(func, arg_type, 10)
-
-# define FMT_CTOR(ctor, func, arg0_type, arg1_type, n) \
-  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
-  ctor(arg0_type arg0, arg1_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
-    const fmt::internal::ArgArray<n>::Type array = {FMT_GEN(n, FMT_MAKE_REF)}; \
-    func(arg0, arg1, fmt::ArgList( \
-      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), array)); \
-  }
-
-// Emulates a variadic constructor on a pre-C++11 compiler.
-# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 1) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 2) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 3) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 4) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 5) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 6) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 7) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 8) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 9) \
-  FMT_CTOR(ctor, func, arg0_type, arg1_type, 10)
-#endif
-
-// Generates a comma-separated list with results of applying f to pairs
-// (argument, index).
-#define FMT_FOR_EACH1(f, x0) f(x0, 0)
-#define FMT_FOR_EACH2(f, x0, x1) \
-  FMT_FOR_EACH1(f, x0), f(x1, 1)
-#define FMT_FOR_EACH3(f, x0, x1, x2) \
-  FMT_FOR_EACH2(f, x0 ,x1), f(x2, 2)
-#define FMT_FOR_EACH4(f, x0, x1, x2, x3) \
-  FMT_FOR_EACH3(f, x0, x1, x2), f(x3, 3)
-#define FMT_FOR_EACH5(f, x0, x1, x2, x3, x4) \
-  FMT_FOR_EACH4(f, x0, x1, x2, x3), f(x4, 4)
-#define FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5) \
-  FMT_FOR_EACH5(f, x0, x1, x2, x3, x4), f(x5, 5)
-#define FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6) \
-  FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5), f(x6, 6)
-#define FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7) \
-  FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6), f(x7, 7)
-#define FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8) \
-  FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7), f(x8, 8)
-#define FMT_FOR_EACH10(f, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
-  FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8), f(x9, 9)
-
-/**
- An error returned by an operating system or a language runtime,
- for example a file opening error.
-*/
-class SystemError : public internal::RuntimeError {
- private:
-  void init(int err_code, CStringRef format_str, ArgList args);
-
- protected:
-  int error_code_;
-
-  typedef char Char;  // For FMT_VARIADIC_CTOR.
-
-  SystemError() {}
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::SystemError` object with the description
-   of the form
-
-   .. parsed-literal::
-     *<message>*: *<system-message>*
-
-   where *<message>* is the formatted message and *<system-message>* is
-   the system message corresponding to the error code.
-   *error_code* is a system error code as given by ``errno``.
-   If *error_code* is not a valid error code such as -1, the system message
-   may look like "Unknown error -1" and is platform-dependent.
-
-   **Example**::
-
-     // This throws a SystemError with the description
-     //   cannot open file 'madeup': No such file or directory
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     std::FILE *file = std::fopen(filename, "r");
-     if (!file)
-       throw fmt::SystemError(errno, "cannot open file '{}'", filename);
-   \endrst
-  */
-  SystemError(int error_code, CStringRef message) {
-    init(error_code, message, ArgList());
-  }
-  FMT_VARIADIC_CTOR(SystemError, init, int, CStringRef)
-
-  int error_code() const { return error_code_; }
-};
-
-/**
-  \rst
-  This template provides operations for formatting and writing data into
-  a character stream. The output is stored in a buffer provided by a subclass
-  such as :class:`fmt::BasicMemoryWriter`.
-
-  You can use one of the following typedefs for common character types:
-
-  +---------+----------------------+
-  | Type    | Definition           |
-  +=========+======================+
-  | Writer  | BasicWriter<char>    |
-  +---------+----------------------+
-  | WWriter | BasicWriter<wchar_t> |
-  +---------+----------------------+
-
-  \endrst
- */
-template <typename Char>
-class BasicWriter {
- private:
-  // Output buffer.
-  Buffer<Char> &buffer_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(BasicWriter);
-
-  typedef typename internal::CharTraits<Char>::CharPtr CharPtr;
-
-#if FMT_SECURE_SCL
-  // Returns pointer value.
-  static Char *get(CharPtr p) { return p.base(); }
-#else
-  static Char *get(Char *p) { return p; }
-#endif
-
-  // Fills the padding around the content and returns the pointer to the
-  // content area.
-  static CharPtr fill_padding(CharPtr buffer,
-      unsigned total_size, std::size_t content_size, wchar_t fill);
-
-  // Grows the buffer by n characters and returns a pointer to the newly
-  // allocated area.
-  CharPtr grow_buffer(std::size_t n) {
-    std::size_t size = buffer_.size();
-    buffer_.resize(size + n);
-    return internal::make_ptr(&buffer_[size], n);
-  }
-
-  // Writes an unsigned decimal integer.
-  template <typename UInt>
-  Char *write_unsigned_decimal(UInt value, unsigned prefix_size = 0) {
-    unsigned num_digits = internal::count_digits(value);
-    Char *ptr = get(grow_buffer(prefix_size + num_digits));
-    internal::format_decimal(ptr + prefix_size, value, num_digits);
-    return ptr;
-  }
-
-  // Writes a decimal integer.
-  template <typename Int>
-  void write_decimal(Int value) {
-    typedef typename internal::IntTraits<Int>::MainType MainType;
-    MainType abs_value = static_cast<MainType>(value);
-    if (internal::is_negative(value)) {
-      abs_value = 0 - abs_value;
-      *write_unsigned_decimal(abs_value, 1) = '-';
-    } else {
-      write_unsigned_decimal(abs_value, 0);
-    }
-  }
-
-  // Prepare a buffer for integer formatting.
-  CharPtr prepare_int_buffer(unsigned num_digits,
-      const EmptySpec &, const char *prefix, unsigned prefix_size) {
-    unsigned size = prefix_size + num_digits;
-    CharPtr p = grow_buffer(size);
-    std::uninitialized_copy(prefix, prefix + prefix_size, p);
-    return p + size - 1;
-  }
-
-  template <typename Spec>
-  CharPtr prepare_int_buffer(unsigned num_digits,
-    const Spec &spec, const char *prefix, unsigned prefix_size);
-
-  // Formats an integer.
-  template <typename T, typename Spec>
-  void write_int(T value, Spec spec);
-
-  // Formats a floating-point number (double or long double).
-  template <typename T>
-  void write_double(T value, const FormatSpec &spec);
-
-  // Writes a formatted string.
-  template <typename StrChar>
-  CharPtr write_str(const StrChar *s, std::size_t size, const AlignSpec &spec);
-
-  template <typename StrChar>
-  void write_str(const internal::Arg::StringValue<StrChar> &str,
-                 const FormatSpec &spec);
-
-  // This following methods are private to disallow writing wide characters
-  // and strings to a char stream. If you want to print a wide string as a
-  // pointer as std::ostream does, cast it to const void*.
-  // Do not implement!
-  void operator<<(typename internal::WCharHelper<wchar_t, Char>::Unsupported);
-  void operator<<(
-      typename internal::WCharHelper<const wchar_t *, Char>::Unsupported);
-
-  // Appends floating-point length specifier to the format string.
-  // The second argument is only used for overload resolution.
-  void append_float_length(Char *&format_ptr, long double) {
-    *format_ptr++ = 'L';
-  }
-
-  template<typename T>
-  void append_float_length(Char *&, T) {}
-
-  template <typename Impl, typename Char_>
-  friend class internal::ArgFormatterBase;
-
-  friend class internal::PrintfArgFormatter<Char>;
-
- protected:
-  /**
-    Constructs a ``BasicWriter`` object.
-   */
-  explicit BasicWriter(Buffer<Char> &b) : buffer_(b) {}
-
- public:
-  /**
-    \rst
-    Destroys a ``BasicWriter`` object.
-    \endrst
-   */
-  virtual ~BasicWriter() {}
-
-  /**
-    Returns the total number of characters written.
-   */
-  std::size_t size() const { return buffer_.size(); }
-
-  /**
-    Returns a pointer to the output buffer content. No terminating null
-    character is appended.
-   */
-  const Char *data() const FMT_NOEXCEPT { return &buffer_[0]; }
-
-  /**
-    Returns a pointer to the output buffer content with terminating null
-    character appended.
-   */
-  const Char *c_str() const {
-    std::size_t size = buffer_.size();
-    buffer_.reserve(size + 1);
-    buffer_[size] = '\0';
-    return &buffer_[0];
-  }
-
-  /**
-    \rst
-    Returns the content of the output buffer as an `std::string`.
-    \endrst
-   */
-  std::basic_string<Char> str() const {
-    return std::basic_string<Char>(&buffer_[0], buffer_.size());
-  }
-
-  /**
-    \rst
-    Writes formatted data.
-
-    *args* is an argument list representing arbitrary arguments.
-
-    **Example**::
-
-       MemoryWriter out;
-       out.write("Current point:\n");
-       out.write("({:+f}, {:+f})", -3.14, 3.14);
-
-    This will write the following output to the ``out`` object:
-
-    .. code-block:: none
-
-       Current point:
-       (-3.140000, +3.140000)
-
-    The output can be accessed using :func:`data()`, :func:`c_str` or
-    :func:`str` methods.
-
-    See also :ref:`syntax`.
-    \endrst
-   */
-  void write(BasicCStringRef<Char> format, ArgList args) {
-    BasicFormatter<Char>(args, *this).format(format);
-  }
-  FMT_VARIADIC_VOID(write, BasicCStringRef<Char>)
-
-  BasicWriter &operator<<(int value) {
-    write_decimal(value);
-    return *this;
-  }
-  BasicWriter &operator<<(unsigned value) {
-    return *this << IntFormatSpec<unsigned>(value);
-  }
-  BasicWriter &operator<<(long value) {
-    write_decimal(value);
-    return *this;
-  }
-  BasicWriter &operator<<(unsigned long value) {
-    return *this << IntFormatSpec<unsigned long>(value);
-  }
-  BasicWriter &operator<<(LongLong value) {
-    write_decimal(value);
-    return *this;
-  }
-
-  /**
-    \rst
-    Formats *value* and writes it to the stream.
-    \endrst
-   */
-  BasicWriter &operator<<(ULongLong value) {
-    return *this << IntFormatSpec<ULongLong>(value);
-  }
-
-  BasicWriter &operator<<(double value) {
-    write_double(value, FormatSpec());
-    return *this;
-  }
-
-  /**
-    \rst
-    Formats *value* using the general format for floating-point numbers
-    (``'g'``) and writes it to the stream.
-    \endrst
-   */
-  BasicWriter &operator<<(long double value) {
-    write_double(value, FormatSpec());
-    return *this;
-  }
-
-  /**
-    Writes a character to the stream.
-   */
-  BasicWriter &operator<<(char value) {
-    buffer_.push_back(value);
-    return *this;
-  }
-
-  BasicWriter &operator<<(
-      typename internal::WCharHelper<wchar_t, Char>::Supported value) {
-    buffer_.push_back(value);
-    return *this;
-  }
-
-  /**
-    \rst
-    Writes *value* to the stream.
-    \endrst
-   */
-  BasicWriter &operator<<(fmt::BasicStringRef<Char> value) {
-    const Char *str = value.data();
-    buffer_.append(str, str + value.size());
-    return *this;
-  }
-
-  BasicWriter &operator<<(
-      typename internal::WCharHelper<StringRef, Char>::Supported value) {
-    const char *str = value.data();
-    buffer_.append(str, str + value.size());
-    return *this;
-  }
-
-  template <typename T, typename Spec, typename FillChar>
-  BasicWriter &operator<<(IntFormatSpec<T, Spec, FillChar> spec) {
-    internal::CharTraits<Char>::convert(FillChar());
-    write_int(spec.value(), spec);
-    return *this;
-  }
-
-  template <typename StrChar>
-  BasicWriter &operator<<(const StrFormatSpec<StrChar> &spec) {
-    const StrChar *s = spec.str();
-    write_str(s, std::char_traits<Char>::length(s), spec);
-    return *this;
-  }
-
-  void clear() FMT_NOEXCEPT { buffer_.clear(); }
-
-  Buffer<Char> &buffer() FMT_NOEXCEPT { return buffer_; }
-};
-
-template <typename Char>
-template <typename StrChar>
-typename BasicWriter<Char>::CharPtr BasicWriter<Char>::write_str(
-      const StrChar *s, std::size_t size, const AlignSpec &spec) {
-  CharPtr out = CharPtr();
-  if (spec.width() > size) {
-    out = grow_buffer(spec.width());
-    Char fill = internal::CharTraits<Char>::cast(spec.fill());
-    if (spec.align() == ALIGN_RIGHT) {
-      std::uninitialized_fill_n(out, spec.width() - size, fill);
-      out += spec.width() - size;
-    } else if (spec.align() == ALIGN_CENTER) {
-      out = fill_padding(out, spec.width(), size, fill);
-    } else {
-      std::uninitialized_fill_n(out + size, spec.width() - size, fill);
-    }
-  } else {
-    out = grow_buffer(size);
-  }
-  std::uninitialized_copy(s, s + size, out);
-  return out;
-}
-
-template <typename Char>
-template <typename StrChar>
-void BasicWriter<Char>::write_str(
-    const internal::Arg::StringValue<StrChar> &s, const FormatSpec &spec) {
-  // Check if StrChar is convertible to Char.
-  internal::CharTraits<Char>::convert(StrChar());
-  if (spec.type_ && spec.type_ != 's')
-    internal::report_unknown_type(spec.type_, "string");
-  const StrChar *str_value = s.value;
-  std::size_t str_size = s.size;
-  if (str_size == 0) {
-    if (!str_value) {
-      FMT_THROW(FormatError("string pointer is null"));
-      return;
-    }
-  }
-  std::size_t precision = static_cast<std::size_t>(spec.precision_);
-  if (spec.precision_ >= 0 && precision < str_size)
-    str_size = precision;
-  write_str(str_value, str_size, spec);
-}
-
-template <typename Char>
-typename BasicWriter<Char>::CharPtr
-  BasicWriter<Char>::fill_padding(
-    CharPtr buffer, unsigned total_size,
-    std::size_t content_size, wchar_t fill) {
-  std::size_t padding = total_size - content_size;
-  std::size_t left_padding = padding / 2;
-  Char fill_char = internal::CharTraits<Char>::cast(fill);
-  std::uninitialized_fill_n(buffer, left_padding, fill_char);
-  buffer += left_padding;
-  CharPtr content = buffer;
-  std::uninitialized_fill_n(buffer + content_size,
-                            padding - left_padding, fill_char);
-  return content;
-}
-
-template <typename Char>
-template <typename Spec>
-typename BasicWriter<Char>::CharPtr
-  BasicWriter<Char>::prepare_int_buffer(
-    unsigned num_digits, const Spec &spec,
-    const char *prefix, unsigned prefix_size) {
-  unsigned width = spec.width();
-  Alignment align = spec.align();
-  Char fill = internal::CharTraits<Char>::cast(spec.fill());
-  if (spec.precision() > static_cast<int>(num_digits)) {
-    // Octal prefix '0' is counted as a digit, so ignore it if precision
-    // is specified.
-    if (prefix_size > 0 && prefix[prefix_size - 1] == '0')
-      --prefix_size;
-    unsigned number_size =
-        prefix_size + internal::to_unsigned(spec.precision());
-    AlignSpec subspec(number_size, '0', ALIGN_NUMERIC);
-    if (number_size >= width)
-      return prepare_int_buffer(num_digits, subspec, prefix, prefix_size);
-    buffer_.reserve(width);
-    unsigned fill_size = width - number_size;
-    if (align != ALIGN_LEFT) {
-      CharPtr p = grow_buffer(fill_size);
-      std::uninitialized_fill(p, p + fill_size, fill);
-    }
-    CharPtr result = prepare_int_buffer(
-        num_digits, subspec, prefix, prefix_size);
-    if (align == ALIGN_LEFT) {
-      CharPtr p = grow_buffer(fill_size);
-      std::uninitialized_fill(p, p + fill_size, fill);
-    }
-    return result;
-  }
-  unsigned size = prefix_size + num_digits;
-  if (width <= size) {
-    CharPtr p = grow_buffer(size);
-    std::uninitialized_copy(prefix, prefix + prefix_size, p);
-    return p + size - 1;
-  }
-  CharPtr p = grow_buffer(width);
-  CharPtr end = p + width;
-  if (align == ALIGN_LEFT) {
-    std::uninitialized_copy(prefix, prefix + prefix_size, p);
-    p += size;
-    std::uninitialized_fill(p, end, fill);
-  } else if (align == ALIGN_CENTER) {
-    p = fill_padding(p, width, size, fill);
-    std::uninitialized_copy(prefix, prefix + prefix_size, p);
-    p += size;
-  } else {
-    if (align == ALIGN_NUMERIC) {
-      if (prefix_size != 0) {
-        p = std::uninitialized_copy(prefix, prefix + prefix_size, p);
-        size -= prefix_size;
-      }
-    } else {
-      std::uninitialized_copy(prefix, prefix + prefix_size, end - size);
-    }
-    std::uninitialized_fill(p, end - size, fill);
-    p = end;
-  }
-  return p - 1;
-}
-
-template <typename Char>
-template <typename T, typename Spec>
-void BasicWriter<Char>::write_int(T value, Spec spec) {
-  unsigned prefix_size = 0;
-  typedef typename internal::IntTraits<T>::MainType UnsignedType;
-  UnsignedType abs_value = static_cast<UnsignedType>(value);
-  char prefix[4] = "";
-  if (internal::is_negative(value)) {
-    prefix[0] = '-';
-    ++prefix_size;
-    abs_value = 0 - abs_value;
-  } else if (spec.flag(SIGN_FLAG)) {
-    prefix[0] = spec.flag(PLUS_FLAG) ? '+' : ' ';
-    ++prefix_size;
-  }
-  switch (spec.type()) {
-  case 0: case 'd': {
-    unsigned num_digits = internal::count_digits(abs_value);
-    CharPtr p = prepare_int_buffer(num_digits, spec, prefix, prefix_size) + 1;
-    internal::format_decimal(get(p), abs_value, 0);
-    break;
-  }
-  case 'x': case 'X': {
-    UnsignedType n = abs_value;
-    if (spec.flag(HASH_FLAG)) {
-      prefix[prefix_size++] = '0';
-      prefix[prefix_size++] = spec.type();
-    }
-    unsigned num_digits = 0;
-    do {
-      ++num_digits;
-    } while ((n >>= 4) != 0);
-    Char *p = get(prepare_int_buffer(
-      num_digits, spec, prefix, prefix_size));
-    n = abs_value;
-    const char *digits = spec.type() == 'x' ?
-        "0123456789abcdef" : "0123456789ABCDEF";
-    do {
-      *p-- = digits[n & 0xf];
-    } while ((n >>= 4) != 0);
-    break;
-  }
-  case 'b': case 'B': {
-    UnsignedType n = abs_value;
-    if (spec.flag(HASH_FLAG)) {
-      prefix[prefix_size++] = '0';
-      prefix[prefix_size++] = spec.type();
-    }
-    unsigned num_digits = 0;
-    do {
-      ++num_digits;
-    } while ((n >>= 1) != 0);
-    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
-    n = abs_value;
-    do {
-      *p-- = static_cast<Char>('0' + (n & 1));
-    } while ((n >>= 1) != 0);
-    break;
-  }
-  case 'o': {
-    UnsignedType n = abs_value;
-    if (spec.flag(HASH_FLAG))
-      prefix[prefix_size++] = '0';
-    unsigned num_digits = 0;
-    do {
-      ++num_digits;
-    } while ((n >>= 3) != 0);
-    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
-    n = abs_value;
-    do {
-      *p-- = static_cast<Char>('0' + (n & 7));
-    } while ((n >>= 3) != 0);
-    break;
-  }
-  case 'n': {
-    unsigned num_digits = internal::count_digits(abs_value);
-    fmt::StringRef sep = std::localeconv()->thousands_sep;
-    unsigned size = static_cast<unsigned>(
-          num_digits + sep.size() * (num_digits - 1) / 3);
-    CharPtr p = prepare_int_buffer(size, spec, prefix, prefix_size) + 1;
-    internal::format_decimal(get(p), abs_value, 0, internal::ThousandsSep(sep));
-    break;
-  }
-  default:
-    internal::report_unknown_type(
-      spec.type(), spec.flag(CHAR_FLAG) ? "char" : "integer");
-    break;
-  }
-}
-
-template <typename Char>
-template <typename T>
-void BasicWriter<Char>::write_double(T value, const FormatSpec &spec) {
-  // Check type.
-  char type = spec.type();
-  bool upper = false;
-  switch (type) {
-  case 0:
-    type = 'g';
-    break;
-  case 'e': case 'f': case 'g': case 'a':
-    break;
-  case 'F':
-#ifdef _MSC_VER
-    // MSVC's printf doesn't support 'F'.
-    type = 'f';
-#endif
-    // Fall through.
-  case 'E': case 'G': case 'A':
-    upper = true;
-    break;
-  default:
-    internal::report_unknown_type(type, "double");
-    break;
-  }
-
-  char sign = 0;
-  // Use isnegative instead of value < 0 because the latter is always
-  // false for NaN.
-  if (internal::FPUtil::isnegative(static_cast<double>(value))) {
-    sign = '-';
-    value = -value;
-  } else if (spec.flag(SIGN_FLAG)) {
-    sign = spec.flag(PLUS_FLAG) ? '+' : ' ';
-  }
-
-  if (internal::FPUtil::isnotanumber(value)) {
-    // Format NaN ourselves because sprintf's output is not consistent
-    // across platforms.
-    std::size_t nan_size = 4;
-    const char *nan = upper ? " NAN" : " nan";
-    if (!sign) {
-      --nan_size;
-      ++nan;
-    }
-    CharPtr out = write_str(nan, nan_size, spec);
-    if (sign)
-      *out = sign;
-    return;
-  }
-
-  if (internal::FPUtil::isinfinity(value)) {
-    // Format infinity ourselves because sprintf's output is not consistent
-    // across platforms.
-    std::size_t inf_size = 4;
-    const char *inf = upper ? " INF" : " inf";
-    if (!sign) {
-      --inf_size;
-      ++inf;
-    }
-    CharPtr out = write_str(inf, inf_size, spec);
-    if (sign)
-      *out = sign;
-    return;
-  }
-
-  std::size_t offset = buffer_.size();
-  unsigned width = spec.width();
-  if (sign) {
-    buffer_.reserve(buffer_.size() + (width > 1u ? width : 1u));
-    if (width > 0)
-      --width;
-    ++offset;
-  }
-
-  // Build format string.
-  enum { MAX_FORMAT_SIZE = 10}; // longest format: %#-*.*Lg
-  Char format[MAX_FORMAT_SIZE];
-  Char *format_ptr = format;
-  *format_ptr++ = '%';
-  unsigned width_for_sprintf = width;
-  if (spec.flag(HASH_FLAG))
-    *format_ptr++ = '#';
-  if (spec.align() == ALIGN_CENTER) {
-    width_for_sprintf = 0;
-  } else {
-    if (spec.align() == ALIGN_LEFT)
-      *format_ptr++ = '-';
-    if (width != 0)
-      *format_ptr++ = '*';
-  }
-  if (spec.precision() >= 0) {
-    *format_ptr++ = '.';
-    *format_ptr++ = '*';
-  }
-
-  append_float_length(format_ptr, value);
-  *format_ptr++ = type;
-  *format_ptr = '\0';
-
-  // Format using snprintf.
-  Char fill = internal::CharTraits<Char>::cast(spec.fill());
-  unsigned n = 0;
-  Char *start = 0;
-  for (;;) {
-    std::size_t buffer_size = buffer_.capacity() - offset;
-#ifdef _MSC_VER
-    // MSVC's vsnprintf_s doesn't work with zero size, so reserve
-    // space for at least one extra character to make the size non-zero.
-    // Note that the buffer's capacity will increase by more than 1.
-    if (buffer_size == 0) {
-      buffer_.reserve(offset + 1);
-      buffer_size = buffer_.capacity() - offset;
-    }
-#endif
-    start = &buffer_[offset];
-    int result = internal::CharTraits<Char>::format_float(
-        start, buffer_size, format, width_for_sprintf, spec.precision(), value);
-    if (result >= 0) {
-      n = internal::to_unsigned(result);
-      if (offset + n < buffer_.capacity())
-        break;  // The buffer is large enough - continue with formatting.
-      buffer_.reserve(offset + n + 1);
-    } else {
-      // If result is negative we ask to increase the capacity by at least 1,
-      // but as std::vector, the buffer grows exponentially.
-      buffer_.reserve(buffer_.capacity() + 1);
-    }
-  }
-  if (sign) {
-    if ((spec.align() != ALIGN_RIGHT && spec.align() != ALIGN_DEFAULT) ||
-        *start != ' ') {
-      *(start - 1) = sign;
-      sign = 0;
-    } else {
-      *(start - 1) = fill;
-    }
-    ++n;
-  }
-  if (spec.align() == ALIGN_CENTER && spec.width() > n) {
-    width = spec.width();
-    CharPtr p = grow_buffer(width);
-    std::memmove(get(p) + (width - n) / 2, get(p), n * sizeof(Char));
-    fill_padding(p, spec.width(), n, fill);
-    return;
-  }
-  if (spec.fill() != ' ' || sign) {
-    while (*start == ' ')
-      *start++ = fill;
-    if (sign)
-      *(start - 1) = sign;
-  }
-  grow_buffer(n);
-}
-
-/**
-  \rst
-  This class template provides operations for formatting and writing data
-  into a character stream. The output is stored in a memory buffer that grows
-  dynamically.
-
-  You can use one of the following typedefs for common character types
-  and the standard allocator:
-
-  +---------------+-----------------------------------------------------+
-  | Type          | Definition                                          |
-  +===============+=====================================================+
-  | MemoryWriter  | BasicMemoryWriter<char, std::allocator<char>>       |
-  +---------------+-----------------------------------------------------+
-  | WMemoryWriter | BasicMemoryWriter<wchar_t, std::allocator<wchar_t>> |
-  +---------------+-----------------------------------------------------+
-
-  **Example**::
-
-     MemoryWriter out;
-     out << "The answer is " << 42 << "\n";
-     out.write("({:+f}, {:+f})", -3.14, 3.14);
-
-  This will write the following output to the ``out`` object:
-
-  .. code-block:: none
-
-     The answer is 42
-     (-3.140000, +3.140000)
-
-  The output can be converted to an ``std::string`` with ``out.str()`` or
-  accessed as a C string with ``out.c_str()``.
-  \endrst
- */
-template <typename Char, typename Allocator = std::allocator<Char> >
-class BasicMemoryWriter : public BasicWriter<Char> {
- private:
-  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE, Allocator> buffer_;
-
- public:
-  explicit BasicMemoryWriter(const Allocator& alloc = Allocator())
-    : BasicWriter<Char>(buffer_), buffer_(alloc) {}
-
-#if FMT_USE_RVALUE_REFERENCES
-  /**
-    \rst
-    Constructs a :class:`fmt::BasicMemoryWriter` object moving the content
-    of the other object to it.
-    \endrst
-   */
-  BasicMemoryWriter(BasicMemoryWriter &&other)
-    : BasicWriter<Char>(buffer_), buffer_(std::move(other.buffer_)) {
-  }
-
-  /**
-    \rst
-    Moves the content of the other ``BasicMemoryWriter`` object to this one.
-    \endrst
-   */
-  BasicMemoryWriter &operator=(BasicMemoryWriter &&other) {
-    buffer_ = std::move(other.buffer_);
-    return *this;
-  }
-#endif
-};
-
-typedef BasicMemoryWriter<char> MemoryWriter;
-typedef BasicMemoryWriter<wchar_t> WMemoryWriter;
-
-/**
-  \rst
-  This class template provides operations for formatting and writing data
-  into a fixed-size array. For writing into a dynamically growing buffer
-  use :class:`fmt::BasicMemoryWriter`.
-
-  Any write method will throw ``std::runtime_error`` if the output doesn't fit
-  into the array.
-
-  You can use one of the following typedefs for common character types:
-
-  +--------------+---------------------------+
-  | Type         | Definition                |
-  +==============+===========================+
-  | ArrayWriter  | BasicArrayWriter<char>    |
-  +--------------+---------------------------+
-  | WArrayWriter | BasicArrayWriter<wchar_t> |
-  +--------------+---------------------------+
-  \endrst
- */
-template <typename Char>
-class BasicArrayWriter : public BasicWriter<Char> {
- private:
-  internal::FixedBuffer<Char> buffer_;
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
-   given size.
-   \endrst
-   */
-  BasicArrayWriter(Char *array, std::size_t size)
-    : BasicWriter<Char>(buffer_), buffer_(array, size) {}
-
-  /**
-   \rst
-   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
-   size known at compile time.
-   \endrst
-   */
-  template <std::size_t SIZE>
-  explicit BasicArrayWriter(Char (&array)[SIZE])
-    : BasicWriter<Char>(buffer_), buffer_(array, SIZE) {}
-};
-
-typedef BasicArrayWriter<char> ArrayWriter;
-typedef BasicArrayWriter<wchar_t> WArrayWriter;
-
-// Reports a system error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_system_error(int error_code,
-                                 StringRef message) FMT_NOEXCEPT;
-
-#if FMT_USE_WINDOWS_H
-
-/** A Windows error. */
-class WindowsError : public SystemError {
- private:
-  FMT_API void init(int error_code, CStringRef format_str, ArgList args);
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::WindowsError` object with the description
-   of the form
-
-   .. parsed-literal::
-     *<message>*: *<system-message>*
-
-   where *<message>* is the formatted message and *<system-message>* is the
-   system message corresponding to the error code.
-   *error_code* is a Windows error code as given by ``GetLastError``.
-   If *error_code* is not a valid error code such as -1, the system message
-   will look like "error -1".
-
-   **Example**::
-
-     // This throws a WindowsError with the description
-     //   cannot open file 'madeup': The system cannot find the file specified.
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     LPOFSTRUCT of = LPOFSTRUCT();
-     HFILE file = OpenFile(filename, &of, OF_READ);
-     if (file == HFILE_ERROR) {
-       throw fmt::WindowsError(GetLastError(),
-                               "cannot open file '{}'", filename);
-     }
-   \endrst
-  */
-  WindowsError(int error_code, CStringRef message) {
-    init(error_code, message, ArgList());
-  }
-  FMT_VARIADIC_CTOR(WindowsError, init, int, CStringRef)
-};
-
-// Reports a Windows error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_windows_error(int error_code,
-                                  StringRef message) FMT_NOEXCEPT;
-
-#endif
-
-enum Color { BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE };
-
-/**
-  Formats a string and prints it to stdout using ANSI escape sequences
-  to specify color (experimental).
-  Example:
-    print_colored(fmt::RED, "Elapsed time: {0:.2f} seconds", 1.23);
- */
-FMT_API void print_colored(Color c, CStringRef format, ArgList args);
-
-/**
-  \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    std::string message = format("The answer is {}", 42);
-  \endrst
-*/
-inline std::string format(CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  w.write(format_str, args);
-  return w.str();
-}
-
-inline std::wstring format(WCStringRef format_str, ArgList args) {
-  WMemoryWriter w;
-  w.write(format_str, args);
-  return w.str();
-}
-
-/**
-  \rst
-  Prints formatted data to the file *f*.
-
-  **Example**::
-
-    print(stderr, "Don't {}!", "panic");
-  \endrst
- */
-FMT_API void print(std::FILE *f, CStringRef format_str, ArgList args);
-
-/**
-  \rst
-  Prints formatted data to ``stdout``.
-
-  **Example**::
-
-    print("Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
- */
-FMT_API void print(CStringRef format_str, ArgList args);
-
-template <typename Char>
-void printf(BasicWriter<Char> &w, BasicCStringRef<Char> format, ArgList args) {
-  internal::PrintfFormatter<Char>(args).format(w, format);
-}
-
-/**
-  \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    std::string message = fmt::sprintf("The answer is %d", 42);
-  \endrst
-*/
-inline std::string sprintf(CStringRef format, ArgList args) {
-  MemoryWriter w;
-  printf(w, format, args);
-  return w.str();
-}
-
-inline std::wstring sprintf(WCStringRef format, ArgList args) {
-  WMemoryWriter w;
-  printf(w, format, args);
-  return w.str();
-}
-
-/**
-  \rst
-  Prints formatted data to the file *f*.
-
-  **Example**::
-
-    fmt::fprintf(stderr, "Don't %s!", "panic");
-  \endrst
- */
-FMT_API int fprintf(std::FILE *f, CStringRef format, ArgList args);
-
-/**
-  \rst
-  Prints formatted data to ``stdout``.
-
-  **Example**::
-
-    fmt::printf("Elapsed time: %.2f seconds", 1.23);
-  \endrst
- */
-inline int printf(CStringRef format, ArgList args) {
-  return fprintf(stdout, format, args);
-}
-
-/**
-  Fast integer formatter.
- */
-class FormatInt {
- private:
-  // Buffer should be large enough to hold all digits (digits10 + 1),
-  // a sign and a null character.
-  enum {BUFFER_SIZE = std::numeric_limits<ULongLong>::digits10 + 3};
-  mutable char buffer_[BUFFER_SIZE];
-  char *str_;
-
-  // Formats value in reverse and returns the number of digits.
-  char *format_decimal(ULongLong value) {
-    char *buffer_end = buffer_ + BUFFER_SIZE - 1;
-    while (value >= 100) {
-      // Integer division is slow so do it for a group of two digits instead
-      // of for every digit. The idea comes from the talk by Alexandrescu
-      // "Three Optimization Tips for C++". See speed-test for a comparison.
-      unsigned index = static_cast<unsigned>((value % 100) * 2);
-      value /= 100;
-      *--buffer_end = internal::Data::DIGITS[index + 1];
-      *--buffer_end = internal::Data::DIGITS[index];
-    }
-    if (value < 10) {
-      *--buffer_end = static_cast<char>('0' + value);
-      return buffer_end;
-    }
-    unsigned index = static_cast<unsigned>(value * 2);
-    *--buffer_end = internal::Data::DIGITS[index + 1];
-    *--buffer_end = internal::Data::DIGITS[index];
-    return buffer_end;
-  }
-
-  void FormatSigned(LongLong value) {
-    ULongLong abs_value = static_cast<ULongLong>(value);
-    bool negative = value < 0;
-    if (negative)
-      abs_value = 0 - abs_value;
-    str_ = format_decimal(abs_value);
-    if (negative)
-      *--str_ = '-';
-  }
-
- public:
-  explicit FormatInt(int value) { FormatSigned(value); }
-  explicit FormatInt(long value) { FormatSigned(value); }
-  explicit FormatInt(LongLong value) { FormatSigned(value); }
-  explicit FormatInt(unsigned value) : str_(format_decimal(value)) {}
-  explicit FormatInt(unsigned long value) : str_(format_decimal(value)) {}
-  explicit FormatInt(ULongLong value) : str_(format_decimal(value)) {}
-
-  /** Returns the number of characters written to the output buffer. */
-  std::size_t size() const {
-    return internal::to_unsigned(buffer_ - str_ + BUFFER_SIZE - 1);
-  }
-
-  /**
-    Returns a pointer to the output buffer content. No terminating null
-    character is appended.
-   */
-  const char *data() const { return str_; }
-
-  /**
-    Returns a pointer to the output buffer content with terminating null
-    character appended.
-   */
-  const char *c_str() const {
-    buffer_[BUFFER_SIZE - 1] = '\0';
-    return str_;
-  }
-
-  /**
-    \rst
-    Returns the content of the output buffer as an ``std::string``.
-    \endrst
-   */
-  std::string str() const { return std::string(str_, size()); }
-};
-
-// Formats a decimal integer value writing into buffer and returns
-// a pointer to the end of the formatted string. This function doesn't
-// write a terminating null character.
-template <typename T>
-inline void format_decimal(char *&buffer, T value) {
-  typedef typename internal::IntTraits<T>::MainType MainType;
-  MainType abs_value = static_cast<MainType>(value);
-  if (internal::is_negative(value)) {
-    *buffer++ = '-';
-    abs_value = 0 - abs_value;
-  }
-  if (abs_value < 100) {
-    if (abs_value < 10) {
-      *buffer++ = static_cast<char>('0' + abs_value);
-      return;
-    }
-    unsigned index = static_cast<unsigned>(abs_value * 2);
-    *buffer++ = internal::Data::DIGITS[index];
-    *buffer++ = internal::Data::DIGITS[index + 1];
-    return;
-  }
-  unsigned num_digits = internal::count_digits(abs_value);
-  internal::format_decimal(buffer, abs_value, num_digits);
-  buffer += num_digits;
-}
-
-/**
-  \rst
-  Returns a named argument for formatting functions.
-
-  **Example**::
-
-    print("Elapsed time: {s:.2f} seconds", arg("s", 1.23));
-
-  \endrst
- */
-template <typename T>
-inline internal::NamedArg<char> arg(StringRef name, const T &arg) {
-  return internal::NamedArg<char>(name, arg);
-}
-
-template <typename T>
-inline internal::NamedArg<wchar_t> arg(WStringRef name, const T &arg) {
-  return internal::NamedArg<wchar_t>(name, arg);
-}
-
-// The following two functions are deleted intentionally to disable
-// nested named arguments as in ``format("{}", arg("a", arg("b", 42)))``.
-template <typename Char>
-void arg(StringRef, const internal::NamedArg<Char>&) FMT_DELETED_OR_UNDEFINED;
-template <typename Char>
-void arg(WStringRef, const internal::NamedArg<Char>&) FMT_DELETED_OR_UNDEFINED;
-}
-
-#if FMT_GCC_VERSION
-// Use the system_header pragma to suppress warnings about variadic macros
-// because suppressing -Wvariadic-macros with the diagnostic pragma doesn't
-// work. It is used at the end because we want to suppress as little warnings
-// as possible.
-# pragma GCC system_header
-#endif
-
-// This is used to work around VC++ bugs in handling variadic macros.
-#define FMT_EXPAND(args) args
-
-// Returns the number of arguments.
-// Based on https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s.
-#define FMT_NARG(...) FMT_NARG_(__VA_ARGS__, FMT_RSEQ_N())
-#define FMT_NARG_(...) FMT_EXPAND(FMT_ARG_N(__VA_ARGS__))
-#define FMT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
-#define FMT_RSEQ_N() 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-#define FMT_CONCAT(a, b) a##b
-#define FMT_FOR_EACH_(N, f, ...) \
-  FMT_EXPAND(FMT_CONCAT(FMT_FOR_EACH, N)(f, __VA_ARGS__))
-#define FMT_FOR_EACH(f, ...) \
-  FMT_EXPAND(FMT_FOR_EACH_(FMT_NARG(__VA_ARGS__), f, __VA_ARGS__))
-
-#define FMT_ADD_ARG_NAME(type, index) type arg##index
-#define FMT_GET_ARG_NAME(type, index) arg##index
-
-#if FMT_USE_VARIADIC_TEMPLATES
-# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
-  template <typename... Args> \
-  ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
-      const Args & ... args) { \
-    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
-    typename ArgArray::Type array{ \
-      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
-    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), \
-      fmt::ArgList(fmt::internal::make_type(args...), array)); \
-  }
-#else
-// Defines a wrapper for a function taking __VA_ARGS__ arguments
-// and n additional arguments of arbitrary types.
-# define FMT_WRAP(Char, ReturnType, func, call, n, ...) \
-  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
-  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
-      FMT_GEN(n, FMT_MAKE_ARG)) { \
-    fmt::internal::ArgArray<n>::Type arr; \
-    FMT_GEN(n, FMT_ASSIGN_##Char); \
-    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList( \
-      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), arr)); \
-  }
-
-# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
-  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__)) { \
-    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList()); \
-  } \
-  FMT_WRAP(Char, ReturnType, func, call, 1, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 2, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 3, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 4, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 5, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 6, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 7, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 8, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 9, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 10, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 11, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 12, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 13, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 14, __VA_ARGS__) \
-  FMT_WRAP(Char, ReturnType, func, call, 15, __VA_ARGS__)
-#endif  // FMT_USE_VARIADIC_TEMPLATES
-
-/**
-  \rst
-  Defines a variadic function with the specified return type, function name
-  and argument types passed as variable arguments to this macro.
-
-  **Example**::
-
-    void print_error(const char *file, int line, const char *format,
-                     fmt::ArgList args) {
-      fmt::print("{}: {}: ", file, line);
-      fmt::print(format, args);
-    }
-    FMT_VARIADIC(void, print_error, const char *, int, const char *)
-
-  ``FMT_VARIADIC`` is used for compatibility with legacy C++ compilers that
-  don't implement variadic templates. You don't have to use this macro if
-  you don't need legacy compiler support and can use variadic templates
-  directly::
-
-    template <typename... Args>
-    void print_error(const char *file, int line, const char *format,
-                     const Args & ... args) {
-      fmt::print("{}: {}: ", file, line);
-      fmt::print(format, args...);
-    }
-  \endrst
- */
-#define FMT_VARIADIC(ReturnType, func, ...) \
-  FMT_VARIADIC_(char, ReturnType, func, return func, __VA_ARGS__)
-
-#define FMT_VARIADIC_W(ReturnType, func, ...) \
-  FMT_VARIADIC_(wchar_t, ReturnType, func, return func, __VA_ARGS__)
-
-#define FMT_CAPTURE_ARG_(id, index) ::fmt::arg(#id, id)
-
-#define FMT_CAPTURE_ARG_W_(id, index) ::fmt::arg(L###id, id)
-
-/**
-  \rst
-  Convenient macro to capture the arguments' names and values into several
-  ``fmt::arg(name, value)``.
-
-  **Example**::
-
-    int x = 1, y = 2;
-    print("point: ({x}, {y})", FMT_CAPTURE(x, y));
-    // same as:
-    // print("point: ({x}, {y})", arg("x", x), arg("y", y));
-
-  \endrst
- */
-#define FMT_CAPTURE(...) FMT_FOR_EACH(FMT_CAPTURE_ARG_, __VA_ARGS__)
-
-#define FMT_CAPTURE_W(...) FMT_FOR_EACH(FMT_CAPTURE_ARG_W_, __VA_ARGS__)
-
-namespace fmt {
-FMT_VARIADIC(std::string, format, CStringRef)
-FMT_VARIADIC_W(std::wstring, format, WCStringRef)
-FMT_VARIADIC(void, print, CStringRef)
-FMT_VARIADIC(void, print, std::FILE *, CStringRef)
-
-FMT_VARIADIC(void, print_colored, Color, CStringRef)
-FMT_VARIADIC(std::string, sprintf, CStringRef)
-FMT_VARIADIC_W(std::wstring, sprintf, WCStringRef)
-FMT_VARIADIC(int, printf, CStringRef)
-FMT_VARIADIC(int, fprintf, std::FILE *, CStringRef)
-
-namespace internal {
-template <typename Char>
-inline bool is_name_start(Char c) {
-  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
-}
-
-// Parses an unsigned integer advancing s to the end of the parsed input.
-// This function assumes that the first character of s is a digit.
-template <typename Char>
-unsigned parse_nonnegative_int(const Char *&s) {
-  assert('0' <= *s && *s <= '9');
-  unsigned value = 0;
-  do {
-    unsigned new_value = value * 10 + (*s++ - '0');
-    // Check if value wrapped around.
-    if (new_value < value) {
-      value = (std::numeric_limits<unsigned>::max)();
-      break;
-    }
-    value = new_value;
-  } while ('0' <= *s && *s <= '9');
-  // Convert to unsigned to prevent a warning.
-  unsigned max_int = (std::numeric_limits<int>::max)();
-  if (value > max_int)
-    FMT_THROW(FormatError("number is too big"));
-  return value;
-}
-
-inline void require_numeric_argument(const Arg &arg, char spec) {
-  if (arg.type > Arg::LAST_NUMERIC_TYPE) {
-    std::string message =
-        fmt::format("format specifier '{}' requires numeric argument", spec);
-    FMT_THROW(fmt::FormatError(message));
-  }
-}
-
-template <typename Char>
-void check_sign(const Char *&s, const Arg &arg) {
-  char sign = static_cast<char>(*s);
-  require_numeric_argument(arg, sign);
-  if (arg.type == Arg::UINT || arg.type == Arg::ULONG_LONG) {
-    FMT_THROW(FormatError(fmt::format(
-      "format specifier '{}' requires signed argument", sign)));
-  }
-  ++s;
-}
-}  // namespace internal
-
-template <typename Char, typename AF>
-inline internal::Arg BasicFormatter<Char, AF>::get_arg(
-    BasicStringRef<Char> arg_name, const char *&error) {
-  if (check_no_auto_index(error)) {
-    map_.init(args());
-    const internal::Arg *arg = map_.find(arg_name);
-    if (arg)
-      return *arg;
-    error = "argument not found";
-  }
-  return internal::Arg();
-}
-
-template <typename Char, typename AF>
-inline internal::Arg BasicFormatter<Char, AF>::parse_arg_index(const Char *&s) {
-  const char *error = 0;
-  internal::Arg arg = *s < '0' || *s > '9' ?
-        next_arg(error) : get_arg(internal::parse_nonnegative_int(s), error);
-  if (error) {
-    FMT_THROW(FormatError(
-                *s != '}' && *s != ':' ? "invalid format string" : error));
-  }
-  return arg;
-}
-
-template <typename Char, typename AF>
-inline internal::Arg BasicFormatter<Char, AF>::parse_arg_name(const Char *&s) {
-  assert(internal::is_name_start(*s));
-  const Char *start = s;
-  Char c;
-  do {
-    c = *++s;
-  } while (internal::is_name_start(c) || ('0' <= c && c <= '9'));
-  const char *error = 0;
-  internal::Arg arg = get_arg(BasicStringRef<Char>(start, s - start), error);
-  if (error)
-    FMT_THROW(FormatError(error));
-  return arg;
-}
-
-template <typename Char, typename ArgFormatter>
-const Char *BasicFormatter<Char, ArgFormatter>::format(
-    const Char *&format_str, const internal::Arg &arg) {
-  using internal::Arg;
-  const Char *s = format_str;
-  FormatSpec spec;
-  if (*s == ':') {
-    if (arg.type == Arg::CUSTOM) {
-      arg.custom.format(this, arg.custom.value, &s);
-      return s;
-    }
-    ++s;
-    // Parse fill and alignment.
-    if (Char c = *s) {
-      const Char *p = s + 1;
-      spec.align_ = ALIGN_DEFAULT;
-      do {
-        switch (*p) {
-          case '<':
-            spec.align_ = ALIGN_LEFT;
-            break;
-          case '>':
-            spec.align_ = ALIGN_RIGHT;
-            break;
-          case '=':
-            spec.align_ = ALIGN_NUMERIC;
-            break;
-          case '^':
-            spec.align_ = ALIGN_CENTER;
-            break;
-        }
-        if (spec.align_ != ALIGN_DEFAULT) {
-          if (p != s) {
-            if (c == '}') break;
-            if (c == '{')
-              FMT_THROW(FormatError("invalid fill character '{'"));
-            s += 2;
-            spec.fill_ = c;
-          } else ++s;
-          if (spec.align_ == ALIGN_NUMERIC)
-            require_numeric_argument(arg, '=');
-          break;
-        }
-      } while (--p >= s);
-    }
-
-    // Parse sign.
-    switch (*s) {
-      case '+':
-        check_sign(s, arg);
-        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
-        break;
-      case '-':
-        check_sign(s, arg);
-        spec.flags_ |= MINUS_FLAG;
-        break;
-      case ' ':
-        check_sign(s, arg);
-        spec.flags_ |= SIGN_FLAG;
-        break;
-    }
-
-    if (*s == '#') {
-      require_numeric_argument(arg, '#');
-      spec.flags_ |= HASH_FLAG;
-      ++s;
-    }
-
-    // Parse zero flag.
-    if (*s == '0') {
-      require_numeric_argument(arg, '0');
-      spec.align_ = ALIGN_NUMERIC;
-      spec.fill_ = '0';
-      ++s;
-    }
-
-    // Parse width.
-    if ('0' <= *s && *s <= '9') {
-      spec.width_ = internal::parse_nonnegative_int(s);
-    } else if (*s == '{') {
-      ++s;
-      Arg width_arg = internal::is_name_start(*s) ?
-            parse_arg_name(s) : parse_arg_index(s);
-      if (*s++ != '}')
-        FMT_THROW(FormatError("invalid format string"));
-      ULongLong value = 0;
-      switch (width_arg.type) {
-      case Arg::INT:
-        if (width_arg.int_value < 0)
-          FMT_THROW(FormatError("negative width"));
-        value = width_arg.int_value;
-        break;
-      case Arg::UINT:
-        value = width_arg.uint_value;
-        break;
-      case Arg::LONG_LONG:
-        if (width_arg.long_long_value < 0)
-          FMT_THROW(FormatError("negative width"));
-        value = width_arg.long_long_value;
-        break;
-      case Arg::ULONG_LONG:
-        value = width_arg.ulong_long_value;
-        break;
-      default:
-        FMT_THROW(FormatError("width is not integer"));
-      }
-      if (value > (std::numeric_limits<int>::max)())
-        FMT_THROW(FormatError("number is too big"));
-      spec.width_ = static_cast<int>(value);
-    }
-
-    // Parse precision.
-    if (*s == '.') {
-      ++s;
-      spec.precision_ = 0;
-      if ('0' <= *s && *s <= '9') {
-        spec.precision_ = internal::parse_nonnegative_int(s);
-      } else if (*s == '{') {
-        ++s;
-        Arg precision_arg = internal::is_name_start(*s) ?
-              parse_arg_name(s) : parse_arg_index(s);
-        if (*s++ != '}')
-          FMT_THROW(FormatError("invalid format string"));
-        ULongLong value = 0;
-        switch (precision_arg.type) {
-          case Arg::INT:
-            if (precision_arg.int_value < 0)
-              FMT_THROW(FormatError("negative precision"));
-            value = precision_arg.int_value;
-            break;
-          case Arg::UINT:
-            value = precision_arg.uint_value;
-            break;
-          case Arg::LONG_LONG:
-            if (precision_arg.long_long_value < 0)
-              FMT_THROW(FormatError("negative precision"));
-            value = precision_arg.long_long_value;
-            break;
-          case Arg::ULONG_LONG:
-            value = precision_arg.ulong_long_value;
-            break;
-          default:
-            FMT_THROW(FormatError("precision is not integer"));
-        }
-        if (value > (std::numeric_limits<int>::max)())
-          FMT_THROW(FormatError("number is too big"));
-        spec.precision_ = static_cast<int>(value);
-      } else {
-        FMT_THROW(FormatError("missing precision specifier"));
-      }
-      if (arg.type <= Arg::LAST_INTEGER_TYPE || arg.type == Arg::POINTER) {
-        FMT_THROW(FormatError(
-            fmt::format("precision not allowed in {} format specifier",
-            arg.type == Arg::POINTER ? "pointer" : "integer")));
-      }
-    }
-
-    // Parse type.
-    if (*s != '}' && *s)
-      spec.type_ = static_cast<char>(*s++);
-  }
-
-  if (*s++ != '}')
-    FMT_THROW(FormatError("missing '}' in format string"));
-
-  // Format argument.
-  ArgFormatter(*this, spec, s - 1).visit(arg);
-  return s;
-}
-
-template <typename Char, typename AF>
-void BasicFormatter<Char, AF>::format(BasicCStringRef<Char> format_str) {
-  const Char *s = format_str.c_str();
-  const Char *start = s;
-  while (*s) {
-    Char c = *s++;
-    if (c != '{' && c != '}') continue;
-    if (*s == c) {
-      write(writer_, start, s);
-      start = ++s;
-      continue;
-    }
-    if (c == '}')
-      FMT_THROW(FormatError("unmatched '}' in format string"));
-    write(writer_, start, s - 1);
-    internal::Arg arg = internal::is_name_start(*s) ?
-          parse_arg_name(s) : parse_arg_index(s);
-    start = s = format(s, arg);
-  }
-  write(writer_, start, s);
-}
-}  // namespace fmt
-
-#if FMT_USE_USER_DEFINED_LITERALS
-namespace fmt {
-namespace internal {
-
-template <typename Char>
-struct UdlFormat {
-  const Char *str;
-
-  template <typename... Args>
-  auto operator()(Args && ... args) const
-                  -> decltype(format(str, std::forward<Args>(args)...)) {
-    return format(str, std::forward<Args>(args)...);
-  }
-};
-
-template <typename Char>
-struct UdlArg {
-  const Char *str;
-
-  template <typename T>
-  NamedArg<Char> operator=(T &&value) const {
-    return {str, std::forward<T>(value)};
-  }
-};
-
-} // namespace internal
-
-inline namespace literals {
-
-/**
-  \rst
-  C++11 literal equivalent of :func:`fmt::format`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    std::string message = "The answer is {}"_format(42);
-  \endrst
- */
-inline internal::UdlFormat<char>
-operator"" _format(const char *s, std::size_t) { return {s}; }
-inline internal::UdlFormat<wchar_t>
-operator"" _format(const wchar_t *s, std::size_t) { return {s}; }
-
-/**
-  \rst
-  C++11 literal equivalent of :func:`fmt::arg`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
-  \endrst
- */
-inline internal::UdlArg<char>
-operator"" _a(const char *s, std::size_t) { return {s}; }
-inline internal::UdlArg<wchar_t>
-operator"" _a(const wchar_t *s, std::size_t) { return {s}; }
-
-} // inline namespace literals
-} // namespace fmt
-#endif // FMT_USE_USER_DEFINED_LITERALS
-
-// Restore warnings.
-#if FMT_GCC_VERSION >= 406
-# pragma GCC diagnostic pop
-#endif
-
-#if defined(__clang__) && !defined(FMT_ICC_VERSION)
-# pragma clang diagnostic pop
-#endif
-
-#ifdef FMT_HEADER_ONLY
-# define FMT_FUNC inline
-# include "format.cc"
-#else
-# define FMT_FUNC
-#endif
-
-#endif  // FMT_FORMAT_H_
diff --git a/diy/include/diy/fmt/ostream.cc b/diy/include/diy/fmt/ostream.cc
deleted file mode 100644
index 0ba303478..000000000
--- a/diy/include/diy/fmt/ostream.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- Formatting library for C++ - std::ostream support
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ostream.h"
-
-namespace fmt {
-
-namespace {
-// Write the content of w to os.
-void write(std::ostream &os, Writer &w) {
-  const char *data = w.data();
-  typedef internal::MakeUnsigned<std::streamsize>::Type UnsignedStreamSize;
-  UnsignedStreamSize size = w.size();
-  UnsignedStreamSize max_size =
-      internal::to_unsigned((std::numeric_limits<std::streamsize>::max)());
-  do {
-    UnsignedStreamSize n = size <= max_size ? size : max_size;
-    os.write(data, static_cast<std::streamsize>(n));
-    data += n;
-    size -= n;
-  } while (size != 0);
-}
-}
-
-FMT_FUNC void print(std::ostream &os, CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  w.write(format_str, args);
-  write(os, w);
-}
-
-FMT_FUNC int fprintf(std::ostream &os, CStringRef format, ArgList args) {
-  MemoryWriter w;
-  printf(w, format, args);
-  write(os, w);
-  return static_cast<int>(w.size());
-}
-}  // namespace fmt
diff --git a/diy/include/diy/fmt/ostream.h b/diy/include/diy/fmt/ostream.h
deleted file mode 100644
index 812278dd3..000000000
--- a/diy/include/diy/fmt/ostream.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- Formatting library for C++ - std::ostream support
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef FMT_OSTREAM_H_
-#define FMT_OSTREAM_H_
-
-#include "format.h"
-#include <ostream>
-
-namespace fmt {
-
-namespace internal {
-
-template <class Char>
-class FormatBuf : public std::basic_streambuf<Char> {
- private:
-  typedef typename std::basic_streambuf<Char>::int_type int_type;
-  typedef typename std::basic_streambuf<Char>::traits_type traits_type;
-
-  Buffer<Char> &buffer_;
-  Char *start_;
-
- public:
-  FormatBuf(Buffer<Char> &buffer) : buffer_(buffer), start_(&buffer[0]) {
-    this->setp(start_, start_ + buffer_.capacity());
-  }
-
-  int_type overflow(int_type ch = traits_type::eof()) {
-    if (!traits_type::eq_int_type(ch, traits_type::eof())) {
-      size_t buf_size = size();
-      buffer_.resize(buf_size);
-      buffer_.reserve(buf_size * 2);
-
-      start_ = &buffer_[0];
-      start_[buf_size] = traits_type::to_char_type(ch);
-      this->setp(start_+ buf_size + 1, start_ + buf_size * 2);
-    }
-    return ch;
-  }
-
-  size_t size() const {
-    return to_unsigned(this->pptr() - start_);
-  }
-};
-
-Yes &convert(std::ostream &);
-
-struct DummyStream : std::ostream {
-  DummyStream();  // Suppress a bogus warning in MSVC.
-  // Hide all operator<< overloads from std::ostream.
-  void operator<<(Null<>);
-};
-
-No &operator<<(std::ostream &, int);
-
-template<typename T>
-struct ConvertToIntImpl<T, true> {
-  // Convert to int only if T doesn't have an overloaded operator<<.
-  enum {
-    value = sizeof(convert(get<DummyStream>() << get<T>())) == sizeof(No)
-  };
-};
-}  // namespace internal
-
-// Formats a value.
-template <typename Char, typename ArgFormatter_, typename T>
-void format(BasicFormatter<Char, ArgFormatter_> &f,
-            const Char *&format_str, const T &value) {
-  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE> buffer;
-
-  internal::FormatBuf<Char> format_buf(buffer);
-  std::basic_ostream<Char> output(&format_buf);
-  output << value;
-
-  BasicStringRef<Char> str(&buffer[0], format_buf.size());
-  typedef internal::MakeArg< BasicFormatter<Char> > MakeArg;
-  format_str = f.format(format_str, MakeArg(str));
-}
-
-/**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    print(cerr, "Don't {}!", "panic");
-  \endrst
- */
-FMT_API void print(std::ostream &os, CStringRef format_str, ArgList args);
-FMT_VARIADIC(void, print, std::ostream &, CStringRef)
-
-/**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    fprintf(cerr, "Don't %s!", "panic");
-  \endrst
- */
-FMT_API int fprintf(std::ostream &os, CStringRef format_str, ArgList args);
-FMT_VARIADIC(int, fprintf, std::ostream &, CStringRef)
-}  // namespace fmt
-
-#ifdef FMT_HEADER_ONLY
-# include "ostream.cc"
-#endif
-
-#endif  // FMT_OSTREAM_H_
diff --git a/diy/include/diy/grid.hpp b/diy/include/diy/grid.hpp
deleted file mode 100644
index cfdb72a65..000000000
--- a/diy/include/diy/grid.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef DIY_GRID_HPP
-#define DIY_GRID_HPP
-
-#include "point.hpp"
-
-namespace diy
-{
-
-template<class C, unsigned D>
-struct Grid;
-
-template<class C, unsigned D>
-struct GridRef
-{
-    public:
-        typedef     C                                           Value;
-
-        typedef     Point<int, D>                               Vertex;
-        typedef     size_t                                      Index;
-
-    public:
-        template<class Int>
-                GridRef(C* data, const Point<Int,D>& shape, bool c_order = true):
-                    data_(data), shape_(shape), c_order_(c_order)   { set_stride(); }
-
-                GridRef(Grid<C,D>& g):
-                    data_(g.data()), shape_(g.shape()),
-                    c_order_(g.c_order())                       { set_stride(); }
-
-        template<class Int>
-        C       operator()(const Point<Int, D>& v) const        { return (*this)(index(v)); }
-
-        template<class Int>
-        C&      operator()(const Point<Int, D>& v)              { return (*this)(index(v)); }
-
-        C       operator()(Index i) const                       { return data_[i]; }
-        C&      operator()(Index i)                             { return data_[i]; }
-
-        const Vertex&
-                shape() const                                   { return shape_; }
-
-        const C*
-                data() const                                    { return data_; }
-        C*      data()                                          { return data_; }
-
-        // Set every element to the given value
-        GridRef&    operator=(C value)                          { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] = value; return *this; }
-        GridRef&    operator/=(C value)                         { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] /= value; return *this; }
-
-        Vertex      vertex(Index idx) const                     { Vertex v; for (unsigned i = 0; i < D; ++i) { v[i] = idx / stride_[i]; idx %= stride_[i]; } return v; }
-        Index       index(const Vertex& v) const                { Index idx = 0; for (unsigned i = 0; i < D; ++i) { idx += ((Index) v[i]) * ((Index) stride_[i]); } return idx; }
-
-        Index       size() const                                { return size(shape()); }
-        void        swap(GridRef& other)                        { std::swap(data_, other.data_); std::swap(shape_, other.shape_); std::swap(stride_, other.stride_); std::swap(c_order_, other.c_order_); }
-
-        bool        c_order() const                             { return c_order_; }
-
-        static constexpr
-        unsigned    dimension()                                 { return D; }
-
-    protected:
-        static Index
-                size(const Vertex& v)                           { Index res = 1; for (unsigned i = 0; i < D; ++i) res *= v[i]; return res; }
-
-        void    set_stride()
-        {
-            Index cur = 1;
-            if (c_order_)
-                for (unsigned i = D; i > 0; --i) { stride_[i-1] = cur; cur *= shape_[i-1]; }
-            else
-                for (unsigned i = 0; i < D; ++i) { stride_[i] = cur; cur *= shape_[i]; }
-
-        }
-        void    set_shape(const Vertex& v)                      { shape_ = v; set_stride(); }
-        void    set_data(C* data)                               { data_ = data; }
-        void    set_c_order(bool order)                         { c_order_ = order; }
-
-    private:
-        C*      data_;
-        Vertex  shape_;
-        Vertex  stride_;
-        bool    c_order_;
-};
-
-
-template<class C, unsigned D>
-struct Grid: public GridRef<C,D>
-{
-    public:
-        typedef     GridRef<C,D>                                Parent;
-        typedef     typename Parent::Value                      Value;
-        typedef     typename Parent::Index                      Index;
-        typedef     typename Parent::Vertex                     Vertex;
-        typedef     Parent                                      Reference;
-
-        template<class U>
-        struct rebind { typedef Grid<U,D>                       type; };
-
-    public:
-                Grid():
-                    Parent(new C[0], Vertex::zero())            {}
-        template<class Int>
-                Grid(const Point<Int, D>& shape, bool c_order = true):
-                    Parent(new C[size(shape)], shape, c_order)
-                {}
-
-                Grid(Grid&& g): Grid()                          { Parent::swap(g); }
-
-                Grid(const Parent& g):
-                    Parent(new C[size(g.shape())], g.shape(),
-                           g.c_order())                         { copy_data(g.data()); }
-
-        template<class OtherGrid>
-                Grid(const OtherGrid& g):
-                    Parent(new C[size(g.shape())],
-                           g.shape(),
-                           g.c_order())                         { copy_data(g.data()); }
-
-                ~Grid()                                         { delete[] Parent::data(); }
-
-        template<class OC>
-        Grid&   operator=(const GridRef<OC, D>& other)
-        {
-            delete[] Parent::data();
-            Parent::set_c_order(other.c_order());       // NB: order needs to be set before the shape, to set the stride correctly
-            Parent::set_shape(other.shape());
-            Index s = size(shape());
-            Parent::set_data(new C[s]);
-            copy_data(other.data());
-            return *this;
-        }
-
-        Grid&   operator=(Grid&& g)                             { Parent::swap(g); return *this; }
-
-        using Parent::data;
-        using Parent::shape;
-        using Parent::operator();
-        using Parent::operator=;
-        using Parent::size;
-
-    private:
-        template<class OC>
-        void    copy_data(const OC* data)
-        {
-            Index s = size(shape());
-            for (Index i = 0; i < s; ++i)
-                Parent::data()[i] = data[i];
-        }
-};
-
-}
-
-#endif
diff --git a/diy/include/diy/io/block.hpp b/diy/include/diy/io/block.hpp
deleted file mode 100644
index 05e45a800..000000000
--- a/diy/include/diy/io/block.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-#ifndef DIY_IO_BLOCK_HPP
-#define DIY_IO_BLOCK_HPP
-
-#include <string>
-#include <algorithm>
-#include <stdexcept>
-
-#include <unistd.h>
-#include <sys/stat.h>
-#include <dirent.h>
-
-#include "../mpi.hpp"
-#include "../assigner.hpp"
-#include "../master.hpp"
-#include "../storage.hpp"
-#include "../log.hpp"
-
-// Read and write collections of blocks using MPI-IO
-namespace diy
-{
-namespace io
-{
-  namespace detail
-  {
-    typedef mpi::io::offset                 offset_t;
-
-    struct GidOffsetCount
-    {
-                    GidOffsetCount():                                   // need to initialize a vector of given size
-                        gid(-1), offset(0), count(0)                    {}
-
-                    GidOffsetCount(int gid_, offset_t offset_, offset_t count_):
-                        gid(gid_), offset(offset_), count(count_)       {}
-
-        bool        operator<(const GidOffsetCount& other) const        { return gid < other.gid; }
-
-        int         gid;
-        offset_t    offset;
-        offset_t    count;
-    };
-  }
-}
-
-// Serialize GidOffsetCount explicitly, to avoid alignment and unitialized data issues
-// (to get identical output files given the same block input)
-template<>
-struct Serialization<io::detail::GidOffsetCount>
-{
-    typedef             io::detail::GidOffsetCount                  GidOffsetCount;
-
-    static void         save(BinaryBuffer& bb, const GidOffsetCount& x)
-    {
-      diy::save(bb, x.gid);
-      diy::save(bb, x.offset);
-      diy::save(bb, x.count);
-    }
-
-    static void         load(BinaryBuffer& bb, GidOffsetCount& x)
-    {
-      diy::load(bb, x.gid);
-      diy::load(bb, x.offset);
-      diy::load(bb, x.count);
-    }
-};
-
-namespace io
-{
-/**
- * \ingroup IO
- * \brief Write blocks to storage collectively in one shared file
- */
-  inline
-  void
-  write_blocks(const std::string&           outfilename,           //!< output file name
-               const mpi::communicator&     comm,                  //!< communicator
-               Master&                      master,                //!< master object
-               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
-               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in the master
-  {
-    if (!save) save = master.saver();       // save is likely to be different from master.save()
-
-    typedef detail::offset_t                offset_t;
-    typedef detail::GidOffsetCount          GidOffsetCount;
-
-    unsigned size = master.size(),
-             max_size, min_size;
-    mpi::all_reduce(comm, size, max_size, mpi::maximum<unsigned>());
-    mpi::all_reduce(comm, size, min_size, mpi::minimum<unsigned>());
-
-    // truncate the file
-    if (comm.rank() == 0)
-        truncate(outfilename.c_str(), 0);
-
-    mpi::io::file f(comm, outfilename, mpi::io::file::wronly | mpi::io::file::create);
-
-    offset_t  start = 0, shift;
-    std::vector<GidOffsetCount>     offset_counts;
-    unsigned i;
-    for (i = 0; i < max_size; ++i)
-    {
-      offset_t count = 0,
-               offset;
-      if (i < size)
-      {
-        // get the block from master and serialize it
-        const void* block = master.get(i);
-        MemoryBuffer bb;
-        LinkFactory::save(bb, master.link(i));
-        save(block, bb);
-        count = bb.buffer.size();
-        mpi::scan(comm, count, offset, std::plus<offset_t>());
-        offset += start - count;
-        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
-        start += shift;
-
-        if (i < min_size)       // up to min_size, we can do collective IO
-          f.write_at_all(offset, bb.buffer);
-        else
-          f.write_at(offset, bb.buffer);
-
-        offset_counts.push_back(GidOffsetCount(master.gid(i), offset, count));
-      } else
-      {
-        // matching global operations
-        mpi::scan(comm, count, offset, std::plus<offset_t>());
-        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
-
-        // -1 indicates that there is no block written here from this rank
-        offset_counts.push_back(GidOffsetCount(-1, offset, count));
-      }
-    }
-
-    if (comm.rank() == 0)
-    {
-      // round-about way of gather vector of vectors of GidOffsetCount to avoid registering a new mpi datatype
-      std::vector< std::vector<char> > gathered_offset_count_buffers;
-      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
-      mpi::gather(comm, oc_buffer.buffer, gathered_offset_count_buffers, 0);
-
-      std::vector<GidOffsetCount>  all_offset_counts;
-      for (unsigned i = 0; i < gathered_offset_count_buffers.size(); ++i)
-      {
-        MemoryBuffer oc_buffer; oc_buffer.buffer.swap(gathered_offset_count_buffers[i]);
-        std::vector<GidOffsetCount> offset_counts;
-        diy::load(oc_buffer, offset_counts);
-        for (unsigned j = 0; j < offset_counts.size(); ++j)
-          if (offset_counts[j].gid != -1)
-            all_offset_counts.push_back(offset_counts[j]);
-      }
-      std::sort(all_offset_counts.begin(), all_offset_counts.end());        // sorts by gid
-
-      MemoryBuffer bb;
-      diy::save(bb, all_offset_counts);
-      diy::save(bb, extra);
-      size_t footer_size = bb.size();
-      diy::save(bb, footer_size);
-
-      // find footer_offset as the max of (offset + count)
-      offset_t footer_offset = 0;
-      for (unsigned i = 0; i < all_offset_counts.size(); ++i)
-      {
-        offset_t end = all_offset_counts[i].offset + all_offset_counts[i].count;
-        if (end > footer_offset)
-            footer_offset = end;
-      }
-      f.write_at(footer_offset, bb.buffer);
-    } else
-    {
-      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
-      mpi::gather(comm, oc_buffer.buffer, 0);
-    }
-  }
-
-/**
- * \ingroup IO
- * \brief Read blocks from storage collectively from one shared file
- */
-    inline
-    void
-    read_blocks(const std::string&           infilename,     //!< input file name
-                const mpi::communicator&     comm,           //!< communicator
-                Assigner&                    assigner,       //!< assigner object
-                Master&                      master,         //!< master object
-                MemoryBuffer&                extra,          //!< user-defined metadata in file header
-                Master::LoadBlock            load = 0)       //!< load block function in case different than or unefined in the master
-    {
-        if (!load) load = master.loader();      // load is likely to be different from master.load()
-
-        typedef detail::offset_t                offset_t;
-        typedef detail::GidOffsetCount          GidOffsetCount;
-
-        mpi::io::file f(comm, infilename, mpi::io::file::rdonly);
-
-        offset_t    footer_offset = f.size() - sizeof(size_t);
-        size_t footer_size;
-
-        // Read the size
-        f.read_at_all(footer_offset, (char*) &footer_size, sizeof(footer_size));
-
-        // Read all_offset_counts
-        footer_offset -= footer_size;
-        MemoryBuffer footer;
-        footer.buffer.resize(footer_size);
-        f.read_at_all(footer_offset, footer.buffer);
-
-        std::vector<GidOffsetCount>  all_offset_counts;
-        diy::load(footer, all_offset_counts);
-        diy::load(footer, extra);
-        extra.reset();
-
-        // Get local gids from assigner
-        size_t size = all_offset_counts.size();
-        assigner.set_nblocks(size);
-        std::vector<int> gids;
-        assigner.local_gids(comm.rank(), gids);
-
-        for (unsigned i = 0; i < gids.size(); ++i)
-        {
-            if (gids[i] != all_offset_counts[gids[i]].gid)
-                get_logger()->warn("gids don't match in diy::io::read_blocks(), {} vs {}",
-                                   gids[i], all_offset_counts[gids[i]].gid);
-
-            offset_t offset = all_offset_counts[gids[i]].offset,
-                     count  = all_offset_counts[gids[i]].count;
-            MemoryBuffer bb;
-            bb.buffer.resize(count);
-            f.read_at(offset, bb.buffer);
-            Link* l = LinkFactory::load(bb);
-            l->fix(assigner);
-            void* b = master.create();
-            load(b, bb);
-            master.add(gids[i], b, l);
-        }
-    }
-
-
-  // Functions without the extra buffer, for compatibility with the old code
-  inline
-  void
-  write_blocks(const std::string&           outfilename,
-               const mpi::communicator&     comm,
-               Master&                      master,
-               Master::SaveBlock            save)
-  {
-    MemoryBuffer extra;
-    write_blocks(outfilename, comm, master, extra, save);
-  }
-
-  inline
-  void
-  read_blocks(const std::string&           infilename,
-              const mpi::communicator&     comm,
-              Assigner&                    assigner,
-              Master&                      master,
-              Master::LoadBlock            load = 0)
-  {
-    MemoryBuffer extra;     // dummy
-    read_blocks(infilename, comm, assigner, master, extra, load);
-  }
-
-namespace split
-{
-/**
- * \ingroup IO
- * \brief Write blocks to storage independently in one file per process
- */
-  inline
-  void
-  write_blocks(const std::string&           outfilename,           //!< output file name
-               const mpi::communicator&     comm,                  //!< communicator
-               Master&                      master,                //!< master object
-               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
-               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in master
-  {
-    if (!save) save = master.saver();       // save is likely to be different from master.save()
-
-    bool proceed = false;
-    size_t size = 0;
-    if (comm.rank() == 0)
-    {
-        struct stat s;
-        if (stat(outfilename.c_str(), &s) == 0)
-        {
-            if (S_ISDIR(s.st_mode))
-                proceed = true;
-        } else if (mkdir(outfilename.c_str(), 0755) == 0)
-            proceed = true;
-        mpi::broadcast(comm, proceed, 0);
-        mpi::reduce(comm, (size_t) master.size(), size, 0, std::plus<size_t>());
-    } else
-    {
-        mpi::broadcast(comm, proceed, 0);
-        mpi::reduce(comm, (size_t) master.size(), 0, std::plus<size_t>());
-    }
-
-    if (!proceed)
-        throw std::runtime_error("Cannot access or create directory: " + outfilename);
-
-    for (int i = 0; i < (int)master.size(); ++i)
-    {
-        const void* block = master.get(i);
-
-        std::string filename = fmt::format("{}/{}", outfilename, master.gid(i));
-
-        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
-
-        LinkFactory::save(bb, master.link(i));
-        save(block, bb);
-
-        fclose(bb.file);
-    }
-
-    if (comm.rank() == 0)
-    {
-        // save the extra buffer
-        std::string filename = outfilename + "/extra";
-        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
-        ::diy::save(bb, size);
-        ::diy::save(bb, extra);
-        fclose(bb.file);
-    }
-  }
-
-/**
- * \ingroup IO
- * \brief Read blocks from storage independently from one file per process
- */
-  inline
-  void
-  read_blocks(const std::string&           infilename,  //!< input file name
-              const mpi::communicator&     comm,        //!< communicator
-              Assigner&                    assigner,    //!< assigner object
-              Master&                      master,      //!< master object
-              MemoryBuffer&                extra,       //!< user-defined metadata in file header
-              Master::LoadBlock            load = 0)    //!< block load function in case different than or undefined in master
-  {
-    if (!load) load = master.loader();      // load is likely to be different from master.load()
-
-    // load the extra buffer and size
-    size_t          size;
-    std::string filename = infilename + "/extra";
-    ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
-    ::diy::load(bb, size);
-    ::diy::load(bb, extra);
-    extra.reset();
-    fclose(bb.file);
-
-    // Get local gids from assigner
-    assigner.set_nblocks(size);
-    std::vector<int> gids;
-    assigner.local_gids(comm.rank(), gids);
-
-    // Read our blocks;
-    for (unsigned i = 0; i < gids.size(); ++i)
-    {
-        std::string filename = fmt::format("{}/{}", infilename, gids[i]);
-
-        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
-        Link* l = LinkFactory::load(bb);
-        l->fix(assigner);
-        void* b = master.create();
-        load(b, bb);
-        master.add(gids[i], b, l);
-
-        fclose(bb.file);
-    }
-  }
-
-  // Functions without the extra buffer, for compatibility with the old code
-  inline
-  void
-  write_blocks(const std::string&           outfilename,
-               const mpi::communicator&     comm,
-               Master&                      master,
-               Master::SaveBlock            save)
-  {
-    MemoryBuffer extra;
-    write_blocks(outfilename, comm, master, extra, save);
-  }
-
-  inline
-  void
-  read_blocks(const std::string&           infilename,
-              const mpi::communicator&     comm,
-              Assigner&                    assigner,
-              Master&                      master,
-              Master::LoadBlock            load = 0)
-  {
-    MemoryBuffer extra;     // dummy
-    read_blocks(infilename, comm, assigner, master, extra, load);
-  }
-} // split
-} // io
-} // diy
-
-#endif
diff --git a/diy/include/diy/io/bov.hpp b/diy/include/diy/io/bov.hpp
deleted file mode 100644
index bd8b24009..000000000
--- a/diy/include/diy/io/bov.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef DIY_IO_BOV_HPP
-#define DIY_IO_BOV_HPP
-
-#include <vector>
-#include <algorithm>
-#include <numeric>
-
-#include "../types.hpp"
-#include "../mpi.hpp"
-
-namespace diy
-{
-namespace io
-{
-  // Reads and writes subsets of a block of values into specified block bounds
-  class BOV
-  {
-    public:
-      typedef       std::vector<int>                                    Shape;
-    public:
-                    BOV(mpi::io::file&    f):
-                      f_(f), offset_(0)                                 {}
-
-      template<class S>
-                    BOV(mpi::io::file&    f,
-                        const S&          shape  = S(),
-                        mpi::io::offset   offset = 0):
-                      f_(f), offset_(offset)                            { set_shape(shape); }
-
-      void          set_offset(mpi::io::offset offset)                  { offset_ = offset; }
-
-      template<class S>
-      void          set_shape(const S& shape)
-      {
-        shape_.clear();
-        stride_.clear();
-        for (unsigned i = 0; i < shape.size(); ++i)
-        {
-            shape_.push_back(shape[i]);
-            stride_.push_back(1);
-        }
-        for (int i = shape_.size() - 2; i >=  0; --i)
-          stride_[i] = stride_[i+1] * shape_[i+1];
-      }
-
-      const Shape&  shape() const                                       { return shape_; }
-
-      template<class T>
-      void          read(const DiscreteBounds& bounds, T* buffer, bool collective = false, int chunk = 1) const;
-
-      template<class T>
-      void          write(const DiscreteBounds& bounds, const T* buffer, bool collective = false, int chunk = 1);
-
-      template<class T>
-      void          write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective = false, int chunk = 1);
-
-    protected:
-      mpi::io::file&        file()                                        { return f_; }
-
-    private:
-      mpi::io::file&        f_;
-      Shape                 shape_;
-      std::vector<size_t>   stride_;
-      size_t                offset_;
-  };
-}
-}
-
-template<class T>
-void
-diy::io::BOV::
-read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
-{
-  int dim   = shape_.size();
-  int total = 1;
-  std::vector<int> subsizes;
-  for (int i = 0; i < dim; ++i)
-  {
-    subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
-    total *= subsizes.back();
-  }
-
-  MPI_Datatype T_type;
-  if (chunk == 1)
-    T_type = mpi::detail::get_mpi_datatype<T>();
-  else
-  {
-    // create an MPI struct of size chunk to read the data in those chunks
-    // (this allows to work around MPI-IO weirdness where crucial quantities
-    // are ints, which are too narrow of a type)
-    int             array_of_blocklengths[]  = { chunk };
-    MPI_Aint        array_of_displacements[] = { 0 };
-    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
-    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
-    MPI_Type_commit(&T_type);
-  }
-
-  MPI_Datatype fileblk;
-  MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
-  MPI_Type_commit(&fileblk);
-
-  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
-
-  mpi::status s;
-  if (!collective)
-      MPI_File_read(f_.handle(), buffer, total, T_type, &s.s);
-  else
-      MPI_File_read_all(f_.handle(), buffer, total, T_type, &s.s);
-
-  if (chunk != 1)
-    MPI_Type_free(&T_type);
-  MPI_Type_free(&fileblk);
-}
-
-template<class T>
-void
-diy::io::BOV::
-write(const DiscreteBounds& bounds, const T* buffer, bool collective, int chunk)
-{
-    write(bounds, buffer, bounds, collective, chunk);
-}
-
-template<class T>
-void
-diy::io::BOV::
-write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective, int chunk)
-{
-  int dim   = shape_.size();
-  std::vector<int> subsizes;
-  std::vector<int> buffer_shape, buffer_start;
-  for (int i = 0; i < dim; ++i)
-  {
-    buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
-    buffer_start.push_back(core.min[i] - bounds.min[i]);
-    subsizes.push_back(core.max[i] - core.min[i] + 1);
-  }
-
-  MPI_Datatype T_type;
-  if (chunk == 1)
-    T_type = mpi::detail::get_mpi_datatype<T>();
-  else
-  {
-    // assume T is a binary block and create an MPI struct of appropriate size
-    int             array_of_blocklengths[]  = { chunk };
-    MPI_Aint        array_of_displacements[] = { 0 };
-    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
-    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
-    MPI_Type_commit(&T_type);
-  }
-
-  MPI_Datatype fileblk, subbuffer;
-  MPI_Type_create_subarray(dim, (int*) &shape_[0],       &subsizes[0], (int*) &bounds.min[0],   MPI_ORDER_C, T_type, &fileblk);
-  MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
-  MPI_Type_commit(&fileblk);
-  MPI_Type_commit(&subbuffer);
-
-  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
-
-  mpi::status s;
-  if (!collective)
-    MPI_File_write(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
-  else
-    MPI_File_write_all(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
-
-  if (chunk != 1)
-    MPI_Type_free(&T_type);
-  MPI_Type_free(&fileblk);
-  MPI_Type_free(&subbuffer);
-}
-
-#endif
diff --git a/diy/include/diy/io/numpy.hpp b/diy/include/diy/io/numpy.hpp
deleted file mode 100644
index 0199a0c38..000000000
--- a/diy/include/diy/io/numpy.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-#ifndef DIY_IO_NMPY_HPP
-#define DIY_IO_NMPY_HPP
-
-#include <sstream>
-#include <complex>
-#include <stdexcept>
-
-#include "../serialization.hpp"
-#include "bov.hpp"
-
-namespace diy
-{
-namespace io
-{
-  class NumPy: public BOV
-  {
-    public:
-                        NumPy(mpi::io::file& f):
-                          BOV(f)                                {}
-
-      unsigned          word_size() const                       { return word_size_; }
-
-      unsigned          read_header()
-      {
-        BOV::Shape  shape;
-        bool        fortran;
-        size_t      offset = parse_npy_header(shape, fortran);
-        if (fortran)
-            throw std::runtime_error("diy::io::NumPy cannot read data in fortran order");
-        BOV::set_offset(offset);
-        BOV::set_shape(shape);
-        return word_size_;
-      }
-
-      template<class T>
-      void              write_header(int dim, const DiscreteBounds& bounds);
-
-      template<class T, class S>
-      void              write_header(const S& shape);
-
-    private:
-      inline size_t     parse_npy_header(BOV::Shape& shape, bool& fortran_order);
-      void              save(diy::BinaryBuffer& bb, const std::string& s)               { bb.save_binary(s.c_str(), s.size()); }
-      template<class T>
-      inline void       convert_and_save(diy::BinaryBuffer& bb, const T& x)
-      {
-          std::ostringstream oss;
-          oss << x;
-          save(bb, oss.str());
-      }
-
-    private:
-      unsigned          word_size_;
-  };
-
-  namespace detail
-  {
-    inline char big_endian();
-    template<class T>
-    char map_numpy_type();
-  }
-}
-}
-
-// Modified from: https://github.com/rogersce/cnpy
-// Copyright (C) 2011  Carl Rogers
-// Released under MIT License
-// license available at http://www.opensource.org/licenses/mit-license.php
-size_t
-diy::io::NumPy::
-parse_npy_header(BOV::Shape& shape, bool& fortran_order)
-{
-    char buffer[256];
-    file().read_at_all(0, buffer, 256);
-    std::string header(buffer, buffer + 256);
-    size_t nl = header.find('\n');
-    if (nl == std::string::npos)
-        throw std::runtime_error("parse_npy_header: failed to read the header");
-    header = header.substr(11, nl - 11 + 1);
-    size_t header_size = nl + 1;
-
-    int loc1, loc2;
-
-    //fortran order
-    loc1 = header.find("fortran_order")+16;
-    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
-
-    //shape
-    unsigned ndims;
-    loc1 = header.find("(");
-    loc2 = header.find(")");
-    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
-    if(str_shape[str_shape.size()-1] == ',') ndims = 1;
-    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
-    shape.resize(ndims);
-    for(unsigned int i = 0;i < ndims;i++) {
-        loc1 = str_shape.find(",");
-        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
-        str_shape = str_shape.substr(loc1+1);
-    }
-
-    //endian, word size, data type
-    //byte order code | stands for not applicable.
-    //not sure when this applies except for byte array
-    loc1 = header.find("descr")+9;
-    //bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
-    //assert(littleEndian);
-
-    //char type = header[loc1+1];
-    //assert(type == map_type(T));
-
-    std::string str_ws = header.substr(loc1+2);
-    loc2 = str_ws.find("'");
-    word_size_ = atoi(str_ws.substr(0,loc2).c_str());
-
-    return header_size;
-}
-
-template<class T>
-void
-diy::io::NumPy::
-write_header(int dim, const DiscreteBounds& bounds)
-{
-    std::vector<int> shape;
-    for (int i = 0; i < dim; ++i)
-        shape.push_back(bounds.max[i] - bounds.min[i] + 1);
-
-    write_header< T, std::vector<int> >(shape);
-}
-
-
-template<class T, class S>
-void
-diy::io::NumPy::
-write_header(const S& shape)
-{
-    BOV::set_shape(shape);
-
-    diy::MemoryBuffer dict;
-    save(dict, "{'descr': '");
-    diy::save(dict, detail::big_endian());
-    diy::save(dict, detail::map_numpy_type<T>());
-    convert_and_save(dict, sizeof(T));
-    save(dict, "', 'fortran_order': False, 'shape': (");
-    convert_and_save(dict, shape[0]);
-    for (int i = 1; i < (int) shape.size(); i++)
-    {
-        save(dict, ", ");
-        convert_and_save(dict, shape[i]);
-    }
-    if(shape.size() == 1) save(dict, ",");
-    save(dict, "), }");
-    //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
-    int remainder = 16 - (10 + dict.position) % 16;
-    for (int i = 0; i < remainder - 1; ++i)
-        diy::save(dict, ' ');
-    diy::save(dict, '\n');
-
-    diy::MemoryBuffer header;
-    diy::save(header, (char) 0x93);
-    save(header, "NUMPY");
-    diy::save(header, (char) 0x01);  // major version of numpy format
-    diy::save(header, (char) 0x00);  // minor version of numpy format
-    diy::save(header, (unsigned short) dict.position);
-    header.save_binary(&dict.buffer[0], dict.buffer.size());
-
-    BOV::set_offset(header.position);
-
-    if (file().comm().rank() == 0)
-        file().write_at(0, &header.buffer[0], header.buffer.size());
-}
-
-char
-diy::io::detail::big_endian()
-{
-  unsigned char x[] = {1,0};
-  void* x_void = x;
-  short y = *static_cast<short*>(x_void);
-  return y == 1 ? '<' : '>';
-}
-
-namespace diy
-{
-namespace io
-{
-namespace detail
-{
-template<> inline char map_numpy_type<float>()                         { return 'f'; }
-template<> inline char map_numpy_type<double>()                        { return 'f'; }
-template<> inline char map_numpy_type<long double>()                   { return 'f'; }
-
-template<> inline char map_numpy_type<int>()                           { return 'i'; }
-template<> inline char map_numpy_type<char>()                          { return 'i'; }
-template<> inline char map_numpy_type<short>()                         { return 'i'; }
-template<> inline char map_numpy_type<long>()                          { return 'i'; }
-template<> inline char map_numpy_type<long long>()                     { return 'i'; }
-
-template<> inline char map_numpy_type<unsigned int>()                  { return 'u'; }
-template<> inline char map_numpy_type<unsigned char>()                 { return 'u'; }
-template<> inline char map_numpy_type<unsigned short>()                { return 'u'; }
-template<> inline char map_numpy_type<unsigned long>()                 { return 'u'; }
-template<> inline char map_numpy_type<unsigned long long>()            { return 'u'; }
-
-template<> inline char map_numpy_type<bool>()                          { return 'b'; }
-
-template<> inline char map_numpy_type< std::complex<float> >()         { return 'c'; }
-template<> inline char map_numpy_type< std::complex<double> >()        { return 'c'; }
-template<> inline char map_numpy_type< std::complex<long double> >()   { return 'c'; }
-}
-}
-}
-
-#endif
diff --git a/diy/include/diy/link.hpp b/diy/include/diy/link.hpp
deleted file mode 100644
index 3262eef61..000000000
--- a/diy/include/diy/link.hpp
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifndef DIY_COVER_HPP
-#define DIY_COVER_HPP
-
-#include <vector>
-#include <map>
-#include <algorithm>
-
-#include "types.hpp"
-#include "serialization.hpp"
-#include "assigner.hpp"
-
-namespace diy
-{
-  // Local view of a distributed representation of a cover, a completely unstructured link
-  class Link
-  {
-    public:
-      virtual   ~Link()                             {}  // need to be able to delete derived classes
-
-      int       size() const                        { return neighbors_.size(); }
-      inline
-      int       size_unique() const;
-      BlockID   target(int i) const                 { return neighbors_[i]; }
-      BlockID&  target(int i)                       { return neighbors_[i]; }
-      inline
-      int       find(int gid) const;
-
-      void      add_neighbor(const BlockID& block)  { neighbors_.push_back(block); }
-
-      void      fix(const Assigner& assigner)       { for (unsigned i = 0; i < neighbors_.size(); ++i) { neighbors_[i].proc = assigner.rank(neighbors_[i].gid); } }
-
-      void      swap(Link& other)                   { neighbors_.swap(other.neighbors_); }
-
-      virtual void  save(BinaryBuffer& bb) const    { diy::save(bb, neighbors_); }
-      virtual void  load(BinaryBuffer& bb)          { diy::load(bb, neighbors_); }
-
-      virtual size_t id() const                     { return 0; }
-
-    private:
-      std::vector<BlockID>  neighbors_;
-  };
-
-  template<class Bounds_>
-  class RegularLink;
-
-  typedef       RegularLink<DiscreteBounds>         RegularGridLink;
-  typedef       RegularLink<ContinuousBounds>       RegularContinuousLink;
-
-  // Selector between regular discrete and contious links given bounds type
-  template<class Bounds_>
-  struct RegularLinkSelector;
-
-  template<>
-  struct RegularLinkSelector<DiscreteBounds>
-  {
-    typedef     RegularGridLink         type;
-    static const size_t id = 1;
-  };
-
-  template<>
-  struct RegularLinkSelector<ContinuousBounds>
-  {
-    typedef     RegularContinuousLink   type;
-    static const size_t id = 2;
-  };
-
-
-  // for a regular decomposition, it makes sense to address the neighbors by direction
-  // and store local and neighbor bounds
-  template<class Bounds_>
-  class RegularLink: public Link
-  {
-    public:
-      typedef   Bounds_                             Bounds;
-
-      typedef   std::map<Direction, int>            DirMap;
-      typedef   std::vector<Direction>              DirVec;
-
-    public:
-                RegularLink(int dim, const Bounds& core, const Bounds& bounds):
-                  dim_(dim), core_(core), bounds_(bounds)            {}
-
-      // dimension
-      int       dimension() const                       { return dim_; }
-
-      // direction
-      int       direction(Direction dir) const;         // convert direction to a neighbor (-1 if no neighbor)
-      Direction direction(int i) const                  { return dir_vec_[i]; }
-      void      add_direction(Direction dir)            { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
-
-      // wrap
-      void       add_wrap(Direction dir)                { wrap_.push_back(dir); }
-      Direction  wrap(int i) const                      { return wrap_[i]; }
-      Direction& wrap(int i)                            { return wrap_[i]; }
-
-      // bounds
-      const Bounds& core() const                        { return core_; }
-      Bounds&       core()                              { return core_; }
-      const Bounds& bounds() const                      { return bounds_; }
-      Bounds&       bounds()                            { return bounds_; }
-      const Bounds& bounds(int i) const                 { return nbr_bounds_[i]; }
-      void          add_bounds(const Bounds& bounds)    { nbr_bounds_.push_back(bounds); }
-
-      void      swap(RegularLink& other)                { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); wrap_.swap(other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); }
-
-      void      save(BinaryBuffer& bb) const
-      {
-          Link::save(bb);
-          diy::save(bb, dim_);
-          diy::save(bb, dir_map_);
-          diy::save(bb, dir_vec_);
-          diy::save(bb, core_);
-          diy::save(bb, bounds_);
-          diy::save(bb, nbr_bounds_);
-          diy::save(bb, wrap_);
-      }
-
-      void      load(BinaryBuffer& bb)
-      {
-          Link::load(bb);
-          diy::load(bb, dim_);
-          diy::load(bb, dir_map_);
-          diy::load(bb, dir_vec_);
-          diy::load(bb, core_);
-          diy::load(bb, bounds_);
-          diy::load(bb, nbr_bounds_);
-          diy::load(bb, wrap_);
-      }
-
-      virtual size_t id() const                         { return RegularLinkSelector<Bounds>::id; }
-
-    private:
-      int       dim_;
-
-      DirMap    dir_map_;
-      DirVec    dir_vec_;
-
-      Bounds                    core_;
-      Bounds                    bounds_;
-      std::vector<Bounds>       nbr_bounds_;
-      std::vector<Direction>    wrap_;
-  };
-
-  // Other cover candidates: KDTreeLink, AMRGridLink
-
-  struct LinkFactory
-  {
-    public:
-      static Link*          create(size_t id)
-      {
-          // not pretty, but will do for now
-          if (id == 0)
-            return new Link;
-          else if (id == 1)
-            return new RegularGridLink(0, DiscreteBounds(), DiscreteBounds());
-          else if (id == 2)
-            return new RegularContinuousLink(0, ContinuousBounds(), ContinuousBounds());
-          else
-            return 0;
-      }
-
-      inline static void    save(BinaryBuffer& bb, const Link* l);
-      inline static Link*   load(BinaryBuffer& bb);
-  };
-}
-
-
-void
-diy::LinkFactory::
-save(BinaryBuffer& bb, const Link* l)
-{
-    diy::save(bb, l->id());
-    l->save(bb);
-}
-
-diy::Link*
-diy::LinkFactory::
-load(BinaryBuffer& bb)
-{
-    size_t id;
-    diy::load(bb, id);
-    Link* l = create(id);
-    l->load(bb);
-    return l;
-}
-
-int
-diy::Link::
-find(int gid) const
-{
-    for (unsigned i = 0; i < (unsigned)size(); ++i)
-  {
-    if (target(i).gid == gid)
-      return i;
-  }
-  return -1;
-}
-int
-diy::Link::
-size_unique() const
-{
-    std::vector<BlockID> tmp(neighbors_.begin(), neighbors_.end());
-    std::sort(tmp.begin(), tmp.end());
-    return std::unique(tmp.begin(), tmp.end()) - tmp.begin();
-}
-
-template<class Bounds>
-int
-diy::RegularLink<Bounds>::
-direction(Direction dir) const
-{
-  DirMap::const_iterator it = dir_map_.find(dir);
-  if (it == dir_map_.end())
-    return -1;
-  else
-    return it->second;
-}
-
-#endif
diff --git a/diy/include/diy/log.hpp b/diy/include/diy/log.hpp
deleted file mode 100644
index 45f202f92..000000000
--- a/diy/include/diy/log.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef DIY_LOG_HPP
-#define DIY_LOG_HPP
-
-#ifndef DIY_USE_SPDLOG
-
-#include <memory>
-#include "fmt/format.h"
-#include "fmt/ostream.h"
-
-namespace diy
-{
-
-namespace spd
-{
-    struct logger
-    {
-        // logger.info(cppformat_string, arg1, arg2, arg3, ...) call style
-        template <typename... Args> void trace(const char* fmt, const Args&... args)    {}
-        template <typename... Args> void debug(const char* fmt, const Args&... args)    {}
-        template <typename... Args> void info(const char* fmt, const Args&... args)     {}
-        template <typename... Args> void warn(const char* fmt, const Args&... args)     {}
-        template <typename... Args> void error(const char* fmt, const Args&... args)    {}
-        template <typename... Args> void critical(const char* fmt, const Args&... args) {}
-    };
-}
-
-inline
-std::shared_ptr<spd::logger>
-get_logger()
-{
-    return std::make_shared<spd::logger>();
-}
-
-inline
-std::shared_ptr<spd::logger>
-create_logger(std::string)
-{
-    return std::make_shared<spd::logger>();
-}
-
-template<class... Args>
-std::shared_ptr<spd::logger>
-set_logger(Args... args)
-{
-    return std::make_shared<spd::logger>();
-}
-
-}   // diy
-
-#else // DIY_USE_SPDLOG
-
-#include <string>
-
-#include <spdlog/spdlog.h>
-#include <spdlog/sinks/null_sink.h>
-
-#include <spdlog/fmt/bundled/format.h>
-#include <spdlog/fmt/bundled/ostream.h>
-
-namespace diy
-{
-
-namespace spd = ::spdlog;
-
-inline
-std::shared_ptr<spd::logger>
-get_logger()
-{
-    auto log = spd::get("diy");
-    if (!log)
-    {
-        auto null_sink = std::make_shared<spd::sinks::null_sink_mt> ();
-        log = std::make_shared<spd::logger>("null_logger", null_sink);
-    }
-    return log;
-}
-
-inline
-std::shared_ptr<spd::logger>
-create_logger(std::string log_level)
-{
-    auto log = spd::stderr_logger_mt("diy");
-    int lvl;
-    for (lvl = spd::level::trace; lvl < spd::level::off; ++lvl)
-        if (spd::level::level_names[lvl] == log_level)
-            break;
-    log->set_level(static_cast<spd::level::level_enum>(lvl));
-    return log;
-}
-
-template<class... Args>
-std::shared_ptr<spd::logger>
-set_logger(Args... args)
-{
-    auto log = std::make_shared<spdlog::logger>("diy", args...);
-    return log;
-}
-
-}   // diy
-#endif
-
-
-#endif // DIY_LOG_HPP
diff --git a/diy/include/diy/master.hpp b/diy/include/diy/master.hpp
deleted file mode 100644
index 97ccb8724..000000000
--- a/diy/include/diy/master.hpp
+++ /dev/null
@@ -1,1203 +0,0 @@
-#ifndef DIY_MASTER_HPP
-#define DIY_MASTER_HPP
-
-#include <vector>
-#include <map>
-#include <list>
-#include <deque>
-#include <algorithm>
-#include <functional>
-
-#include "link.hpp"
-#include "collection.hpp"
-
-// Communicator functionality
-#include "mpi.hpp"
-#include "serialization.hpp"
-#include "detail/collectives.hpp"
-#include "time.hpp"
-
-#include "thread.hpp"
-
-#include "detail/block_traits.hpp"
-
-#include "log.hpp"
-#include "stats.hpp"
-
-namespace diy
-{
-  // Stores and manages blocks; initiates serialization and communication when necessary.
-  //
-  // Provides a foreach function, which is meant as the main entry point.
-  //
-  // Provides a conversion between global and local block ids,
-  // which is hidden from blocks via a communicator proxy.
-  class Master
-  {
-    public:
-      struct ProcessBlock;
-
-      template<class Block>
-      struct Binder;
-
-      // Commands
-      struct BaseCommand;
-
-      template<class Block>
-      struct Command;
-
-      typedef std::vector<BaseCommand*>     Commands;
-
-      // Skip
-      using Skip = std::function<bool(int, const Master&)>;
-
-      struct SkipNoIncoming;
-      struct NeverSkip { bool    operator()(int i, const Master& master) const   { return false; } };
-
-      // Collection
-      typedef Collection::Create            CreateBlock;
-      typedef Collection::Destroy           DestroyBlock;
-      typedef Collection::Save              SaveBlock;
-      typedef Collection::Load              LoadBlock;
-
-    public:
-      // Communicator types
-      struct Proxy;
-      struct ProxyWithLink;
-
-      // foreach callback
-      template<class Block>
-      using Callback = std::function<void(Block*, const ProxyWithLink&)>;
-
-      struct QueuePolicy
-      {
-        virtual bool    unload_incoming(const Master& master, int from, int to, size_t size) const  =0;
-        virtual bool    unload_outgoing(const Master& master, int from, size_t size) const          =0;
-        virtual         ~QueuePolicy() {}
-      };
-
-      //! Move queues out of core if their size exceeds a parameter given in the constructor
-      struct QueueSizePolicy: public QueuePolicy
-      {
-                QueueSizePolicy(size_t sz): size(sz)          {}
-        bool    unload_incoming(const Master& master, int from, int to, size_t sz) const    { return sz > size; }
-        bool    unload_outgoing(const Master& master, int from, size_t sz) const            { return sz > size*master.outgoing_count(from); }
-
-        size_t  size;
-      };
-
-      struct MessageInfo
-      {
-        int from, to;
-        int round;
-      };
-
-      struct InFlightSend
-      {
-        std::shared_ptr<MemoryBuffer> message;
-        mpi::request                  request;
-
-        // for debug purposes:
-        MessageInfo info;
-      };
-
-      struct InFlightRecv
-      {
-        MemoryBuffer message;
-        MessageInfo info{ -1, -1, -1 };
-      };
-
-      struct Collective;
-      struct tags       { enum { queue, piece }; };
-
-      typedef           std::list<InFlightSend>             InFlightSendsList;
-      typedef           std::map<int, InFlightRecv>         InFlightRecvsMap;
-      typedef           std::list<int>                      ToSendList;         // [gid]
-      typedef           std::list<Collective>               CollectivesList;
-      typedef           std::map<int, CollectivesList>      CollectivesMap;     // gid          -> [collectives]
-
-
-      struct QueueRecord
-      {
-                        QueueRecord(size_t s = 0, int e = -1): size(s), external(e)     {}
-        size_t          size;
-        int             external;
-      };
-
-      typedef           std::map<int,     QueueRecord>      InQueueRecords;     //  gid         -> (size, external)
-      typedef           std::map<int,     MemoryBuffer>     IncomingQueues;     //  gid         -> queue
-      typedef           std::map<BlockID, MemoryBuffer>     OutgoingQueues;     // (gid, proc)  -> queue
-      typedef           std::map<BlockID, QueueRecord>      OutQueueRecords;    // (gid, proc)  -> (size, external)
-      struct IncomingQueuesRecords
-      {
-        InQueueRecords  records;
-        IncomingQueues  queues;
-      };
-      struct OutgoingQueuesRecord
-      {
-                        OutgoingQueuesRecord(int e = -1): external(e)       {}
-        int             external;
-        OutQueueRecords external_local;
-        OutgoingQueues  queues;
-      };
-      typedef           std::map<int,     IncomingQueuesRecords>    IncomingQueuesMap;  //  gid         -> {  gid       -> queue }
-      typedef           std::map<int,     OutgoingQueuesRecord>     OutgoingQueuesMap;  //  gid         -> { (gid,proc) -> queue }
-
-      struct IncomingRound
-      {
-        IncomingQueuesMap map;
-        int received{0};
-      };
-      typedef std::map<int, IncomingRound> IncomingRoundMap;
-
-
-    public:
-     /**
-      * \ingroup Initialization
-      * \brief The main DIY object
-      *
-      * Helper functions specify how to:
-           * create an empty block,
-           * destroy a block (a function that's expected to upcast and delete),
-           * serialize a block
-      */
-                    Master(mpi::communicator    comm,          //!< communicator
-                           int                  threads  = 1,  //!< number of threads DIY can use
-                           int                  limit    = -1, //!< number of blocks to store in memory
-                           CreateBlock          create   = 0,  //!< block create function; master manages creation if create != 0
-                           DestroyBlock         destroy  = 0,  //!< block destroy function; master manages destruction if destroy != 0
-                           ExternalStorage*     storage  = 0,  //!< storage object (path, method, etc.) for storing temporary blocks being shuffled in/out of core
-                           SaveBlock            save     = 0,  //!< block save function; master manages saving if save != 0
-                           LoadBlock            load     = 0,  //!< block load function; master manages loading if load != 0
-                           QueuePolicy*         q_policy = new QueueSizePolicy(4096)): //!< policy for managing message queues specifies maximum size of message queues to keep in memory
-                      blocks_(create, destroy, storage, save, load),
-                      queue_policy_(q_policy),
-                      limit_(limit),
-                      threads_(threads == -1 ? thread::hardware_concurrency() : threads),
-                      storage_(storage),
-                      // Communicator functionality
-                      comm_(comm),
-                      expected_(0),
-                      exchange_round_(-1),
-                      immediate_(true)
-                                                        {}
-                    ~Master()                           { set_immediate(true); clear(); delete queue_policy_; }
-      inline void   clear();
-      inline void   destroy(int i)                      { if (blocks_.own()) blocks_.destroy(i); }
-
-      inline int    add(int gid, void* b, Link* l);     //!< add a block
-      inline void*  release(int i);                     //!< release ownership of the block
-
-      //!< return the `i`-th block
-      inline void*  block(int i) const                  { return blocks_.find(i); }
-      template<class Block>
-      Block*        block(int i) const                  { return static_cast<Block*>(block(i)); }
-      inline Link*  link(int i) const                   { return links_[i]; }
-      inline int    loaded_block() const                { return blocks_.available(); }
-
-      inline void   unload(int i);
-      inline void   load(int i);
-      void          unload(std::vector<int>& loaded)    { for(unsigned i = 0; i < loaded.size(); ++i) unload(loaded[i]); loaded.clear(); }
-      void          unload_all()                        { for(unsigned i = 0; i < size(); ++i) if (block(i) != 0) unload(i); }
-      inline bool   has_incoming(int i) const;
-
-      inline void   unload_queues(int i);
-      inline void   unload_incoming(int gid);
-      inline void   unload_outgoing(int gid);
-      inline void   load_queues(int i);
-      inline void   load_incoming(int gid);
-      inline void   load_outgoing(int gid);
-
-      //! return the MPI communicator
-      const mpi::communicator&  communicator() const    { return comm_; }
-      //! return the MPI communicator
-      mpi::communicator&        communicator()          { return comm_; }
-
-      //! return the `i`-th block, loading it if necessary
-      void*         get(int i)                          { return blocks_.get(i); }
-      //! return gid of the `i`-th block
-      int           gid(int i) const                    { return gids_[i]; }
-      //! return the local id of the local block with global id gid, or -1 if not local
-      int           lid(int gid) const                  { return local(gid) ?  lids_.find(gid)->second : -1; }
-      //! whether the block with global id gid is local
-      bool          local(int gid) const                { return lids_.find(gid) != lids_.end(); }
-
-      //! exchange the queues between all the blocks (collective operation)
-      inline void   exchange();
-      inline void   process_collectives();
-
-      inline
-      ProxyWithLink proxy(int i) const;
-
-      //! return the number of local blocks
-      unsigned      size() const                        { return blocks_.size(); }
-      void*         create() const                      { return blocks_.create(); }
-
-      // accessors
-      int           limit() const                       { return limit_; }
-      int           threads() const                     { return threads_; }
-      int           in_memory() const                   { return *blocks_.in_memory().const_access(); }
-
-      void          set_threads(int threads)            { threads_ = threads; }
-
-      CreateBlock   creator() const                     { return blocks_.creator(); }
-      DestroyBlock  destroyer() const                   { return blocks_.destroyer(); }
-      LoadBlock     loader() const                      { return blocks_.loader(); }
-      SaveBlock     saver() const                       { return blocks_.saver(); }
-
-      //! call `f` with every block
-      template<class Block>
-      void          foreach_(const Callback<Block>& f, const Skip& s = NeverSkip());
-
-      template<class F>
-      void          foreach(const F& f, const Skip& s = NeverSkip())
-      {
-          using Block = typename detail::block_traits<F>::type;
-          foreach_<Block>(f, s);
-      }
-
-      inline void   execute();
-
-      bool          immediate() const                   { return immediate_; }
-      void          set_immediate(bool i)               { if (i && !immediate_) execute(); immediate_ = i; }
-
-    public:
-      // Communicator functionality
-      IncomingQueues&   incoming(int gid)               { return incoming_[exchange_round_].map[gid].queues; }
-      OutgoingQueues&   outgoing(int gid)               { return outgoing_[gid].queues; }
-      CollectivesList&  collectives(int gid)            { return collectives_[gid]; }
-      size_t            incoming_count(int gid) const
-      {
-        IncomingRoundMap::const_iterator round_it = incoming_.find(exchange_round_);
-        if (round_it == incoming_.end())
-          return 0;
-        IncomingQueuesMap::const_iterator queue_it = round_it->second.map.find(gid);
-        if (queue_it == round_it->second.map.end())
-          return 0;
-        return queue_it->second.queues.size();
-      }
-      size_t            outgoing_count(int gid) const   { OutgoingQueuesMap::const_iterator it = outgoing_.find(gid); if (it == outgoing_.end()) return 0; return it->second.queues.size(); }
-
-      void              set_expected(int expected)      { expected_ = expected; }
-      void              add_expected(int i)             { expected_ += i; }
-      int               expected() const                { return expected_; }
-      void              replace_link(int i, Link* link) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link; expected_ += links_[i]->size_unique(); }
-
-    public:
-      // Communicator functionality
-      inline void       flush();            // makes sure all the serialized queues migrate to their target processors
-
-    private:
-      // Communicator functionality
-      inline void       comm_exchange(ToSendList& to_send, int out_queues_limit);     // possibly called in between block computations
-      inline bool       nudge();
-
-      void              cancel_requests();              // TODO
-
-      // debug
-      inline void       show_incoming_records() const;
-
-    private:
-      std::vector<Link*>    links_;
-      Collection            blocks_;
-      std::vector<int>      gids_;
-      std::map<int, int>    lids_;
-
-      QueuePolicy*          queue_policy_;
-
-      int                   limit_;
-      int                   threads_;
-      ExternalStorage*      storage_;
-
-    private:
-      // Communicator
-      mpi::communicator     comm_;
-      IncomingRoundMap      incoming_;
-      OutgoingQueuesMap     outgoing_;
-      InFlightSendsList     inflight_sends_;
-      InFlightRecvsMap      inflight_recvs_;
-      CollectivesMap        collectives_;
-      int                   expected_;
-      int                   exchange_round_;
-      bool                  immediate_;
-      Commands              commands_;
-
-    private:
-      fast_mutex            add_mutex_;
-
-    public:
-      std::shared_ptr<spd::logger>  log = get_logger();
-      stats::Profiler               prof;
-  };
-
-  struct Master::BaseCommand
-  {
-      virtual       ~BaseCommand()                                                  {}      // to delete derived classes
-      virtual void  execute(void* b, const ProxyWithLink& cp) const                 =0;
-      virtual bool  skip(int i, const Master& master) const                         =0;
-  };
-
-  template<class Block>
-  struct Master::Command: public BaseCommand
-  {
-            Command(Callback<Block> f_, const Skip& s_):
-                f(f_), s(s_)                                                        {}
-
-      void  execute(void* b, const ProxyWithLink& cp) const override                { f(static_cast<Block*>(b), cp); }
-      bool  skip(int i, const Master& m) const override                             { return s(i,m); }
-
-      Callback<Block>   f;
-      Skip              s;
-  };
-
-  struct Master::SkipNoIncoming
-  { bool operator()(int i, const Master& master) const   { return !master.has_incoming(i); } };
-
-  struct Master::Collective
-  {
-            Collective():
-              cop_(0)                           {}
-            Collective(detail::CollectiveOp* cop):
-              cop_(cop)                         {}
-            // this copy constructor is very ugly, but need it to insert Collectives into a list
-            Collective(const Collective& other):
-              cop_(0)                           { swap(const_cast<Collective&>(other)); }
-            ~Collective()                       { delete cop_; }
-
-    void    init()                              { cop_->init(); }
-    void    swap(Collective& other)             { std::swap(cop_, other.cop_); }
-    void    update(const Collective& other)     { cop_->update(*other.cop_); }
-    void    global(const mpi::communicator& c)  { cop_->global(c); }
-    void    copy_from(Collective& other) const  { cop_->copy_from(*other.cop_); }
-    void    result_out(void* x) const           { cop_->result_out(x); }
-
-    detail::CollectiveOp*                       cop_;
-
-    private:
-    Collective& operator=(const Collective& other);
-  };
-}
-
-#include "proxy.hpp"
-
-// --- ProcessBlock ---
-struct diy::Master::ProcessBlock
-{
-          ProcessBlock(Master&                    master_,
-                       const std::deque<int>&     blocks_,
-                       int                        local_limit_,
-                       critical_resource<int>&    idx_):
-              master(master_),
-              blocks(blocks_),
-              local_limit(local_limit_),
-              idx(idx_)
-          {}
-
-  void    process()
-  {
-    master.log->debug("Processing with thread: {}",  this_thread::get_id());
-
-    std::vector<int>      local;
-    do
-    {
-      int cur = (*idx.access())++;
-
-      if ((size_t)cur >= blocks.size())
-          return;
-
-      int i = blocks[cur];
-      if (master.block(i))
-      {
-          if (local.size() == (size_t)local_limit)
-              master.unload(local);
-          local.push_back(i);
-      }
-
-      master.log->debug("Processing block: {}", master.gid(i));
-
-      bool skip_block = true;
-      for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
-      {
-          if (!master.commands_[cmd]->skip(i, master))
-          {
-              skip_block = false;
-              break;
-          }
-      }
-
-      IncomingQueuesMap &current_incoming = master.incoming_[master.exchange_round_].map;
-      if (skip_block)
-      {
-          if (master.block(i) == 0)
-              master.load_queues(i);      // even though we are skipping the block, the queues might be necessary
-
-          for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
-          {
-              master.commands_[cmd]->execute(0, master.proxy(i));  // 0 signals that we are skipping the block (even if it's loaded)
-
-              // no longer need them, so get rid of them, rather than risk reloading
-              current_incoming[master.gid(i)].queues.clear();
-              current_incoming[master.gid(i)].records.clear();
-          }
-
-          if (master.block(i) == 0)
-              master.unload_queues(i);    // even though we are skipping the block, the queues might be necessary
-      }
-      else
-      {
-          if (master.block(i) == 0)                             // block unloaded
-          {
-              if (local.size() == (size_t)local_limit)                    // reached the local limit
-                  master.unload(local);
-
-              master.load(i);
-              local.push_back(i);
-          }
-
-          for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
-          {
-              master.commands_[cmd]->execute(master.block(i), master.proxy(i));
-
-              // no longer need them, so get rid of them
-              current_incoming[master.gid(i)].queues.clear();
-              current_incoming[master.gid(i)].records.clear();
-          }
-      }
-    } while(true);
-
-    // TODO: invoke opportunistic communication
-    //       don't forget to adjust Master::exchange()
-  }
-
-  static void run(void* bf)                   { static_cast<ProcessBlock*>(bf)->process(); }
-
-  Master&                 master;
-  const std::deque<int>&  blocks;
-  int                     local_limit;
-  critical_resource<int>& idx;
-};
-// --------------------
-
-void
-diy::Master::
-clear()
-{
-  for (unsigned i = 0; i < size(); ++i)
-    delete links_[i];
-  blocks_.clear();
-  links_.clear();
-  gids_.clear();
-  lids_.clear();
-  expected_ = 0;
-}
-
-void
-diy::Master::
-unload(int i)
-{
-  log->debug("Unloading block: {}", gid(i));
-
-  blocks_.unload(i);
-  unload_queues(i);
-}
-
-void
-diy::Master::
-unload_queues(int i)
-{
-  unload_incoming(gid(i));
-  unload_outgoing(gid(i));
-}
-
-void
-diy::Master::
-unload_incoming(int gid)
-{
-  for (IncomingRoundMap::iterator round_itr = incoming_.begin(); round_itr != incoming_.end(); ++round_itr)
-  {
-    IncomingQueuesMap::iterator qmap_itr = round_itr->second.map.find(gid);
-    if (qmap_itr == round_itr->second.map.end())
-    {
-      continue;
-    }
-    IncomingQueuesRecords& in_qrs = qmap_itr->second;
-    for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
-    {
-      QueueRecord& qr = it->second;
-      if (queue_policy_->unload_incoming(*this, it->first, gid, qr.size))
-      {
-        log->debug("Unloading queue: {} <- {}", gid, it->first);
-        qr.external = storage_->put(in_qrs.queues[it->first]);
-      }
-    }
-  }
-}
-
-void
-diy::Master::
-unload_outgoing(int gid)
-{
-  OutgoingQueuesRecord& out_qr = outgoing_[gid];
-
-  size_t out_queues_size = sizeof(size_t);   // map size
-  size_t count = 0;
-  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end(); ++it)
-  {
-    if (it->first.proc == comm_.rank()) continue;
-
-    out_queues_size += sizeof(BlockID);     // target
-    out_queues_size += sizeof(size_t);      // buffer.position
-    out_queues_size += sizeof(size_t);      // buffer.size
-    out_queues_size += it->second.size();   // buffer contents
-    ++count;
-  }
-  if (queue_policy_->unload_outgoing(*this, gid, out_queues_size - sizeof(size_t)))
-  {
-      log->debug("Unloading outgoing queues: {} -> ...; size = {}\n", gid, out_queues_size);
-      MemoryBuffer  bb;     bb.reserve(out_queues_size);
-      diy::save(bb, count);
-
-      for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end();)
-      {
-        if (it->first.proc == comm_.rank())
-        {
-          // treat as incoming
-          if (queue_policy_->unload_incoming(*this, gid, it->first.gid, it->second.size()))
-          {
-            QueueRecord& qr = out_qr.external_local[it->first];
-            qr.size = it->second.size();
-            qr.external = storage_->put(it->second);
-
-            out_qr.queues.erase(it++);
-            continue;
-          } // else keep in memory
-        } else
-        {
-          diy::save(bb, it->first);
-          diy::save(bb, it->second);
-
-          out_qr.queues.erase(it++);
-          continue;
-        }
-        ++it;
-      }
-
-      // TODO: this mechanism could be adjusted for direct saving to disk
-      //       (without intermediate binary buffer serialization)
-      out_qr.external = storage_->put(bb);
-  }
-}
-
-void
-diy::Master::
-load(int i)
-{
- log->debug("Loading block: {}", gid(i));
-
-  blocks_.load(i);
-  load_queues(i);
-}
-
-void
-diy::Master::
-load_queues(int i)
-{
-  load_incoming(gid(i));
-  load_outgoing(gid(i));
-}
-
-void
-diy::Master::
-load_incoming(int gid)
-{
-  IncomingQueuesRecords& in_qrs = incoming_[exchange_round_].map[gid];
-  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
-  {
-    QueueRecord& qr = it->second;
-    if (qr.external != -1)
-    {
-        log->debug("Loading queue: {} <- {}", gid, it->first);
-        storage_->get(qr.external, in_qrs.queues[it->first]);
-        qr.external = -1;
-    }
-  }
-}
-
-void
-diy::Master::
-load_outgoing(int gid)
-{
-  // TODO: we could adjust this mechanism to read directly from storage,
-  //       bypassing an intermediate MemoryBuffer
-  OutgoingQueuesRecord& out_qr = outgoing_[gid];
-  if (out_qr.external != -1)
-  {
-    MemoryBuffer bb;
-    storage_->get(out_qr.external, bb);
-    out_qr.external = -1;
-
-    size_t count;
-    diy::load(bb, count);
-    for (size_t i = 0; i < count; ++i)
-    {
-      BlockID to;
-      diy::load(bb, to);
-      diy::load(bb, out_qr.queues[to]);
-    }
-  }
-}
-
-diy::Master::ProxyWithLink
-diy::Master::
-proxy(int i) const
-{ return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i)), block(i), link(i)); }
-
-
-int
-diy::Master::
-add(int gid, void* b, Link* l)
-{
-  if (*blocks_.in_memory().const_access() == limit_)
-    unload_all();
-
-  lock_guard<fast_mutex>    lock(add_mutex_);       // allow to add blocks from multiple threads
-
-  blocks_.add(b);
-  links_.push_back(l);
-  gids_.push_back(gid);
-
-  int lid = gids_.size() - 1;
-  lids_[gid] = lid;
-  add_expected(l->size_unique()); // NB: at every iteration we expect a message from each unique neighbor
-
-  return lid;
-}
-
-void*
-diy::Master::
-release(int i)
-{
-  void* b = blocks_.release(i);
-  delete link(i);   links_[i] = 0;
-  lids_.erase(gid(i));
-  return b;
-}
-
-bool
-diy::Master::
-has_incoming(int i) const
-{
-  const IncomingQueuesRecords& in_qrs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
-  for (InQueueRecords::const_iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
-  {
-    const QueueRecord& qr = it->second;
-    if (qr.size != 0)
-        return true;
-  }
-  return false;
-}
-
-template<class Block>
-void
-diy::Master::
-foreach_(const Callback<Block>& f, const Skip& skip)
-{
-    auto scoped = prof.scoped("foreach");
-    commands_.push_back(new Command<Block>(f, skip));
-
-    if (immediate())
-        execute();
-}
-
-void
-diy::Master::
-execute()
-{
-  log->debug("Entered execute()");
-  auto scoped = prof.scoped("execute");
-  //show_incoming_records();
-
-  // touch the outgoing and incoming queues as well as collectives to make sure they exist
-  for (unsigned i = 0; i < size(); ++i)
-  {
-    outgoing(gid(i));
-    incoming(gid(i));           // implicitly touches queue records
-    collectives(gid(i));
-  }
-
-  if (commands_.empty())
-      return;
-
-  // Order the blocks, so the loaded ones come first
-  std::deque<int>   blocks;
-  for (unsigned i = 0; i < size(); ++i)
-    if (block(i) == 0)
-        blocks.push_back(i);
-    else
-        blocks.push_front(i);
-
-  // don't use more threads than we can have blocks in memory
-  int num_threads;
-  int blocks_per_thread;
-  if (limit_ == -1)
-  {
-    num_threads = threads_;
-    blocks_per_thread = size();
-  }
-  else
-  {
-    num_threads = std::min(threads_, limit_);
-    blocks_per_thread = limit_/num_threads;
-  }
-
-  // idx is shared
-  critical_resource<int> idx(0);
-
-  typedef                 ProcessBlock                                   BlockFunctor;
-  if (num_threads > 1)
-  {
-    // launch the threads
-    typedef               std::pair<thread*, BlockFunctor*>               ThreadFunctorPair;
-    typedef               std::list<ThreadFunctorPair>                    ThreadFunctorList;
-    ThreadFunctorList     threads;
-    for (unsigned i = 0; i < (unsigned)num_threads; ++i)
-    {
-        BlockFunctor* bf = new BlockFunctor(*this, blocks, blocks_per_thread, idx);
-        threads.push_back(ThreadFunctorPair(new thread(&BlockFunctor::run, bf), bf));
-    }
-
-    // join the threads
-    for(ThreadFunctorList::iterator it = threads.begin(); it != threads.end(); ++it)
-    {
-        thread*           t  = it->first;
-        BlockFunctor*     bf = it->second;
-        t->join();
-        delete t;
-        delete bf;
-    }
-  } else
-  {
-      BlockFunctor bf(*this, blocks, blocks_per_thread, idx);
-      BlockFunctor::run(&bf);
-  }
-
-  // clear incoming queues
-  incoming_[exchange_round_].map.clear();
-
-  if (limit() != -1 && in_memory() > limit())
-      throw std::runtime_error(fmt::format("Fatal: {} blocks in memory, with limit {}", in_memory(), limit()));
-
-  // clear commands
-  for (size_t i = 0; i < commands_.size(); ++i)
-      delete commands_[i];
-  commands_.clear();
-}
-
-void
-diy::Master::
-exchange()
-{
-  auto scoped = prof.scoped("exchange");
-  execute();
-
-  log->debug("Starting exchange");
-
-  // make sure there is a queue for each neighbor
-  for (int i = 0; i < (int)size(); ++i)
-  {
-    OutgoingQueues&  outgoing_queues  = outgoing_[gid(i)].queues;
-    OutQueueRecords& external_local   = outgoing_[gid(i)].external_local;
-    if (outgoing_queues.size() < (size_t)link(i)->size())
-      for (unsigned j = 0; j < (unsigned)link(i)->size(); ++j)
-      {
-        if (external_local.find(link(i)->target(j)) == external_local.end())
-          outgoing_queues[link(i)->target(j)];        // touch the outgoing queue, creating it if necessary
-      }
-  }
-
-  flush();
-  log->debug("Finished exchange");
-}
-
-namespace diy
-{
-namespace detail
-{
-  template <typename T>
-  struct VectorWindow
-  {
-    T *begin;
-    size_t count;
-  };
-} // namespace detail
-
-namespace mpi
-{
-namespace detail
-{
-  template<typename T>  struct is_mpi_datatype< diy::detail::VectorWindow<T> > { typedef true_type type; };
-
-  template <typename T>
-  struct mpi_datatype< diy::detail::VectorWindow<T> >
-  {
-    typedef diy::detail::VectorWindow<T> VecWin;
-    static MPI_Datatype         datatype()                { return get_mpi_datatype<T>(); }
-    static const void*          address(const VecWin& x)  { return x.begin; }
-    static void*                address(VecWin& x)        { return x.begin; }
-    static int                  count(const VecWin& x)    { return static_cast<int>(x.count); }
-  };
-}
-} // namespace mpi::detail
-
-} // namespace diy
-
-/* Communicator */
-void
-diy::Master::
-comm_exchange(ToSendList& to_send, int out_queues_limit)
-{
-  static const size_t MAX_MPI_MESSAGE_COUNT = INT_MAX;
-
-  IncomingRound &current_incoming = incoming_[exchange_round_];
-  // isend outgoing queues, up to the out_queues_limit
-  while(inflight_sends_.size() < (size_t)out_queues_limit && !to_send.empty())
-  {
-    int from = to_send.front();
-
-    // deal with external_local queues
-    for (OutQueueRecords::iterator it = outgoing_[from].external_local.begin(); it != outgoing_[from].external_local.end(); ++it)
-    {
-      int to = it->first.gid;
-
-      log->debug("Processing local queue: {} <- {} of size {}", to, from, it->second.size);
-
-      QueueRecord& in_qr  = current_incoming.map[to].records[from];
-      bool in_external  = block(lid(to)) == 0;
-
-      if (in_external)
-          in_qr = it->second;
-      else
-      {
-          // load the queue
-          in_qr.size     = it->second.size;
-          in_qr.external = -1;
-
-          MemoryBuffer bb;
-          storage_->get(it->second.external, bb);
-
-          current_incoming.map[to].queues[from].swap(bb);
-      }
-      ++current_incoming.received;
-    }
-    outgoing_[from].external_local.clear();
-
-    if (outgoing_[from].external != -1)
-      load_outgoing(from);
-    to_send.pop_front();
-
-    OutgoingQueues& outgoing = outgoing_[from].queues;
-    for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
-    {
-      BlockID to_proc = it->first;
-      int     to      = to_proc.gid;
-      int     proc    = to_proc.proc;
-
-      log->debug("Processing queue:      {} <- {} of size {}", to, from, outgoing_[from].queues[to_proc].size());
-
-      // There may be local outgoing queues that remained in memory
-      if (proc == comm_.rank())     // sending to ourselves: simply swap buffers
-      {
-        log->debug("Moving queue in-place: {} <- {}", to, from);
-
-        QueueRecord& in_qr  = current_incoming.map[to].records[from];
-        bool in_external  = block(lid(to)) == 0;
-        if (in_external)
-        {
-          log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
-          MemoryBuffer& bb = it->second;
-          in_qr.size = bb.size();
-          if (queue_policy_->unload_incoming(*this, from, to, in_qr.size))
-            in_qr.external = storage_->put(bb);
-          else
-          {
-            MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
-            in_bb.swap(bb);
-            in_bb.reset();
-            in_qr.external = -1;
-          }
-        } else        // !in_external
-        {
-          log->debug("Swapping in memory:    {} <- {}", to, from);
-          MemoryBuffer& bb = current_incoming.map[to].queues[from];
-          bb.swap(it->second);
-          bb.reset();
-          in_qr.size = bb.size();
-          in_qr.external = -1;
-        }
-
-        ++current_incoming.received;
-        continue;
-      }
-
-      std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>();
-      buffer->swap(it->second);
-
-      MessageInfo info{from, to, exchange_round_};
-      if (buffer->size() <= (MAX_MPI_MESSAGE_COUNT - sizeof(info)))
-      {
-        diy::save(*buffer, info);
-
-        inflight_sends_.emplace_back();
-        inflight_sends_.back().info = info;
-        inflight_sends_.back().request = comm_.isend(proc, tags::queue, buffer->buffer);
-        inflight_sends_.back().message = buffer;
-      }
-      else
-      {
-        int npieces = static_cast<int>((buffer->size() + MAX_MPI_MESSAGE_COUNT - 1)/MAX_MPI_MESSAGE_COUNT);
-
-        // first send the head
-        std::shared_ptr<MemoryBuffer> hb = std::make_shared<MemoryBuffer>();
-        diy::save(*hb, buffer->size());
-        diy::save(*hb, info);
-
-        inflight_sends_.emplace_back();
-        inflight_sends_.back().info = info;
-        inflight_sends_.back().request = comm_.isend(proc, tags::piece, hb->buffer);
-        inflight_sends_.back().message = hb;
-
-        // send the message pieces
-        size_t msg_buff_idx = 0;
-        for (int i = 0; i < npieces; ++i, msg_buff_idx += MAX_MPI_MESSAGE_COUNT)
-        {
-          int tag = (i == (npieces - 1)) ? tags::queue : tags::piece;
-
-          detail::VectorWindow<char> window;
-          window.begin = &buffer->buffer[msg_buff_idx];
-          window.count = std::min(MAX_MPI_MESSAGE_COUNT, buffer->size() - msg_buff_idx);
-
-          inflight_sends_.emplace_back();
-          inflight_sends_.back().info = info;
-          inflight_sends_.back().request = comm_.isend(proc, tag, window);
-          inflight_sends_.back().message = buffer;
-        }
-      }
-    }
-  }
-
-  // kick requests
-  while(nudge());
-
-  // check incoming queues
-  mpi::optional<mpi::status> ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
-  while(ostatus)
-  {
-    InFlightRecv &ir = inflight_recvs_[ostatus->source()];
-
-    if (ir.info.from == -1) // uninitialized
-    {
-      MemoryBuffer bb;
-      comm_.recv(ostatus->source(), ostatus->tag(), bb.buffer);
-
-      if (ostatus->tag() == tags::piece)
-      {
-        size_t msg_size;
-        diy::load(bb, msg_size);
-        diy::load(bb, ir.info);
-
-        ir.message.buffer.reserve(msg_size);
-      }
-      else // tags::queue
-      {
-        diy::load_back(bb, ir.info);
-        ir.message.swap(bb);
-      }
-    }
-    else
-    {
-      size_t start_idx = ir.message.buffer.size();
-      size_t count = ostatus->count<char>();
-      ir.message.buffer.resize(start_idx + count);
-
-      detail::VectorWindow<char> window;
-      window.begin = &ir.message.buffer[start_idx];
-      window.count = count;
-
-      comm_.recv(ostatus->source(), ostatus->tag(), window);
-    }
-
-    if (ostatus->tag() == tags::queue)
-    {
-      size_t size  = ir.message.size();
-      int from = ir.info.from;
-      int to = ir.info.to;
-      int external = -1;
-
-      assert(ir.info.round >= exchange_round_);
-      IncomingRound *in = &incoming_[ir.info.round];
-
-      bool unload_queue = ((ir.info.round == exchange_round_) ? (block(lid(to)) == 0) : (limit_ != -1)) &&
-                          queue_policy_->unload_incoming(*this, from, to, size);
-      if (unload_queue)
-      {
-        log->debug("Directly unloading queue {} <- {}", to, from);
-        external = storage_->put(ir.message); // unload directly
-      }
-      else
-      {
-        in->map[to].queues[from].swap(ir.message);
-        in->map[to].queues[from].reset();     // buffer position = 0
-      }
-      in->map[to].records[from] = QueueRecord(size, external);
-
-      ++(in->received);
-      ir = InFlightRecv(); // reset
-    }
-
-    ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
-  }
-}
-
-void
-diy::Master::
-flush()
-{
-#ifdef DEBUG
-  time_type start = get_time();
-  unsigned wait = 1;
-#endif
-
-  // prepare for next round
-  incoming_.erase(exchange_round_);
-  ++exchange_round_;
-
-  // make a list of outgoing queues to send (the ones in memory come first)
-  ToSendList    to_send;
-  for (OutgoingQueuesMap::iterator it = outgoing_.begin(); it != outgoing_.end(); ++it)
-  {
-    OutgoingQueuesRecord& out = it->second;
-    if (out.external == -1)
-        to_send.push_front(it->first);
-    else
-        to_send.push_back(it->first);
-  }
-  log->debug("to_send.size(): {}", to_send.size());
-
-  // XXX: we probably want a cleverer limit than block limit times average number of queues per block
-  // XXX: with queues we could easily maintain a specific space limit
-  int out_queues_limit;
-  if (limit_ == -1 || size() == 0)
-    out_queues_limit = to_send.size();
-  else
-    out_queues_limit = std::max((size_t) 1, to_send.size()/size()*limit_);      // average number of queues per block * in-memory block limit
-
-  do
-  {
-    comm_exchange(to_send, out_queues_limit);
-
-#ifdef DEBUG
-    time_type cur = get_time();
-    if (cur - start > wait*1000)
-    {
-        log->warn("Waiting in flush [{}]: {} - {} out of {}",
-                  comm_.rank(), inflight_sends_.size(), incoming_[exchange_round_].received, expected_);
-        wait *= 2;
-    }
-#endif
-  } while (!inflight_sends_.empty() || incoming_[exchange_round_].received < expected_ || !to_send.empty());
-
-  outgoing_.clear();
-
-  log->debug("Done in flush");
-  //show_incoming_records();
-
-  process_collectives();
-}
-
-void
-diy::Master::
-process_collectives()
-{
-  auto scoped = prof.scoped("collectives");
-
-  if (collectives_.empty())
-      return;
-
-  typedef       CollectivesList::iterator       CollectivesIterator;
-  std::vector<CollectivesIterator>  iters;
-  std::vector<int>                  gids;
-  for (CollectivesMap::iterator cur = collectives_.begin(); cur != collectives_.end(); ++cur)
-  {
-    gids.push_back(cur->first);
-    iters.push_back(cur->second.begin());
-  }
-
-  while (iters[0] != collectives_.begin()->second.end())
-  {
-    iters[0]->init();
-    for (unsigned j = 1; j < iters.size(); ++j)
-    {
-      // NB: this assumes that the operations are commutative
-      iters[0]->update(*iters[j]);
-    }
-    iters[0]->global(comm_);        // do the mpi collective
-
-    for (unsigned j = 1; j < iters.size(); ++j)
-    {
-      iters[j]->copy_from(*iters[0]);
-      ++iters[j];
-    }
-
-    ++iters[0];
-  }
-}
-
-bool
-diy::Master::
-nudge()
-{
-  bool success = false;
-  for (InFlightSendsList::iterator it = inflight_sends_.begin(); it != inflight_sends_.end(); ++it)
-  {
-    mpi::optional<mpi::status> ostatus = it->request.test();
-    if (ostatus)
-    {
-      success = true;
-      InFlightSendsList::iterator rm = it;
-      --it;
-      inflight_sends_.erase(rm);
-    }
-  }
-  return success;
-}
-
-void
-diy::Master::
-show_incoming_records() const
-{
-  for (IncomingRoundMap::const_iterator rounds_itr = incoming_.begin(); rounds_itr != incoming_.end(); ++rounds_itr)
-  {
-    for (IncomingQueuesMap::const_iterator it = rounds_itr->second.map.begin(); it != rounds_itr->second.map.end(); ++it)
-    {
-      const IncomingQueuesRecords& in_qrs = it->second;
-      for (InQueueRecords::const_iterator cur = in_qrs.records.begin(); cur != in_qrs.records.end(); ++cur)
-      {
-        const QueueRecord& qr = cur->second;
-        log->info("round: {}, {} <- {}: (size,external) = ({},{})",
-                  rounds_itr->first,
-                  it->first, cur->first,
-                  qr.size,
-                  qr.external);
-      }
-      for (IncomingQueues::const_iterator cur = in_qrs.queues.begin(); cur != in_qrs.queues.end(); ++cur)
-      {
-        log->info("round: {}, {} <- {}: queue.size() = {}",
-                  rounds_itr->first,
-                  it->first, cur->first,
-                  const_cast<IncomingQueuesRecords&>(in_qrs).queues[cur->first].size());
-      }
-    }
-  }
-}
-
-#endif
diff --git a/diy/include/diy/mpi.hpp b/diy/include/diy/mpi.hpp
deleted file mode 100644
index 28502002f..000000000
--- a/diy/include/diy/mpi.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef DIY_MPI_HPP
-#define DIY_MPI_HPP
-
-#include <mpi.h>
-
-#include "mpi/constants.hpp"
-#include "mpi/datatypes.hpp"
-#include "mpi/optional.hpp"
-#include "mpi/status.hpp"
-#include "mpi/request.hpp"
-#include "mpi/point-to-point.hpp"
-#include "mpi/communicator.hpp"
-#include "mpi/collectives.hpp"
-#include "mpi/io.hpp"
-
-namespace diy
-{
-namespace mpi
-{
-
-//! \ingroup MPI
-struct environment
-{
-  environment()                           { int argc = 0; char** argv; MPI_Init(&argc, &argv); }
-  environment(int argc, char* argv[])     { MPI_Init(&argc, &argv); }
-  ~environment()                          { MPI_Finalize(); }
-};
-
-}
-}
-
-#endif
diff --git a/diy/include/diy/mpi/collectives.hpp b/diy/include/diy/mpi/collectives.hpp
deleted file mode 100644
index 8d70bcf01..000000000
--- a/diy/include/diy/mpi/collectives.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#include <vector>
-
-#include "operations.hpp"
-
-namespace diy
-{
-namespace mpi
-{
-  //!\addtogroup MPI
-  //!@{
-
-  template<class T, class Op>
-  struct Collectives
-  {
-    typedef   detail::mpi_datatype<T>     Datatype;
-
-    static void broadcast(const communicator& comm, T& x, int root)
-    {
-      MPI_Bcast(Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(), root, comm);
-    }
-
-    static void broadcast(const communicator& comm, std::vector<T>& x, int root)
-    {
-      size_t sz = x.size();
-      Collectives<size_t, void*>::broadcast(comm, sz, root);
-
-      if (comm.rank() != root)
-          x.resize(sz);
-
-      MPI_Bcast(Datatype::address(x[0]),
-                x.size(),
-                Datatype::datatype(), root, comm);
-    }
-
-    static request ibroadcast(const communicator& comm, T& x, int root)
-    {
-      request r;
-      MPI_Ibcast(Datatype::address(x),
-                 Datatype::count(x),
-                 Datatype::datatype(), root, comm, &r.r);
-      return r;
-    }
-
-    static void gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
-    {
-      size_t s  = comm.size();
-             s *= Datatype::count(in);
-      out.resize(s);
-      MPI_Gather(Datatype::address(const_cast<T&>(in)),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 Datatype::address(out[0]),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 root, comm);
-    }
-
-    static void gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
-    {
-      std::vector<int>  counts(comm.size());
-      Collectives<int,void*>::gather(comm, (int) in.size(), counts, root);
-
-      std::vector<int>  offsets(comm.size(), 0);
-      for (unsigned i = 1; i < offsets.size(); ++i)
-        offsets[i] = offsets[i-1] + counts[i-1];
-
-      std::vector<T> buffer(offsets.back() + counts.back());
-      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
-                  in.size(),
-                  Datatype::datatype(),
-                  Datatype::address(buffer[0]),
-                  &counts[0],
-                  &offsets[0],
-                  Datatype::datatype(),
-                  root, comm);
-
-      out.resize(comm.size());
-      size_t cur = 0;
-      for (unsigned i = 0; i < (unsigned)comm.size(); ++i)
-      {
-          out[i].reserve(counts[i]);
-          for (unsigned j = 0; j < (unsigned)counts[i]; ++j)
-              out[i].push_back(buffer[cur++]);
-      }
-    }
-
-    static void gather(const communicator& comm, const T& in, int root)
-    {
-      MPI_Gather(Datatype::address(const_cast<T&>(in)),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 Datatype::address(const_cast<T&>(in)),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 root, comm);
-    }
-
-    static void gather(const communicator& comm, const std::vector<T>& in, int root)
-    {
-      Collectives<int,void*>::gather(comm, (int) in.size(), root);
-
-      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
-                  in.size(),
-                  Datatype::datatype(),
-                  0, 0, 0,
-                  Datatype::datatype(),
-                  root, comm);
-    }
-
-    static void all_gather(const communicator& comm, const T& in, std::vector<T>& out)
-    {
-      size_t s  = comm.size();
-             s *= Datatype::count(in);
-      out.resize(s);
-      MPI_Allgather(Datatype::address(const_cast<T&>(in)),
-                    Datatype::count(in),
-                    Datatype::datatype(),
-                    Datatype::address(out[0]),
-                    Datatype::count(in),
-                    Datatype::datatype(),
-                    comm);
-    }
-
-    static void all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
-    {
-      std::vector<int>  counts(comm.size());
-      Collectives<int,void*>::all_gather(comm, (int) in.size(), counts);
-
-      std::vector<int>  offsets(comm.size(), 0);
-      for (unsigned i = 1; i < offsets.size(); ++i)
-        offsets[i] = offsets[i-1] + counts[i-1];
-
-      std::vector<T> buffer(offsets.back() + counts.back());
-      MPI_Allgatherv(Datatype::address(const_cast<T&>(in[0])),
-                     in.size(),
-                     Datatype::datatype(),
-                     Datatype::address(buffer[0]),
-                     &counts[0],
-                     &offsets[0],
-                     Datatype::datatype(),
-                     comm);
-
-      out.resize(comm.size());
-      size_t cur = 0;
-      for (int i = 0; i < comm.size(); ++i)
-      {
-          out[i].reserve(counts[i]);
-          for (int j = 0; j < counts[i]; ++j)
-              out[i].push_back(buffer[cur++]);
-      }
-    }
-
-    static void reduce(const communicator& comm, const T& in, T& out, int root, const Op&)
-    {
-      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
-                 Datatype::address(out),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 detail::mpi_op<Op>::get(),
-                 root, comm);
-    }
-
-    static void reduce(const communicator& comm, const T& in, int root, const Op& op)
-    {
-      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
-                 Datatype::address(const_cast<T&>(in)),
-                 Datatype::count(in),
-                 Datatype::datatype(),
-                 detail::mpi_op<Op>::get(),
-                 root, comm);
-    }
-
-    static void all_reduce(const communicator& comm, const T& in, T& out, const Op&)
-    {
-      MPI_Allreduce(Datatype::address(const_cast<T&>(in)),
-                    Datatype::address(out),
-                    Datatype::count(in),
-                    Datatype::datatype(),
-                    detail::mpi_op<Op>::get(),
-                    comm);
-    }
-
-    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
-    {
-      out.resize(in.size());
-      MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])),
-                    Datatype::address(out[0]),
-                    in.size(),
-                    Datatype::datatype(),
-                    detail::mpi_op<Op>::get(),
-                    comm);
-    }
-
-    static void scan(const communicator& comm, const T& in, T& out, const Op&)
-    {
-      MPI_Scan(Datatype::address(const_cast<T&>(in)),
-               Datatype::address(out),
-               Datatype::count(in),
-               Datatype::datatype(),
-               detail::mpi_op<Op>::get(),
-               comm);
-    }
-
-    static void all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
-    {
-      // NB: this will fail if T is a vector
-      MPI_Alltoall(Datatype::address(const_cast<T&>(in[0])), n,
-                   Datatype::datatype(),
-                   Datatype::address(out[0]), n,
-                   Datatype::datatype(),
-                   comm);
-    }
-  };
-
-  //! Broadcast to all processes in `comm`.
-  template<class T>
-  void      broadcast(const communicator& comm, T& x, int root)
-  {
-    Collectives<T,void*>::broadcast(comm, x, root);
-  }
-
-  //! Broadcast for vectors
-  template<class T>
-  void      broadcast(const communicator& comm, std::vector<T>& x, int root)
-  {
-    Collectives<T,void*>::broadcast(comm, x, root);
-  }
-
-  //! iBroadcast to all processes in `comm`.
-  template<class T>
-  request   ibroadcast(const communicator& comm, T& x, int root)
-  {
-    return Collectives<T,void*>::ibroadcast(comm, x, root);
-  }
-
-  //! Gather from all processes in `comm`.
-  //!  On `root` process, `out` is resized to `comm.size()` and filled with
-  //! elements from the respective ranks.
-  template<class T>
-  void      gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
-  {
-    Collectives<T,void*>::gather(comm, in, out, root);
-  }
-
-  //! Same as above, but for vectors.
-  template<class T>
-  void      gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
-  {
-    Collectives<T,void*>::gather(comm, in, out, root);
-  }
-
-  //! Simplified version (without `out`) for use on non-root processes.
-  template<class T>
-  void      gather(const communicator& comm, const T& in, int root)
-  {
-    Collectives<T,void*>::gather(comm, in, root);
-  }
-
-  //! Simplified version (without `out`) for use on non-root processes.
-  template<class T>
-  void      gather(const communicator& comm, const std::vector<T>& in, int root)
-  {
-    Collectives<T,void*>::gather(comm, in, root);
-  }
-
-  //! all_gather from all processes in `comm`.
-  //! `out` is resized to `comm.size()` and filled with
-  //! elements from the respective ranks.
-  template<class T>
-  void      all_gather(const communicator& comm, const T& in, std::vector<T>& out)
-  {
-    Collectives<T,void*>::all_gather(comm, in, out);
-  }
-
-  //! Same as above, but for vectors.
-  template<class T>
-  void      all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
-  {
-    Collectives<T,void*>::all_gather(comm, in, out);
-  }
-
-  //! reduce
-  template<class T, class Op>
-  void      reduce(const communicator& comm, const T& in, T& out, int root, const Op& op)
-  {
-    Collectives<T, Op>::reduce(comm, in, out, root, op);
-  }
-
-  //! Simplified version (without `out`) for use on non-root processes.
-  template<class T, class Op>
-  void      reduce(const communicator& comm, const T& in, int root, const Op& op)
-  {
-    Collectives<T, Op>::reduce(comm, in, root, op);
-  }
-
-  //! all_reduce
-  template<class T, class Op>
-  void      all_reduce(const communicator& comm, const T& in, T& out, const Op& op)
-  {
-    Collectives<T, Op>::all_reduce(comm, in, out, op);
-  }
-
-  //! Same as above, but for vectors.
-  template<class T, class Op>
-  void      all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
-  {
-    Collectives<T, Op>::all_reduce(comm, in, out, op);
-  }
-
-  //! scan
-  template<class T, class Op>
-  void      scan(const communicator& comm, const T& in, T& out, const Op& op)
-  {
-    Collectives<T, Op>::scan(comm, in, out, op);
-  }
-
-  //! all_to_all
-  template<class T>
-  void      all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
-  {
-    Collectives<T, void*>::all_to_all(comm, in, out, n);
-  }
-
-  //!@}
-}
-}
diff --git a/diy/include/diy/mpi/communicator.hpp b/diy/include/diy/mpi/communicator.hpp
deleted file mode 100644
index d1bdf33f7..000000000
--- a/diy/include/diy/mpi/communicator.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-namespace diy
-{
-namespace mpi
-{
-
-  //! \ingroup MPI
-  //! Simple wrapper around `MPI_Comm`.
-  class communicator
-  {
-    public:
-                communicator(MPI_Comm comm = MPI_COMM_WORLD):
-                  comm_(comm), rank_(0), size_(1) { if (comm != MPI_COMM_NULL) { MPI_Comm_rank(comm_, &rank_); MPI_Comm_size(comm_, &size_); } }
-
-      int       rank() const                        { return rank_; }
-      int       size() const                        { return size_; }
-
-      //void      send(int dest,
-      //               int tag,
-      //               const void* buf,
-      //               MPI_Datatype datatype) const   { }
-
-      //! Send `x` to processor `dest` using `tag` (blocking).
-      template<class T>
-      void      send(int dest, int tag, const T& x) const   { detail::send<T>()(comm_, dest, tag, x); }
-
-      //! Receive `x` from `dest` using `tag` (blocking).
-      //! If `T` is an `std::vector<...>`, `recv` will resize it to fit exactly the sent number of values.
-      template<class T>
-      status    recv(int source, int tag, T& x) const       { return detail::recv<T>()(comm_, source, tag, x); }
-
-      //! Non-blocking version of `send()`.
-      template<class T>
-      request   isend(int dest, int tag, const T& x) const  { return detail::isend<T>()(comm_, dest, tag, x); }
-
-      //! Non-blocking version of `recv()`.
-      //! If `T` is an `std::vector<...>`, its size must be big enough to accomodate the sent values.
-      template<class T>
-      request   irecv(int source, int tag, T& x) const      { return detail::irecv<T>()(comm_, source, tag, x); }
-
-      //! probe
-      status    probe(int source, int tag) const            { status s; MPI_Probe(source, tag, comm_, &s.s); return s; }
-
-      //! iprobe
-      inline
-      optional<status>
-                iprobe(int source, int tag) const;
-
-      //! barrier
-      void      barrier() const                             { MPI_Barrier(comm_); }
-
-                operator MPI_Comm() const                   { return comm_; }
-
-    private:
-      MPI_Comm  comm_;
-      int       rank_;
-      int       size_;
-  };
-}
-}
-
-diy::mpi::optional<diy::mpi::status>
-diy::mpi::communicator::
-iprobe(int source, int tag) const
-{
-  status s;
-  int flag;
-  MPI_Iprobe(source, tag, comm_, &flag, &s.s);
-  if (flag)
-    return s;
-  return optional<status>();
-}
-
diff --git a/diy/include/diy/mpi/constants.hpp b/diy/include/diy/mpi/constants.hpp
deleted file mode 100644
index 7668e418f..000000000
--- a/diy/include/diy/mpi/constants.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef DIY_MPI_CONSTANTS_HPP
-#define DIY_MPI_CONSTANTS_HPP
-
-namespace diy
-{
-namespace mpi
-{
-  const int any_source  = MPI_ANY_SOURCE;
-  const int any_tag     = MPI_ANY_TAG;
-}
-}
-
-#endif
diff --git a/diy/include/diy/mpi/datatypes.hpp b/diy/include/diy/mpi/datatypes.hpp
deleted file mode 100644
index 7d8e3a448..000000000
--- a/diy/include/diy/mpi/datatypes.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef DIY_MPI_DATATYPES_HPP
-#define DIY_MPI_DATATYPES_HPP
-
-#include <vector>
-
-namespace diy
-{
-namespace mpi
-{
-namespace detail
-{
-  template<class T> MPI_Datatype  get_mpi_datatype();
-
-  struct true_type  {};
-  struct false_type {};
-
-  /* is_mpi_datatype */
-  template<class T>
-  struct is_mpi_datatype        { typedef false_type    type; };
-
-#define DIY_MPI_DATATYPE_MAP(cpp_type, mpi_type) \
-  template<>  inline MPI_Datatype  get_mpi_datatype<cpp_type>() { return mpi_type; }  \
-  template<>  struct is_mpi_datatype<cpp_type>                  { typedef true_type type; };    \
-  template<>  struct is_mpi_datatype< std::vector<cpp_type> >   { typedef true_type type; };
-
-  DIY_MPI_DATATYPE_MAP(char,                  MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(unsigned char,         MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(bool,                  MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(int,                   MPI_INT);
-  DIY_MPI_DATATYPE_MAP(unsigned,              MPI_UNSIGNED);
-  DIY_MPI_DATATYPE_MAP(long,                  MPI_LONG);
-  DIY_MPI_DATATYPE_MAP(unsigned long,         MPI_UNSIGNED_LONG);
-  DIY_MPI_DATATYPE_MAP(long long,             MPI_LONG_LONG_INT);
-  DIY_MPI_DATATYPE_MAP(unsigned long long,    MPI_UNSIGNED_LONG_LONG);
-  DIY_MPI_DATATYPE_MAP(float,                 MPI_FLOAT);
-  DIY_MPI_DATATYPE_MAP(double,                MPI_DOUBLE);
-
-  /* mpi_datatype: helper routines, specialized for std::vector<...> */
-  template<class T>
-  struct mpi_datatype
-  {
-    static MPI_Datatype         datatype()              { return get_mpi_datatype<T>(); }
-    static const void*          address(const T& x)     { return &x; }
-    static void*                address(T& x)           { return &x; }
-    static int                  count(const T& x)       { return 1; }
-  };
-
-  template<class U>
-  struct mpi_datatype< std::vector<U> >
-  {
-    typedef     std::vector<U>      VecU;
-
-    static MPI_Datatype         datatype()              { return get_mpi_datatype<U>(); }
-    static const void*          address(const VecU& x)  { return &x[0]; }
-    static void*                address(VecU& x)        { return &x[0]; }
-    static int                  count(const VecU& x)    { return x.size(); }
-  };
-
-}
-}
-}
-
-#endif
diff --git a/diy/include/diy/mpi/io.hpp b/diy/include/diy/mpi/io.hpp
deleted file mode 100644
index ebe6a2e17..000000000
--- a/diy/include/diy/mpi/io.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef DIY_MPI_IO_HPP
-#define DIY_MPI_IO_HPP
-
-#include <vector>
-#include <string>
-
-namespace diy
-{
-namespace mpi
-{
-namespace io
-{
-  typedef               MPI_Offset              offset;
-
-  //! Wraps MPI file IO. \ingroup MPI
-  class file
-  {
-    public:
-      enum
-      {
-        rdonly          = MPI_MODE_RDONLY,
-        rdwr            = MPI_MODE_RDWR,
-        wronly          = MPI_MODE_WRONLY,
-        create          = MPI_MODE_CREATE,
-        exclusive       = MPI_MODE_EXCL,
-        delete_on_close = MPI_MODE_DELETE_ON_CLOSE,
-        unique_open     = MPI_MODE_UNIQUE_OPEN,
-        sequential      = MPI_MODE_SEQUENTIAL,
-        append          = MPI_MODE_APPEND
-      };
-
-    public:
-                    file(const communicator&    comm,
-                         const std::string&     filename,
-                         int                    mode):
-                        comm_(comm)                         { MPI_File_open(comm, const_cast<char*>(filename.c_str()), mode, MPI_INFO_NULL, &fh); }
-                    ~file()                                 { close(); }
-      void          close()                                 { if (fh != MPI_FILE_NULL) MPI_File_close(&fh); }
-
-      offset        size() const                            { offset sz; MPI_File_get_size(fh, &sz); return sz; }
-      void          resize(offset size)                     { MPI_File_set_size(fh, size); }
-
-      inline void   read_at(offset o, char* buffer, size_t size);
-      inline void   read_at_all(offset o, char* buffer, size_t size);
-      inline void   write_at(offset o, const char* buffer, size_t size);
-      inline void   write_at_all(offset o, const char* buffer, size_t size);
-
-      template<class T>
-      inline void   read_at(offset o, std::vector<T>& data);
-
-      template<class T>
-      inline void   read_at_all(offset o, std::vector<T>& data);
-
-      template<class T>
-      inline void   write_at(offset o, const std::vector<T>& data);
-
-      template<class T>
-      inline void   write_at_all(offset o, const std::vector<T>& data);
-
-      const communicator&
-                    comm() const                            { return comm_; }
-
-      MPI_File&     handle()                                { return fh; }
-
-    private:
-      const communicator&   comm_;
-      MPI_File              fh;
-  };
-}
-}
-}
-
-void
-diy::mpi::io::file::
-read_at(offset o, char* buffer, size_t size)
-{
-  status s;
-  MPI_File_read_at(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
-}
-
-template<class T>
-void
-diy::mpi::io::file::
-read_at(offset o, std::vector<T>& data)
-{
-  read_at(o, &data[0], data.size()*sizeof(T));
-}
-
-void
-diy::mpi::io::file::
-read_at_all(offset o, char* buffer, size_t size)
-{
-  status s;
-  MPI_File_read_at_all(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
-}
-
-template<class T>
-void
-diy::mpi::io::file::
-read_at_all(offset o, std::vector<T>& data)
-{
-  read_at_all(o, (char*) &data[0], data.size()*sizeof(T));
-}
-
-void
-diy::mpi::io::file::
-write_at(offset o, const char* buffer, size_t size)
-{
-  status s;
-  MPI_File_write_at(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
-}
-
-template<class T>
-void
-diy::mpi::io::file::
-write_at(offset o, const std::vector<T>& data)
-{
-  write_at(o, (const char*) &data[0], data.size()*sizeof(T));
-}
-
-void
-diy::mpi::io::file::
-write_at_all(offset o, const char* buffer, size_t size)
-{
-  status s;
-  MPI_File_write_at_all(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
-}
-
-template<class T>
-void
-diy::mpi::io::file::
-write_at_all(offset o, const std::vector<T>& data)
-{
-  write_at_all(o, &data[0], data.size()*sizeof(T));
-}
-
-#endif
diff --git a/diy/include/diy/mpi/operations.hpp b/diy/include/diy/mpi/operations.hpp
deleted file mode 100644
index 2f95c0a72..000000000
--- a/diy/include/diy/mpi/operations.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <functional>
-
-namespace diy
-{
-namespace mpi
-{
-  //! \addtogroup MPI
-  //!@{
-  template<class U>
-  struct maximum { const U& operator()(const U& x, const U& y) const { return std::max(x,y); } };
-  template<class U>
-  struct minimum { const U& operator()(const U& x, const U& y) const { return std::min(x,y); } };
-  //!@}
-
-namespace detail
-{
-  template<class T> struct mpi_op                           { static MPI_Op  get(); };
-  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get() { return MPI_MAX; }  };
-  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get() { return MPI_MIN; }  };
-  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get() { return MPI_SUM; }  };
-  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get() { return MPI_PROD; }  };
-  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get() { return MPI_LAND; }  };
-  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get() { return MPI_LOR; }  };
-}
-}
-}
diff --git a/diy/include/diy/mpi/optional.hpp b/diy/include/diy/mpi/optional.hpp
deleted file mode 100644
index ab58aaf81..000000000
--- a/diy/include/diy/mpi/optional.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-namespace diy
-{
-namespace mpi
-{
-  template<class T>
-  struct optional
-  {
-                optional():
-                  init_(false)                  {}
-
-                optional(const T& v):
-                  init_(true)                   { new(buf_) T(v); }
-
-                optional(const optional& o):
-                  init_(o.init_)                { if (init_) new(buf_) T(*o);  }
-
-                ~optional()                     { if (init_) clear(); }
-
-    inline
-    optional&   operator=(const optional& o);
-
-                operator bool() const           { return init_; }
-
-    T&          operator*()                     { return *static_cast<T*>(address()); }
-    const T&    operator*() const               { return *static_cast<const T*>(address()); }
-
-    T*          operator->()                    { return &(operator*()); }
-    const T*    operator->() const              { return &(operator*()); }
-
-    private:
-      void      clear()                         { static_cast<T*>(address())->~T(); }
-
-      void*         address()                   { return buf_; }
-      const void*   address() const             { return buf_; }
-
-    private:
-      bool init_;
-      char buf_[sizeof(T)];
-  };
-}
-}
-
-template<class T>
-diy::mpi::optional<T>&
-diy::mpi::optional<T>::
-operator=(const optional& o)
-{
-  if (init_)
-    clear();
-  init_ = o.init_;
-  if (init_)
-    new (buf_) T(*o);
-
-  return *this;
-}
diff --git a/diy/include/diy/mpi/point-to-point.hpp b/diy/include/diy/mpi/point-to-point.hpp
deleted file mode 100644
index dc8a341dc..000000000
--- a/diy/include/diy/mpi/point-to-point.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <vector>
-
-namespace diy
-{
-namespace mpi
-{
-namespace detail
-{
-  // send
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct send;
-
-  template<class T>
-  struct send<T, true_type>
-  {
-    void operator()(MPI_Comm comm, int dest, int tag, const T& x) const
-    {
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Send((void*) Datatype::address(x),
-               Datatype::count(x),
-               Datatype::datatype(),
-               dest, tag, comm);
-    }
-  };
-
-  // recv
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct recv;
-
-  template<class T>
-  struct recv<T, true_type>
-  {
-    status operator()(MPI_Comm comm, int source, int tag, T& x) const
-    {
-      typedef       mpi_datatype<T>     Datatype;
-      status s;
-      MPI_Recv((void*) Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                source, tag, comm, &s.s);
-      return s;
-    }
-  };
-
-  template<class U>
-  struct recv<std::vector<U>, true_type>
-  {
-    status operator()(MPI_Comm comm, int source, int tag, std::vector<U>& x) const
-    {
-      status s;
-
-      MPI_Probe(source, tag, comm, &s.s);
-      x.resize(s.count<U>());
-      MPI_Recv(&x[0], x.size(), get_mpi_datatype<U>(), source, tag, comm, &s.s);
-      return s;
-    }
-  };
-
-  // isend
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct isend;
-
-  template<class T>
-  struct isend<T, true_type>
-  {
-    request operator()(MPI_Comm comm, int dest, int tag, const T& x) const
-    {
-      request r;
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Isend((void*) Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                dest, tag, comm, &r.r);
-      return r;
-    }
-  };
-
-  // irecv
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct irecv;
-
-  template<class T>
-  struct irecv<T, true_type>
-  {
-    request operator()(MPI_Comm comm, int source, int tag, T& x) const
-    {
-      request r;
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Irecv(Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                source, tag, comm, &r.r);
-      return r;
-    }
-  };
-}
-}
-}
diff --git a/diy/include/diy/mpi/request.hpp b/diy/include/diy/mpi/request.hpp
deleted file mode 100644
index 23b11816e..000000000
--- a/diy/include/diy/mpi/request.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-namespace diy
-{
-namespace mpi
-{
-  struct request
-  {
-    status              wait()              { status s; MPI_Wait(&r, &s.s); return s; }
-    inline
-    optional<status>    test();
-    void                cancel()            { MPI_Cancel(&r); }
-
-    MPI_Request         r;
-  };
-}
-}
-
-diy::mpi::optional<diy::mpi::status>
-diy::mpi::request::test()
-{
-  status s;
-  int flag;
-  MPI_Test(&r, &flag, &s.s);
-  if (flag)
-    return s;
-  return optional<status>();
-}
diff --git a/diy/include/diy/mpi/status.hpp b/diy/include/diy/mpi/status.hpp
deleted file mode 100644
index aab500c31..000000000
--- a/diy/include/diy/mpi/status.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-namespace diy
-{
-namespace mpi
-{
-  struct status
-  {
-    int             source() const          { return s.MPI_SOURCE; }
-    int             tag() const             { return s.MPI_TAG; }
-    int             error() const           { return s.MPI_ERROR; }
-    bool            cancelled() const       { int flag; MPI_Test_cancelled(const_cast<MPI_Status*>(&s), &flag); return flag; }
-
-    template<class T>
-    int             count() const;
-
-                    operator MPI_Status&()              { return s; }
-                    operator const MPI_Status&() const  { return s; }
-
-    MPI_Status      s;
-  };
-}
-}
-
-template<class T>
-int
-diy::mpi::status::count() const
-{
-  int c;
-  MPI_Get_count(const_cast<MPI_Status*>(&s), detail::get_mpi_datatype<T>(), &c);
-  return c;
-}
diff --git a/diy/include/diy/no-thread.hpp b/diy/include/diy/no-thread.hpp
deleted file mode 100644
index fd7af88ae..000000000
--- a/diy/include/diy/no-thread.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef DIY_NO_THREAD_HPP
-#define DIY_NO_THREAD_HPP
-
-// replicates only the parts of the threading interface that we use
-// executes everything in a single thread
-
-namespace diy
-{
-  struct thread
-  {
-                        thread(void (*f)(void *), void* args):
-                            f_(f), args_(args)                    {}
-
-    void                join()                                    { f_(args_); }
-
-    static unsigned     hardware_concurrency()                    { return 1; }
-
-    void (*f_)(void*);
-    void*   args_;
-  };
-
-  struct mutex {};
-  struct fast_mutex {};
-  struct recursive_mutex {};
-
-  template<class T>
-  struct lock_guard
-  {
-      lock_guard(T&)        {}
-  };
-
-  namespace this_thread
-  {
-      inline unsigned long int  get_id()    { return 0; }
-  }
-}
-
-#endif
diff --git a/diy/include/diy/partners/all-reduce.hpp b/diy/include/diy/partners/all-reduce.hpp
deleted file mode 100644
index e34066595..000000000
--- a/diy/include/diy/partners/all-reduce.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef DIY_PARTNERS_ALL_REDUCE_HPP
-#define DIY_PARTNERS_ALL_REDUCE_HPP
-
-#include "merge.hpp"
-
-namespace diy
-{
-
-class Master;
-
-//! Allreduce (reduction with results broadcasted to all blocks) is
-//! implemented as two merge reductions, with incoming and outgoing items swapped in second one.
-//! Ie, follows merge reduction up and down the merge tree
-
-/**
- * \ingroup Communication
- * \brief Partners for all-reduce
- *
- */
-struct RegularAllReducePartners: public RegularMergePartners
-{
-  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
-
-                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
-                //! contiguous is useful when data needs to be united;
-                //! round-robin is useful for vector-"halving"
-  template<class Decomposer>
-                RegularAllReducePartners(const Decomposer& decomposer,  //!< domain decomposition
-                                         int k,                         //!< target k value
-                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
-                    ):
-                  Parent(decomposer, k, contiguous)         {}
-                RegularAllReducePartners(const DivisionVector&   divs,//!< explicit division vector
-                                         const KVSVector&        kvs, //!< explicit k vector
-                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
-                    ):
-                  Parent(divs, kvs, contiguous)               {}
-
-  //! returns total number of rounds
-  size_t        rounds() const                                  { return 2*Parent::rounds(); }
-  //! returns size of a group of partners in a given round
-  int           size(int round) const                           { return Parent::size(parent_round(round)); }
-  //! returns dimension (direction of partners in a regular grid) in a given round
-  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
-  //! returns whether a given block in a given round has dropped out of the merge yet or not
-  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
-  //! returns what the current round would be in the first or second parent merge reduction
-  int           parent_round(int round) const                   { return round < (int) Parent::rounds() ? round : rounds() - round; }
-
-  // incoming is only valid for an active gid; it will only be called with an active gid
-  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
-  {
-      if (round <= (int) Parent::rounds())
-          Parent::incoming(round, gid, partners, m);
-      else
-          Parent::outgoing(parent_round(round), gid, partners, m);
-  }
-
-  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
-  {
-      if (round < (int) Parent::rounds())
-          Parent::outgoing(round, gid, partners, m);
-      else
-          Parent::incoming(parent_round(round), gid, partners, m);
-  }
-};
-
-} // diy
-
-#endif
-
-
diff --git a/diy/include/diy/partners/broadcast.hpp b/diy/include/diy/partners/broadcast.hpp
deleted file mode 100644
index d3f565f82..000000000
--- a/diy/include/diy/partners/broadcast.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef DIY_PARTNERS_BROADCAST_HPP
-#define DIY_PARTNERS_BROADCAST_HPP
-
-#include "merge.hpp"
-
-namespace diy
-{
-
-class Master;
-
-/**
- * \ingroup Communication
- * \brief Partners for broadcast
- *
- */
-struct RegularBroadcastPartners: public RegularMergePartners
-{
-  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
-
-                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
-                //! contiguous is useful when data needs to be united;
-                //! round-robin is useful for vector-"halving"
-  template<class Decomposer>
-                RegularBroadcastPartners(const Decomposer& decomposer,  //!< domain decomposition
-                                         int k,                         //!< target k value
-                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
-                    ):
-                  Parent(decomposer, k, contiguous)         {}
-                RegularBroadcastPartners(const DivisionVector&   divs,//!< explicit division vector
-                                         const KVSVector&        kvs, //!< explicit k vector
-                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
-                    ):
-                  Parent(divs, kvs, contiguous)               {}
-
-  //! returns total number of rounds
-  size_t        rounds() const                                  { return Parent::rounds(); }
-  //! returns size of a group of partners in a given round
-  int           size(int round) const                           { return Parent::size(parent_round(round)); }
-  //! returns dimension (direction of partners in a regular grid) in a given round
-  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
-  //! returns whether a given block in a given round has dropped out of the merge yet or not
-  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
-  //! returns what the current round would be in the first or second parent merge reduction
-  int           parent_round(int round) const                   { return rounds() - round; }
-
-  // incoming is only valid for an active gid; it will only be called with an active gid
-  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
-  {
-      Parent::outgoing(parent_round(round), gid, partners, m);
-  }
-
-  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
-  {
-      Parent::incoming(parent_round(round), gid, partners, m);
-  }
-};
-
-} // diy
-
-#endif
-
-
diff --git a/diy/include/diy/partners/common.hpp b/diy/include/diy/partners/common.hpp
deleted file mode 100644
index 43f8297a0..000000000
--- a/diy/include/diy/partners/common.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-#ifndef DIY_PARTNERS_COMMON_HPP
-#define DIY_PARTNERS_COMMON_HPP
-
-#include "../decomposition.hpp"
-#include "../types.hpp"
-
-namespace diy
-{
-
-struct RegularPartners
-{
-  // The record of group size per round in a dimension
-  struct DimK
-  {
-            DimK(int dim_, int k_):
-                dim(dim_), size(k_)               {}
-
-    int dim;
-    int size;           // group size
-  };
-
-  typedef       std::vector<int>                    CoordVector;
-  typedef       std::vector<int>                    DivisionVector;
-  typedef       std::vector<DimK>                   KVSVector;
-
-  // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily)
-  typedef       DiscreteBounds                      Bounds;
-  typedef       RegularDecomposer<Bounds>           Decomposer;
-
-  template<class Decomposer_>
-                RegularPartners(const Decomposer_& decomposer, int k, bool contiguous = true):
-                  divisions_(decomposer.divisions),
-                  contiguous_(contiguous)                       { factor(k, divisions_, kvs_); fill_steps(); }
-                RegularPartners(const DivisionVector&   divs,
-                                const KVSVector&        kvs,
-                                bool  contiguous = true):
-                  divisions_(divs), kvs_(kvs),
-                  contiguous_(contiguous)                       { fill_steps(); }
-
-  size_t        rounds() const                                  { return kvs_.size(); }
-  int           size(int round) const                           { return kvs_[round].size; }
-  int           dim(int round) const                            { return kvs_[round].dim; }
-
-  int           step(int round) const                           { return steps_[round]; }
-
-  const DivisionVector&     divisions() const                   { return divisions_; }
-  const KVSVector&          kvs() const                         { return kvs_; }
-  bool                      contiguous() const                  { return contiguous_; }
-
-  static
-  inline void   factor(int k, const DivisionVector& divisions, KVSVector& kvs);
-
-  inline void   fill(int round, int gid, std::vector<int>& partners) const;
-  inline int    group_position(int round, int c, int step) const;
-
-  private:
-    inline void fill_steps();
-    static
-    inline void factor(int k, int tot_b, std::vector<int>& kvs);
-
-    DivisionVector      divisions_;
-    KVSVector           kvs_;
-    bool                contiguous_;
-    std::vector<int>    steps_;
-};
-
-}
-
-void
-diy::RegularPartners::
-fill_steps()
-{
-  if (contiguous_)
-  {
-    std::vector<int>    cur_steps(divisions().size(), 1);
-
-    for (size_t r = 0; r < rounds(); ++r)
-    {
-      steps_.push_back(cur_steps[kvs_[r].dim]);
-      cur_steps[kvs_[r].dim] *= kvs_[r].size;
-    }
-  } else
-  {
-    std::vector<int>    cur_steps(divisions().begin(), divisions().end());
-    for (size_t r = 0; r < rounds(); ++r)
-    {
-      cur_steps[kvs_[r].dim] /= kvs_[r].size;
-      steps_.push_back(cur_steps[kvs_[r].dim]);
-    }
-  }
-}
-
-void
-diy::RegularPartners::
-fill(int round, int gid, std::vector<int>& partners) const
-{
-  const DimK&   kv  = kvs_[round];
-  partners.reserve(kv.size);
-
-  int step = this->step(round);       // gids jump by this much in the current round
-
-  CoordVector   coords;
-  Decomposer::gid_to_coords(gid, coords, divisions_);
-  int c   = coords[kv.dim];
-  int pos = group_position(round, c, step);
-
-  int partner = c - pos * step;
-  coords[kv.dim] = partner;
-  int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
-  partners.push_back(partner_gid);
-
-  for (int k = 1; k < kv.size; ++k)
-  {
-    partner += step;
-    coords[kv.dim] = partner;
-    int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
-    partners.push_back(partner_gid);
-  }
-}
-
-// Tom's GetGrpPos
-int
-diy::RegularPartners::
-group_position(int round, int c, int step) const
-{
-  // the second term in the following expression does not simplify to
-  // (gid - start_b) / kv[r]
-  // because the division gid / (step * kv[r]) is integer and truncates
-  // this is exactly what we want
-  int g = c % step + c / (step * kvs_[round].size) * step;
-  int p = c / step % kvs_[round].size;
-  static_cast<void>(g);        // shut up the compiler
-
-  // g: group number (output)
-  // p: position number within the group (output)
-  return p;
-}
-
-void
-diy::RegularPartners::
-factor(int k, const DivisionVector& divisions, KVSVector& kvs)
-{
-  // factor in each dimension
-  std::vector< std::vector<int> >       tmp_kvs(divisions.size());
-  for (unsigned i = 0; i < divisions.size(); ++i)
-    factor(k, divisions[i], tmp_kvs[i]);
-
-  // interleave the dimensions
-  std::vector<int>  round_per_dim(divisions.size(), 0);
-  while(true)
-  {
-    // TODO: not the most efficient way to do this
-    bool changed = false;
-    for (unsigned i = 0; i < divisions.size(); ++i)
-    {
-      if (round_per_dim[i] == (int) tmp_kvs[i].size())
-        continue;
-      kvs.push_back(DimK(i, tmp_kvs[i][round_per_dim[i]++]));
-      changed = true;
-    }
-    if (!changed)
-        break;
-  }
-}
-
-// Tom's FactorK
-void
-diy::RegularPartners::
-factor(int k, int tot_b, std::vector<int>& kv)
-{
-  int rem = tot_b; // unfactored remaining portion of tot_b
-  int j;
-
-  while (rem > 1)
-  {
-    // remainder is divisible by k
-    if (rem % k == 0)
-    {
-      kv.push_back(k);
-      rem /= k;
-    }
-    // if not, start at k and linearly look for smaller factors down to 2
-    else
-    {
-      for (j = k - 1; j > 1; j--)
-      {
-        if (rem % j == 0)
-        {
-          kv.push_back(j);
-          rem /= k;
-          break;
-        }
-      }
-      if (j == 1)
-      {
-        kv.push_back(rem);
-        rem = 1;
-      }
-    } // else
-  } // while
-}
-
-
-#endif
diff --git a/diy/include/diy/partners/merge.hpp b/diy/include/diy/partners/merge.hpp
deleted file mode 100644
index c6be42533..000000000
--- a/diy/include/diy/partners/merge.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef DIY_PARTNERS_MERGE_HPP
-#define DIY_PARTNERS_MERGE_HPP
-
-#include "common.hpp"
-
-namespace diy
-{
-
-class Master;
-
-/**
- * \ingroup Communication
- * \brief Partners for merge-reduce
- *
- */
-struct RegularMergePartners: public RegularPartners
-{
-  typedef       RegularPartners                                 Parent;
-
-                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
-                // contiguous is useful when data needs to be united;
-                // round-robin is useful for vector-"halving"
-  template<class Decomposer>
-                RegularMergePartners(const Decomposer& decomposer,  //!< domain decomposition
-                                     int k,                         //!< target k value
-                                     bool contiguous = true         //!< distance doubling (true) or halving (false)
-                    ):
-                    Parent(decomposer, k, contiguous)           {}
-                RegularMergePartners(const DivisionVector&   divs, //!< explicit division vector
-                                     const KVSVector&        kvs,  //!< explicit k vector
-                                     bool  contiguous = true       //!< distance doubling (true) or halving (false)
-                    ):
-                    Parent(divs, kvs, contiguous)               {}
-
-  inline bool   active(int round, int gid, const Master&) const;
-
-  // incoming is only valid for an active gid; it will only be called with an active gid
-  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master&) const    { Parent::fill(round - 1, gid, partners); }
-  // this is a lazy implementation of outgoing, but it reuses the existing code
-  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master&) const    { std::vector<int> tmp; Parent::fill(round, gid, tmp); partners.push_back(tmp[0]); }
-};
-
-} // diy
-
-bool
-diy::RegularMergePartners::
-active(int round, int gid, const Master&) const
-{
-  CoordVector   coords;
-  Decomposer::gid_to_coords(gid, coords, divisions());
-
-  for (int r = 0; r < round; ++r)
-      if (Parent::group_position(r, coords[kvs()[r].dim], step(r)) != 0)
-          return false;
-
-  return true;
-}
-
-#endif
-
diff --git a/diy/include/diy/partners/swap.hpp b/diy/include/diy/partners/swap.hpp
deleted file mode 100644
index cc3b3e494..000000000
--- a/diy/include/diy/partners/swap.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef DIY_PARTNERS_SWAP_HPP
-#define DIY_PARTNERS_SWAP_HPP
-
-#include "common.hpp"
-
-namespace diy
-{
-
-class Master;
-
-/**
- * \ingroup Communication
- * \brief Partners for swap-reduce
- *
- */
-struct RegularSwapPartners: public RegularPartners
-{
-  typedef       RegularPartners                                 Parent;
-
-                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
-                // contiguous is useful when data needs to be united;
-                // round-robin is useful for vector-"halving"
-  template<class Decomposer>
-                RegularSwapPartners(const Decomposer& decomposer,   //!< domain decomposition
-                                    int k,                          //!< target k value
-                                    bool contiguous = true          //!< distance halving (true) or doubling (false)
-                    ):
-                    Parent(decomposer, k, contiguous)         {}
-                RegularSwapPartners(const DivisionVector&   divs, //!< explicit division vector
-                                    const KVSVector&        kvs,  //!< explicit k vector
-                                    bool  contiguous = true       //!< distance halving (true) or doubling (false)
-                    ):
-                    Parent(divs, kvs, contiguous)               {}
-
-  bool          active(int round, int gid, const Master&) const                                 { return true; }    // in swap-reduce every block is always active
-
-  void          incoming(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round - 1, gid, partners); }
-  void          outgoing(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round, gid, partners); }
-};
-
-} // diy
-
-#endif
diff --git a/diy/include/diy/pick.hpp b/diy/include/diy/pick.hpp
deleted file mode 100644
index 5f9d8d0e8..000000000
--- a/diy/include/diy/pick.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef DIY_PICK_HPP
-#define DIY_PICK_HPP
-
-#include "link.hpp"
-
-namespace diy
-{
-    template<class Bounds, class Point, class OutIter>
-    void near(const RegularLink<Bounds>& link, const Point& p, float r, OutIter out,
-              const Bounds& domain);
-
-    template<class Bounds, class Point, class OutIter>
-    void in(const RegularLink<Bounds>& link, const Point& p, OutIter out, const Bounds& domain);
-
-    template<class Point, class Bounds>
-    float distance(int dim, const Bounds& bounds, const Point& p);
-
-    template<class Bounds>
-    inline
-    float distance(int dim, const Bounds& bounds1, const Bounds& bounds2);
-
-    template<class Bounds>
-    void wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim);
-}
-
-//! Finds the neighbors within radius r of a target point.
-template<class Bounds, class Point, class OutIter>
-void
-diy::
-near(const RegularLink<Bounds>& link,  //!< neighbors
-     const Point& p,                   //!< target point (must be in current block)
-     float r,                          //!< target radius (>= 0.0)
-     OutIter out,                      //!< insert iterator for output set of neighbors
-     const Bounds& domain)             //!< global domain bounds
-{
-  Bounds neigh_bounds; // neighbor block bounds
-
-  // for all neighbors of this block
-  for (int n = 0; n < link.size(); n++)
-  {
-    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
-    neigh_bounds = link.bounds(n);
-    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
-
-    if (distance(link.dimension(), neigh_bounds, p) <= r)
-        *out++ = n;
-  } // for all neighbors
-}
-
-//! Find the distance between point `p` and box `bounds`.
-template<class Point, class Bounds>
-float
-diy::
-distance(int dim, const Bounds& bounds, const Point& p)
-{
-    float res = 0;
-    for (int i = 0; i < dim; ++i)
-    {
-        // avoids all the annoying case logic by finding
-        // diff = max(bounds.min[i] - p[i], 0, p[i] - bounds.max[i])
-        float diff = 0, d;
-
-        d = bounds.min[i] - p[i];
-        if (d > diff) diff = d;
-        d = p[i] - bounds.max[i];
-        if (d > diff) diff = d;
-
-        res += diff*diff;
-    }
-    return sqrt(res);
-}
-
-template<class Bounds>
-float
-diy::
-distance(int dim, const Bounds& bounds1, const Bounds& bounds2)
-{
-    float res = 0;
-    for (int i = 0; i < dim; ++i)
-    {
-        float diff = 0, d;
-
-        float d1 = bounds1.max[i] - bounds2.min[i];
-        float d2 = bounds2.max[i] - bounds1.min[i];
-
-        if (d1 > 0 && d2 > 0)
-            diff = 0;
-        else if (d1 <= 0)
-            diff = -d1;
-        else if (d2 <= 0)
-            diff = -d2;
-
-        res += diff*diff;
-    }
-    return sqrt(res);
-}
-
-//! Finds the neighbor(s) containing the target point.
-template<class Bounds, class Point, class OutIter>
-void
-diy::
-in(const RegularLink<Bounds>& link,  //!< neighbors
-   const Point& p,                   //!< target point
-   OutIter out,                      //!< insert iterator for output set of neighbors
-   const Bounds& domain)             //!< global domain bounds
-{
-  Bounds neigh_bounds; // neighbor block bounds
-
-  // for all neighbors of this block
-  for (int n = 0; n < link.size(); n++)
-  {
-    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
-    neigh_bounds = link.bounds(n);
-    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
-
-    if (distance(link.dimension(), neigh_bounds, p) == 0)
-        *out++ = n;
-  } // for all neighbors
-}
-
-// wraps block bounds
-// wrap dir is the wrapping direction from original block to wrapped neighbor block
-// overall domain bounds and dimensionality are also needed
-template<class Bounds>
-void
-diy::
-wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim)
-{
-  for (int i = 0; i < dim; ++i)
-  {
-    bounds.min[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
-    bounds.max[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
-  }
-}
-
-
-#endif
diff --git a/diy/include/diy/point.hpp b/diy/include/diy/point.hpp
deleted file mode 100644
index cafbe784c..000000000
--- a/diy/include/diy/point.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef DIY_POINT_HPP
-#define DIY_POINT_HPP
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include <sstream>
-
-#include <array>
-
-namespace diy
-{
-
-template<class Coordinate_, unsigned D>
-class Point: public std::array<Coordinate_, D>
-{
-    public:
-        typedef             Coordinate_                             Coordinate;
-        typedef             std::array<Coordinate, D>               ArrayParent;
-
-        typedef             Point<Coordinate, D-1>                  LPoint;
-        typedef             Point<Coordinate, D+1>                  UPoint;
-
-        template<class U>
-        struct rebind       { typedef Point<U,D> type; };
-
-    public:
-                            Point()                                 { for (unsigned i = 0; i < D; ++i) (*this)[i] = 0; }
-                            Point(const ArrayParent& a):
-                                ArrayParent(a)                      {}
-        template<class T>   Point(const Point<T, D>& p)             { for (size_t i = 0; i < D; ++i) (*this)[i] = p[i]; }
-        template<class T>   Point(const T* a)                       { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
-        template<class T>   Point(const std::vector<T>& a)          { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
-                            Point(std::initializer_list<Coordinate> lst)   { unsigned i = 0; for (Coordinate x : lst) (*this)[i++] = x; }
-
-                            Point(Point&&)                          =default;
-                            Point(const Point&)                     =default;
-        Point&              operator=(const Point&)                 =default;
-
-        static constexpr
-        unsigned            dimension()                             { return D; }
-
-        static Point        zero()                                  { return Point(); }
-        static Point        one()                                   { Point p; for (unsigned i = 0; i < D; ++i) p[i] = 1; return p; }
-
-        LPoint              drop(int dim) const                     { LPoint p; unsigned c = 0; for (unsigned i = 0; i < D;   ++i) { if (i == dim) continue; p[c++] = (*this)[i]; } return p; }
-        UPoint              lift(int dim, Coordinate x) const       { UPoint p; for (unsigned i = 0; i < D+1; ++i) { if (i < dim) p[i] = (*this)[i]; else if (i == dim) p[i] = x; else if (i > dim) p[i] = (*this)[i-1]; } return p; }
-
-        using ArrayParent::operator[];
-
-        Point&              operator+=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] += y[i];  return *this; }
-        Point&              operator-=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] -= y[i];  return *this; }
-        Point&              operator*=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] *= a;     return *this; }
-        Point&              operator/=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] /= a;     return *this; }
-
-        Coordinate          norm() const                            { return (*this)*(*this); }
-
-        std::ostream&       operator<<(std::ostream& out) const     { out << (*this)[0]; for (unsigned i = 1; i < D; ++i) out << " " << (*this)[i]; return out; }
-        std::istream&       operator>>(std::istream& in);
-
-        friend
-        Point               operator+(Point x, const Point& y)       { x += y; return x; }
-
-        friend
-        Point               operator-(Point x, const Point& y)       { x -= y; return x; }
-
-        friend
-        Point               operator/(Point x, Coordinate y)         { x /= y; return x; }
-
-        friend
-        Point               operator*(Point x, Coordinate y)         { x *= y; return x; }
-
-        friend
-        Point               operator*(Coordinate y, Point x)         { x *= y; return x; }
-
-        friend
-        Coordinate          operator*(const Point& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
-
-        template<class T>
-        friend
-        Coordinate          operator*(const Point<T,D>& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
-};
-
-template<class C, unsigned D>
-std::istream&
-Point<C,D>::
-operator>>(std::istream& in)
-{
-    std::string point_str;
-    in >> point_str;        // read until ' '
-    std::stringstream ps(point_str);
-
-    char x;
-    for (unsigned i = 0; i < dimension(); ++i)
-    {
-        ps >> (*this)[i];
-        ps >> x;
-    }
-
-    return in;
-}
-
-
-template<class Coordinate, unsigned D>
-Coordinate norm2(const Point<Coordinate,D>& p)
-{ Coordinate res = 0; for (unsigned i = 0; i < D; ++i) res += p[i]*p[i]; return res; }
-
-template<class C, unsigned D>
-std::ostream&
-operator<<(std::ostream& out, const Point<C,D>& p)
-{ return p.operator<<(out); }
-
-template<class C, unsigned D>
-std::istream&
-operator>>(std::istream& in, Point<C,D>& p)
-{ return p.operator>>(in); }
-
-}
-
-#endif // DIY_POINT_HPP
diff --git a/diy/include/diy/proxy.hpp b/diy/include/diy/proxy.hpp
deleted file mode 100644
index 0160e0605..000000000
--- a/diy/include/diy/proxy.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef DIY_PROXY_HPP
-#define DIY_PROXY_HPP
-
-
-namespace diy
-{
-  //! Communication proxy, used for enqueueing and dequeueing items for future exchange.
-  struct Master::Proxy
-  {
-    template <class T>
-    struct EnqueueIterator;
-
-                        Proxy(Master* master, int gid):
-                          gid_(gid),
-                          master_(master),
-                          incoming_(&master->incoming(gid)),
-                          outgoing_(&master->outgoing(gid)),
-                          collectives_(&master->collectives(gid))       {}
-
-    int                 gid() const                                     { return gid_; }
-
-    //! Enqueue data whose size can be determined automatically, e.g., an STL vector.
-    template<class T>
-    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
-                                const T&        x,                                      //!< data (eg. STL vector)
-                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
-                               ) const
-    { OutgoingQueues& out = *outgoing_; save(out[to], x); }
-
-    //! Enqueue data whose size is given explicitly by the user, e.g., an array.
-    template<class T>
-    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
-                                const T*        x,                                      //!< pointer to the data (eg. address of start of vector)
-                                size_t          n,                                      //!< size in data elements (eg. ints)
-                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
-                               ) const;
-
-    //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was
-    //! previously enqueued so that diy knows its size when it is received.
-    //! In this case, diy will allocate the receive buffer; the user does not need to do so.
-    template<class T>
-    void                dequeue(int             from,                                   //!< target block gid
-                                T&              x,                                      //!< data (eg. STL vector)
-                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
-                               ) const
-    { IncomingQueues& in  = *incoming_; load(in[from], x); }
-
-    //! Dequeue an array of data whose size is given explicitly by the user.
-    //! In this case, the user needs to allocate the receive buffer prior to calling dequeue.
-    template<class T>
-    void                dequeue(int             from,                                   //!< target block gid
-                                T*              x,                                      //!< pointer to the data (eg. address of start of vector)
-                                size_t          n,                                      //!< size in data elements (eg. ints)
-                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
-                               ) const;
-
-    template<class T>
-    EnqueueIterator<T>  enqueuer(const T& x,
-                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save<T>) const
-    { return EnqueueIterator<T>(this, x, save); }
-
-    IncomingQueues*     incoming() const                                { return incoming_; }
-    MemoryBuffer&       incoming(int from) const                        { return (*incoming_)[from]; }
-    inline void         incoming(std::vector<int>& v) const;            // fill v with every gid from which we have a message
-
-    OutgoingQueues*     outgoing() const                                { return outgoing_; }
-    MemoryBuffer&       outgoing(const BlockID& to) const               { return (*outgoing_)[to]; }
-
-/**
- * \ingroup Communication
- * \brief Post an all-reduce collective using an existing communication proxy.
- * Available operators are:
- * maximum<T>, minimum<T>, std::plus<T>, std::multiplies<T>, std::logical_and<T>, and
- * std::logical_or<T>.
- */
-    template<class T, class Op>
-    inline void         all_reduce(const T& in,                  //!< local value being reduced
-                                   Op op                         //!< operator
-                                   ) const;
-/**
- * \ingroup Communication
- * \brief Return the result of a proxy collective without popping it off the collectives list (same result would be returned multiple times). The list can be cleared with collectives()->clear().
- */
-    template<class T>
-    inline T            read() const;
-/**
- * \ingroup Communication
- * \brief Return the result of a proxy collective; result is popped off the collectives list.
- */
-    template<class T>
-    inline T            get() const;
-
-    template<class T>
-    inline void         scratch(const T& in) const;
-
-/**
- * \ingroup Communication
- * \brief Return the list of proxy collectives (values and operations)
- */
-    CollectivesList*    collectives() const                             { return collectives_; }
-
-    Master*             master() const                                  { return master_; }
-
-    private:
-      int               gid_;
-      Master*           master_;
-      IncomingQueues*   incoming_;
-      OutgoingQueues*   outgoing_;
-      CollectivesList*  collectives_;
-  };
-
-  template<class T>
-  struct Master::Proxy::EnqueueIterator:
-    public std::iterator<std::output_iterator_tag, void, void, void, void>
-  {
-    typedef     void (*SaveT)(BinaryBuffer&, const T&);
-
-                        EnqueueIterator(const Proxy* proxy, const T& x,
-                                        SaveT save = &::diy::save<T>):
-                            proxy_(proxy), x_(x), save_(save)               {}
-
-    EnqueueIterator&    operator=(const BlockID& to)                        { proxy_->enqueue(to, x_, save_); return *this; }
-    EnqueueIterator&    operator*()                                         { return *this; }
-    EnqueueIterator&    operator++()                                        { return *this; }
-    EnqueueIterator&    operator++(int)                                     { return *this; }
-
-    private:
-      const Proxy*  proxy_;
-      const T&      x_;
-      SaveT         save_;
-
-  };
-
-  struct Master::ProxyWithLink: public Master::Proxy
-  {
-            ProxyWithLink(const Proxy&    proxy,
-                          void*           block,
-                          Link*           link):
-              Proxy(proxy),
-              block_(block),
-              link_(link)                                           {}
-
-      Link*   link() const                                          { return link_; }
-      void*   block() const                                         { return block_; }
-
-    private:
-      void*   block_;
-      Link*   link_;
-  };
-}
-
-
-void
-diy::Master::Proxy::
-incoming(std::vector<int>& v) const
-{
-  for (IncomingQueues::const_iterator it = incoming_->begin(); it != incoming_->end(); ++it)
-    v.push_back(it->first);
-}
-
-template<class T, class Op>
-void
-diy::Master::Proxy::
-all_reduce(const T& in, Op op) const
-{
-  collectives_->push_back(Collective(new detail::AllReduceOp<T,Op>(in, op)));
-}
-
-template<class T>
-T
-diy::Master::Proxy::
-read() const
-{
-  T res;
-  collectives_->front().result_out(&res);
-  return res;
-}
-
-template<class T>
-T
-diy::Master::Proxy::
-get() const
-{
-  T res = read<T>();
-  collectives_->pop_front();
-  return res;
-}
-
-template<class T>
-void
-diy::Master::Proxy::
-scratch(const T& in) const
-{
-  collectives_->push_back(Collective(new detail::Scratch<T>(in)));
-}
-
-template<class T>
-void
-diy::Master::Proxy::
-enqueue(const BlockID& to, const T* x, size_t n,
-        void (*save)(BinaryBuffer&, const T&)) const
-{
-    OutgoingQueues& out = *outgoing_;
-    BinaryBuffer&   bb  = out[to];
-    if (save == (void (*)(BinaryBuffer&, const T&)) &::diy::save<T>)
-        diy::save(bb, x, n);       // optimized for unspecialized types
-    else
-        for (size_t i = 0; i < n; ++i)
-            save(bb, x[i]);
-}
-
-template<class T>
-void
-diy::Master::Proxy::
-dequeue(int from, T* x, size_t n,
-        void (*load)(BinaryBuffer&, T&)) const
-{
-    IncomingQueues& in = *incoming_;
-    BinaryBuffer&   bb = in[from];
-    if (load == (void (*)(BinaryBuffer&, T&)) &::diy::load<T>)
-        diy::load(bb, x, n);       // optimized for unspecialized types
-    else
-        for (size_t i = 0; i < n; ++i)
-            load(bb, x[i]);
-}
-
-
-#endif
diff --git a/diy/include/diy/reduce-operations.hpp b/diy/include/diy/reduce-operations.hpp
deleted file mode 100644
index 629824da5..000000000
--- a/diy/include/diy/reduce-operations.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef DIY_REDUCE_OPERATIONS_HPP
-#define DIY_REDUCE_OPERATIONS_HPP
-
-#include "reduce.hpp"
-#include "partners/swap.hpp"
-#include "detail/reduce/all-to-all.hpp"
-
-namespace diy
-{
-
-/**
- * \ingroup Communication
- * \brief all to all reduction
- *
- */
-template<class Op>
-void
-all_to_all(Master&              master,     //!< block owner
-           const Assigner&      assigner,   //!< global block locator (maps gid to proc)
-           const Op&            op,         //!< user-defined operation called to enqueue and dequeue items
-           int                  k = 2       //!< reduction fanout
-          )
-{
-  auto scoped = master.prof.scoped("all_to_all");
-  RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()-1), assigner.nblocks());
-  RegularSwapPartners  partners(decomposer, k, false);
-  reduce(master, assigner, partners, detail::AllToAllReduce<Op>(op, assigner), detail::SkipIntermediate(partners.rounds()));
-}
-
-}
-
-#endif
diff --git a/diy/include/diy/reduce.hpp b/diy/include/diy/reduce.hpp
deleted file mode 100644
index 6d47d7930..000000000
--- a/diy/include/diy/reduce.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef DIY_REDUCE_HPP
-#define DIY_REDUCE_HPP
-
-#include <vector>
-#include "master.hpp"
-#include "assigner.hpp"
-#include "detail/block_traits.hpp"
-#include "log.hpp"
-
-namespace diy
-{
-//! Enables communication within a group during a reduction.
-//! DIY creates the ReduceProxy for you in diy::reduce()
-//! and provides a reference to ReduceProxy each time the user's reduction function is called
-struct ReduceProxy: public Master::Proxy
-{
-    typedef     std::vector<int>                            GIDVector;
-
-    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
-                void*                   block, //!< diy block
-                unsigned                round, //!< current round
-                const Assigner&         assigner, //!< assigner
-                const GIDVector&        incoming_gids, //!< incoming gids in this group
-                const GIDVector&        outgoing_gids): //!< outgoing gids in this group
-      Master::Proxy(proxy),
-      block_(block),
-      round_(round),
-      assigner_(assigner)
-    {
-      // setup in_link
-      for (unsigned i = 0; i < incoming_gids.size(); ++i)
-      {
-        BlockID nbr;
-        nbr.gid  = incoming_gids[i];
-        nbr.proc = assigner.rank(nbr.gid);
-        in_link_.add_neighbor(nbr);
-      }
-
-      // setup out_link
-      for (unsigned i = 0; i < outgoing_gids.size(); ++i)
-      {
-        BlockID nbr;
-        nbr.gid  = outgoing_gids[i];
-        nbr.proc = assigner.rank(nbr.gid);
-        out_link_.add_neighbor(nbr);
-      }
-    }
-
-    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
-                void*                   block, //!< diy block
-                unsigned                round, //!< current round
-                const Assigner&         assigner,
-                const Link&             in_link,
-                const Link&             out_link):
-      Master::Proxy(proxy),
-      block_(block),
-      round_(round),
-      assigner_(assigner),
-      in_link_(in_link),
-      out_link_(out_link)
-    {}
-
-    //! returns pointer to block
-    void*         block() const                           { return block_; }
-    //! returns current round number
-    unsigned      round() const                           { return round_; }
-    //! returns incoming link
-    const Link&   in_link() const                         { return in_link_; }
-    //! returns outgoing link
-    const Link&   out_link() const                        { return out_link_; }
-    //! returns total number of blocks
-    int           nblocks() const                         { return assigner_.nblocks(); }
-    //! returns the assigner
-    const Assigner& assigner() const                      { return assigner_; }
-
-    //! advanced: change current round number
-    void          set_round(unsigned r)                   { round_ = r; }
-
-  private:
-    void*         block_;
-    unsigned      round_;
-    const Assigner& assigner_;
-
-    Link          in_link_;
-    Link          out_link_;
-};
-
-namespace detail
-{
-  template<class Block, class Partners>
-  struct ReductionFunctor;
-
-  template<class Partners, class Skip>
-  struct SkipInactiveOr;
-
-  struct ReduceNeverSkip
-  {
-    bool operator()(int round, int lid, const Master& master) const  { return false; }
-  };
-}
-
-/**
- * \ingroup Communication
- * \brief Implementation of the reduce communication pattern (includes
- *        swap-reduce, merge-reduce, and any other global communication).
- *
- */
-template<class Reduce, class Partners, class Skip>
-void reduce(Master&                    master,        //!< master object
-            const Assigner&            assigner,      //!< assigner object
-            const Partners&            partners,      //!< partners object
-            const Reduce&              reduce,        //!< reduction callback function
-            const Skip&                skip)          //!< object determining whether a block should be skipped
-{
-  auto log = get_logger();
-
-  int original_expected = master.expected();
-
-  using Block = typename detail::block_traits<Reduce>::type;
-
-  unsigned round;
-  for (round = 0; round < partners.rounds(); ++round)
-  {
-    log->debug("Round {}", round);
-    master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
-                   detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
-    master.execute();
-
-    int expected = 0;
-    for (unsigned i = 0; i < master.size(); ++i)
-    {
-      if (partners.active(round + 1, master.gid(i), master))
-      {
-        std::vector<int> incoming_gids;
-        partners.incoming(round + 1, master.gid(i), incoming_gids, master);
-        expected += incoming_gids.size();
-        master.incoming(master.gid(i)).clear();
-      }
-    }
-    master.set_expected(expected);
-    master.flush();
-  }
-  // final round
-  log->debug("Round {}", round);
-  master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
-                 detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
-
-  master.set_expected(original_expected);
-}
-
-/**
- * \ingroup Communication
- * \brief Implementation of the reduce communication pattern (includes
- *        swap-reduce, merge-reduce, and any other global communication).
- *
- */
-template<class Reduce, class Partners>
-void reduce(Master&                    master,        //!< master object
-            const Assigner&            assigner,      //!< assigner object
-            const Partners&            partners,      //!< partners object
-            const Reduce&              reducer)       //!< reduction callback function
-{
-  reduce(master, assigner, partners, reducer, detail::ReduceNeverSkip());
-}
-
-namespace detail
-{
-  template<class Block, class Partners>
-  struct ReductionFunctor
-  {
-    using Callback = std::function<void(Block*, const ReduceProxy&, const Partners&)>;
-
-                ReductionFunctor(unsigned round_, const Callback& reduce_, const Partners& partners_, const Assigner& assigner_):
-                    round(round_), reduce(reduce_), partners(partners_), assigner(assigner_)        {}
-
-    void        operator()(Block* b, const Master::ProxyWithLink& cp) const
-    {
-      if (!partners.active(round, cp.gid(), *cp.master())) return;
-
-      std::vector<int> incoming_gids, outgoing_gids;
-      if (round > 0)
-          partners.incoming(round, cp.gid(), incoming_gids, *cp.master());        // receive from the previous round
-      if (round < partners.rounds())
-          partners.outgoing(round, cp.gid(), outgoing_gids, *cp.master());        // send to the next round
-
-      ReduceProxy   rp(cp, b, round, assigner, incoming_gids, outgoing_gids);
-      reduce(b, rp, partners);
-
-      // touch the outgoing queues to make sure they exist
-      Master::OutgoingQueues& outgoing = *cp.outgoing();
-      if (outgoing.size() < (size_t) rp.out_link().size())
-        for (int j = 0; j < rp.out_link().size(); ++j)
-          outgoing[rp.out_link().target(j)];       // touch the outgoing queue, creating it if necessary
-    }
-
-    unsigned        round;
-    Callback        reduce;
-    Partners        partners;
-    const Assigner& assigner;
-  };
-
-  template<class Partners, class Skip>
-  struct SkipInactiveOr
-  {
-                    SkipInactiveOr(int round_, const Partners& partners_, const Skip& skip_):
-                        round(round_), partners(partners_), skip(skip_)         {}
-    bool            operator()(int i, const Master& master) const               { return !partners.active(round, master.gid(i), master) || skip(round, i, master); }
-    int             round;
-    const Partners& partners;
-    Skip            skip;
-  };
-}
-
-} // diy
-
-#endif // DIY_REDUCE_HPP
diff --git a/diy/include/diy/serialization.hpp b/diy/include/diy/serialization.hpp
deleted file mode 100644
index 25640255d..000000000
--- a/diy/include/diy/serialization.hpp
+++ /dev/null
@@ -1,456 +0,0 @@
-#ifndef DIY_SERIALIZATION_HPP
-#define DIY_SERIALIZATION_HPP
-
-#include <vector>
-#include <valarray>
-#include <map>
-#include <set>
-#include <string>
-#include <fstream>
-
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <type_traits>              // this is used for a safety check for default serialization
-
-namespace diy
-{
-  //! A serialization buffer. \ingroup Serialization
-  struct BinaryBuffer
-  {
-    virtual void        save_binary(const char* x, size_t count)    =0;   //!< copy `count` bytes from `x` into the buffer
-    virtual void        load_binary(char* x, size_t count)          =0;   //!< copy `count` bytes into `x` from the buffer
-    virtual void        load_binary_back(char* x, size_t count)     =0;   //!< copy `count` bytes into `x` from the back of the buffer
-  };
-
-  struct MemoryBuffer: public BinaryBuffer
-  {
-                        MemoryBuffer(size_t position_ = 0):
-                          position(position_)                       {}
-
-    virtual inline void save_binary(const char* x, size_t count) override;   //!< copy `count` bytes from `x` into the buffer
-    virtual inline void load_binary(char* x, size_t count) override;         //!< copy `count` bytes into `x` from the buffer
-    virtual inline void load_binary_back(char* x, size_t count) override;    //!< copy `count` bytes into `x` from the back of the buffer
-
-    void                clear()                                     { buffer.clear(); reset(); }
-    void                wipe()                                      { std::vector<char>().swap(buffer); reset(); }
-    void                reset()                                     { position = 0; }
-    void                skip(size_t s)                              { position += s; }
-    void                swap(MemoryBuffer& o)                       { std::swap(position, o.position); buffer.swap(o.buffer); }
-    bool                empty() const                               { return buffer.empty(); }
-    size_t              size() const                                { return buffer.size(); }
-    void                reserve(size_t s)                           { buffer.reserve(s); }
-                        operator bool() const                       { return position < buffer.size(); }
-
-    //! copy a memory buffer from one buffer to another, bypassing making a temporary copy first
-    inline static void  copy(MemoryBuffer& from, MemoryBuffer& to);
-
-    //! multiplier used for the geometric growth of the container
-    static float        growth_multiplier()                         { return 1.5; }
-
-    // simple file IO
-    void                write(const std::string& fn) const          { std::ofstream out(fn.c_str()); out.write(&buffer[0], size()); }
-    void                read(const std::string& fn)
-    {
-        std::ifstream in(fn.c_str(), std::ios::binary | std::ios::ate);
-        buffer.resize(in.tellg());
-        in.seekg(0);
-        in.read(&buffer[0], size());
-        position = 0;
-    }
-
-    size_t              position;
-    std::vector<char>   buffer;
-  };
-
-  namespace detail
-  {
-    struct Default {};
-  }
-
-  //!\addtogroup Serialization
-  //!@{
-
-  /**
-   * \brief Main interface to serialization, meant to be specialized for the
-   * types that require special handling.  `diy::save()` and `diy::load()` call
-   * the static member functions of this class.
-   *
-   * The default (unspecialized) version copies
-   * `sizeof(T)` bytes from `&x` to or from `bb` via
-   * its `diy::BinaryBuffer::save_binary()` and `diy::BinaryBuffer::load_binary()`
-   * functions.  This works out perfectly for plain old data (e.g., simple structs).
-   * To save a more complicated type, one has to specialize
-   * `diy::Serialization<T>` for that type. Specializations are already provided for
-   * `std::vector<T>`, `std::map<K,V>`, and `std::pair<T,U>`.
-   * As a result one can quickly add a specialization of one's own
-   *
-   */
-  template<class T>
-  struct Serialization: public detail::Default
-  {
-#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)
-    static_assert(std::is_trivially_copyable<T>::value, "Default serialization works only for trivially copyable types");
-#endif
-
-    static void         save(BinaryBuffer& bb, const T& x)          { bb.save_binary((const char*)  &x, sizeof(T)); }
-    static void         load(BinaryBuffer& bb, T& x)                { bb.load_binary((char*)        &x, sizeof(T)); }
-  };
-
-  //! Saves `x` to `bb` by calling `diy::Serialization<T>::save(bb,x)`.
-  template<class T>
-  void                  save(BinaryBuffer& bb, const T& x)          { Serialization<T>::save(bb, x); }
-
-  //! Loads `x` from `bb` by calling `diy::Serialization<T>::load(bb,x)`.
-  template<class T>
-  void                  load(BinaryBuffer& bb, T& x)                { Serialization<T>::load(bb, x); }
-
-  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
-  //! the array will be copied all at once. Otherwise, it's copied element by element.
-  template<class T>
-  void                  save(BinaryBuffer& bb, const T* x, size_t n);
-
-  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
-  //! the array will be filled all at once. Otherwise, it's filled element by element.
-  template<class T>
-  void                  load(BinaryBuffer& bb, T* x, size_t n);
-
-  //! Supports only binary data copying (meant for simple footers).
-  template<class T>
-  void                  load_back(BinaryBuffer& bb, T& x)           { bb.load_binary_back((char*) &x, sizeof(T)); }
-
-  //@}
-
-
-  namespace detail
-  {
-    template<typename T>
-    struct is_default
-    {
-        typedef char    yes;
-        typedef int     no;
-
-        static yes      test(Default*);
-        static no       test(...);
-
-        enum { value = (sizeof(test((T*) 0)) == sizeof(yes)) };
-    };
-  }
-
-  template<class T>
-  void                  save(BinaryBuffer& bb, const T* x, size_t n)
-  {
-    if (!detail::is_default< Serialization<T> >::value)
-      for (size_t i = 0; i < n; ++i)
-        diy::save(bb, x[i]);
-    else        // if Serialization is not specialized for U, just save the binary data
-      bb.save_binary((const char*) &x[0], sizeof(T)*n);
-  }
-
-  template<class T>
-  void                  load(BinaryBuffer& bb, T* x, size_t n)
-  {
-    if (!detail::is_default< Serialization<T> >::value)
-      for (size_t i = 0; i < n; ++i)
-        diy::load(bb, x[i]);
-    else      // if Serialization is not specialized for U, just load the binary data
-      bb.load_binary((char*) &x[0], sizeof(T)*n);
-  }
-
-
-  // save/load for MemoryBuffer
-  template<>
-  struct Serialization< MemoryBuffer >
-  {
-    static void         save(BinaryBuffer& bb, const MemoryBuffer& x)
-    {
-      diy::save(bb, x.position);
-      diy::save(bb, &x.buffer[0], x.position);
-    }
-
-    static void         load(BinaryBuffer& bb, MemoryBuffer& x)
-    {
-      diy::load(bb, x.position);
-      x.buffer.resize(x.position);
-      diy::load(bb, &x.buffer[0], x.position);
-    }
-  };
-
-  // save/load for std::vector<U>
-  template<class U>
-  struct Serialization< std::vector<U> >
-  {
-    typedef             std::vector<U>          Vector;
-
-    static void         save(BinaryBuffer& bb, const Vector& v)
-    {
-      size_t s = v.size();
-      diy::save(bb, s);
-      diy::save(bb, &v[0], v.size());
-    }
-
-    static void         load(BinaryBuffer& bb, Vector& v)
-    {
-      size_t s;
-      diy::load(bb, s);
-      v.resize(s);
-      diy::load(bb, &v[0], s);
-    }
-  };
-
-  template<class U>
-  struct Serialization< std::valarray<U> >
-  {
-    typedef             std::valarray<U>        ValArray;
-
-    static void         save(BinaryBuffer& bb, const ValArray& v)
-    {
-      size_t s = v.size();
-      diy::save(bb, s);
-      diy::save(bb, &v[0], v.size());
-    }
-
-    static void         load(BinaryBuffer& bb, ValArray& v)
-    {
-      size_t s;
-      diy::load(bb, s);
-      v.resize(s);
-      diy::load(bb, &v[0], s);
-    }
-  };
-
-  // save/load for std::string
-  template<>
-  struct Serialization< std::string >
-  {
-    typedef             std::string             String;
-
-    static void         save(BinaryBuffer& bb, const String& s)
-    {
-      size_t sz = s.size();
-      diy::save(bb, sz);
-      diy::save(bb, s.c_str(), sz);
-    }
-
-    static void         load(BinaryBuffer& bb, String& s)
-    {
-      size_t sz;
-      diy::load(bb, sz);
-      s.resize(sz);
-      for (size_t i = 0; i < sz; ++i)
-      {
-          char c;
-          diy::load(bb, c);
-          s[i] = c;
-      }
-    }
-  };
-
-  // save/load for std::pair<X,Y>
-  template<class X, class Y>
-  struct Serialization< std::pair<X,Y> >
-  {
-    typedef             std::pair<X,Y>          Pair;
-
-    static void         save(BinaryBuffer& bb, const Pair& p)
-    {
-      diy::save(bb, p.first);
-      diy::save(bb, p.second);
-    }
-
-    static void         load(BinaryBuffer& bb, Pair& p)
-    {
-      diy::load(bb, p.first);
-      diy::load(bb, p.second);
-    }
-  };
-
-  // save/load for std::map<K,V>
-  template<class K, class V>
-  struct Serialization< std::map<K,V> >
-  {
-    typedef             std::map<K,V>           Map;
-
-    static void         save(BinaryBuffer& bb, const Map& m)
-    {
-      size_t s = m.size();
-      diy::save(bb, s);
-      for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it)
-        diy::save(bb, *it);
-    }
-
-    static void         load(BinaryBuffer& bb, Map& m)
-    {
-      size_t s;
-      diy::load(bb, s);
-      for (size_t i = 0; i < s; ++i)
-      {
-        K k;
-        diy::load(bb, k);
-        diy::load(bb, m[k]);
-      }
-    }
-  };
-
-  // save/load for std::set<T>
-  template<class T>
-  struct Serialization< std::set<T> >
-  {
-    typedef             std::set<T>             Set;
-
-    static void         save(BinaryBuffer& bb, const Set& m)
-    {
-      size_t s = m.size();
-      diy::save(bb, s);
-      for (typename std::set<T>::const_iterator it = m.begin(); it != m.end(); ++it)
-        diy::save(bb, *it);
-    }
-
-    static void         load(BinaryBuffer& bb, Set& m)
-    {
-      size_t s;
-      diy::load(bb, s);
-      for (size_t i = 0; i < s; ++i)
-      {
-        T p;
-        diy::load(bb, p);
-        m.insert(p);
-      }
-    }
-  };
-
-  // save/load for std::unordered_map<K,V,H,E,A>
-  template<class K, class V, class H, class E, class A>
-  struct Serialization< std::unordered_map<K,V,H,E,A> >
-  {
-    typedef             std::unordered_map<K,V,H,E,A>   Map;
-
-    static void         save(BinaryBuffer& bb, const Map& m)
-    {
-      size_t s = m.size();
-      diy::save(bb, s);
-      for (auto& x : m)
-        diy::save(bb, x);
-    }
-
-    static void         load(BinaryBuffer& bb, Map& m)
-    {
-      size_t s;
-      diy::load(bb, s);
-      for (size_t i = 0; i < s; ++i)
-      {
-        std::pair<K,V> p;
-        diy::load(bb, p);
-        m.emplace(std::move(p));
-      }
-    }
-  };
-
-  // save/load for std::unordered_set<T,H,E,A>
-  template<class T, class H, class E, class A>
-  struct Serialization< std::unordered_set<T,H,E,A> >
-  {
-    typedef             std::unordered_set<T,H,E,A>     Set;
-
-    static void         save(BinaryBuffer& bb, const Set& m)
-    {
-      size_t s = m.size();
-      diy::save(bb, s);
-      for (auto& x : m)
-        diy::save(bb, x);
-    }
-
-    static void         load(BinaryBuffer& bb, Set& m)
-    {
-      size_t s;
-      diy::load(bb, s);
-      for (size_t i = 0; i < s; ++i)
-      {
-        T p;
-        diy::load(bb, p);
-        m.emplace(std::move(p));
-      }
-    }
-  };
-
-  // save/load for std::tuple<...>
-  // TODO: this ought to be default (copying) serialization
-  //       if all arguments are default
-  template<class... Args>
-  struct Serialization< std::tuple<Args...> >
-  {
-    typedef             std::tuple<Args...>     Tuple;
-
-    static void         save(BinaryBuffer& bb, const Tuple& t)          { save<0>(bb, t); }
-
-    template<std::size_t I = 0>
-    static
-    typename std::enable_if<I == sizeof...(Args), void>::type
-                        save(BinaryBuffer&, const Tuple&)               {}
-
-    template<std::size_t I = 0>
-    static
-    typename std::enable_if<I < sizeof...(Args), void>::type
-                        save(BinaryBuffer& bb, const Tuple& t)          { diy::save(bb, std::get<I>(t)); save<I+1>(bb, t); }
-
-    static void         load(BinaryBuffer& bb, Tuple& t)                { load<0>(bb, t); }
-
-    template<std::size_t I = 0>
-    static
-    typename std::enable_if<I == sizeof...(Args), void>::type
-                        load(BinaryBuffer&, Tuple&)                     {}
-
-    template<std::size_t I = 0>
-    static
-    typename std::enable_if<I < sizeof...(Args), void>::type
-                        load(BinaryBuffer& bb, Tuple& t)                { diy::load(bb, std::get<I>(t)); load<I+1>(bb, t); }
-
-  };
-}
-
-void
-diy::MemoryBuffer::
-save_binary(const char* x, size_t count)
-{
-  if (position + count > buffer.capacity())
-    buffer.reserve((position + count) * growth_multiplier());           // if we have to grow, grow geometrically
-
-  if (position + count > buffer.size())
-    buffer.resize(position + count);
-
-  std::copy(x, x + count, &buffer[position]);
-  position += count;
-}
-
-void
-diy::MemoryBuffer::
-load_binary(char* x, size_t count)
-{
-  std::copy(&buffer[position], &buffer[position + count], x);
-  position += count;
-}
-
-void
-diy::MemoryBuffer::
-load_binary_back(char* x, size_t count)
-{
-  std::copy(&buffer[buffer.size() - count], &buffer[buffer.size()], x);
-  buffer.resize(buffer.size() - count);
-}
-
-void
-diy::MemoryBuffer::
-copy(MemoryBuffer& from, MemoryBuffer& to)
-{
-  size_t sz;
-  diy::load(from, sz);
-  from.position -= sizeof(size_t);
-
-  size_t total = sizeof(size_t) + sz;
-  to.buffer.resize(to.position + total);
-  std::copy(&from.buffer[from.position], &from.buffer[from.position + total], &to.buffer[to.position]);
-  to.position += total;
-  from.position += total;
-}
-
-#endif
diff --git a/diy/include/diy/stats.hpp b/diy/include/diy/stats.hpp
deleted file mode 100644
index 4866ccfb1..000000000
--- a/diy/include/diy/stats.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef DIY_STATS_HPP
-#define DIY_STATS_HPP
-
-#include <chrono>
-#include <string>
-#include <vector>
-
-#include "log.hpp"      // need this for format
-
-namespace diy
-{
-namespace stats
-{
-
-#if defined(DIY_PROFILE)
-struct Profiler
-{
-    using   Clock = std::chrono::high_resolution_clock;
-    using   Time  = Clock::time_point;
-
-    struct Event
-    {
-            Event(const std::string& name_, bool begin_):
-                name(name_),
-                begin(begin_),
-                stamp(Clock::now())
-                                                        {}
-
-        std::string     name;
-        bool            begin;
-        Time            stamp;
-    };
-
-    using   EventsVector = std::vector<Event>;
-
-    struct  Scoped
-    {
-            Scoped(Profiler& prof_, std::string name_):
-                prof(prof_), name(name_), active(true)  { prof << name; }
-            ~Scoped()                                   { if (active) prof >> name; }
-
-            Scoped(Scoped&& other):
-                prof(other.prof),
-                name(other.name),
-                active(other.active)                    { other.active = false; }
-
-        Scoped&
-            operator=(Scoped&& other) = delete;
-            Scoped(const Scoped&) = delete;
-        Scoped&
-            operator=(const Scoped&) = delete;
-
-        Profiler&   prof;
-        std::string name;
-        bool        active;
-    };
-
-            Profiler()                                  { reset_time(); }
-
-    void    reset_time()                                { start = Clock::now(); }
-
-    void    operator<<(std::string name)                { enter(name); }
-    void    operator>>(std::string name)                { exit(name); }
-
-    void    enter(std::string name)                     { events.push_back(Event(name, true)); }
-    void    exit(std::string name)                      { events.push_back(Event(name, false)); }
-
-    void    output(std::ostream& out)
-    {
-        for (size_t i = 0; i < events.size(); ++i)
-        {
-            const Event& e = events[i];
-            auto time = std::chrono::duration_cast<std::chrono::microseconds>(e.stamp - start).count();
-
-            fmt::print(out, "{:02d}:{:02d}:{:02d}.{:06d} {}{}\n",
-                            time/1000000/60/60,
-                            time/1000000/60 % 60,
-                            time/1000000 % 60,
-                            time % 1000000,
-                            (e.begin ? '<' : '>'),
-                            e.name);
-        }
-    }
-
-    Scoped  scoped(std::string name)                    { return Scoped(*this, name); }
-
-    void    clear()                                     { events.clear(); }
-
-    private:
-        Time            start;
-        EventsVector    events;
-};
-#else
-struct Profiler
-{
-    struct Scoped {};
-
-    void    reset_time()                                {}
-
-    void    operator<<(std::string)                     {}
-    void    operator>>(std::string)                     {}
-
-    void    enter(const std::string&)                   {}
-    void    exit(const std::string&)                    {}
-
-    void    output(std::ostream&)                       {}
-    void    clear()                                     {}
-
-    Scoped  scoped(std::string)                         { return Scoped(); }
-};
-#endif
-}
-}
-
-#endif
diff --git a/diy/include/diy/storage.hpp b/diy/include/diy/storage.hpp
deleted file mode 100644
index 62213b2c5..000000000
--- a/diy/include/diy/storage.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef DIY_STORAGE_HPP
-#define DIY_STORAGE_HPP
-
-#include <string>
-#include <map>
-#include <fstream>
-
-#include <unistd.h>     // mkstemp() on Mac
-#include <cstdlib>      // mkstemp() on Linux
-#include <cstdio>       // remove()
-#include <fcntl.h>
-
-#include "serialization.hpp"
-#include "thread.hpp"
-#include "log.hpp"
-
-namespace diy
-{
-  namespace detail
-  {
-    typedef       void  (*Save)(const void*, BinaryBuffer& buf);
-    typedef       void  (*Load)(void*,       BinaryBuffer& buf);
-
-    struct FileBuffer: public BinaryBuffer
-    {
-                          FileBuffer(FILE* file_): file(file_), head(0), tail(0)    {}
-
-      // TODO: add error checking
-      virtual inline void save_binary(const char* x, size_t count) override   { fwrite(x, 1, count, file); head += count; }
-      virtual inline void load_binary(char* x, size_t count) override         { fread(x, 1, count, file); }
-      virtual inline void load_binary_back(char* x, size_t count) override    { fseek(file, tail, SEEK_END); fread(x, 1, count, file); tail += count; fseek(file, head, SEEK_SET); }
-
-      size_t              size() const                                { return head; }
-
-      FILE*  file;
-      size_t head, tail;  // tail is used to support reading from the back;
-                          // the mechanism is a little awkward and unused, but should work if needed
-    };
-  }
-
-  class ExternalStorage
-  {
-    public:
-      virtual int   put(MemoryBuffer& bb)                               =0;
-      virtual int   put(const void* x, detail::Save save)               =0;
-      virtual void  get(int i, MemoryBuffer& bb, size_t extra = 0)      =0;
-      virtual void  get(int i, void* x, detail::Load load)              =0;
-      virtual void  destroy(int i)                                      =0;
-  };
-
-  class FileStorage: public ExternalStorage
-  {
-    private:
-      struct FileRecord
-      {
-        size_t          size;
-        std::string     name;
-      };
-
-    public:
-                    FileStorage(const std::string& filename_template = "/tmp/DIY.XXXXXX"):
-                      filename_templates_(1, filename_template),
-                      count_(0), current_size_(0), max_size_(0)         {}
-
-                    FileStorage(const std::vector<std::string>& filename_templates):
-                      filename_templates_(filename_templates),
-                      count_(0), current_size_(0), max_size_(0)         {}
-
-      virtual int   put(MemoryBuffer& bb) override
-      {
-        auto log = get_logger();
-        std::string     filename;
-        int fh = open_random(filename);
-
-        log->debug("FileStorage::put(): {}; buffer size: {}", filename, bb.size());
-
-        size_t sz = bb.buffer.size();
-        size_t written = write(fh, &bb.buffer[0], sz);
-        if (written < sz || written == (size_t)-1)
-          log->warn("Could not write the full buffer to {}: written = {}; size = {}", filename, written, sz);
-        fsync(fh);
-        close(fh);
-        bb.wipe();
-
-#if 0       // double-check the written file size: only for extreme debugging
-        FILE* fp = fopen(filename.c_str(), "r");
-        fseek(fp, 0L, SEEK_END);
-        int fsz = ftell(fp);
-        if (fsz != sz)
-            log->warn("file size doesn't match the buffer size, {} vs {}", fsz, sz);
-        fclose(fp);
-#endif
-
-        return make_file_record(filename, sz);
-      }
-
-      virtual int    put(const void* x, detail::Save save) override
-      {
-        std::string     filename;
-        int fh = open_random(filename);
-
-        detail::FileBuffer fb(fdopen(fh, "w"));
-        save(x, fb);
-        size_t sz = fb.size();
-        fclose(fb.file);
-        fsync(fh);
-
-        return make_file_record(filename, sz);
-      }
-
-      virtual void   get(int i, MemoryBuffer& bb, size_t extra) override
-      {
-        FileRecord fr = extract_file_record(i);
-
-        get_logger()->debug("FileStorage::get(): {}", fr.name);
-
-        bb.buffer.reserve(fr.size + extra);
-        bb.buffer.resize(fr.size);
-        int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
-        read(fh, &bb.buffer[0], fr.size);
-        close(fh);
-
-        remove_file(fr);
-      }
-
-      virtual void   get(int i, void* x, detail::Load load) override
-      {
-        FileRecord fr = extract_file_record(i);
-
-        //int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
-        int fh = open(fr.name.c_str(), O_RDONLY, 0600);
-        detail::FileBuffer fb(fdopen(fh, "r"));
-        load(x, fb);
-        fclose(fb.file);
-
-        remove_file(fr);
-      }
-
-      virtual void  destroy(int i) override
-      {
-        FileRecord      fr;
-        {
-          CriticalMapAccessor accessor = filenames_.access();
-          fr = (*accessor)[i];
-          accessor->erase(i);
-        }
-        remove(fr.name.c_str());
-        (*current_size_.access()) -= fr.size;
-      }
-
-      int           count() const               { return (*count_.const_access()); }
-      size_t        current_size() const        { return (*current_size_.const_access()); }
-      size_t        max_size() const            { return (*max_size_.const_access()); }
-
-                    ~FileStorage()
-      {
-        for (FileRecordMap::const_iterator it =  filenames_.const_access()->begin();
-                                           it != filenames_.const_access()->end();
-                                         ++it)
-        {
-          remove(it->second.name.c_str());
-        }
-      }
-
-    private:
-      int           open_random(std::string& filename) const
-      {
-        if (filename_templates_.size() == 1)
-            filename = filename_templates_[0].c_str();
-        else
-        {
-            // pick a template at random (very basic load balancing mechanism)
-            filename  = filename_templates_[std::rand() % filename_templates_.size()].c_str();
-        }
-#ifdef __MACH__
-        // TODO: figure out how to open with O_SYNC
-        int fh = mkstemp(const_cast<char*>(filename.c_str()));
-#else
-        int fh = mkostemp(const_cast<char*>(filename.c_str()), O_WRONLY | O_SYNC);
-#endif
-
-        return fh;
-      }
-
-      int           make_file_record(const std::string& filename, size_t sz)
-      {
-        int res = (*count_.access())++;
-        FileRecord  fr = { sz, filename };
-        (*filenames_.access())[res] = fr;
-
-        // keep track of sizes
-        critical_resource<size_t>::accessor     cur = current_size_.access();
-        *cur += sz;
-        critical_resource<size_t>::accessor     max = max_size_.access();
-        if (*cur > *max)
-            *max = *cur;
-
-        return res;
-      }
-
-      FileRecord    extract_file_record(int i)
-      {
-        CriticalMapAccessor accessor = filenames_.access();
-        FileRecord fr = (*accessor)[i];
-        accessor->erase(i);
-        return fr;
-      }
-
-      void          remove_file(const FileRecord& fr)
-      {
-        remove(fr.name.c_str());
-        (*current_size_.access()) -= fr.size;
-      }
-
-    private:
-      typedef           std::map<int, FileRecord>                   FileRecordMap;
-      typedef           critical_resource<FileRecordMap>            CriticalMap;
-      typedef           CriticalMap::accessor                       CriticalMapAccessor;
-
-    private:
-      std::vector<std::string>      filename_templates_;
-      CriticalMap                   filenames_;
-      critical_resource<int>        count_;
-      critical_resource<size_t>     current_size_, max_size_;
-  };
-}
-
-#endif
diff --git a/diy/include/diy/thread.hpp b/diy/include/diy/thread.hpp
deleted file mode 100644
index 1c9149a42..000000000
--- a/diy/include/diy/thread.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef DIY_THREAD_H
-#define DIY_THREAD_H
-
-#ifdef DIY_NO_THREADS
-#include "no-thread.hpp"
-#else
-
-#include "thread/fast_mutex.h"
-
-#include <thread>
-#include <mutex>
-
-namespace diy
-{
-    using std::thread;
-    using std::mutex;
-    using std::recursive_mutex;
-    namespace this_thread = std::this_thread;
-
-    // TODO: replace with our own implementation using std::atomic_flag
-    using fast_mutex = tthread::fast_mutex;
-
-    template<class Mutex>
-    using lock_guard = std::unique_lock<Mutex>;
-}
-
-#endif
-
-#include "critical-resource.hpp"
-
-#endif
diff --git a/diy/include/diy/thread/fast_mutex.h b/diy/include/diy/thread/fast_mutex.h
deleted file mode 100644
index 4d4b7cc43..000000000
--- a/diy/include/diy/thread/fast_mutex.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
-Copyright (c) 2010-2012 Marcus Geelnard
-
-This software is provided 'as-is', without any express or implied
-warranty. In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-    1. The origin of this software must not be misrepresented; you must not
-    claim that you wrote the original software. If you use this software
-    in a product, an acknowledgment in the product documentation would be
-    appreciated but is not required.
-
-    2. Altered source versions must be plainly marked as such, and must not be
-    misrepresented as being the original software.
-
-    3. This notice may not be removed or altered from any source
-    distribution.
-*/
-
-#ifndef _FAST_MUTEX_H_
-#define _FAST_MUTEX_H_
-
-/// @file
-
-// Which platform are we on?
-#if !defined(_TTHREAD_PLATFORM_DEFINED_)
-  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
-    #define _TTHREAD_WIN32_
-  #else
-    #define _TTHREAD_POSIX_
-  #endif
-  #define _TTHREAD_PLATFORM_DEFINED_
-#endif
-
-// Check if we can support the assembly language level implementation (otherwise
-// revert to the system API)
-#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
-    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
-    (defined(__GNUC__) && (defined(__ppc__)))
-  #define _FAST_MUTEX_ASM_
-#else
-  #define _FAST_MUTEX_SYS_
-#endif
-
-#if defined(_TTHREAD_WIN32_)
-  #ifndef WIN32_LEAN_AND_MEAN
-    #define WIN32_LEAN_AND_MEAN
-    #define __UNDEF_LEAN_AND_MEAN
-  #endif
-  #include <windows.h>
-  #ifdef __UNDEF_LEAN_AND_MEAN
-    #undef WIN32_LEAN_AND_MEAN
-    #undef __UNDEF_LEAN_AND_MEAN
-  #endif
-#else
-  #ifdef _FAST_MUTEX_ASM_
-    #include <sched.h>
-  #else
-    #include <pthread.h>
-  #endif
-#endif
-
-namespace tthread {
-
-/// Fast mutex class.
-/// This is a mutual exclusion object for synchronizing access to shared
-/// memory areas for several threads. It is similar to the tthread::mutex class,
-/// but instead of using system level functions, it is implemented as an atomic
-/// spin lock with very low CPU overhead.
-///
-/// The \c fast_mutex class is NOT compatible with the \c condition_variable
-/// class (however, it IS compatible with the \c lock_guard class). It should
-/// also be noted that the \c fast_mutex class typically does not provide
-/// as accurate thread scheduling as a the standard \c mutex class does.
-///
-/// Because of the limitations of the class, it should only be used in
-/// situations where the mutex needs to be locked/unlocked very frequently.
-///
-/// @note The "fast" version of this class relies on inline assembler language,
-/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
-/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
-/// Visual C++).
-/// For other architectures/compilers, system functions are used instead.
-class fast_mutex {
-  public:
-    /// Constructor.
-#if defined(_FAST_MUTEX_ASM_)
-    fast_mutex() : mLock(0) {}
-#else
-    fast_mutex()
-    {
-  #if defined(_TTHREAD_WIN32_)
-      InitializeCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_init(&mHandle, NULL);
-  #endif
-    }
-#endif
-
-#if !defined(_FAST_MUTEX_ASM_)
-    /// Destructor.
-    ~fast_mutex()
-    {
-  #if defined(_TTHREAD_WIN32_)
-      DeleteCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_destroy(&mHandle);
-  #endif
-    }
-#endif
-
-    /// Lock the mutex.
-    /// The method will block the calling thread until a lock on the mutex can
-    /// be obtained. The mutex remains locked until \c unlock() is called.
-    /// @see lock_guard
-    inline void lock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-      bool gotLock;
-      do {
-        gotLock = try_lock();
-        if(!gotLock)
-        {
-  #if defined(_TTHREAD_WIN32_)
-          Sleep(0);
-  #elif defined(_TTHREAD_POSIX_)
-          sched_yield();
-  #endif
-        }
-      } while(!gotLock);
-#else
-  #if defined(_TTHREAD_WIN32_)
-      EnterCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_lock(&mHandle);
-  #endif
-#endif
-    }
-
-    /// Try to lock the mutex.
-    /// The method will try to lock the mutex. If it fails, the function will
-    /// return immediately (non-blocking).
-    /// @return \c true if the lock was acquired, or \c false if the lock could
-    /// not be acquired.
-    inline bool try_lock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-      int oldLock;
-  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-      asm volatile (
-        "movl $1,%%eax\n\t"
-        "xchg %%eax,%0\n\t"
-        "movl %%eax,%1\n\t"
-        : "=m" (mLock), "=m" (oldLock)
-        :
-        : "%eax", "memory"
-      );
-  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-      int *ptrLock = &mLock;
-      __asm {
-        mov eax,1
-        mov ecx,ptrLock
-        xchg eax,[ecx]
-        mov oldLock,eax
-      }
-  #elif defined(__GNUC__) && (defined(__ppc__))
-      int newLock = 1;
-      asm volatile (
-        "\n1:\n\t"
-        "lwarx  %0,0,%1\n\t"
-        "cmpwi  0,%0,0\n\t"
-        "bne-   2f\n\t"
-        "stwcx. %2,0,%1\n\t"
-        "bne-   1b\n\t"
-        "isync\n"
-        "2:\n\t"
-        : "=&r" (oldLock)
-        : "r" (&mLock), "r" (newLock)
-        : "cr0", "memory"
-      );
-  #endif
-      return (oldLock == 0);
-#else
-  #if defined(_TTHREAD_WIN32_)
-      return TryEnterCriticalSection(&mHandle) ? true : false;
-  #elif defined(_TTHREAD_POSIX_)
-      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
-  #endif
-#endif
-    }
-
-    /// Unlock the mutex.
-    /// If any threads are waiting for the lock on this mutex, one of them will
-    /// be unblocked.
-    inline void unlock()
-    {
-#if defined(_FAST_MUTEX_ASM_)
-  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-      asm volatile (
-        "movl $0,%%eax\n\t"
-        "xchg %%eax,%0\n\t"
-        : "=m" (mLock)
-        :
-        : "%eax", "memory"
-      );
-  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-      int *ptrLock = &mLock;
-      __asm {
-        mov eax,0
-        mov ecx,ptrLock
-        xchg eax,[ecx]
-      }
-  #elif defined(__GNUC__) && (defined(__ppc__))
-      asm volatile (
-        "sync\n\t"  // Replace with lwsync where possible?
-        : : : "memory"
-      );
-      mLock = 0;
-  #endif
-#else
-  #if defined(_TTHREAD_WIN32_)
-      LeaveCriticalSection(&mHandle);
-  #elif defined(_TTHREAD_POSIX_)
-      pthread_mutex_unlock(&mHandle);
-  #endif
-#endif
-    }
-
-  private:
-#if defined(_FAST_MUTEX_ASM_)
-    int mLock;
-#else
-  #if defined(_TTHREAD_WIN32_)
-    CRITICAL_SECTION mHandle;
-  #elif defined(_TTHREAD_POSIX_)
-    pthread_mutex_t mHandle;
-  #endif
-#endif
-};
-
-}
-
-#endif // _FAST_MUTEX_H_
-
diff --git a/diy/include/diy/time.hpp b/diy/include/diy/time.hpp
deleted file mode 100644
index d6b44c2e1..000000000
--- a/diy/include/diy/time.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef DIY_TIME_HPP
-#define DIY_TIME_HPP
-
-#include <sys/time.h>
-
-#ifdef __MACH__
-#include <mach/clock.h>
-#include <mach/mach.h>
-#endif
-
-namespace diy
-{
-
-typedef     unsigned long       time_type;
-
-inline time_type get_time()
-{
-#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
-    clock_serv_t cclock;
-    mach_timespec_t ts;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-    clock_get_time(cclock, &ts);
-    mach_port_deallocate(mach_task_self(), cclock);
-#else
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-#endif
-    return ts.tv_sec*1000 + ts.tv_nsec/1000000;
-}
-
-}
-
-#endif
diff --git a/diy/include/diy/types.hpp b/diy/include/diy/types.hpp
deleted file mode 100644
index d52e75030..000000000
--- a/diy/include/diy/types.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef DIY_TYPES_HPP
-#define DIY_TYPES_HPP
-
-#include <iostream>
-#include "constants.h"
-#include "point.hpp"
-
-namespace diy
-{
-    struct BlockID
-    {
-        int gid, proc;
-    };
-
-    template<class Coordinate_>
-    struct Bounds
-    {
-        using Coordinate = Coordinate_;
-
-        Point<Coordinate, DIY_MAX_DIM>    min, max;
-    };
-    using DiscreteBounds   = Bounds<int>;
-    using ContinuousBounds = Bounds<float>;
-
-    //! Helper to create a 1-dimensional discrete domain with the specified extents
-    inline
-    diy::DiscreteBounds
-    interval(int from, int to)            { DiscreteBounds domain; domain.min[0] = from; domain.max[0] = to; return domain; }
-
-    struct Direction: public Point<int,DIY_MAX_DIM>
-    {
-              Direction()                 { for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0; }
-              Direction(int dir)
-      {
-          for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0;
-          if (dir & DIY_X0) (*this)[0] -= 1;
-          if (dir & DIY_X1) (*this)[0] += 1;
-          if (dir & DIY_Y0) (*this)[1] -= 1;
-          if (dir & DIY_Y1) (*this)[1] += 1;
-          if (dir & DIY_Z0) (*this)[2] -= 1;
-          if (dir & DIY_Z1) (*this)[2] += 1;
-          if (dir & DIY_T0) (*this)[3] -= 1;
-          if (dir & DIY_T1) (*this)[3] += 1;
-      }
-
-      bool
-      operator==(const diy::Direction& y) const
-      {
-        for (int i = 0; i < DIY_MAX_DIM; ++i)
-            if ((*this)[i] != y[i]) return false;
-        return true;
-      }
-
-      // lexicographic comparison
-      bool
-      operator<(const diy::Direction& y) const
-      {
-        for (int i = 0; i < DIY_MAX_DIM; ++i)
-        {
-            if ((*this)[i] < y[i]) return true;
-            if ((*this)[i] > y[i]) return false;
-        }
-        return false;
-      }
-    };
-
-    // Selector of bounds value type
-    template<class Bounds_>
-    struct BoundsValue
-    {
-        using type = typename Bounds_::Coordinate;
-    };
-
-    inline
-    bool
-    operator<(const diy::BlockID& x, const diy::BlockID& y)
-    { return x.gid < y.gid; }
-
-    inline
-    bool
-    operator==(const diy::BlockID& x, const diy::BlockID& y)
-    { return x.gid == y.gid; }
-}
-
-#endif
diff --git a/diy/include/diy/vertices.hpp b/diy/include/diy/vertices.hpp
deleted file mode 100644
index 423209fd6..000000000
--- a/diy/include/diy/vertices.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef DIY_VERTICES_HPP
-#define DIY_VERTICES_HPP
-
-#include <iterator>
-
-namespace diy
-{
-
-namespace detail
-{
-    template<class Vertex, size_t I>
-    struct IsLast
-    {
-        static constexpr bool value = (Vertex::dimension() - 1 == I);
-    };
-
-    template<class Vertex, class Callback, size_t I, bool P>
-    struct ForEach
-    {
-        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
-        {
-            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
-                ForEach<Vertex, Callback, I+1, IsLast<Vertex,I+1>::value>()(pos, from, to, callback);
-        }
-    };
-
-    template<class Vertex, class Callback, size_t I>
-    struct ForEach<Vertex,Callback,I,true>
-    {
-        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
-        {
-            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
-                callback(pos);
-        }
-    };
-}
-
-template<class Vertex, class Callback>
-void for_each(const Vertex& from, const Vertex& to, const Callback& callback)
-{
-    Vertex pos;
-    grid::detail::ForEach<Vertex, Callback, 0, detail::IsLast<Vertex,0>::value>()(pos, from, to, callback);
-}
-
-template<class Vertex, class Callback>
-void for_each(const Vertex& shape, const Callback& callback)
-{
-    // specify grid namespace to disambiguate with std::for_each(...)
-    grid::for_each(Vertex::zero(), shape - Vertex::one(), callback);
-}
-
-}
-
-#endif

From 2aab6ba47e55ccd288579db2ae07ef893b338ca4 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 13:53:56 -0500
Subject: [PATCH 15/24] Add 3rd-party harness.

Add docs and scripts for 3rd party modules in VTK-m.

This is an import of scripts from VTK.
---
 vtkm/thirdparty/UPDATING.md      | 105 ++++++++++++++++++
 vtkm/thirdparty/update-common.sh | 181 +++++++++++++++++++++++++++++++
 2 files changed, 286 insertions(+)
 create mode 100644 vtkm/thirdparty/UPDATING.md
 create mode 100644 vtkm/thirdparty/update-common.sh

diff --git a/vtkm/thirdparty/UPDATING.md b/vtkm/thirdparty/UPDATING.md
new file mode 100644
index 000000000..d1c7fa8d7
--- /dev/null
+++ b/vtkm/thirdparty/UPDATING.md
@@ -0,0 +1,105 @@
+# Updating Third Party Projects
+
+When updating a third party project, any changes to the imported project
+itself (e.g., the `diy/vtkmdiy` directory for diy), should go through the
+`update.sh` framework. This framework ensures that all patches to the third
+party projects are tracked externally and available for (preferably) upstream
+or other projects also embedding the library.
+
+# Updating a Project
+
+Once converted, a project should be updated by applying patches to the
+repository specified in its `update.sh` script. Once the changes are merged,
+pulling the changes involves running the `update.sh` script. This will update
+the local copy of the project to the version specified in `update.sh` (usually
+a `for/foo` branch, like `for/vtk-m` for example, but may be `master` or any
+other Git reference) and merge it into the main tree.
+
+This requires a Git 2.5 or higher due the `worktree` tool being used to
+simplify the availability of the commits to the main checkout.
+
+Here's an example of updating the `diy` project from tag v2.0 to v2.1,
+starting with updating the third-party repo
+
+```sh
+$ cd diy
+$ git checkout for/vtk-m
+$ git fetch origin
+$ git rebase --onto v2.1 v2.0
+$ git push
+```
+
+Now import into VTK-m
+
+```sh
+$ cd vtkm/ThirdParty/diy
+$ git checkout -b update_diy
+$ ./update.sh
+```
+
+Now you can review the change and make a merge request from the branch as normal.
+
+# Porting a Project
+
+When converting a project, if there are any local patches, a project should be
+created [on GitLab](https://gitlab.kitware.com/third-party) to track it. If
+the upstream project does not use Git, it should be imported into Git (there
+may be existing conversions available on Github already). The project's
+description should indicate where the source repository lives.
+
+Once a mirror of the project is created, a branch named `for/foo` should be
+created where patches for the `foo` project will be applied (i.e., `for/vtk-m`
+for VTK-m's patches to the project). Usually, changes to the build system, the
+source code for mangling, the addition of `.gitattributes` files, and other
+changes belong here. Functional changes should be submitted upstream (but may
+still be tracked so that they may be used).
+
+The basic steps to import a project `diy` based on the tag
+`v2.0` looks like this:
+
+```sh
+$ git clone https://github.com/diatomic/diy.git
+$ cd diy/
+$ git remote add kitware git@gitlab.kitware.com:third-party/diy.git
+$ git push -u kitware
+$ git push -u kitware --tags
+$ git checkout v2.0
+$ git checkout -b for/vtk-m
+$ git push --set-upstream kitware for/vtk-m
+```
+
+Making the initial import involves filling out the project's `update.sh`
+script in its directory. The [update-common.sh](update-common.sh) script
+describes what is necessary, but in a nutshell, it is basically metadata such
+as the name of the project and where it goes in the importing project.
+
+The most important bit is the `extract_source` function which should subset
+the repository. If all that needs to be done is to extract the files given in
+the `paths` variable (described in the `update-common.sh` script), the
+`git_archive` function may be used if the `git archive` tool generates a
+suitable subset.
+
+Make sure `update.sh` is executable before commit. On Unix, run:
+
+```sh
+  $ chmod u+x update.sh && git add -u update.sh
+```
+
+On Windows, run:
+
+```sh
+  $ git update-index --chmod=+x update.sh
+```
+
+# Process
+
+The basic process involves a second branch where the third party project's
+changes are tracked. This branch has a commit for each time it has been
+updated and is stripped to only contain the relevant parts (no unit tests,
+documentation, etc.). This branch is then merged into the main branch as a
+subdirectory using the `subtree` merge strategy.
+
+Initial conversions will require a manual push by the maintainers since the
+conversion involves a root commit which is not allowed under normal
+circumstances. Please send an email to the mailing list asking for assistance
+if necessary.
diff --git a/vtkm/thirdparty/update-common.sh b/vtkm/thirdparty/update-common.sh
new file mode 100644
index 000000000..9550b56ad
--- /dev/null
+++ b/vtkm/thirdparty/update-common.sh
@@ -0,0 +1,181 @@
+#=============================================================================
+# Copyright 2015-2016 Kitware, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+########################################################################
+# Script for updating third party packages.
+#
+# This script should be sourced in a project-specific script which sets
+# the following variables:
+#
+#   name
+#       The name of the project.
+#   ownership
+#       A git author name/email for the commits.
+#   subtree
+#       The location of the thirdparty package within the main source
+#       tree.
+#   repo
+#       The git repository to use as upstream.
+#   tag
+#       The tag, branch or commit hash to use for upstream.
+#   shortlog
+#       Optional.  Set to 'true' to get a shortlog in the commit message.
+#
+# Additionally, an "extract_source" function must be defined. It will be
+# run within the checkout of the project on the requested tag. It should
+# should place the desired tree into $extractdir/$name-reduced. This
+# directory will be used as the newest commit for the project.
+#
+# For convenience, the function may use the "git_archive" function which
+# does a standard "git archive" extraction using the (optional) "paths"
+# variable to only extract a subset of the source tree.
+########################################################################
+
+########################################################################
+# Utility functions
+########################################################################
+git_archive () {
+    git archive --worktree-attributes --prefix="$name-reduced/" HEAD -- $paths | \
+        tar -C "$extractdir" -x
+}
+
+disable_custom_gitattributes() {
+    pushd "${extractdir}/${name}-reduced"
+    # Git does not allow custom attributes in a subdirectory where we
+    # are about to merge the `.gitattributes` file, so disable them.
+    sed -i '/^\[attr\]/ {s/^/#/}' .gitattributes
+    popd
+}
+
+die () {
+    echo >&2 "$@"
+    exit 1
+}
+
+warn () {
+    echo >&2 "warning: $@"
+}
+
+readonly regex_date='20[0-9][0-9]-[0-9][0-9]-[0-9][0-9]'
+readonly basehash_regex="$name $regex_date ([0-9a-f]*)"
+readonly basehash="$( git rev-list --author="$ownership" --grep="$basehash_regex" -n 1 HEAD )"
+readonly upstream_old_short="$( git cat-file commit "$basehash" | sed -n '/'"$basehash_regex"'/ {s/.*(//;s/)//;p}' | egrep '^[0-9a-f]+$' )"
+
+########################################################################
+# Sanity checking
+########################################################################
+[ -n "$name" ] || \
+    die "'name' is empty"
+[ -n "$ownership" ] || \
+    die "'ownership' is empty"
+[ -n "$subtree" ] || \
+    die "'subtree' is empty"
+[ -n "$repo" ] || \
+    die "'repo' is empty"
+[ -n "$tag" ] || \
+    die "'tag' is empty"
+[ -n "$basehash" ] || \
+    warn "'basehash' is empty; performing initial import"
+readonly do_shortlog="${shortlog-false}"
+
+readonly workdir="$PWD/work"
+readonly upstreamdir="$workdir/upstream"
+readonly extractdir="$workdir/extract"
+
+[ -d "$workdir" ] && \
+    die "error: workdir '$workdir' already exists"
+
+trap "rm -rf '$workdir'" EXIT
+
+# Get upstream
+git clone "$repo" "$upstreamdir"
+
+if [ -n "$basehash" ]; then
+    # Use the existing package's history
+    git worktree add "$extractdir" "$basehash"
+    # Clear out the working tree
+    pushd "$extractdir"
+    git ls-files | xargs rm -v
+    find . -type d -empty -delete
+    popd
+else
+    # Create a repo to hold this package's history
+    mkdir -p "$extractdir"
+    git -C "$extractdir" init
+fi
+
+# Extract the subset of upstream we care about
+pushd "$upstreamdir"
+git checkout "$tag"
+readonly upstream_hash="$( git rev-parse HEAD )"
+readonly upstream_hash_short="$( git rev-parse --short=8 "$upstream_hash" )"
+readonly upstream_datetime="$( git rev-list "$upstream_hash" --format='%ci' -n 1 | grep -e "^$regex_date" )"
+readonly upstream_date="$( echo "$upstream_datetime" | grep -o -e "$regex_date" )"
+if $do_shortlog && [ -n "$basehash" ]; then
+    readonly commit_shortlog="
+
+Upstream Shortlog
+-----------------
+
+$( git shortlog --no-merges --abbrev=8 --format='%h %s' "$upstream_old_short".."$upstream_hash" )"
+else
+    readonly commit_shortlog=""
+fi
+extract_source || \
+    die "failed to extract source"
+popd
+
+[ -d "$extractdir/$name-reduced" ] || \
+    die "expected directory to extract does not exist"
+readonly commit_summary="$name $upstream_date ($upstream_hash_short)"
+
+# Commit the subset
+pushd "$extractdir"
+mv -v "$name-reduced/"* .
+rmdir "$name-reduced/"
+git add -A .
+git commit -n --author="$ownership" --date="$upstream_datetime" -F - <<-EOF
+$commit_summary
+
+Code extracted from:
+
+    $repo
+
+at commit $upstream_hash ($tag).$commit_shortlog
+EOF
+git branch -f "upstream-$name"
+popd
+
+# Merge the subset into this repository
+if [ -n "$basehash" ]; then
+    git merge --log -s recursive "-Xsubtree=$subtree/" --no-commit "upstream-$name"
+else
+    # Note: on Windows 'git merge' will open a browser, and the check will fail,
+    # so use the flag by default.
+    unrelated_histories_flag=""
+    if git --version | grep -q windows; then
+        unrelated_histories_flag="--allow-unrelated-histories "
+    elif git merge --help | grep -q -e allow-unrelated-histories; then
+        unrelated_histories_flag="--allow-unrelated-histories "
+    fi
+    readonly unrelated_histories_flag
+
+    git fetch "$extractdir" "+upstream-$name:upstream-$name"
+    git merge --log -s ours --no-commit $unrelated_histories_flag "upstream-$name"
+    git read-tree -u --prefix="$subtree/" "upstream-$name"
+fi
+git commit --no-edit
+git branch -d "upstream-$name"

From aa936095da5e48ad901b19c0e251069eb57774c4 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 13:57:38 -0500
Subject: [PATCH 16/24] update `diy` location in gitattributes.

---
 .gitattributes | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index 79ac99f3f..2174096fc 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -14,4 +14,4 @@ data/* filter=lfs diff=lfs merge=lfs -text
 *.rst            whitespace=tab-in-indent conflict-marker-size=79
 *.txt            whitespace=tab-in-indent
 
-diy/**           -format.clang-format -whitespace
+vtkm/thirdparty/diy/vtkmdiy/**           -format.clang-format -whitespace

From 4339b4e2d2d1ba46dd425bb2ca609780e5131ecb Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 13:55:34 -0500
Subject: [PATCH 17/24] Add update script for diy.

---
 vtkm/thirdparty/diy/update.sh | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100755 vtkm/thirdparty/diy/update.sh

diff --git a/vtkm/thirdparty/diy/update.sh b/vtkm/thirdparty/diy/update.sh
new file mode 100755
index 000000000..3ce5ef0ec
--- /dev/null
+++ b/vtkm/thirdparty/diy/update.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+shopt -s dotglob
+
+readonly name="diy"
+readonly ownership="Diy Upstream <kwrobot@kitware.com>"
+readonly subtree="vtkm/thirdparty/$name/vtkm$name"
+readonly repo="https://gitlab.kitware.com/third-party/diy2.git"
+readonly tag="for/vtk-m"
+readonly paths="
+include
+LEGAL.txt
+LICENSE.txt
+README.md
+"
+
+extract_source () {
+    git_archive
+    pushd "$extractdir/$name-reduced"
+    mv include/diy include/vtkmdiy
+    popd
+}
+
+. "${BASH_SOURCE%/*}/../update-common.sh"

From 6fc0794c405734060a07159a0b88fe95946b56f0 Mon Sep 17 00:00:00 2001
From: Diy Upstream <kwrobot@kitware.com>
Date: Tue, 2 Jan 2018 18:30:42 -0800
Subject: [PATCH 18/24] diy 2018-01-02 (aa778e24)

Code extracted from:

    https://gitlab.kitware.com/third-party/diy2.git

at commit aa778e24a40ec6d39c4f8b43eb4bdb4f2708219a (for/vtk-m).
---
 LEGAL.txt                                     |   19 +
 LICENSE.txt                                   |   41 +
 README.md                                     |   85 +
 include/vtkmdiy/algorithms.hpp                |  191 +
 include/vtkmdiy/assigner.hpp                  |  126 +
 include/vtkmdiy/collection.hpp                |  121 +
 include/vtkmdiy/communicator.hpp              |   13 +
 include/vtkmdiy/constants.h                   |   22 +
 include/vtkmdiy/critical-resource.hpp         |   53 +
 include/vtkmdiy/decomposition.hpp             |  716 +++
 .../detail/algorithms/kdtree-sampling.hpp     |  450 ++
 include/vtkmdiy/detail/algorithms/kdtree.hpp  |  569 +++
 include/vtkmdiy/detail/algorithms/sort.hpp    |  162 +
 include/vtkmdiy/detail/block_traits.hpp       |   31 +
 include/vtkmdiy/detail/collectives.hpp        |   54 +
 include/vtkmdiy/detail/reduce/all-to-all.hpp  |  169 +
 include/vtkmdiy/detail/traits.hpp             |  318 ++
 include/vtkmdiy/fmt/format.cc                 |  535 +++
 include/vtkmdiy/fmt/format.h                  | 4014 +++++++++++++++++
 include/vtkmdiy/fmt/ostream.cc                |   35 +
 include/vtkmdiy/fmt/ostream.h                 |  105 +
 include/vtkmdiy/grid.hpp                      |  153 +
 include/vtkmdiy/io/block.hpp                  |  396 ++
 include/vtkmdiy/io/bov.hpp                    |  171 +
 include/vtkmdiy/io/numpy.hpp                  |  213 +
 include/vtkmdiy/link.hpp                      |  219 +
 include/vtkmdiy/log.hpp                       |  103 +
 include/vtkmdiy/master.hpp                    | 1203 +++++
 include/vtkmdiy/mpi.hpp                       |   32 +
 include/vtkmdiy/mpi/collectives.hpp           |  328 ++
 include/vtkmdiy/mpi/communicator.hpp          |   72 +
 include/vtkmdiy/mpi/constants.hpp             |   13 +
 include/vtkmdiy/mpi/datatypes.hpp             |   63 +
 include/vtkmdiy/mpi/io.hpp                    |  137 +
 include/vtkmdiy/mpi/operations.hpp            |   26 +
 include/vtkmdiy/mpi/optional.hpp              |   55 +
 include/vtkmdiy/mpi/point-to-point.hpp        |   98 +
 include/vtkmdiy/mpi/request.hpp               |   26 +
 include/vtkmdiy/mpi/status.hpp                |   30 +
 include/vtkmdiy/no-thread.hpp                 |   38 +
 include/vtkmdiy/partners/all-reduce.hpp       |   72 +
 include/vtkmdiy/partners/broadcast.hpp        |   62 +
 include/vtkmdiy/partners/common.hpp           |  204 +
 include/vtkmdiy/partners/merge.hpp            |   60 +
 include/vtkmdiy/partners/swap.hpp             |   43 +
 include/vtkmdiy/pick.hpp                      |  137 +
 include/vtkmdiy/point.hpp                     |  120 +
 include/vtkmdiy/proxy.hpp                     |  228 +
 include/vtkmdiy/reduce-operations.hpp         |   32 +
 include/vtkmdiy/reduce.hpp                    |  216 +
 include/vtkmdiy/serialization.hpp             |  456 ++
 include/vtkmdiy/stats.hpp                     |  115 +
 include/vtkmdiy/storage.hpp                   |  228 +
 include/vtkmdiy/thread.hpp                    |   31 +
 include/vtkmdiy/thread/fast_mutex.h           |  248 +
 include/vtkmdiy/time.hpp                      |   33 +
 include/vtkmdiy/types.hpp                     |   85 +
 include/vtkmdiy/vertices.hpp                  |   54 +
 58 files changed, 13629 insertions(+)
 create mode 100644 LEGAL.txt
 create mode 100644 LICENSE.txt
 create mode 100644 README.md
 create mode 100644 include/vtkmdiy/algorithms.hpp
 create mode 100644 include/vtkmdiy/assigner.hpp
 create mode 100644 include/vtkmdiy/collection.hpp
 create mode 100644 include/vtkmdiy/communicator.hpp
 create mode 100644 include/vtkmdiy/constants.h
 create mode 100644 include/vtkmdiy/critical-resource.hpp
 create mode 100644 include/vtkmdiy/decomposition.hpp
 create mode 100644 include/vtkmdiy/detail/algorithms/kdtree-sampling.hpp
 create mode 100644 include/vtkmdiy/detail/algorithms/kdtree.hpp
 create mode 100644 include/vtkmdiy/detail/algorithms/sort.hpp
 create mode 100644 include/vtkmdiy/detail/block_traits.hpp
 create mode 100644 include/vtkmdiy/detail/collectives.hpp
 create mode 100644 include/vtkmdiy/detail/reduce/all-to-all.hpp
 create mode 100644 include/vtkmdiy/detail/traits.hpp
 create mode 100644 include/vtkmdiy/fmt/format.cc
 create mode 100644 include/vtkmdiy/fmt/format.h
 create mode 100644 include/vtkmdiy/fmt/ostream.cc
 create mode 100644 include/vtkmdiy/fmt/ostream.h
 create mode 100644 include/vtkmdiy/grid.hpp
 create mode 100644 include/vtkmdiy/io/block.hpp
 create mode 100644 include/vtkmdiy/io/bov.hpp
 create mode 100644 include/vtkmdiy/io/numpy.hpp
 create mode 100644 include/vtkmdiy/link.hpp
 create mode 100644 include/vtkmdiy/log.hpp
 create mode 100644 include/vtkmdiy/master.hpp
 create mode 100644 include/vtkmdiy/mpi.hpp
 create mode 100644 include/vtkmdiy/mpi/collectives.hpp
 create mode 100644 include/vtkmdiy/mpi/communicator.hpp
 create mode 100644 include/vtkmdiy/mpi/constants.hpp
 create mode 100644 include/vtkmdiy/mpi/datatypes.hpp
 create mode 100644 include/vtkmdiy/mpi/io.hpp
 create mode 100644 include/vtkmdiy/mpi/operations.hpp
 create mode 100644 include/vtkmdiy/mpi/optional.hpp
 create mode 100644 include/vtkmdiy/mpi/point-to-point.hpp
 create mode 100644 include/vtkmdiy/mpi/request.hpp
 create mode 100644 include/vtkmdiy/mpi/status.hpp
 create mode 100644 include/vtkmdiy/no-thread.hpp
 create mode 100644 include/vtkmdiy/partners/all-reduce.hpp
 create mode 100644 include/vtkmdiy/partners/broadcast.hpp
 create mode 100644 include/vtkmdiy/partners/common.hpp
 create mode 100644 include/vtkmdiy/partners/merge.hpp
 create mode 100644 include/vtkmdiy/partners/swap.hpp
 create mode 100644 include/vtkmdiy/pick.hpp
 create mode 100644 include/vtkmdiy/point.hpp
 create mode 100644 include/vtkmdiy/proxy.hpp
 create mode 100644 include/vtkmdiy/reduce-operations.hpp
 create mode 100644 include/vtkmdiy/reduce.hpp
 create mode 100644 include/vtkmdiy/serialization.hpp
 create mode 100644 include/vtkmdiy/stats.hpp
 create mode 100644 include/vtkmdiy/storage.hpp
 create mode 100644 include/vtkmdiy/thread.hpp
 create mode 100644 include/vtkmdiy/thread/fast_mutex.h
 create mode 100644 include/vtkmdiy/time.hpp
 create mode 100644 include/vtkmdiy/types.hpp
 create mode 100644 include/vtkmdiy/vertices.hpp

diff --git a/LEGAL.txt b/LEGAL.txt
new file mode 100644
index 000000000..66955ef03
--- /dev/null
+++ b/LEGAL.txt
@@ -0,0 +1,19 @@
+Copyright Notice
+
+DIY2, Copyright (c) 2015, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from the U.S. Dept. of Energy).  All rights reserved.
+
+If you have questions about your rights to use or distribute this software,
+please contact Berkeley Lab's Technology Transfer Department at  TTD@lbl.gov.
+
+NOTICE.  This software is owned by the U.S. Department of Energy.  As such, the
+U.S. Government has been granted for itself and others acting on its behalf a
+paid-up, nonexclusive, irrevocable, worldwide license in the Software to
+reproduce, prepare derivative works, and perform publicly and display publicly.
+Beginning five (5) years after the date permission to assert copyright is
+obtained from the U.S. Department of Energy, and subject to any subsequent five
+(5) year renewals, the U.S. Government is granted for itself and others acting
+on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
+Software to reproduce, prepare derivative works, distribute copies to the
+public, perform publicly and display publicly, and to permit others to do so.
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 000000000..7607d2ca1
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,41 @@
+License Agreement
+
+"DIY2, Copyright (c) 2015, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from the U.S. Dept. of Energy).  All rights reserved."
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+(1) Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+(2) Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+(3) Neither the name of the University of California, Lawrence Berkeley National
+Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used
+to endorse or promote products derived from this software without specific prior
+written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to Lawrence Berkeley National Laboratory,
+without imposing a separate written license agreement for such Enhancements,
+then you hereby grant the following license: a  non-exclusive, royalty-free
+perpetual license to install, use, modify, prepare derivative works, incorporate
+into other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..a2ab00c94
--- /dev/null
+++ b/README.md
@@ -0,0 +1,85 @@
+## DIY is a block-parallel library
+
+DIY is a block-parallel library for implementing scalable algorithms that can execute both
+in-core and out-of-core. The same program can be executed with one or more threads per MPI
+process, seamlessly combining distributed-memory message passing with shared-memory thread
+parallelism.  The abstraction enabling these capabilities is block parallelism; blocks
+and their message queues are mapped onto processing elements (MPI processes or threads) and are
+migrated between memory and storage by the DIY runtime. Complex communication patterns,
+including neighbor exchange, merge reduction, swap reduction, and all-to-all exchange, are
+possible in- and out-of-core in DIY.
+
+## Licensing
+
+DIY is released as open source software under a BSD-style [license](./LICENSE.txt).
+
+## Dependencies
+
+DIY requires an MPI installation. We recommend [MPICH](http://www.mpich.org/).
+
+## Download, build, install
+
+- You can clone this repository, or
+
+- You can download the [latest tarball](https://github.com/diatomic/diy2/archive/master.tar.gz).
+
+
+DIY is a header-only library. It does not need to be built; you can simply
+include it in your project. The examples can be built using `cmake` from the
+top-level directory.
+
+## Documentation
+
+[Doxygen pages](https://diatomic.github.io/diy)
+
+## Example
+
+A simple DIY program, shown below, consists of the following components:
+
+- `struct`s called blocks,
+- a diy object called the `master`,
+- a set of callback functions performed on each block by `master.foreach()`,
+- optionally, one or more message exchanges between the blocks by `master.exchange()`, and
+- there may be other collectives and global reductions not shown below.
+
+The callback functions (`enqueue_local()` and `average()` in the example below) receive the block
+pointer and a communication proxy for the message exchange between blocks. It is usual for the
+callback functions to enqueue or dequeue messages from the proxy, so that information can be
+received and sent during rounds of message exchange.
+
+```cpp
+    // --- main program --- //
+
+    struct Block { float local, average; };             // define your block structure
+
+    Master master(world);                               // world = MPI_Comm
+    ...                                                 // populate master with blocks
+    master.foreach(&enqueue_local);                     // call enqueue_local() for each block
+    master.exchange();                                  // exchange enqueued data between blocks
+    master.foreach(&average);                           // call average() for each block
+
+    // --- callback functions --- //
+
+    // enqueue block data prior to exchanging it
+    void enqueue_local(Block* b,                        // current block
+                       const Proxy& cp)                 // communication proxy provides access to the neighbor blocks
+    {
+        for (size_t i = 0; i < cp.link()->size(); i++)  // for all neighbor blocks
+            cp.enqueue(cp.link()->target(i), b->local); // enqueue the data to be sent to this neighbor
+                                                        // block in the next exchange
+    }
+
+    // use the received data after exchanging it, in this case compute its average
+    void average(Block* b,                              // current block
+                 const Proxy& cp)                       // communication proxy provides access to the neighbor blocks
+    {
+        float x, average = 0;
+        for (size_t i = 0; i < cp.link()->size(); i++)  // for all neighbor blocks
+        {
+            cp.dequeue(cp.link()->target(i).gid, x);    // dequeue the data received from this
+                                                        // neighbor block in the last exchange
+            average += x;
+        }
+        b->average = average / cp.link()->size();
+    }
+```
diff --git a/include/vtkmdiy/algorithms.hpp b/include/vtkmdiy/algorithms.hpp
new file mode 100644
index 000000000..23215a2c3
--- /dev/null
+++ b/include/vtkmdiy/algorithms.hpp
@@ -0,0 +1,191 @@
+#ifndef DIY_ALGORITHMS_HPP
+#define DIY_ALGORITHMS_HPP
+
+#include <vector>
+
+#include "master.hpp"
+#include "assigner.hpp"
+#include "reduce.hpp"
+#include "reduce-operations.hpp"
+#include "partners/swap.hpp"
+
+#include "detail/algorithms/sort.hpp"
+#include "detail/algorithms/kdtree.hpp"
+#include "detail/algorithms/kdtree-sampling.hpp"
+
+#include "log.hpp"
+
+namespace diy
+{
+    /**
+     * \ingroup Algorithms
+     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
+     */
+    template<class Block, class T, class Cmp>
+    void sort(Master&                   master,               //!< master object
+              const Assigner&           assigner,             //!< assigner object
+              std::vector<T> Block::*   values,               //!< all values to sort
+              std::vector<T> Block::*   samples,              //!< (output) boundaries of blocks
+              size_t                    num_samples,          //!< desired number of samples
+              const Cmp&                cmp,                  //!< comparison function
+              int                       k   = 2,              //!< k-ary reduction will be used
+              bool                      samples_only = false) //!< false: results will be all_to_all exchanged; true: only sort but don't exchange results
+    {
+        bool immediate = master.immediate();
+        master.set_immediate(false);
+
+        // NB: although sorter will go out of scope, its member functions sample()
+        //     and exchange() will return functors whose copies get saved inside reduce
+        detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples);
+
+        // swap-reduce to all-gather samples
+        RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks());
+        RegularSwapPartners   partners(decomposer, k);
+        reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds()));
+
+        // all_to_all to exchange the values
+        if (!samples_only)
+            all_to_all(master, assigner, sorter.exchange(), k);
+
+        master.set_immediate(immediate);
+    }
+
+
+    /**
+     * \ingroup Algorithms
+     * \brief sample sort `values` of each block, store the boundaries between blocks in `samples`
+     * shorter version of above sort algorithm with the default less-than comparator used for T
+     * and all_to_all exchange included
+     */
+    template<class Block, class T>
+    void sort(Master&                   master,      //!< master object
+              const Assigner&           assigner,    //!< assigner object
+              std::vector<T> Block::*   values,      //!< all values to sort
+              std::vector<T> Block::*   samples,     //!< (output) boundaries of blocks
+              size_t                    num_samples, //!< desired number of samples
+              int                       k   = 2)     //!< k-ary reduction will be used
+    {
+        sort(master, assigner, values, samples, num_samples, std::less<T>(), k);
+    }
+
+    /**
+     * \ingroup Algorithms
+     * \brief build a kd-tree and sort a set of points into it (use histograms to determine split values)
+     */
+    template<class Block, class Point>
+    void kdtree(Master&                         master,      //!< master object
+                const Assigner&                 assigner,    //!< assigner object
+                int                             dim,         //!< dimensionality
+                const ContinuousBounds&         domain,      //!< global data extents
+                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
+                size_t                          bins,        //!< number of histogram bins for splitting a dimension
+                bool                            wrap = false)//!< periodic boundaries in all dimensions
+    {
+        if (assigner.nblocks() & (assigner.nblocks() - 1))
+            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
+
+        typedef     diy::RegularContinuousLink      RCLink;
+
+        for (size_t i = 0; i < master.size(); ++i)
+        {
+            RCLink* link   = static_cast<RCLink*>(master.link(i));
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir[j] = -1; wrap_dir[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir[j] = 1; wrap_dir[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
+        }
+
+        detail::KDTreePartition<Block,Point>    kdtree_partition(dim, points, bins);
+
+        detail::KDTreePartners                  partners(dim, assigner.nblocks(), wrap, domain);
+        reduce(master, assigner, partners, kdtree_partition);
+
+        // update master.expected to match the links
+        int expected = 0;
+        for (size_t i = 0; i < master.size(); ++i)
+            expected += master.link(i)->size_unique();
+        master.set_expected(expected);
+    }
+
+    /**
+     * \ingroup Algorithms
+     * \brief build a kd-tree and sort a set of points into it (use sampling to determine split values)
+     */
+    template<class Block, class Point>
+    void kdtree_sampling
+               (Master&                         master,      //!< master object
+                const Assigner&                 assigner,    //!< assigner object
+                int                             dim,         //!< dimensionality
+                const ContinuousBounds&         domain,      //!< global data extents
+                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
+                size_t                          samples,     //!< number of samples to take in each block
+                bool                            wrap = false)//!< periodic boundaries in all dimensions
+    {
+        if (assigner.nblocks() & (assigner.nblocks() - 1))
+            throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
+
+        typedef     diy::RegularContinuousLink      RCLink;
+
+        for (size_t i = 0; i < master.size(); ++i)
+        {
+            RCLink* link   = static_cast<RCLink*>(master.link(i));
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir[j] = -1; wrap_dir[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir[j] = 1; wrap_dir[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
+        }
+
+        detail::KDTreeSamplingPartition<Block,Point>    kdtree_partition(dim, points, samples);
+
+        detail::KDTreePartners                          partners(dim, assigner.nblocks(), wrap, domain);
+        reduce(master, assigner, partners, kdtree_partition);
+
+        // update master.expected to match the links
+        int expected = 0;
+        for (size_t i = 0; i < master.size(); ++i)
+            expected += master.link(i)->size_unique();
+        master.set_expected(expected);
+    }
+}
+
+#endif
diff --git a/include/vtkmdiy/assigner.hpp b/include/vtkmdiy/assigner.hpp
new file mode 100644
index 000000000..957596ddc
--- /dev/null
+++ b/include/vtkmdiy/assigner.hpp
@@ -0,0 +1,126 @@
+#ifndef DIY_ASSIGNER_HPP
+#define DIY_ASSIGNER_HPP
+
+#include <vector>
+
+namespace diy
+{
+  // Derived types should define
+  //   int rank(int gid) const
+  // that converts a global block id to a rank that it's assigned to.
+  class Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Manages how blocks are assigned to processes
+      */
+                    Assigner(int size,     //!< total number of processes
+                             int nblocks   //!< total (global) number of blocks
+                             ):
+                      size_(size), nblocks_(nblocks)    {}
+
+      //! returns the total number of process ranks
+      int           size() const                        { return size_; }
+      //! returns the total number of global blocks
+      int           nblocks() const                     { return nblocks_; }
+      //! sets the total number of global blocks
+      void          set_nblocks(int nblocks)            { nblocks_ = nblocks; }
+      //! gets the local gids for a given process rank
+      virtual void  local_gids(int rank, std::vector<int>& gids) const   =0;
+      //! returns the process rank of the block with global id gid (need not be local)
+      virtual int   rank(int gid) const     =0;
+
+    private:
+      int           size_;      // total number of ranks
+      int           nblocks_;   // total number of blocks
+  };
+
+  class ContiguousAssigner: public Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Assigns blocks to processes in contiguous gid (block global id) order
+      */
+            ContiguousAssigner(int size,     //!< total number of processes
+                               int nblocks   //!< total (global) number of blocks
+                               ):
+              Assigner(size, nblocks)           {}
+
+      using Assigner::size;
+      using Assigner::nblocks;
+
+      int   rank(int gid) const override
+      {
+          int div = nblocks() / size();
+          int mod = nblocks() % size();
+          int r = gid / (div + 1);
+          if (r < mod)
+          {
+              return r;
+          } else
+          {
+              return mod + (gid - (div + 1)*mod)/div;
+          }
+      }
+      inline
+      void  local_gids(int rank, std::vector<int>& gids) const override;
+  };
+
+  class RoundRobinAssigner: public Assigner
+  {
+    public:
+     /**
+      * \ingroup Assignment
+      * \brief Assigns blocks to processes in cyclic or round-robin gid (block global id) order
+      */
+            RoundRobinAssigner(int size,     //!< total number of processes
+                               int nblocks   //!< total (global) number of blocks
+                               ):
+              Assigner(size, nblocks)           {}
+
+      using Assigner::size;
+      using Assigner::nblocks;
+
+      int   rank(int gid) const override        { return gid % size(); }
+      inline
+      void  local_gids(int rank, std::vector<int>& gids) const override;
+  };
+}
+
+void
+diy::ContiguousAssigner::
+local_gids(int rank, std::vector<int>& gids) const
+{
+  int div = nblocks() / size();
+  int mod = nblocks() % size();
+
+  int from, to;
+  if (rank < mod)
+      from = rank * (div + 1);
+  else
+      from = mod * (div + 1) + (rank - mod) * div;
+
+  if (rank + 1 < mod)
+      to = (rank + 1) * (div + 1);
+  else
+      to = mod * (div + 1) + (rank + 1 - mod) * div;
+
+  for (int gid = from; gid < to; ++gid)
+    gids.push_back(gid);
+}
+
+void
+diy::RoundRobinAssigner::
+local_gids(int rank, std::vector<int>& gids) const
+{
+  int cur = rank;
+  while (cur < nblocks())
+  {
+    gids.push_back(cur);
+    cur += size();
+  }
+}
+
+#endif
diff --git a/include/vtkmdiy/collection.hpp b/include/vtkmdiy/collection.hpp
new file mode 100644
index 000000000..c24af95f5
--- /dev/null
+++ b/include/vtkmdiy/collection.hpp
@@ -0,0 +1,121 @@
+#ifndef DIY_COLLECTION_HPP
+#define DIY_COLLECTION_HPP
+
+#include <vector>
+
+#include "serialization.hpp"
+#include "storage.hpp"
+#include "thread.hpp"
+
+
+namespace diy
+{
+  class Collection
+  {
+    public:
+      typedef       void*                                       Element;
+      typedef       std::vector<Element>                        Elements;
+      typedef       critical_resource<int, recursive_mutex>     CInt;
+
+      typedef       void* (*Create)();
+      typedef       void  (*Destroy)(void*);
+      typedef       detail::Save                                Save;
+      typedef       detail::Load                                Load;
+
+    public:
+                    Collection(Create               create,
+                               Destroy              destroy,
+                               ExternalStorage*     storage,
+                               Save                 save,
+                               Load                 load):
+                        create_(create),
+                        destroy_(destroy),
+                        storage_(storage),
+                        save_(save),
+                        load_(load),
+                        in_memory_(0)               {}
+
+      size_t        size() const                    { return elements_.size(); }
+      const CInt&   in_memory() const               { return in_memory_; }
+      inline void   clear();
+
+      int           add(Element e)                  { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return elements_.size() - 1; }
+      void*         release(int i)                  { void* e = get(i); elements_[i] = 0; return e; }
+
+      void*         find(int i) const               { return elements_[i]; }                        // possibly returns 0, if the element is unloaded
+      void*         get(int i)                      { if (!find(i)) load(i); return find(i); }      // loads the element first, and then returns its address
+
+      int           available() const               { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
+
+      inline void   load(int i);
+      inline void   unload(int i);
+
+      Create        creator() const                 { return create_; }
+      Destroy       destroyer() const               { return destroy_; }
+      Load          loader() const                  { return load_; }
+      Save          saver() const                   { return save_; }
+
+      void*         create() const                  { return create_(); }
+      void          destroy(int i)                  { if (find(i)) { destroy_(find(i)); elements_[i] = 0; } else if (external_[i] != -1) storage_->destroy(external_[i]); }
+
+      bool          own() const                     { return destroy_ != 0; }
+
+      ExternalStorage*      storage() const         { return storage_; }
+
+    private:
+      Create                create_;
+      Destroy               destroy_;
+      ExternalStorage*      storage_;
+      Save                  save_;
+      Load                  load_;
+
+      Elements              elements_;
+      std::vector<int>      external_;
+      CInt                  in_memory_;
+  };
+}
+
+void
+diy::Collection::
+clear()
+{
+  if (own())
+    for (size_t i = 0; i < size(); ++i)
+      destroy(i);
+  elements_.clear();
+  external_.clear();
+  *in_memory_.access() = 0;
+}
+
+void
+diy::Collection::
+unload(int i)
+{
+  //BinaryBuffer bb;
+  void* e = find(i);
+  //save_(e, bb);
+  //external_[i] = storage_->put(bb);
+  external_[i] = storage_->put(e, save_);
+
+  destroy_(e);
+  elements_[i] = 0;
+
+  --(*in_memory_.access());
+}
+
+void
+diy::Collection::
+load(int i)
+{
+  //BinaryBuffer bb;
+  //storage_->get(external_[i], bb);
+  void* e = create_();
+  //load_(e, bb);
+  storage_->get(external_[i], e, load_);
+  elements_[i] = e;
+  external_[i] = -1;
+
+  ++(*in_memory_.access());
+}
+
+#endif
diff --git a/include/vtkmdiy/communicator.hpp b/include/vtkmdiy/communicator.hpp
new file mode 100644
index 000000000..b95708298
--- /dev/null
+++ b/include/vtkmdiy/communicator.hpp
@@ -0,0 +1,13 @@
+#ifndef DIY_COMMUNICATOR_HPP
+#define DIY_COMMUNICATOR_HPP
+
+#warning "diy::Communicator (in diy/communicator.hpp) is deprecated, use diy::mpi::communicator directly"
+
+#include "mpi.hpp"
+
+namespace diy
+{
+  typedef mpi::communicator         Communicator;
+}
+
+#endif
diff --git a/include/vtkmdiy/constants.h b/include/vtkmdiy/constants.h
new file mode 100644
index 000000000..e3c9cc563
--- /dev/null
+++ b/include/vtkmdiy/constants.h
@@ -0,0 +1,22 @@
+#ifndef DIY_CONSTANTS_H
+#define DIY_CONSTANTS_H
+
+// Default DIY_MAX_DIM to 4, unless provided by the user
+// (used for static min/max size in various Bounds)
+#ifndef DIY_MAX_DIM
+#define DIY_MAX_DIM 4
+#endif
+
+enum
+{
+  DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
+  DIY_X1 = 0x02, /* maximum-side x (right) neighbor */
+  DIY_Y0 = 0x04, /* minimum-side y (bottom) neighbor */
+  DIY_Y1 = 0x08, /* maximum-side y (top) neighbor */
+  DIY_Z0 = 0x10, /* minimum-side z (back) neighbor */
+  DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */
+  DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */
+  DIY_T1 = 0x80  /* maximum-side t (later) neighbor */
+};
+
+#endif
diff --git a/include/vtkmdiy/critical-resource.hpp b/include/vtkmdiy/critical-resource.hpp
new file mode 100644
index 000000000..61a5a4b8a
--- /dev/null
+++ b/include/vtkmdiy/critical-resource.hpp
@@ -0,0 +1,53 @@
+#ifndef DIY_CRITICAL_RESOURCE_HPP
+#define DIY_CRITICAL_RESOURCE_HPP
+
+namespace diy
+{
+  // TODO: when not running under C++11, i.e., when lock_guard is TinyThread's
+  //       lock_guard, and not C++11's unique_lock, this implementation might
+  //       be buggy since the copy constructor is invoked when
+  //       critical_resource::access() returns an instance of this class. Once
+  //       the temporary is destroyed the mutex is unlocked. I'm not 100%
+  //       certain of this because I'd expect a deadlock on copy constructor,
+  //       but it's clearly not happening -- so I may be missing something.
+  //       (This issue will take care of itself in DIY3 once we switch to C++11 completely.)
+  template<class T, class Mutex>
+  class resource_accessor
+  {
+    public:
+                resource_accessor(T& x, Mutex& m):
+                    x_(x), lock_(m)                         {}
+
+      T&        operator*()                                 { return x_; }
+      T*        operator->()                                { return &x_; }
+      const T&  operator*() const                           { return x_; }
+      const T*  operator->() const                          { return &x_; }
+
+    private:
+      T&                        x_;
+      lock_guard<Mutex>         lock_;
+  };
+
+  template<class T, class Mutex = fast_mutex>
+  class critical_resource
+  {
+    public:
+      typedef           resource_accessor<T, Mutex>         accessor;
+      typedef           resource_accessor<const T, Mutex>   const_accessor;     // eventually, try shared locking
+
+    public:
+                        critical_resource()                 {}
+                        critical_resource(const T& x):
+                            x_(x)                           {}
+
+      accessor          access()                            { return accessor(x_, m_); }
+      const_accessor    const_access() const                { return const_accessor(x_, m_); }
+
+    private:
+      T                 x_;
+      mutable Mutex     m_;
+  };
+}
+
+
+#endif
diff --git a/include/vtkmdiy/decomposition.hpp b/include/vtkmdiy/decomposition.hpp
new file mode 100644
index 000000000..51dfc5af2
--- /dev/null
+++ b/include/vtkmdiy/decomposition.hpp
@@ -0,0 +1,716 @@
+#ifndef DIY_DECOMPOSITION_HPP
+#define DIY_DECOMPOSITION_HPP
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+#include "link.hpp"
+#include "assigner.hpp"
+#include "master.hpp"
+
+namespace diy
+{
+namespace detail
+{
+  template<class Bounds_, class Enable = void>
+  struct BoundsHelper;
+
+  // discrete bounds
+  template<class Bounds>
+  struct BoundsHelper<Bounds, typename std::enable_if<std::is_integral<typename Bounds::Coordinate>::value>::type>
+  {
+    using Coordinate = typename Bounds::Coordinate;
+
+    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)          { return min + (max - min + 1)/n * i; }
+    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool shared_face)
+    {
+      if (i == n - 1)
+        return max;
+      else
+        return from(i+1, n, min, max, shared_face) - (shared_face ? 0 : 1);
+    }
+
+    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
+    {
+        Coordinate width = (max - min + 1)/n;
+        Coordinate res = (x - min)/width;
+        if (res >= n) res = n - 1;
+
+        if (shared && x == from(res, n, min, max, shared))
+            --res;
+        return res;
+    }
+    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool shared)
+    {
+        Coordinate width = (max - min + 1)/n;
+        Coordinate res = (x - min)/width + 1;
+        if (shared && x == from(res, n, min, max, shared))
+            ++res;
+        return res;
+    }
+  };
+
+  // continuous bounds
+  template<class Bounds>
+  struct BoundsHelper<Bounds, typename std::enable_if<std::is_floating_point<typename Bounds::Coordinate>::value>::type>
+  {
+    using Coordinate = typename Bounds::Coordinate;
+
+    static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * i; }
+    static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * (i+1); }
+
+    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::floor((x - min)/width); if (min + res*width == x) return (res - 1); else return res; }
+    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::ceil ((x - min)/width); if (min + res*width == x) return (res + 1); else return res; }
+  };
+}
+
+  //! \ingroup Decomposition
+  //! Decomposes a regular (discrete or continuous) domain into even blocks;
+  //! creates Links with Bounds along the way.
+  template<class Bounds_>
+  struct RegularDecomposer
+  {
+    typedef         Bounds_                                         Bounds;
+    typedef         typename BoundsValue<Bounds>::type              Coordinate;
+    typedef         typename RegularLinkSelector<Bounds>::type      Link;
+
+    using Creator = std::function<void(int,      Bounds, Bounds, Bounds, Link)>;
+    using Updater = std::function<void(int, int, Bounds, Bounds, Bounds, Link)>;
+
+    typedef         std::vector<bool>                               BoolVector;
+    typedef         std::vector<Coordinate>                         CoordinateVector;
+    typedef         std::vector<int>                                DivisionsVector;
+
+    /// @param dim:        dimensionality of the decomposition
+    /// @param domain:     bounds of global domain
+    /// @param nblocks:    total number of global blocks
+    /// @param share_face: indicates dimensions on which to share block faces
+    /// @param wrap:       indicates dimensions on which to wrap the boundary
+    /// @param ghosts:     indicates how many ghosts to use in each dimension
+    /// @param divisions:  indicates how many cuts to make along each dimension
+    ///                   (0 means "no constraint," i.e., leave it up to the algorithm)
+                    RegularDecomposer(int               dim_,
+                                      const Bounds&     domain_,
+                                      int               nblocks_,
+                                      BoolVector        share_face_ = BoolVector(),
+                                      BoolVector        wrap_       = BoolVector(),
+                                      CoordinateVector  ghosts_     = CoordinateVector(),
+                                      DivisionsVector   divisions_  = DivisionsVector()):
+                      dim(dim_), domain(domain_), nblocks(nblocks_),
+                      share_face(share_face_),
+                      wrap(wrap_), ghosts(ghosts_), divisions(divisions_)
+    {
+      if ((int) share_face.size() < dim)  share_face.resize(dim);
+      if ((int) wrap.size() < dim)        wrap.resize(dim);
+      if ((int) ghosts.size() < dim)      ghosts.resize(dim);
+      if ((int) divisions.size() < dim)   divisions.resize(dim);
+
+      fill_divisions(divisions);
+    }
+
+    // Calls create(int gid, const Bounds& bounds, const Link& link)
+    void            decompose(int rank, const Assigner& assigner, const Creator& create);
+
+    void            decompose(int rank, const Assigner& assigner, Master& master, const Updater& update);
+
+    void            decompose(int rank, const Assigner& assigner, Master& master);
+
+    // find lowest gid that owns a particular point
+    template<class Point>
+    int             lowest_gid(const Point& p) const;
+
+    void            gid_to_coords(int gid, DivisionsVector& coords) const       { gid_to_coords(gid, coords, divisions); }
+    int             coords_to_gid(const DivisionsVector& coords) const          { return coords_to_gid(coords, divisions); }
+    void            fill_divisions(std::vector<int>& divisions) const;
+
+    void            fill_bounds(Bounds& bounds, const DivisionsVector& coords, bool add_ghosts = false) const;
+    void            fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const;
+
+    static bool     all(const std::vector<int>& v, int x);
+    static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions);
+    static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions);
+
+    static void     factor(std::vector<unsigned>& factors, int n);
+
+    // Point to GIDs functions
+    template<class Point>
+    void            point_to_gids(std::vector<int>& gids, const Point& p) const;
+
+    //! returns gid of a block that contains the point; ignores ghosts
+    template<class Point>
+    int             point_to_gid(const Point& p) const;
+
+    template<class Point>
+    int             num_gids(const Point& p) const;
+
+    template<class Point>
+    void            top_bottom(int& top, int& bottom, const Point& p, int axis) const;
+
+
+    int               dim;
+    Bounds            domain;
+    int               nblocks;
+    BoolVector        share_face;
+    BoolVector        wrap;
+    CoordinateVector  ghosts;
+    DivisionsVector   divisions;
+
+  };
+
+  /**
+   * \ingroup Decomposition
+   * \brief Decomposes the domain into a prescribed pattern of blocks.
+   *
+   * @param dim        dimension of the domain
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param create     the callback functor
+   * @param wrap       indicates dimensions on which to wrap the boundary
+   * @param ghosts     indicates how many ghosts to use in each dimension
+   * @param divs       indicates how many cuts to make along each dimension
+   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
+   *
+   * `create(...)` is called with each block assigned to the local domain. See [decomposition example](#decomposition-example).
+   */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 const typename RegularDecomposer<Bounds>::Creator&   create,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, create);
+  }
+
+  /**
+   * \ingroup Decomposition
+   * \brief Decomposes the domain into a prescribed pattern of blocks.
+   *
+   * @param dim        dimension of the domain
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param master     gets the blocks once this function returns
+   * @param wrap       indicates dimensions on which to wrap the boundary
+   * @param ghosts     indicates how many ghosts to use in each dimension
+   * @param divs       indicates how many cuts to make along each dimension
+   *                   (0 means "no constraint," i.e., leave it up to the algorithm)
+   *
+   * `master` must have been supplied a create function in order for this function to work.
+   */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 Master&            master,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       = typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, master);
+  }
+
+  /**
+   * \ingroup Decomposition
+   * \brief A "null" decompositon that simply creates the blocks and adds them to the master
+   *
+   * @param rank       local rank
+   * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+   *                   also communicates the total number of blocks
+   * @param master     gets the blocks once this function returns
+   */
+  inline
+  void decompose(int                rank,
+                 const Assigner&    assigner,
+                 Master&            master)
+  {
+    std::vector<int>  local_gids;
+    assigner.local_gids(rank, local_gids);
+
+    for (size_t i = 0; i < local_gids.size(); ++i)
+      master.add(local_gids[i], master.create(), new diy::Link);
+  }
+
+    /**
+     * \ingroup Decomposition
+     * \brief Add a decomposition (modify links) of an existing set of blocks that were
+     * added to the master previously
+     *
+     * @param rank       local rank
+     * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+     *                   also communicates the total number of blocks
+     */
+  template<class Bounds>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 Master&            master,
+                 const typename RegularDecomposer<Bounds>::Updater&   update,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     =
+                 typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       =
+                 typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+      RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).
+          decompose(rank, assigner, master, update);
+  }
+
+  //! Decomposition example: \example decomposition/test-decomposition.cpp
+  //! Direct master insertion example: \example decomposition/test-direct-master.cpp
+}
+
+// decomposes domain and adds blocks to the master
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master)
+{
+  decompose(rank, assigner, [&master](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
+  {
+    void*     b = master.create();
+    Link*     l = new Link(link);
+    master.add(gid, b, l);
+  });
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, const Creator& create)
+{
+  std::vector<int> gids;
+  assigner.local_gids(rank, gids);
+  for (int i = 0; i < (int)gids.size(); ++i)
+  {
+    int gid = gids[i];
+
+    DivisionsVector coords;
+    gid_to_coords(gid, coords);
+
+    Bounds core, bounds;
+    fill_bounds(core,   coords);
+    fill_bounds(bounds, coords, true);
+
+    // Fill link with all the neighbors
+    Link link(dim, core, bounds);
+    std::vector<int>  offsets(dim, -1);
+    offsets[0] = -2;
+    while (!all(offsets, 1))
+    {
+      // next offset
+      int i;
+      for (i = 0; i < dim; ++i)
+        if (offsets[i] == 1)
+          offsets[i] = -1;
+        else
+          break;
+      ++offsets[i];
+
+      if (all(offsets, 0)) continue;      // skip ourselves
+
+      DivisionsVector     nhbr_coords(dim);
+      Direction           dir, wrap_dir;
+      bool                inbounds = true;
+      for (int i = 0; i < dim; ++i)
+      {
+        nhbr_coords[i] = coords[i] + offsets[i];
+
+        // wrap
+        if (nhbr_coords[i] < 0)
+        {
+          if (wrap[i])
+          {
+            nhbr_coords[i] = divisions[i] - 1;
+            wrap_dir[i] = -1;
+          }
+          else
+            inbounds = false;
+        }
+
+        if (nhbr_coords[i] >= divisions[i])
+        {
+          if (wrap[i])
+          {
+            nhbr_coords[i] = 0;
+            wrap_dir[i] = 1;
+          }
+          else
+            inbounds = false;
+        }
+
+        // NB: this needs to match the addressing scheme in dir_t (in constants.h)
+        if (offsets[i] == -1 || offsets[i] == 1)
+          dir[i] = offsets[i];
+      }
+      if (!inbounds) continue;
+
+      int nhbr_gid = coords_to_gid(nhbr_coords);
+      BlockID bid; bid.gid = nhbr_gid; bid.proc = assigner.rank(nhbr_gid);
+      link.add_neighbor(bid);
+
+      Bounds nhbr_bounds;
+      fill_bounds(nhbr_bounds, nhbr_coords);
+      link.add_bounds(nhbr_bounds);
+
+      link.add_direction(dir);
+      link.add_wrap(wrap_dir);
+    }
+
+    create(gid, core, bounds, domain, link);
+  }
+}
+
+// decomposes domain but does not add blocks to master, assumes they were added already
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master, const Updater& update)
+{
+    decompose(rank, assigner, [&master,&update](int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link)
+    {
+        int lid = master.lid(gid);
+        Link* l = new Link(link);
+        master.replace_link(lid, l);
+        update(gid, lid, core, bounds, domain, *l);
+    });
+}
+
+template<class Bounds>
+bool
+diy::RegularDecomposer<Bounds>::
+all(const std::vector<int>& v, int x)
+{
+  for (unsigned i = 0; i < v.size(); ++i)
+    if (v[i] != x)
+      return false;
+  return true;
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions)
+{
+  int dim = divisions.size();
+  for (int i = 0; i < dim; ++i)
+  {
+    coords.push_back(gid % divisions[i]);
+    gid /= divisions[i];
+  }
+}
+
+template<class Bounds>
+int
+diy::RegularDecomposer<Bounds>::
+coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions)
+{
+  int gid = 0;
+  for (int i = coords.size() - 1; i >= 0; --i)
+  {
+    gid *= divisions[i];
+    gid += coords[i];
+  }
+  return gid;
+}
+
+//! \ingroup Decomposition
+//! Gets the bounds, with or without ghosts, for a block specified by its block coordinates
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_bounds(Bounds& bounds,                  //!< (output) bounds
+            const DivisionsVector& coords,   //!< coordinates of the block in the decomposition
+            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
+    const
+{
+  for (int i = 0; i < dim; ++i)
+  {
+    bounds.min[i] = detail::BoundsHelper<Bounds>::from(coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
+    bounds.max[i] = detail::BoundsHelper<Bounds>::to  (coords[i], divisions[i], domain.min[i], domain.max[i], share_face[i]);
+  }
+
+  for (int i = dim; i < DIY_MAX_DIM; ++i)   // set the unused dimension to 0
+  {
+    bounds.min[i] = 0;
+    bounds.max[i] = 0;
+  }
+
+  if (!add_ghosts)
+    return;
+
+  for (int i = 0; i < dim; ++i)
+  {
+    if (wrap[i])
+    {
+      bounds.min[i] -= ghosts[i];
+      bounds.max[i] += ghosts[i];
+    } else
+    {
+      bounds.min[i] = std::max(domain.min[i], bounds.min[i] - ghosts[i]);
+      bounds.max[i] = std::min(domain.max[i], bounds.max[i] + ghosts[i]);
+    }
+  }
+}
+
+//! \ingroup Decomposition
+//! Gets the bounds, with or without ghosts, for a block specified by its gid
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_bounds(Bounds& bounds,                  //!< (output) bounds
+            int gid,                         //!< global id of the block
+            bool add_ghosts)                 //!< whether to include ghosts in the output bounds
+    const
+{
+    DivisionsVector coords;
+    gid_to_coords(gid, coords);
+    if (add_ghosts)
+        fill_bounds(bounds, coords, true);
+    else
+        fill_bounds(bounds, coords);
+}
+
+namespace diy { namespace detail {
+// current state of division in one dimension used in fill_divisions below
+template<class Coordinate>
+struct Div
+{
+    int dim;                                 // 0, 1, 2, etc. e.g. for x, y, z etc.
+    int nb;                                  // number of blocks so far in this dimension
+    Coordinate b_size;                       // block size so far in this dimension
+
+    // sort on descending block size unless tied, in which case
+    // sort on ascending num blocks in current dim unless tied, in which case
+    // sort on ascending dimension
+    bool operator<(Div rhs) const
+    {
+        // sort on second value of the pair unless tied, in which case sort on first
+        if (b_size == rhs.b_size)
+        {
+            if (nb == rhs.nb)
+                return(dim < rhs.dim);
+            return(nb < rhs.nb);
+        }
+        return(b_size > rhs.b_size);
+    }
+};
+} }
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+fill_divisions(std::vector<int>& divisions) const
+{
+    // prod = number of blocks unconstrained by user; c = number of unconstrained dimensions
+    int prod = 1; int c = 0;
+    for (int i = 0; i < dim; ++i)
+        if (divisions[i] != 0)
+        {
+            prod *= divisions[i];
+            ++c;
+        }
+
+    if (nblocks % prod != 0)
+        throw std::runtime_error("Total number of blocks cannot be factored into provided divs");
+
+    if (c == (int) divisions.size())               // nothing to do; user provided all divs
+        return;
+
+    // factor number of blocks left in unconstrained dimensions
+    // factorization is sorted from smallest to largest factors
+    std::vector<unsigned> factors;
+    factor(factors, nblocks/prod);
+
+    using detail::Div;
+    std::vector< Div<Coordinate> > missing_divs;              // pairs consisting of (dim, #divs)
+
+    // init missing_divs
+    for (int i = 0; i < dim; i++)
+    {
+        if (divisions[i] == 0)
+        {
+            Div<Coordinate> div;
+            div.dim = i;
+            div.nb = 1;
+            div.b_size = domain.max[i] - domain.min[i];
+            missing_divs.push_back(div);
+        }
+    }
+
+    // iterate over factorization of number of blocks (factors are sorted smallest to largest)
+    // NB: using int instead of size_t because must be negative in order to break out of loop
+    for (int i = factors.size() - 1; i >= 0; --i)
+    {
+        // fill in missing divs by dividing dimension w/ largest block size
+        // except when this would be illegal (resulting in bounds.max < bounds.min;
+        // only a problem for discrete bounds
+
+        // sort on decreasing block size
+        std::sort(missing_divs.begin(), missing_divs.end());
+
+        // split the dimension with the largest block size (first element in vector)
+        Coordinate min =
+            detail::BoundsHelper<Bounds>::from(0,
+                                               missing_divs[0].nb * factors[i],
+                                               domain.min[missing_divs[0].dim],
+                                               domain.max[missing_divs[0].dim],
+                                               share_face[missing_divs[0].dim]);
+        Coordinate max =
+            detail::BoundsHelper<Bounds>::to(0,
+                                             missing_divs[0].nb * factors[i],
+                                             domain.min[missing_divs[0].dim],
+                                             domain.max[missing_divs[0].dim],
+                                             share_face[missing_divs[0].dim]);
+        if (max >= min)
+        {
+            missing_divs[0].nb    *= factors[i];
+            missing_divs[0].b_size = max - min;
+        }
+        else
+        {
+            std::ostringstream oss;
+            oss << "Unable to decompose domain into " << nblocks << " blocks: " << min << " " << max;
+            throw std::runtime_error(oss.str());
+        }
+    }
+
+    // assign the divisions
+    for (size_t i = 0; i < missing_divs.size(); i++)
+        divisions[missing_divs[i].dim] = missing_divs[i].nb;
+}
+
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+factor(std::vector<unsigned>& factors, int n)
+{
+  while (n != 1)
+    for (int i = 2; i <= n; ++i)
+    {
+      if (n % i == 0)
+      {
+        factors.push_back(i);
+        n /= i;
+        break;
+      }
+    }
+}
+
+// Point to GIDs
+// TODO: deal with wrap correctly
+// TODO: add an optional ghosts argument to ignore ghosts (if we want to find the true owners, or something like that)
+template<class Bounds>
+template<class Point>
+void
+diy::RegularDecomposer<Bounds>::
+point_to_gids(std::vector<int>& gids, const Point& p) const
+{
+    std::vector< std::pair<int, int> > ranges(dim);
+    for (int i = 0; i < dim; ++i)
+        top_bottom(ranges[i].second, ranges[i].first, p, i);
+
+    // look up gids for all combinations
+    DivisionsVector coords(dim), location(dim);
+    while(location.back() < ranges.back().second - ranges.back().first)
+    {
+        for (int i = 0; i < dim; ++i)
+            coords[i] = ranges[i].first + location[i];
+        gids.push_back(coords_to_gid(coords, divisions));
+
+        location[0]++;
+        unsigned i = 0;
+        while (i < dim-1 && location[i] == ranges[i].second - ranges[i].first)
+        {
+            location[i] = 0;
+            ++i;
+            location[i]++;
+        }
+    }
+}
+
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+point_to_gid(const Point& p) const
+{
+    int gid = 0;
+    for (int axis = dim - 1; axis >= 0; --axis)
+    {
+      int bottom  = detail::BoundsHelper<Bounds>::lower(p[axis], divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+          bottom  = std::max(0, bottom);
+
+      // coupled with coords_to_gid
+      gid *= divisions[axis];
+      gid += bottom;
+    }
+
+    return gid;
+}
+
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+num_gids(const Point& p) const
+{
+    int res = 1;
+    for (int i = 0; i < dim; ++i)
+    {
+        int top, bottom;
+        top_bottom(top, bottom, p, i);
+        res *= top - bottom;
+    }
+    return res;
+}
+
+template<class Bounds>
+template<class Point>
+void
+diy::RegularDecomposer<Bounds>::
+top_bottom(int& top, int& bottom, const Point& p, int axis) const
+{
+    Coordinate l = p[axis] - ghosts[axis];
+    Coordinate r = p[axis] + ghosts[axis];
+
+    top     = detail::BoundsHelper<Bounds>::upper(r, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+    bottom  = detail::BoundsHelper<Bounds>::lower(l, divisions[axis], domain.min[axis], domain.max[axis], share_face[axis]);
+
+    if (!wrap[axis])
+    {
+        bottom  = std::max(0, bottom);
+        top     = std::min(divisions[axis], top);
+    }
+}
+
+// find lowest gid that owns a particular point
+template<class Bounds>
+template<class Point>
+int
+diy::RegularDecomposer<Bounds>::
+lowest_gid(const Point& p) const
+{
+    // TODO: optimize - no need to compute all gids
+    std::vector<int> gids;
+    point_to_gids(gids, p);
+    std::sort(gids.begin(), gids.end());
+    return gids[0];
+}
+
+#endif
diff --git a/include/vtkmdiy/detail/algorithms/kdtree-sampling.hpp b/include/vtkmdiy/detail/algorithms/kdtree-sampling.hpp
new file mode 100644
index 000000000..7cf2ee1e5
--- /dev/null
+++ b/include/vtkmdiy/detail/algorithms/kdtree-sampling.hpp
@@ -0,0 +1,450 @@
+#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+#define DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+
+#include <vector>
+#include <cassert>
+#include "../../partners/all-reduce.hpp"
+#include "../../log.hpp"
+
+// TODO: technically, what's done now is not a perfect subsample:
+//       we take the same number of samples from every block, in reality this number should be selected at random,
+//       so that the total number of samples adds up to samples*nblocks
+//
+// NB: random samples are chosen using rand(), which is assumed to be seeded
+//     externally. Once we switch to C++11, we should use its more advanced
+//     random number generators (and take a generator as an external parameter)
+//     (TODO)
+
+namespace diy
+{
+namespace detail
+{
+
+template<class Block, class Point>
+struct KDTreeSamplingPartition
+{
+    typedef     diy::RegularContinuousLink      RCLink;
+    typedef     diy::ContinuousBounds           Bounds;
+
+    typedef     std::vector<float>              Samples;
+
+                KDTreeSamplingPartition(int                             dim,
+                                        std::vector<Point>  Block::*    points,
+                                        size_t                          samples):
+                    dim_(dim), points_(points), samples_(samples)           {}
+
+    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
+
+    int         divide_gid(int gid, bool lower, int round, int rounds) const;
+    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
+    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
+
+    void        compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    void        add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const;
+    void        receive_samples(Block* b, const diy::ReduceProxy& srp,       Samples& samples) const;
+    void        forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const;
+
+    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const;
+    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
+
+    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
+    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
+    float       find_split(const Bounds& changed, const Bounds& original) const;
+
+    int                             dim_;
+    std::vector<Point>  Block::*    points_;
+    size_t                          samples_;
+};
+
+}
+}
+
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
+{
+    int dim;
+    if (srp.round() < partners.rounds())
+        dim = partners.dim(srp.round());
+    else
+        dim = partners.dim(srp.round() - 1);
+
+    if (srp.round() == partners.rounds())
+        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
+    {
+        dequeue_exchange(b, srp, dim);         // from the swap round
+        split_to_neighbors(b, srp, dim);
+    }
+    else if (partners.swap_round(srp.round()))
+    {
+        Samples samples;
+        receive_samples(b, srp, samples);
+        enqueue_exchange(b, srp, dim, samples);
+    } else if (partners.sub_round(srp.round()) == 0)
+    {
+        if (srp.round() > 0)
+        {
+            int prev_dim = dim - 1;
+            if (prev_dim < 0)
+                prev_dim += dim_;
+            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+        }
+
+        compute_local_samples(b, srp, dim);
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)     // we are reusing partners class, so really we are talking about the samples rounds here
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        srp.enqueue(srp.out_link().target(0), samples);
+    } else
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        if (samples.size() != 1)
+        {
+            // pick the median
+            std::nth_element(samples.begin(), samples.begin() + samples.size()/2, samples.end());
+            std::swap(samples[0], samples[samples.size()/2]);
+            //std::sort(samples.begin(), samples.end());
+            //samples[0] = (samples[samples.size()/2] + samples[samples.size()/2 + 1])/2;
+            samples.resize(1);
+        }
+        forward_samples(b, srp, samples);
+    }
+}
+
+template<class Block, class Point>
+int
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+divide_gid(int gid, bool lower, int round, int rounds) const
+{
+    if (lower)
+        gid &= ~(1 << (rounds - 1 - round));
+    else
+        gid |=  (1 << (rounds - 1 - round));
+    return gid;
+}
+
+// round here is the outer iteration of the algorithm
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+{
+    auto        log  = get_logger();
+    int         gid  = srp.gid();
+    int         lid  = srp.master()->lid(gid);
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // (gid, dir) -> i
+    std::map<std::pair<int,diy::Direction>, int> link_map;
+    for (int i = 0; i < link->size(); ++i)
+        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
+
+    // NB: srp.enqueue(..., ...) should match the link
+    std::vector<float>  splits(link->size());
+    for (int i = 0; i < link->size(); ++i)
+    {
+        float split; diy::Direction dir;
+
+        int in_gid = link->target(i).gid;
+        while(srp.incoming(in_gid))
+        {
+            srp.dequeue(in_gid, split);
+            srp.dequeue(in_gid, dir);
+
+            // reverse dir
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
+
+            int k = link_map[std::make_pair(in_gid, dir)];
+            log->trace("{} {} {} -> {}", in_gid, dir, split, k);
+            splits[k] = split;
+        }
+    }
+
+    RCLink      new_link(dim_, link->core(), link->core());
+
+    bool lower = !(gid & (1 << (rounds - 1 - round)));
+
+    // fill out the new link
+    for (int i = 0; i < link->size(); ++i)
+    {
+        diy::Direction  dir = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
+        {
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
+                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                new_link.add_neighbor(nbr);
+
+                new_link.add_direction(dir);
+
+                Bounds bounds = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
+                new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
+            }
+        } else // non-aligned side
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
+
+                Bounds  bounds  = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
+
+                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
+                {
+                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                    new_link.add_neighbor(nbr);
+                    new_link.add_direction(dir);
+                    new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
+                }
+            }
+        }
+    }
+
+    // add link to the dual block
+    int dual_gid = divide_gid(gid, !lower, round, rounds);
+    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
+    new_link.add_neighbor(dual);
+
+    Bounds nbr_bounds = link->bounds();     // old block bounds
+    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
+    new_link.add_bounds(nbr_bounds);
+
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
+    if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
+        new_link.add_direction(right);
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
+        new_link.add_direction(left);
+    }
+
+    // update the link; notice that this won't conflict with anything since
+    // reduce is using its own notion of the link constructed through the
+    // partners
+    link->swap(new_link);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // determine split
+    float split = find_split(link->core(), link->bounds());
+
+    for (int i = 0; i < link->size(); ++i)
+    {
+        srp.enqueue(link->target(i), split);
+        srp.enqueue(link->target(i), link->direction(i));
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    // compute and enqueue local samples
+    Samples samples;
+    size_t points_size = (b->*points_).size();
+    size_t n = std::min(points_size, samples_);
+    samples.reserve(n);
+    for (size_t i = 0; i < n; ++i)
+    {
+        float x = (b->*points_)[rand() % points_size][dim];
+        samples.push_back(x);
+    }
+
+    srp.enqueue(srp.out_link().target(0), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    // dequeue and combine the samples
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+        int nbr_gid = srp.in_link().target(i).gid;
+
+        Samples smpls;
+        srp.dequeue(nbr_gid, smpls);
+        for (size_t i = 0; i < smpls.size(); ++i)
+            samples.push_back(smpls[i]);
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    srp.dequeue(srp.in_link().target(0).gid, samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const
+{
+    for (int i = 0; i < srp.out_link().size(); ++i)
+        srp.enqueue(srp.out_link().target(i), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    int k = srp.out_link().size();
+
+    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
+        return;
+
+    // pick split points
+    float split = samples[0];
+
+    // subset and enqueue
+    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+      float x = (b->*points_)[i][dim];
+      int loc = x < split ? 0 : 1;
+      out_points[loc].push_back((b->*points_)[i]);
+    }
+    int pos = -1;
+    for (int i = 0; i < k; ++i)
+    {
+      if (srp.out_link().target(i).gid == srp.gid())
+      {
+        (b->*points_).swap(out_points[i]);
+        pos = i;
+      }
+      else
+        srp.enqueue(srp.out_link().target(i), out_points[i]);
+    }
+    if (pos == 0)
+        link->core().max[dim] = split;
+    else
+        link->core().min[dim] = split;
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+      int nbr_gid = srp.in_link().target(i).gid;
+      if (nbr_gid == srp.gid())
+          continue;
+
+      std::vector<Point>   in_points;
+      srp.dequeue(nbr_gid, in_points);
+      for (size_t j = 0; j < in_points.size(); ++j)
+      {
+        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
+            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
+                                                 in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
+        (b->*points_).push_back(in_points[j]);
+      }
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
+{
+    if (lower)
+        bounds.max[dim] = split;
+    else
+        bounds.min[dim] = split;
+}
+
+template<class Block, class Point>
+bool
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
+{
+    if (wrap)
+    {
+        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
+            return true;
+        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
+            return true;
+    }
+    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
+}
+
+template<class Block, class Point>
+float
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_split(const Bounds& changed, const Bounds& original) const
+{
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (changed.min[i] != original.min[i])
+            return changed.min[i];
+        if (changed.max[i] != original.max[i])
+            return changed.max[i];
+    }
+    assert(0);
+    return -1;
+}
+
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
+
+#endif
diff --git a/include/vtkmdiy/detail/algorithms/kdtree.hpp b/include/vtkmdiy/detail/algorithms/kdtree.hpp
new file mode 100644
index 000000000..286929dc9
--- /dev/null
+++ b/include/vtkmdiy/detail/algorithms/kdtree.hpp
@@ -0,0 +1,569 @@
+#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_HPP
+#define DIY_DETAIL_ALGORITHMS_KDTREE_HPP
+
+#include <vector>
+#include <cassert>
+#include "../../partners/all-reduce.hpp"
+#include "../../log.hpp"
+
+namespace diy
+{
+namespace detail
+{
+
+struct KDTreePartners;
+
+template<class Block, class Point>
+struct KDTreePartition
+{
+    typedef     diy::RegularContinuousLink      RCLink;
+    typedef     diy::ContinuousBounds           Bounds;
+
+    typedef     std::vector<size_t>             Histogram;
+
+                KDTreePartition(int                             dim,
+                                std::vector<Point>  Block::*    points,
+                                size_t                          bins):
+                    dim_(dim), points_(points), bins_(bins)            {}
+
+    void        operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
+
+    int         divide_gid(int gid, bool lower, int round, int rounds) const;
+    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
+    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
+
+    void        compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    void        add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const;
+    void        receive_histogram(Block* b, const diy::ReduceProxy& srp,       Histogram& histogram) const;
+    void        forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const;
+
+    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const;
+    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
+
+    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
+    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
+    float       find_split(const Bounds& changed, const Bounds& original) const;
+
+    int                             dim_;
+    std::vector<Point>  Block::*    points_;
+    size_t                          bins_;
+};
+
+}
+}
+
+struct diy::detail::KDTreePartners
+{
+  // bool = are we in a swap (vs histogram) round
+  // int  = round within that partner
+  typedef           std::pair<bool, int>                    RoundType;
+  typedef           diy::ContinuousBounds                   Bounds;
+
+                    KDTreePartners(int dim, int nblocks, bool wrap_, const Bounds& domain_):
+                        decomposer(1, interval(0,nblocks-1), nblocks),
+                        histogram(decomposer, 2),
+                        swap(decomposer, 2, false),
+                        wrap(wrap_),
+                        domain(domain_)
+  {
+    for (unsigned i = 0; i < swap.rounds(); ++i)
+    {
+      // fill histogram rounds
+      for (unsigned j = 0; j < histogram.rounds(); ++j)
+      {
+        rounds_.push_back(std::make_pair(false, j));
+        dim_.push_back(i % dim);
+        if (j == histogram.rounds() / 2 - 1 - i)
+            j += 2*i;
+      }
+
+      // fill swap round
+      rounds_.push_back(std::make_pair(true, i));
+      dim_.push_back(i % dim);
+
+      // fill link round
+      rounds_.push_back(std::make_pair(true, -1));          // (true, -1) signals link round
+      dim_.push_back(i % dim);
+    }
+  }
+
+  size_t        rounds() const                              { return rounds_.size(); }
+  size_t        swap_rounds() const                         { return swap.rounds(); }
+
+  int           dim(int round) const                        { return dim_[round]; }
+  bool          swap_round(int round) const                 { return rounds_[round].first; }
+  int           sub_round(int round) const                  { return rounds_[round].second; }
+
+  inline bool   active(int round, int gid, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        return true;
+    else if (swap_round(round) && sub_round(round) < 0)     // link round
+        return true;
+    else if (swap_round(round))
+        return swap.active(sub_round(round), gid, m);
+    else
+        return histogram.active(sub_round(round), gid, m);
+  }
+
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        link_neighbors(-1, gid, partners, m);
+    else if (swap_round(round) && sub_round(round) < 0)       // link round
+        swap.incoming(sub_round(round - 1) + 1, gid, partners, m);
+    else if (swap_round(round))
+        histogram.incoming(histogram.rounds(), gid, partners, m);
+    else
+    {
+        if (round > 0 && sub_round(round) == 0)
+            link_neighbors(-1, gid, partners, m);
+        else if (round > 0 && sub_round(round - 1) != sub_round(round) - 1)        // jump through the histogram rounds
+            histogram.incoming(sub_round(round - 1) + 1, gid, partners, m);
+        else
+            histogram.incoming(sub_round(round), gid, partners, m);
+    }
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    if (round == (int) rounds())
+        swap.outgoing(sub_round(round-1) + 1, gid, partners, m);
+    else if (swap_round(round) && sub_round(round) < 0)       // link round
+        link_neighbors(-1, gid, partners, m);
+    else if (swap_round(round))
+        swap.outgoing(sub_round(round), gid, partners, m);
+    else
+        histogram.outgoing(sub_round(round), gid, partners, m);
+  }
+
+  inline void   link_neighbors(int, int gid, std::vector<int>& partners, const diy::Master& m) const
+  {
+    int         lid  = m.lid(gid);
+    diy::Link*  link = m.link(lid);
+
+    std::set<int> result;       // partners must be unique
+    for (int i = 0; i < link->size(); ++i)
+        result.insert(link->target(i).gid);
+
+    for (std::set<int>::const_iterator it = result.begin(); it != result.end(); ++it)
+        partners.push_back(*it);
+  }
+
+  // 1-D domain to feed into histogram and swap
+  diy::RegularDecomposer<diy::DiscreteBounds>   decomposer;
+
+  diy::RegularAllReducePartners     histogram;
+  diy::RegularSwapPartners          swap;
+
+  std::vector<RoundType>            rounds_;
+  std::vector<int>                  dim_;
+
+  bool                              wrap;
+  Bounds                            domain;
+};
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
+{
+    int dim;
+    if (srp.round() < partners.rounds())
+        dim = partners.dim(srp.round());
+    else
+        dim = partners.dim(srp.round() - 1);
+
+    if (srp.round() == partners.rounds())
+        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
+    {
+        dequeue_exchange(b, srp, dim);         // from the swap round
+        split_to_neighbors(b, srp, dim);
+    }
+    else if (partners.swap_round(srp.round()))
+    {
+        Histogram   histogram;
+        receive_histogram(b, srp, histogram);
+        enqueue_exchange(b, srp, dim, histogram);
+    } else if (partners.sub_round(srp.round()) == 0)
+    {
+        if (srp.round() > 0)
+        {
+            int prev_dim = dim - 1;
+            if (prev_dim < 0)
+                prev_dim += dim_;
+            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+        }
+
+        compute_local_histogram(b, srp, dim);
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)
+    {
+        Histogram   histogram(bins_);
+        add_histogram(b, srp, histogram);
+        srp.enqueue(srp.out_link().target(0), histogram);
+    }
+    else
+    {
+        Histogram   histogram(bins_);
+        add_histogram(b, srp, histogram);
+        forward_histogram(b, srp, histogram);
+    }
+}
+
+template<class Block, class Point>
+int
+diy::detail::KDTreePartition<Block,Point>::
+divide_gid(int gid, bool lower, int round, int rounds) const
+{
+    if (lower)
+        gid &= ~(1 << (rounds - 1 - round));
+    else
+        gid |=  (1 << (rounds - 1 - round));
+    return gid;
+}
+
+// round here is the outer iteration of the algorithm
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+{
+    int         gid  = srp.gid();
+    int         lid  = srp.master()->lid(gid);
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // (gid, dir) -> i
+    std::map<std::pair<int,diy::Direction>, int> link_map;
+    for (int i = 0; i < link->size(); ++i)
+        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
+
+    // NB: srp.enqueue(..., ...) should match the link
+    std::vector<float>  splits(link->size());
+    for (int i = 0; i < link->size(); ++i)
+    {
+        float split; diy::Direction dir;
+
+        int in_gid = link->target(i).gid;
+        while(srp.incoming(in_gid))
+        {
+            srp.dequeue(in_gid, split);
+            srp.dequeue(in_gid, dir);
+
+            // reverse dir
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
+
+            int k = link_map[std::make_pair(in_gid, dir)];
+            splits[k] = split;
+        }
+    }
+
+    RCLink      new_link(dim_, link->core(), link->core());
+
+    bool lower = !(gid & (1 << (rounds - 1 - round)));
+
+    // fill out the new link
+    for (int i = 0; i < link->size(); ++i)
+    {
+        diy::Direction  dir      = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
+        {
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
+                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                new_link.add_neighbor(nbr);
+
+                new_link.add_direction(dir);
+
+                Bounds bounds = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
+                new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
+            }
+        } else // non-aligned side
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
+
+                Bounds  bounds  = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
+
+                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
+                {
+                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                    new_link.add_neighbor(nbr);
+                    new_link.add_direction(dir);
+                    new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
+                }
+            }
+        }
+    }
+
+    // add link to the dual block
+    int dual_gid = divide_gid(gid, !lower, round, rounds);
+    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
+    new_link.add_neighbor(dual);
+
+    Bounds nbr_bounds = link->bounds();     // old block bounds
+    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
+    new_link.add_bounds(nbr_bounds);
+
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
+    if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
+        new_link.add_direction(right);
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
+        new_link.add_direction(left);
+    }
+
+    // update the link; notice that this won't conflict with anything since
+    // reduce is using its own notion of the link constructed through the
+    // partners
+    link->swap(new_link);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // determine split
+    float split = find_split(link->core(), link->bounds());
+
+    for (int i = 0; i < link->size(); ++i)
+    {
+        srp.enqueue(link->target(i), split);
+        srp.enqueue(link->target(i), link->direction(i));
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // compute and enqueue local histogram
+    Histogram histogram(bins_);
+
+    float   width = (link->core().max[dim] - link->core().min[dim])/bins_;
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+        float x = (b->*points_)[i][dim];
+        int loc = (x - link->core().min[dim]) / width;
+        if (loc < 0)
+            throw std::runtime_error(fmt::format("{} {} {}", loc, x, link->core().min[dim]));
+        if (loc >= (int) bins_)
+            loc = bins_ - 1;
+        ++(histogram[loc]);
+    }
+
+    srp.enqueue(srp.out_link().target(0), histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+{
+    // dequeue and add up the histograms
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+        int nbr_gid = srp.in_link().target(i).gid;
+
+        Histogram hist;
+        srp.dequeue(nbr_gid, hist);
+        for (size_t i = 0; i < hist.size(); ++i)
+            histogram[i] += hist[i];
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+receive_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+{
+    srp.dequeue(srp.in_link().target(0).gid, histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const
+{
+    for (int i = 0; i < srp.out_link().size(); ++i)
+        srp.enqueue(srp.out_link().target(i), histogram);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram& histogram) const
+{
+    auto        log = get_logger();
+
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    int k = srp.out_link().size();
+
+    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
+        return;
+
+    // pick split points
+    size_t total = 0;
+    for (size_t i = 0; i < histogram.size(); ++i)
+        total += histogram[i];
+    log->trace("Histogram total: {}", total);
+
+    size_t cur   = 0;
+    float  width = (link->core().max[dim] - link->core().min[dim])/bins_;
+    float  split = 0;
+    for (size_t i = 0; i < histogram.size(); ++i)
+    {
+        if (cur + histogram[i] > total/2)
+        {
+            split = link->core().min[dim] + width*i;
+            break;
+        }
+        cur += histogram[i];
+    }
+    log->trace("Found split: {} (dim={}) in {} - {}", split, dim, link->core().min[dim], link->core().max[dim]);
+
+    // subset and enqueue
+    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+      float x = (b->*points_)[i][dim];
+      int loc = x < split ? 0 : 1;
+      out_points[loc].push_back((b->*points_)[i]);
+    }
+    int pos = -1;
+    for (int i = 0; i < k; ++i)
+    {
+      if (srp.out_link().target(i).gid == srp.gid())
+      {
+        (b->*points_).swap(out_points[i]);
+        pos = i;
+      }
+      else
+        srp.enqueue(srp.out_link().target(i), out_points[i]);
+    }
+    if (pos == 0)
+        link->core().max[dim] = split;
+    else
+        link->core().min[dim] = split;
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+      int nbr_gid = srp.in_link().target(i).gid;
+      if (nbr_gid == srp.gid())
+          continue;
+
+      std::vector<Point>   in_points;
+      srp.dequeue(nbr_gid, in_points);
+      for (size_t j = 0; j < in_points.size(); ++j)
+      {
+        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
+            throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
+                                     in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
+        (b->*points_).push_back(in_points[j]);
+      }
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreePartition<Block,Point>::
+update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
+{
+    if (lower)
+        bounds.max[dim] = split;
+    else
+        bounds.min[dim] = split;
+}
+
+template<class Block, class Point>
+bool
+diy::detail::KDTreePartition<Block,Point>::
+intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
+{
+    if (wrap)
+    {
+        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
+            return true;
+        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
+            return true;
+    }
+    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
+}
+
+template<class Block, class Point>
+float
+diy::detail::KDTreePartition<Block,Point>::
+find_split(const Bounds& changed, const Bounds& original) const
+{
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (changed.min[i] != original.min[i])
+            return changed.min[i];
+        if (changed.max[i] != original.max[i])
+            return changed.max[i];
+    }
+    assert(0);
+    return -1;
+}
+
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreePartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
+
+#endif
diff --git a/include/vtkmdiy/detail/algorithms/sort.hpp b/include/vtkmdiy/detail/algorithms/sort.hpp
new file mode 100644
index 000000000..5cc3f8807
--- /dev/null
+++ b/include/vtkmdiy/detail/algorithms/sort.hpp
@@ -0,0 +1,162 @@
+#ifndef DIY_DETAIL_ALGORITHMS_SORT_HPP
+#define DIY_DETAIL_ALGORITHMS_SORT_HPP
+
+#include <functional>
+#include <algorithm>
+
+namespace diy
+{
+
+namespace detail
+{
+
+template<class Block, class T, class Cmp>
+struct SampleSort
+{
+    typedef         std::vector<T>      Block::*ValuesVector;
+    struct Sampler;
+    struct Exchanger;
+
+                    SampleSort(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_, size_t num_samples_):
+                        values(values_), samples(samples_),
+                        cmp(cmp_), num_samples(num_samples_)                    {}
+
+    Sampler         sample() const                                              { return Sampler(values, samples, cmp, num_samples); }
+    Exchanger       exchange() const                                            { return Exchanger(values, samples, cmp); }
+
+    static void     dequeue_values(std::vector<T>& v, const ReduceProxy& rp, bool skip_self = true)
+    {
+        auto log = get_logger();
+
+        int k_in  = rp.in_link().size();
+
+        log->trace("dequeue_values(): gid={}, round={}; v.size()={}", rp.gid(), rp.round(), v.size());
+
+        if (detail::is_default< Serialization<T> >::value)
+        {
+            // add up sizes
+            size_t sz = 0;
+            size_t end = v.size();
+            for (int i = 0; i < k_in; ++i)
+            {
+                log->trace("    incoming size from {}: {}", rp.in_link().target(i).gid, sz);
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                sz += in.size() / sizeof(T);
+            }
+            log->trace("    incoming size: {}", sz);
+            v.resize(end + sz);
+
+            for (int i = 0; i < k_in; ++i)
+            {
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                size_t sz = in.size() / sizeof(T);
+                T* bg = (T*) &in.buffer[0];
+                std::copy(bg, bg + sz, &v[end]);
+                end += sz;
+            }
+        } else
+        {
+            for (int i = 0; i < k_in; ++i)
+            {
+                if (skip_self && rp.in_link().target(i).gid == rp.gid()) continue;
+                MemoryBuffer& in = rp.incoming(rp.in_link().target(i).gid);
+                while(in)
+                {
+                    T x;
+                    diy::load(in, x);
+                    v.emplace_back(std::move(x));
+                }
+            }
+        }
+        log->trace("    v.size()={}", v.size());
+    }
+
+    ValuesVector    values;
+    ValuesVector    samples;
+    Cmp             cmp;
+    size_t          num_samples;
+};
+
+template<class Block, class T, class Cmp>
+struct SampleSort<Block,T,Cmp>::Sampler
+{
+                    Sampler(ValuesVector values_, ValuesVector dividers_, const Cmp& cmp_, size_t num_samples_):
+                        values(values_), dividers(dividers_), cmp(cmp_), num_samples(num_samples_)    {}
+
+    void            operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    {
+        int k_in  = srp.in_link().size();
+        int k_out = srp.out_link().size();
+
+        std::vector<T> samples;
+
+        if (k_in == 0)
+        {
+            // draw random samples
+            for (size_t i = 0; i < num_samples; ++i)
+                samples.push_back((b->*values)[std::rand() % (b->*values).size()]);
+        } else
+            dequeue_values(samples, srp, false);
+
+        if (k_out == 0)
+        {
+            // pick subsamples that separate quantiles
+            std::sort(samples.begin(), samples.end(), cmp);
+            std::vector<T>  subsamples(srp.nblocks() - 1);
+            int step = samples.size() / srp.nblocks();       // NB: subsamples.size() + 1
+            for (size_t i = 0; i < subsamples.size(); ++i)
+                subsamples[i] = samples[(i+1)*step];
+            (b->*dividers).swap(subsamples);
+        }
+        else
+        {
+            for (int i = 0; i < k_out; ++i)
+            {
+                MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
+                save(out, &samples[0], samples.size());
+            }
+        }
+    }
+
+    ValuesVector    values;
+    ValuesVector    dividers;
+    Cmp             cmp;
+    size_t          num_samples;
+};
+
+template<class Block, class T, class Cmp>
+struct SampleSort<Block,T,Cmp>::Exchanger
+{
+                    Exchanger(ValuesVector values_, ValuesVector samples_, const Cmp& cmp_):
+                        values(values_), samples(samples_), cmp(cmp_)       {}
+
+    void            operator()(Block* b, const ReduceProxy& rp) const
+    {
+        if (rp.round() == 0)
+        {
+            // enqueue values to the correct locations
+            for (size_t i = 0; i < (b->*values).size(); ++i)
+            {
+                int to = std::lower_bound((b->*samples).begin(), (b->*samples).end(), (b->*values)[i], cmp) - (b->*samples).begin();
+                rp.enqueue(rp.out_link().target(to), (b->*values)[i]);
+            }
+            (b->*values).clear();
+        } else
+        {
+            dequeue_values((b->*values), rp, false);
+            std::sort((b->*values).begin(), (b->*values).end(), cmp);
+        }
+    }
+
+    ValuesVector    values;
+    ValuesVector    samples;
+    Cmp             cmp;
+};
+
+}
+
+}
+
+#endif
diff --git a/include/vtkmdiy/detail/block_traits.hpp b/include/vtkmdiy/detail/block_traits.hpp
new file mode 100644
index 000000000..eb4b7c547
--- /dev/null
+++ b/include/vtkmdiy/detail/block_traits.hpp
@@ -0,0 +1,31 @@
+#ifndef DIY_BLOCK_TRAITS_HPP
+#define DIY_BLOCK_TRAITS_HPP
+
+#include "traits.hpp"
+
+namespace diy
+{
+namespace detail
+{
+    template<class F>
+    struct block_traits
+    {
+        typedef typename std::remove_pointer<typename function_traits<F>::template arg<0>::type>::type type;
+    };
+
+    // matches block member functions
+    template<class Block, class R, class... Args>
+    struct block_traits<R(Block::*)(Args...)>
+    {
+        typedef Block type;
+    };
+
+    template<class Block, class R, class... Args>
+    struct block_traits<R(Block::*)(Args...) const>
+    {
+        typedef Block type;
+    };
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/detail/collectives.hpp b/include/vtkmdiy/detail/collectives.hpp
new file mode 100644
index 000000000..a85a0f3e4
--- /dev/null
+++ b/include/vtkmdiy/detail/collectives.hpp
@@ -0,0 +1,54 @@
+#ifndef DIY_COLLECTIVES_HPP
+#define DIY_COLLECTIVES_HPP
+
+namespace diy
+{
+namespace detail
+{
+  struct CollectiveOp
+  {
+    virtual void    init()                                  =0;
+    virtual void    update(const CollectiveOp& other)       =0;
+    virtual void    global(const mpi::communicator& comm)   =0;
+    virtual void    copy_from(const CollectiveOp& other)    =0;
+    virtual void    result_out(void* dest) const            =0;
+    virtual         ~CollectiveOp()                         {}
+  };
+
+  template<class T, class Op>
+  struct AllReduceOp: public CollectiveOp
+  {
+          AllReduceOp(const T& x, Op op):
+            in_(x), op_(op)         {}
+
+    void  init()                                    { out_ = in_; }
+    void  update(const CollectiveOp& other)         { out_ = op_(out_, static_cast<const AllReduceOp&>(other).in_); }
+    void  global(const mpi::communicator& comm)     { T res; mpi::all_reduce(comm, out_, res, op_); out_ = res; }
+    void  copy_from(const CollectiveOp& other)      { out_ = static_cast<const AllReduceOp&>(other).out_; }
+    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = out_; }
+
+    private:
+      T     in_, out_;
+      Op    op_;
+  };
+
+  template<class T>
+  struct Scratch: public CollectiveOp
+  {
+          Scratch(const T& x):
+            x_(x)                                   {}
+
+    void  init()                                    {}
+    void  update(const CollectiveOp& other)         {}
+    void  global(const mpi::communicator& comm)     {}
+    void  copy_from(const CollectiveOp& other)      {}
+    void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = x_; }
+
+    private:
+      T     x_;
+  };
+
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/detail/reduce/all-to-all.hpp b/include/vtkmdiy/detail/reduce/all-to-all.hpp
new file mode 100644
index 000000000..1e555db82
--- /dev/null
+++ b/include/vtkmdiy/detail/reduce/all-to-all.hpp
@@ -0,0 +1,169 @@
+#ifndef DIY_DETAIL_ALL_TO_ALL_HPP
+#define DIY_DETAIL_ALL_TO_ALL_HPP
+
+#include "../block_traits.hpp"
+
+namespace diy
+{
+
+namespace detail
+{
+  template<class Op>
+  struct AllToAllReduce
+  {
+    using Block = typename block_traits<Op>::type;
+
+         AllToAllReduce(const Op& op_, const Assigner& assigner):
+             op(op_)
+    {
+      for (int gid = 0; gid < assigner.nblocks(); ++gid)
+      {
+        BlockID nbr = { gid, assigner.rank(gid) };
+        all_neighbors_link.add_neighbor(nbr);
+      }
+    }
+
+    void operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    {
+      int k_in  = srp.in_link().size();
+      int k_out = srp.out_link().size();
+
+      if (k_in == 0 && k_out == 0)  // special case of a single block
+      {
+          ReduceProxy all_srp_out(srp, srp.block(), 0, srp.assigner(), empty_link,         all_neighbors_link);
+          ReduceProxy all_srp_in (srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+
+          op(b, all_srp_out);
+          MemoryBuffer& in_queue = all_srp_in.incoming(all_srp_in.in_link().target(0).gid);
+          in_queue.swap(all_srp_out.outgoing(all_srp_out.out_link().target(0)));
+          in_queue.reset();
+
+          op(b, all_srp_in);
+          return;
+      }
+
+      if (k_in == 0)                // initial round
+      {
+        ReduceProxy all_srp(srp, srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
+        op(b, all_srp);
+
+        Master::OutgoingQueues all_queues;
+        all_queues.swap(*all_srp.outgoing());       // clears out the queues and stores them locally
+
+        // enqueue outgoing
+        int group = all_srp.out_link().size() / k_out;
+        for (int i = 0; i < k_out; ++i)
+        {
+          std::pair<int,int> range(i*group, (i+1)*group);
+          srp.enqueue(srp.out_link().target(i), range);
+          for (int j = i*group; j < (i+1)*group; ++j)
+          {
+            int from = srp.gid();
+            int to   = all_srp.out_link().target(j).gid;
+            srp.enqueue(srp.out_link().target(i), std::make_pair(from, to));
+            srp.enqueue(srp.out_link().target(i), all_queues[all_srp.out_link().target(j)]);
+          }
+        }
+      } else if (k_out == 0)        // final round
+      {
+        // dequeue incoming + reorder into the correct order
+        ReduceProxy all_srp(srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+
+        Master::IncomingQueues all_incoming;
+        all_incoming.swap(*srp.incoming());
+
+        std::pair<int, int> range;      // all the ranges should be the same
+        for (int i = 0; i < k_in; ++i)
+        {
+          int gid_in = srp.in_link().target(i).gid;
+          MemoryBuffer& in = all_incoming[gid_in];
+          load(in, range);
+          while(in)
+          {
+            std::pair<int, int> from_to;
+            load(in, from_to);
+            load(in, all_srp.incoming(from_to.first));
+            all_srp.incoming(from_to.first).reset();
+          }
+        }
+
+        op(b, all_srp);
+      } else                                        // intermediate round: reshuffle queues
+      {
+        // add up buffer sizes
+        std::vector<size_t> sizes_out(k_out, sizeof(std::pair<int,int>));
+        std::pair<int, int> range;      // all the ranges should be the same
+        for (int i = 0; i < k_in; ++i)
+        {
+          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
+
+          load(in, range);
+          int group = (range.second - range.first)/k_out;
+
+          std::pair<int, int> from_to;
+          size_t s;
+          while(in)
+          {
+            diy::load(in, from_to);
+            diy::load(in, s);
+
+            int j = (from_to.second - range.first) / group;
+            sizes_out[j] += s + sizeof(size_t) + sizeof(std::pair<int,int>);
+            in.skip(s);
+          }
+          in.reset();
+        }
+
+        // reserve outgoing buffers of correct size
+        int group = (range.second - range.first)/k_out;
+        for (int i = 0; i < k_out; ++i)
+        {
+          MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
+          out.reserve(sizes_out[i]);
+
+          std::pair<int, int> out_range;
+          out_range.first  = range.first + group*i;
+          out_range.second = range.first + group*(i+1);
+          save(out, out_range);
+        }
+
+        // re-direct the queues
+        for (int i = 0; i < k_in; ++i)
+        {
+          MemoryBuffer& in = srp.incoming(srp.in_link().target(i).gid);
+
+          std::pair<int, int> range;
+          load(in, range);
+
+          std::pair<int, int> from_to;
+          while(in)
+          {
+            load(in, from_to);
+            int j = (from_to.second - range.first) / group;
+
+            MemoryBuffer& out = srp.outgoing(srp.out_link().target(j));
+            save(out, from_to);
+            MemoryBuffer::copy(in, out);
+          }
+        }
+      }
+    }
+
+    const Op&           op;
+    Link                all_neighbors_link, empty_link;
+  };
+
+  struct SkipIntermediate
+  {
+         SkipIntermediate(size_t rounds_):
+            rounds(rounds_)                                     {}
+
+    bool operator()(int round, int, const Master&) const        { if (round == 0 || round == (int) rounds) return false; return true; }
+
+    size_t  rounds;
+  };
+}
+
+}
+
+#endif
diff --git a/include/vtkmdiy/detail/traits.hpp b/include/vtkmdiy/detail/traits.hpp
new file mode 100644
index 000000000..f47b733c8
--- /dev/null
+++ b/include/vtkmdiy/detail/traits.hpp
@@ -0,0 +1,318 @@
+//--------------------------------------
+// utils/traits: Additional type traits
+//--------------------------------------
+//
+//          Copyright kennytm (auraHT Ltd.) 2011.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file doc/LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+/**
+
+``<utils/traits.hpp>`` --- Additional type traits
+=================================================
+
+This module provides additional type traits and related functions, missing from
+the standard library.
+
+*/
+
+#ifndef DIY_UTILS_TRAITS_HPP
+#define DIY_UTILS_TRAITS_HPP
+
+#include <cstdlib>
+#include <tuple>
+#include <functional>
+#include <type_traits>
+
+namespace diy
+{
+namespace detail {
+
+/**
+.. macro:: DECLARE_HAS_TYPE_MEMBER(member_name)
+
+    This macro declares a template ``has_member_name`` which will check whether
+    a type member ``member_name`` exists in a particular type.
+
+    Example::
+
+        DECLARE_HAS_TYPE_MEMBER(result_type)
+
+        ...
+
+        printf("%d\n", has_result_type< std::plus<int> >::value);
+        // ^ prints '1' (true)
+        printf("%d\n", has_result_type< double(*)() >::value);
+        // ^ prints '0' (false)
+*/
+#define DECLARE_HAS_TYPE_MEMBER(member_name) \
+    template <typename, typename = void> \
+    struct has_##member_name \
+    { enum { value = false }; }; \
+    template <typename T> \
+    struct has_##member_name<T, typename std::enable_if<sizeof(typename T::member_name)||true>::type> \
+    { enum { value = true }; };
+
+/**
+.. type:: struct utils::function_traits<F>
+
+    Obtain compile-time information about a function object *F*.
+
+    This template currently supports the following types:
+
+    * Normal function types (``R(T...)``), function pointers (``R(*)(T...)``)
+      and function references (``R(&)(T...)`` and ``R(&&)(T...)``).
+    * Member functions (``R(C::*)(T...)``)
+    * ``std::function<F>``
+    * Type of lambda functions, and any other types that has a unique
+      ``operator()``.
+    * Type of ``std::mem_fn`` (only for GCC's libstdc++ and LLVM's libc++).
+      Following the C++ spec, the first argument will be a raw pointer.
+*/
+template <typename T>
+struct function_traits
+    : public function_traits<decltype(&T::operator())>
+{};
+
+namespace xx_impl
+{
+    template <typename C, typename R, typename... A>
+    struct memfn_type
+    {
+        typedef typename std::conditional<
+            std::is_const<C>::value,
+            typename std::conditional<
+                std::is_volatile<C>::value,
+                R (C::*)(A...) const volatile,
+                R (C::*)(A...) const
+            >::type,
+            typename std::conditional<
+                std::is_volatile<C>::value,
+                R (C::*)(A...) volatile,
+                R (C::*)(A...)
+            >::type
+        >::type type;
+    };
+}
+
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType(Args...)>
+{
+    /**
+    .. type:: type result_type
+
+        The type returned by calling an instance of the function object type *F*.
+    */
+    typedef ReturnType result_type;
+
+    /**
+    .. type:: type function_type
+
+        The function type (``R(T...)``).
+    */
+    typedef ReturnType function_type(Args...);
+
+    /**
+    .. type:: type member_function_type<OwnerType>
+
+        The member function type for an *OwnerType* (``R(OwnerType::*)(T...)``).
+    */
+    template <typename OwnerType>
+    using member_function_type = typename xx_impl::memfn_type<
+        typename std::remove_pointer<typename std::remove_reference<OwnerType>::type>::type,
+        ReturnType, Args...
+    >::type;
+
+    /**
+    .. data:: static const size_t arity
+
+        Number of arguments the function object will take.
+    */
+    enum { arity = sizeof...(Args) };
+
+    /**
+    .. type:: type arg<n>::type
+
+        The type of the *n*-th argument.
+    */
+    template <size_t i>
+    struct arg
+    {
+        typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
+    };
+};
+
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType(*)(Args...)>
+    : public function_traits<ReturnType(Args...)>
+{};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...)>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) const>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef const ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) volatile>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef volatile ClassType& owner_type;
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) const volatile>
+    : public function_traits<ReturnType(Args...)>
+{
+    typedef const volatile ClassType& owner_type;
+};
+
+template <typename FunctionType>
+struct function_traits<std::function<FunctionType>>
+    : public function_traits<FunctionType>
+{};
+
+#if defined(_GLIBCXX_FUNCTIONAL)
+#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::_Mem_fn
+#elif defined(_LIBCPP_FUNCTIONAL)
+#define MEM_FN_SYMBOL_XX0SL7G4Z0J std::__mem_fn
+#endif
+
+#ifdef MEM_FN_SYMBOL_XX0SL7G4Z0J
+
+template <typename R, typename C>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R C::*>>
+    : public function_traits<R(C*)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...)>>
+    : public function_traits<R(C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const>>
+    : public function_traits<R(const C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) volatile>>
+    : public function_traits<R(volatile C*, A...)>
+{};
+template <typename R, typename C, typename... A>
+struct function_traits<MEM_FN_SYMBOL_XX0SL7G4Z0J<R(C::*)(A...) const volatile>>
+    : public function_traits<R(const volatile C*, A...)>
+{};
+
+#undef MEM_FN_SYMBOL_XX0SL7G4Z0J
+#endif
+
+template <typename T>
+struct function_traits<T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<volatile T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const volatile T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<volatile T&&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<const volatile T&&> : public function_traits<T> {};
+
+
+#define FORWARD_RES_8QR485JMSBT \
+    typename std::conditional< \
+        std::is_lvalue_reference<R>::value, \
+        T&, \
+        typename std::remove_reference<T>::type&& \
+    >::type
+
+/**
+.. function:: auto utils::forward_like<Like, T>(T&& t) noexcept
+
+    Forward the reference *t* like the type of *Like*. That means, if *Like* is
+    an lvalue (reference), this function will return an lvalue reference of *t*.
+    Otherwise, if *Like* is an rvalue, this function will return an rvalue
+    reference of *t*.
+
+    This is mainly used to propagate the expression category (lvalue/rvalue) of
+    a member of *Like*, generalizing ``std::forward``.
+*/
+template <typename R, typename T>
+FORWARD_RES_8QR485JMSBT forward_like(T&& input) noexcept
+{
+    return static_cast<FORWARD_RES_8QR485JMSBT>(input);
+}
+
+#undef FORWARD_RES_8QR485JMSBT
+
+/**
+.. type:: struct utils::copy_cv<From, To>
+
+    Copy the CV qualifier between the two types. For example,
+    ``utils::copy_cv<const int, double>::type`` will become ``const double``.
+*/
+template <typename From, typename To>
+struct copy_cv
+{
+private:
+    typedef typename std::remove_cv<To>::type raw_To;
+    typedef typename std::conditional<std::is_const<From>::value,
+                                      const raw_To, raw_To>::type const_raw_To;
+public:
+    /**
+    .. type:: type type
+
+        Result of cv-copying.
+    */
+    typedef typename std::conditional<std::is_volatile<From>::value,
+                                      volatile const_raw_To, const_raw_To>::type type;
+};
+
+/**
+.. type:: struct utils::pointee<T>
+
+    Returns the type by derefering an instance of *T*. This is a generalization
+    of ``std::remove_pointer``, that it also works with iterators.
+*/
+template <typename T>
+struct pointee
+{
+    /**
+    .. type:: type type
+
+        Result of dereferencing.
+    */
+    typedef typename std::remove_reference<decltype(*std::declval<T>())>::type type;
+};
+
+/**
+.. function:: std::add_rvalue_reference<T>::type utils::rt_val<T>() noexcept
+
+    Returns a value of type *T*. It is guaranteed to do nothing and will not
+    throw a compile-time error, but using the returned result will cause
+    undefined behavior.
+*/
+template <typename T>
+typename std::add_rvalue_reference<T>::type rt_val() noexcept
+{
+    return std::move(*static_cast<T*>(nullptr));
+}
+
+}
+
+}
+
+#endif
+
diff --git a/include/vtkmdiy/fmt/format.cc b/include/vtkmdiy/fmt/format.cc
new file mode 100644
index 000000000..09d2ea9fd
--- /dev/null
+++ b/include/vtkmdiy/fmt/format.cc
@@ -0,0 +1,535 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "format.h"
+
+#include <string.h>
+
+#include <cctype>
+#include <cerrno>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>  // for std::ptrdiff_t
+
+#if defined(_WIN32) && defined(__MINGW32__)
+# include <cstring>
+#endif
+
+#if FMT_USE_WINDOWS_H
+# if !defined(FMT_HEADER_ONLY) && !defined(WIN32_LEAN_AND_MEAN)
+#  define WIN32_LEAN_AND_MEAN
+# endif
+# if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
+#  include <windows.h>
+# else
+#  define NOMINMAX
+#  include <windows.h>
+#  undef NOMINMAX
+# endif
+#endif
+
+#if FMT_EXCEPTIONS
+# define FMT_TRY try
+# define FMT_CATCH(x) catch (x)
+#else
+# define FMT_TRY if (true)
+# define FMT_CATCH(x) if (false)
+#endif
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4127)  // conditional expression is constant
+# pragma warning(disable: 4702)  // unreachable code
+// Disable deprecation warning for strerror. The latter is not called but
+// MSVC fails to detect it.
+# pragma warning(disable: 4996)
+#endif
+
+// Dummy implementations of strerror_r and strerror_s called if corresponding
+// system functions are not available.
+static inline fmt::internal::Null<> strerror_r(int, char *, ...) {
+  return fmt::internal::Null<>();
+}
+static inline fmt::internal::Null<> strerror_s(char *, std::size_t, ...) {
+  return fmt::internal::Null<>();
+}
+
+namespace fmt {
+
+FMT_FUNC internal::RuntimeError::~RuntimeError() FMT_DTOR_NOEXCEPT {}
+FMT_FUNC FormatError::~FormatError() FMT_DTOR_NOEXCEPT {}
+FMT_FUNC SystemError::~SystemError() FMT_DTOR_NOEXCEPT {}
+
+namespace {
+
+#ifndef _MSC_VER
+# define FMT_SNPRINTF snprintf
+#else  // _MSC_VER
+inline int fmt_snprintf(char *buffer, size_t size, const char *format, ...) {
+  va_list args;
+  va_start(args, format);
+  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
+  va_end(args);
+  return result;
+}
+# define FMT_SNPRINTF fmt_snprintf
+#endif  // _MSC_VER
+
+#if defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+# define FMT_SWPRINTF snwprintf
+#else
+# define FMT_SWPRINTF swprintf
+#endif // defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
+
+const char RESET_COLOR[] = "\x1b[0m";
+
+typedef void (*FormatFunc)(Writer &, int, StringRef);
+
+// Portable thread-safe version of strerror.
+// Sets buffer to point to a string describing the error code.
+// This can be either a pointer to a string stored in buffer,
+// or a pointer to some static immutable string.
+// Returns one of the following values:
+//   0      - success
+//   ERANGE - buffer is not large enough to store the error message
+//   other  - failure
+// Buffer should be at least of size 1.
+int safe_strerror(
+    int error_code, char *&buffer, std::size_t buffer_size) FMT_NOEXCEPT {
+  FMT_ASSERT(buffer != 0 && buffer_size != 0, "invalid buffer");
+
+  class StrError {
+   private:
+    int error_code_;
+    char *&buffer_;
+    std::size_t buffer_size_;
+
+    // A noop assignment operator to avoid bogus warnings.
+    void operator=(const StrError &) {}
+
+    // Handle the result of XSI-compliant version of strerror_r.
+    int handle(int result) {
+      // glibc versions before 2.13 return result in errno.
+      return result == -1 ? errno : result;
+    }
+
+    // Handle the result of GNU-specific version of strerror_r.
+    int handle(char *message) {
+      // If the buffer is full then the message is probably truncated.
+      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
+        return ERANGE;
+      buffer_ = message;
+      return 0;
+    }
+
+    // Handle the case when strerror_r is not available.
+    int handle(internal::Null<>) {
+      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
+    }
+
+    // Fallback to strerror_s when strerror_r is not available.
+    int fallback(int result) {
+      // If the buffer is full then the message is probably truncated.
+      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ?
+            ERANGE : result;
+    }
+
+    // Fallback to strerror if strerror_r and strerror_s are not available.
+    int fallback(internal::Null<>) {
+      errno = 0;
+      buffer_ = strerror(error_code_);
+      return errno;
+    }
+
+   public:
+    StrError(int err_code, char *&buf, std::size_t buf_size)
+      : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
+
+    int run() {
+      // Suppress a warning about unused strerror_r.
+      strerror_r(0, FMT_NULL, "");
+      return handle(strerror_r(error_code_, buffer_, buffer_size_));
+    }
+  };
+  return StrError(error_code, buffer, buffer_size).run();
+}
+
+void format_error_code(Writer &out, int error_code,
+                       StringRef message) FMT_NOEXCEPT {
+  // Report error code making sure that the output fits into
+  // INLINE_BUFFER_SIZE to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.clear();
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  typedef internal::IntTraits<int>::MainType MainType;
+  MainType abs_value = static_cast<MainType>(error_code);
+  if (internal::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += internal::count_digits(abs_value);
+  if (message.size() <= internal::INLINE_BUFFER_SIZE - error_code_size)
+    out << message << SEP;
+  out << ERROR_STR << error_code;
+  assert(out.size() <= internal::INLINE_BUFFER_SIZE);
+}
+
+void report_error(FormatFunc func, int error_code,
+                  StringRef message) FMT_NOEXCEPT {
+  MemoryWriter full_message;
+  func(full_message, error_code, message);
+  // Use Writer::data instead of Writer::c_str to avoid potential memory
+  // allocation.
+  std::fwrite(full_message.data(), full_message.size(), 1, stderr);
+  std::fputc('\n', stderr);
+}
+}  // namespace
+
+FMT_FUNC void SystemError::init(
+    int err_code, CStringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  format_system_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+template <typename T>
+int internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SNPRINTF(buffer, size, format, value) :
+        FMT_SNPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SNPRINTF(buffer, size, format, width, value) :
+      FMT_SNPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+int internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, T value) {
+  if (width == 0) {
+    return precision < 0 ?
+        FMT_SWPRINTF(buffer, size, format, value) :
+        FMT_SWPRINTF(buffer, size, format, precision, value);
+  }
+  return precision < 0 ?
+      FMT_SWPRINTF(buffer, size, format, width, value) :
+      FMT_SWPRINTF(buffer, size, format, width, precision, value);
+}
+
+template <typename T>
+const char internal::BasicData<T>::DIGITS[] =
+    "0001020304050607080910111213141516171819"
+    "2021222324252627282930313233343536373839"
+    "4041424344454647484950515253545556575859"
+    "6061626364656667686970717273747576777879"
+    "8081828384858687888990919293949596979899";
+
+#define FMT_POWERS_OF_10(factor) \
+  factor * 10, \
+  factor * 100, \
+  factor * 1000, \
+  factor * 10000, \
+  factor * 100000, \
+  factor * 1000000, \
+  factor * 10000000, \
+  factor * 100000000, \
+  factor * 1000000000
+
+template <typename T>
+const uint32_t internal::BasicData<T>::POWERS_OF_10_32[] = {
+  0, FMT_POWERS_OF_10(1)
+};
+
+template <typename T>
+const uint64_t internal::BasicData<T>::POWERS_OF_10_64[] = {
+  0,
+  FMT_POWERS_OF_10(1),
+  FMT_POWERS_OF_10(ULongLong(1000000000)),
+  // Multiply several constants instead of using a single long long constant
+  // to avoid warnings about C++98 not supporting long long.
+  ULongLong(1000000000) * ULongLong(1000000000) * 10
+};
+
+FMT_FUNC void internal::report_unknown_type(char code, const char *type) {
+  (void)type;
+  if (std::isprint(static_cast<unsigned char>(code))) {
+    FMT_THROW(FormatError(
+        format("unknown format code '{}' for {}", code, type)));
+  }
+  FMT_THROW(FormatError(
+      format("unknown format code '\\x{:02x}' for {}",
+        static_cast<unsigned>(code), type)));
+}
+
+#if FMT_USE_WINDOWS_H
+
+FMT_FUNC internal::UTF8ToUTF16::UTF8ToUTF16(StringRef s) {
+  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
+  if (s.size() > INT_MAX)
+    FMT_THROW(WindowsError(ERROR_INVALID_PARAMETER, ERROR_MSG));
+  int s_size = static_cast<int>(s.size());
+  int length = MultiByteToWideChar(
+      CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, FMT_NULL, 0);
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+  buffer_.resize(length + 1);
+  length = MultiByteToWideChar(
+    CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, &buffer_[0], length);
+  if (length == 0)
+    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
+  buffer_[length] = 0;
+}
+
+FMT_FUNC internal::UTF16ToUTF8::UTF16ToUTF8(WStringRef s) {
+  if (int error_code = convert(s)) {
+    FMT_THROW(WindowsError(error_code,
+        "cannot convert string from UTF-16 to UTF-8"));
+  }
+}
+
+FMT_FUNC int internal::UTF16ToUTF8::convert(WStringRef s) {
+  if (s.size() > INT_MAX)
+    return ERROR_INVALID_PARAMETER;
+  int s_size = static_cast<int>(s.size());
+  int length = WideCharToMultiByte(
+    CP_UTF8, 0, s.data(), s_size, FMT_NULL, 0, FMT_NULL, FMT_NULL);
+  if (length == 0)
+    return GetLastError();
+  buffer_.resize(length + 1);
+  length = WideCharToMultiByte(
+    CP_UTF8, 0, s.data(), s_size, &buffer_[0], length, FMT_NULL, FMT_NULL);
+  if (length == 0)
+    return GetLastError();
+  buffer_[length] = 0;
+  return 0;
+}
+
+FMT_FUNC void WindowsError::init(
+    int err_code, CStringRef format_str, ArgList args) {
+  error_code_ = err_code;
+  MemoryWriter w;
+  internal::format_windows_error(w, err_code, format(format_str, args));
+  std::runtime_error &base = *this;
+  base = std::runtime_error(w.str());
+}
+
+FMT_FUNC void internal::format_windows_error(
+    Writer &out, int error_code, StringRef message) FMT_NOEXCEPT {
+  FMT_TRY {
+    MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer;
+    buffer.resize(INLINE_BUFFER_SIZE);
+    for (;;) {
+      wchar_t *system_message = &buffer[0];
+      int result = FormatMessageW(
+        FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+        FMT_NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+        system_message, static_cast<uint32_t>(buffer.size()), FMT_NULL);
+      if (result != 0) {
+        UTF16ToUTF8 utf8_message;
+        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
+          out << message << ": " << utf8_message;
+          return;
+        }
+        break;
+      }
+      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        break;  // Can't get error message, report error code instead.
+      buffer.resize(buffer.size() * 2);
+    }
+  } FMT_CATCH(...) {}
+  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
+}
+
+#endif  // FMT_USE_WINDOWS_H
+
+FMT_FUNC void format_system_error(
+    Writer &out, int error_code, StringRef message) FMT_NOEXCEPT {
+  FMT_TRY {
+    internal::MemoryBuffer<char, internal::INLINE_BUFFER_SIZE> buffer;
+    buffer.resize(internal::INLINE_BUFFER_SIZE);
+    for (;;) {
+      char *system_message = &buffer[0];
+      int result = safe_strerror(error_code, system_message, buffer.size());
+      if (result == 0) {
+        out << message << ": " << system_message;
+        return;
+      }
+      if (result != ERANGE)
+        break;  // Can't get error message, report error code instead.
+      buffer.resize(buffer.size() * 2);
+    }
+  } FMT_CATCH(...) {}
+  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
+}
+
+template <typename Char>
+void internal::ArgMap<Char>::init(const ArgList &args) {
+  if (!map_.empty())
+    return;
+  typedef internal::NamedArg<Char> NamedArg;
+  const NamedArg *named_arg = FMT_NULL;
+  bool use_values =
+      args.type(ArgList::MAX_PACKED_ARGS - 1) == internal::Arg::NONE;
+  if (use_values) {
+    for (unsigned i = 0;/*nothing*/; ++i) {
+      internal::Arg::Type arg_type = args.type(i);
+      switch (arg_type) {
+      case internal::Arg::NONE:
+        return;
+      case internal::Arg::NAMED_ARG:
+        named_arg = static_cast<const NamedArg*>(args.values_[i].pointer);
+        map_.push_back(Pair(named_arg->name, *named_arg));
+        break;
+      default:
+        /*nothing*/;
+      }
+    }
+    return;
+  }
+  for (unsigned i = 0; i != ArgList::MAX_PACKED_ARGS; ++i) {
+    internal::Arg::Type arg_type = args.type(i);
+    if (arg_type == internal::Arg::NAMED_ARG) {
+      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
+      map_.push_back(Pair(named_arg->name, *named_arg));
+    }
+  }
+  for (unsigned i = ArgList::MAX_PACKED_ARGS;/*nothing*/; ++i) {
+    switch (args.args_[i].type) {
+    case internal::Arg::NONE:
+      return;
+    case internal::Arg::NAMED_ARG:
+      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
+      map_.push_back(Pair(named_arg->name, *named_arg));
+      break;
+    default:
+      /*nothing*/;
+    }
+  }
+}
+
+template <typename Char>
+void internal::FixedBuffer<Char>::grow(std::size_t) {
+  FMT_THROW(std::runtime_error("buffer overflow"));
+}
+
+FMT_FUNC internal::Arg internal::FormatterBase::do_get_arg(
+    unsigned arg_index, const char *&error) {
+  internal::Arg arg = args_[arg_index];
+  switch (arg.type) {
+  case internal::Arg::NONE:
+    error = "argument index out of range";
+    break;
+  case internal::Arg::NAMED_ARG:
+    arg = *static_cast<const internal::Arg*>(arg.pointer);
+    break;
+  default:
+    /*nothing*/;
+  }
+  return arg;
+}
+
+FMT_FUNC void report_system_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  // 'fmt::' is for bcc32.
+  report_error(format_system_error, error_code, message);
+}
+
+#if FMT_USE_WINDOWS_H
+FMT_FUNC void report_windows_error(
+    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
+  // 'fmt::' is for bcc32.
+  report_error(internal::format_windows_error, error_code, message);
+}
+#endif
+
+FMT_FUNC void print(std::FILE *f, CStringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  std::fwrite(w.data(), 1, w.size(), f);
+}
+
+FMT_FUNC void print(CStringRef format_str, ArgList args) {
+  print(stdout, format_str, args);
+}
+
+FMT_FUNC void print_colored(Color c, CStringRef format, ArgList args) {
+  char escape[] = "\x1b[30m";
+  escape[3] = static_cast<char>('0' + c);
+  std::fputs(escape, stdout);
+  print(format, args);
+  std::fputs(RESET_COLOR, stdout);
+}
+
+#ifndef FMT_HEADER_ONLY
+
+template struct internal::BasicData<void>;
+
+// Explicit instantiations for char.
+
+template void internal::FixedBuffer<char>::grow(std::size_t);
+
+template void internal::ArgMap<char>::init(const ArgList &args);
+
+template FMT_API int internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, double value);
+
+template FMT_API int internal::CharTraits<char>::format_float(
+    char *buffer, std::size_t size, const char *format,
+    unsigned width, int precision, long double value);
+
+// Explicit instantiations for wchar_t.
+
+template void internal::FixedBuffer<wchar_t>::grow(std::size_t);
+
+template void internal::ArgMap<wchar_t>::init(const ArgList &args);
+
+template FMT_API int internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, double value);
+
+template FMT_API int internal::CharTraits<wchar_t>::format_float(
+    wchar_t *buffer, std::size_t size, const wchar_t *format,
+    unsigned width, int precision, long double value);
+
+#endif  // FMT_HEADER_ONLY
+
+}  // namespace fmt
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/include/vtkmdiy/fmt/format.h b/include/vtkmdiy/fmt/format.h
new file mode 100644
index 000000000..02452c397
--- /dev/null
+++ b/include/vtkmdiy/fmt/format.h
@@ -0,0 +1,4014 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#define FMT_HEADER_ONLY     // Added by diy for header-only usage
+
+#include <cassert>
+#include <clocale>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <utility>  // for std::pair
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 40000
+
+#ifdef _SECURE_SCL
+# define FMT_SECURE_SCL _SECURE_SCL
+#else
+# define FMT_SECURE_SCL 0
+#endif
+
+#if FMT_SECURE_SCL
+# include <iterator>
+#endif
+
+#ifdef _MSC_VER
+# define FMT_MSC_VER _MSC_VER
+#else
+# define FMT_MSC_VER 0
+#endif
+
+#if FMT_MSC_VER && FMT_MSC_VER <= 1500
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+typedef __int64          intmax_t;
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+# ifdef FMT_EXPORT
+#  define FMT_API __declspec(dllexport)
+# elif defined(FMT_SHARED)
+#  define FMT_API __declspec(dllimport)
+# endif
+#endif
+#ifndef FMT_API
+# define FMT_API
+#endif
+
+#ifdef __GNUC__
+# define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+# define FMT_GCC_EXTENSION __extension__
+# if FMT_GCC_VERSION >= 406
+#  pragma GCC diagnostic push
+// Disable the warning about "long long" which is sometimes reported even
+// when using __extension__.
+#  pragma GCC diagnostic ignored "-Wlong-long"
+// Disable the warning about declaration shadowing because it affects too
+// many valid cases.
+#  pragma GCC diagnostic ignored "-Wshadow"
+// Disable the warning about implicit conversions that may change the sign of
+// an integer; silencing it otherwise would require many explicit casts.
+#  pragma GCC diagnostic ignored "-Wsign-conversion"
+# endif
+# if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__
+#  define FMT_HAS_GXX_CXX11 1
+# endif
+#else
+# define FMT_GCC_EXTENSION
+#endif
+
+#if defined(__INTEL_COMPILER)
+# define FMT_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICL)
+# define FMT_ICC_VERSION __ICL
+#endif
+
+#if defined(__clang__) && !defined(FMT_ICC_VERSION)
+# define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+# pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+#ifdef __GNUC_LIBSTD__
+# define FMT_GNUC_LIBSTD_VERSION (__GNUC_LIBSTD__ * 100 + __GNUC_LIBSTD_MINOR__)
+#endif
+
+#ifdef __has_feature
+# define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+# define FMT_HAS_FEATURE(x) 0
+#endif
+
+#ifdef __has_builtin
+# define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+# define FMT_HAS_BUILTIN(x) 0
+#endif
+
+#ifdef __has_cpp_attribute
+# define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#ifndef FMT_USE_VARIADIC_TEMPLATES
+// Variadic templates are available in GCC since version 4.4
+// (http://gcc.gnu.org/projects/cxx0x.html) and in Visual C++
+// since version 2013.
+# define FMT_USE_VARIADIC_TEMPLATES \
+   (FMT_HAS_FEATURE(cxx_variadic_templates) || \
+       (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1800)
+#endif
+
+#ifndef FMT_USE_RVALUE_REFERENCES
+// Don't use rvalue references when compiling with clang and an old libstdc++
+// as the latter doesn't provide std::move.
+# if defined(FMT_GNUC_LIBSTD_VERSION) && FMT_GNUC_LIBSTD_VERSION <= 402
+#  define FMT_USE_RVALUE_REFERENCES 0
+# else
+#  define FMT_USE_RVALUE_REFERENCES \
+    (FMT_HAS_FEATURE(cxx_rvalue_references) || \
+        (FMT_GCC_VERSION >= 403 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1600)
+# endif
+#endif
+
+// Check if exceptions are disabled.
+#if defined(__GNUC__) && !defined(__EXCEPTIONS)
+# define FMT_EXCEPTIONS 0
+#endif
+#if FMT_MSC_VER && !_HAS_EXCEPTIONS
+# define FMT_EXCEPTIONS 0
+#endif
+#ifndef FMT_EXCEPTIONS
+# define FMT_EXCEPTIONS 1
+#endif
+
+#ifndef FMT_THROW
+# if FMT_EXCEPTIONS
+#  define FMT_THROW(x) throw x
+# else
+#  define FMT_THROW(x) assert(false)
+# endif
+#endif
+
+// Define FMT_USE_NOEXCEPT to make fmt use noexcept (C++11 feature).
+#ifndef FMT_USE_NOEXCEPT
+# define FMT_USE_NOEXCEPT 0
+#endif
+
+#if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
+    (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || \
+    FMT_MSC_VER >= 1900
+# define FMT_DETECTED_NOEXCEPT noexcept
+#else
+# define FMT_DETECTED_NOEXCEPT throw()
+#endif
+
+#ifndef FMT_NOEXCEPT
+# if FMT_EXCEPTIONS
+#  define FMT_NOEXCEPT FMT_DETECTED_NOEXCEPT
+# else
+#  define FMT_NOEXCEPT
+# endif
+#endif
+
+// This is needed because GCC still uses throw() in its headers when exceptions
+// are disabled.
+#if FMT_GCC_VERSION
+# define FMT_DTOR_NOEXCEPT FMT_DETECTED_NOEXCEPT
+#else
+# define FMT_DTOR_NOEXCEPT FMT_NOEXCEPT
+#endif
+
+#ifndef FMT_OVERRIDE
+# if (defined(FMT_USE_OVERRIDE) && FMT_USE_OVERRIDE) || FMT_HAS_FEATURE(cxx_override) || \
+   (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || \
+   FMT_MSC_VER >= 1900
+#  define FMT_OVERRIDE override
+# else
+#  define FMT_OVERRIDE
+# endif
+#endif
+
+#ifndef FMT_NULL
+# if FMT_HAS_FEATURE(cxx_nullptr) || \
+   (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || \
+   FMT_MSC_VER >= 1600
+#  define FMT_NULL nullptr
+# else
+#  define FMT_NULL NULL
+# endif
+#endif
+
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#ifndef FMT_USE_DELETED_FUNCTIONS
+# define FMT_USE_DELETED_FUNCTIONS 0
+#endif
+
+#if FMT_USE_DELETED_FUNCTIONS || FMT_HAS_FEATURE(cxx_deleted_functions) || \
+  (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1800
+# define FMT_DELETED_OR_UNDEFINED  = delete
+# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&) = delete; \
+    TypeName& operator=(const TypeName&) = delete
+#else
+# define FMT_DELETED_OR_UNDEFINED
+# define FMT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&); \
+    TypeName& operator=(const TypeName&)
+#endif
+
+#ifndef FMT_USE_DEFAULTED_FUNCTIONS
+# define FMT_USE_DEFAULTED_FUNCTIONS 0
+#endif
+
+#ifndef FMT_DEFAULTED_COPY_CTOR
+# if FMT_USE_DEFAULTED_FUNCTIONS || FMT_HAS_FEATURE(cxx_defaulted_functions) || \
+   (FMT_GCC_VERSION >= 404 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1800
+#  define FMT_DEFAULTED_COPY_CTOR(TypeName) \
+    TypeName(const TypeName&) = default;
+# else
+#  define FMT_DEFAULTED_COPY_CTOR(TypeName)
+# endif
+#endif
+
+#ifndef FMT_USE_USER_DEFINED_LITERALS
+// All compilers which support UDLs also support variadic templates. This
+// makes the fmt::literals implementation easier. However, an explicit check
+// for variadic templates is added here just in case.
+// For Intel's compiler both it and the system gcc/msc must support UDLs.
+# define FMT_USE_USER_DEFINED_LITERALS \
+   FMT_USE_VARIADIC_TEMPLATES && FMT_USE_RVALUE_REFERENCES && \
+   (FMT_HAS_FEATURE(cxx_user_literals) || \
+     (FMT_GCC_VERSION >= 407 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900) && \
+   (!defined(FMT_ICC_VERSION) || FMT_ICC_VERSION >= 1500)
+#endif
+
+#ifndef FMT_USE_EXTERN_TEMPLATES
+# define FMT_USE_EXTERN_TEMPLATES \
+    (FMT_CLANG_VERSION >= 209 || (FMT_GCC_VERSION >= 303 && FMT_HAS_GXX_CXX11))
+#endif
+
+#ifdef FMT_HEADER_ONLY
+// If header only do not use extern templates.
+# undef FMT_USE_EXTERN_TEMPLATES
+# define FMT_USE_EXTERN_TEMPLATES 0
+#endif
+
+#ifndef FMT_ASSERT
+# define FMT_ASSERT(condition, message) assert((condition) && message)
+#endif
+
+// __builtin_clz is broken in clang with Microsoft CodeGen:
+// https://github.com/fmtlib/fmt/issues/519
+#ifndef _MSC_VER
+# if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clz)
+#  define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+# endif
+
+# if FMT_GCC_VERSION >= 400 || FMT_HAS_BUILTIN(__builtin_clzll)
+#  define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+# endif
+#endif
+
+// Some compilers masquerade as both MSVC and GCC-likes or
+// otherwise support __builtin_clz and __builtin_clzll, so
+// only define FMT_BUILTIN_CLZ using the MSVC intrinsics
+// if the clz and clzll builtins are not available.
+#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && !defined(_MANAGED)
+# include <intrin.h>  // _BitScanReverse, _BitScanReverse64
+
+namespace fmt {
+namespace internal {
+# pragma intrinsic(_BitScanReverse)
+inline uint32_t clz(uint32_t x) {
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+
+  assert(x != 0);
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+# pragma warning(suppress: 6102)
+  return 31 - r;
+}
+# define FMT_BUILTIN_CLZ(n) fmt::internal::clz(n)
+
+# ifdef _WIN64
+#  pragma intrinsic(_BitScanReverse64)
+# endif
+
+inline uint32_t clzll(uint64_t x) {
+  unsigned long r = 0;
+# ifdef _WIN64
+  _BitScanReverse64(&r, x);
+# else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 - (r + 32);
+
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+# endif
+
+  assert(x != 0);
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+# pragma warning(suppress: 6102)
+  return 63 - r;
+}
+# define FMT_BUILTIN_CLZLL(n) fmt::internal::clzll(n)
+}
+}
+#endif
+
+namespace fmt {
+namespace internal {
+struct DummyInt {
+  int data[2];
+  operator int() const { return 0; }
+};
+typedef std::numeric_limits<fmt::internal::DummyInt> FPUtil;
+
+// Dummy implementations of system functions such as signbit and ecvt called
+// if the latter are not available.
+inline DummyInt signbit(...) { return DummyInt(); }
+inline DummyInt _ecvt_s(...) { return DummyInt(); }
+inline DummyInt isinf(...) { return DummyInt(); }
+inline DummyInt _finite(...) { return DummyInt(); }
+inline DummyInt isnan(...) { return DummyInt(); }
+inline DummyInt _isnan(...) { return DummyInt(); }
+
+// A helper function to suppress bogus "conditional expression is constant"
+// warnings.
+template <typename T>
+inline T const_check(T value) { return value; }
+}
+}  // namespace fmt
+
+namespace std {
+// Standard permits specialization of std::numeric_limits. This specialization
+// is used to resolve ambiguity between isinf and std::isinf in glibc:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48891
+// and the same for isnan and signbit.
+template <>
+class numeric_limits<fmt::internal::DummyInt> :
+    public std::numeric_limits<int> {
+ public:
+  // Portable version of isinf.
+  template <typename T>
+  static bool isinfinity(T x) {
+    using namespace fmt::internal;
+    // The resolution "priority" is:
+    // isinf macro > std::isinf > ::isinf > fmt::internal::isinf
+    if (const_check(sizeof(isinf(x)) == sizeof(bool) ||
+                    sizeof(isinf(x)) == sizeof(int))) {
+      return isinf(x) != 0;
+    }
+    return !_finite(static_cast<double>(x));
+  }
+
+  // Portable version of isnan.
+  template <typename T>
+  static bool isnotanumber(T x) {
+    using namespace fmt::internal;
+    if (const_check(sizeof(isnan(x)) == sizeof(bool) ||
+                    sizeof(isnan(x)) == sizeof(int))) {
+      return isnan(x) != 0;
+    }
+    return _isnan(static_cast<double>(x)) != 0;
+  }
+
+  // Portable version of signbit.
+  static bool isnegative(double x) {
+    using namespace fmt::internal;
+    if (const_check(sizeof(signbit(x)) == sizeof(bool) ||
+                    sizeof(signbit(x)) == sizeof(int))) {
+      return signbit(x) != 0;
+    }
+    if (x < 0) return true;
+    if (!isnotanumber(x)) return false;
+    int dec = 0, sign = 0;
+    char buffer[2];  // The buffer size must be >= 2 or _ecvt_s will fail.
+    _ecvt_s(buffer, sizeof(buffer), x, 0, &dec, &sign);
+    return sign != 0;
+  }
+};
+}  // namespace std
+
+namespace fmt {
+
+// Fix the warning about long long on older versions of GCC
+// that don't support the diagnostic pragma.
+FMT_GCC_EXTENSION typedef long long LongLong;
+FMT_GCC_EXTENSION typedef unsigned long long ULongLong;
+
+#if FMT_USE_RVALUE_REFERENCES
+using std::move;
+#endif
+
+template <typename Char>
+class BasicWriter;
+
+typedef BasicWriter<char> Writer;
+typedef BasicWriter<wchar_t> WWriter;
+
+template <typename Char>
+class ArgFormatter;
+
+struct FormatSpec;
+
+template <typename Impl, typename Char, typename Spec = fmt::FormatSpec>
+class BasicPrintfArgFormatter;
+
+template <typename CharType,
+          typename ArgFormatter = fmt::ArgFormatter<CharType> >
+class BasicFormatter;
+
+/**
+  \rst
+  A string reference. It can be constructed from a C string or
+  ``std::basic_string``.
+
+  You can use one of the following typedefs for common character types:
+
+  +------------+-------------------------+
+  | Type       | Definition              |
+  +============+=========================+
+  | StringRef  | BasicStringRef<char>    |
+  +------------+-------------------------+
+  | WStringRef | BasicStringRef<wchar_t> |
+  +------------+-------------------------+
+
+  This class is most useful as a parameter type to allow passing
+  different types of strings to a function, for example::
+
+    template <typename... Args>
+    std::string format(StringRef format_str, const Args & ... args);
+
+    format("{}", 42);
+    format(std::string("{}"), 42);
+  \endrst
+ */
+template <typename Char>
+class BasicStringRef {
+ private:
+  const Char *data_;
+  std::size_t size_;
+
+ public:
+  /** Constructs a string reference object from a C string and a size. */
+  BasicStringRef(const Char *s, std::size_t size) : data_(s), size_(size) {}
+
+  /**
+    \rst
+    Constructs a string reference object from a C string computing
+    the size with ``std::char_traits<Char>::length``.
+    \endrst
+   */
+  BasicStringRef(const Char *s)
+    : data_(s), size_(std::char_traits<Char>::length(s)) {}
+
+  /**
+    \rst
+    Constructs a string reference from a ``std::basic_string`` object.
+    \endrst
+   */
+  template <typename Allocator>
+  BasicStringRef(
+      const std::basic_string<Char, std::char_traits<Char>, Allocator> &s)
+  : data_(s.c_str()), size_(s.size()) {}
+
+  /**
+    \rst
+    Converts a string reference to an ``std::string`` object.
+    \endrst
+   */
+  std::basic_string<Char> to_string() const {
+    return std::basic_string<Char>(data_, size_);
+  }
+
+  /** Returns a pointer to the string data. */
+  const Char *data() const { return data_; }
+
+  /** Returns the string size. */
+  std::size_t size() const { return size_; }
+
+  // Lexicographically compare this string reference to other.
+  int compare(BasicStringRef other) const {
+    std::size_t size = size_ < other.size_ ? size_ : other.size_;
+    int result = std::char_traits<Char>::compare(data_, other.data_, size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  friend bool operator==(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) == 0;
+  }
+  friend bool operator!=(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) != 0;
+  }
+  friend bool operator<(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) < 0;
+  }
+  friend bool operator<=(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend bool operator>(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) > 0;
+  }
+  friend bool operator>=(BasicStringRef lhs, BasicStringRef rhs) {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+typedef BasicStringRef<char> StringRef;
+typedef BasicStringRef<wchar_t> WStringRef;
+
+/**
+  \rst
+  A reference to a null terminated string. It can be constructed from a C
+  string or ``std::basic_string``.
+
+  You can use one of the following typedefs for common character types:
+
+  +-------------+--------------------------+
+  | Type        | Definition               |
+  +=============+==========================+
+  | CStringRef  | BasicCStringRef<char>    |
+  +-------------+--------------------------+
+  | WCStringRef | BasicCStringRef<wchar_t> |
+  +-------------+--------------------------+
+
+  This class is most useful as a parameter type to allow passing
+  different types of strings to a function, for example::
+
+    template <typename... Args>
+    std::string format(CStringRef format_str, const Args & ... args);
+
+    format("{}", 42);
+    format(std::string("{}"), 42);
+  \endrst
+ */
+template <typename Char>
+class BasicCStringRef {
+ private:
+  const Char *data_;
+
+ public:
+  /** Constructs a string reference object from a C string. */
+  BasicCStringRef(const Char *s) : data_(s) {}
+
+  /**
+    \rst
+    Constructs a string reference from a ``std::basic_string`` object.
+    \endrst
+   */
+  template <typename Allocator>
+  BasicCStringRef(
+      const std::basic_string<Char, std::char_traits<Char>, Allocator> &s)
+  : data_(s.c_str()) {}
+
+  /** Returns the pointer to a C string. */
+  const Char *c_str() const { return data_; }
+};
+
+typedef BasicCStringRef<char> CStringRef;
+typedef BasicCStringRef<wchar_t> WCStringRef;
+
+/** A formatting error such as invalid format string. */
+class FormatError : public std::runtime_error {
+ public:
+  explicit FormatError(CStringRef message)
+  : std::runtime_error(message.c_str()) {}
+  FormatError(const FormatError &ferr) : std::runtime_error(ferr) {}
+  FMT_API ~FormatError() FMT_DTOR_NOEXCEPT;
+};
+
+namespace internal {
+
+// MakeUnsigned<T>::Type gives an unsigned type corresponding to integer type T.
+template <typename T>
+struct MakeUnsigned { typedef T Type; };
+
+#define FMT_SPECIALIZE_MAKE_UNSIGNED(T, U) \
+  template <> \
+  struct MakeUnsigned<T> { typedef U Type; }
+
+FMT_SPECIALIZE_MAKE_UNSIGNED(char, unsigned char);
+FMT_SPECIALIZE_MAKE_UNSIGNED(signed char, unsigned char);
+FMT_SPECIALIZE_MAKE_UNSIGNED(short, unsigned short);
+FMT_SPECIALIZE_MAKE_UNSIGNED(int, unsigned);
+FMT_SPECIALIZE_MAKE_UNSIGNED(long, unsigned long);
+FMT_SPECIALIZE_MAKE_UNSIGNED(LongLong, ULongLong);
+
+// Casts nonnegative integer to unsigned.
+template <typename Int>
+inline typename MakeUnsigned<Int>::Type to_unsigned(Int value) {
+  FMT_ASSERT(value >= 0, "negative value");
+  return static_cast<typename MakeUnsigned<Int>::Type>(value);
+}
+
+// The number of characters to store in the MemoryBuffer object itself
+// to avoid dynamic memory allocation.
+enum { INLINE_BUFFER_SIZE = 500 };
+
+#if FMT_SECURE_SCL
+// Use checked iterator to avoid warnings on MSVC.
+template <typename T>
+inline stdext::checked_array_iterator<T*> make_ptr(T *ptr, std::size_t size) {
+  return stdext::checked_array_iterator<T*>(ptr, size);
+}
+#else
+template <typename T>
+inline T *make_ptr(T *ptr, std::size_t) { return ptr; }
+#endif
+}  // namespace internal
+
+/**
+  \rst
+  A buffer supporting a subset of ``std::vector``'s operations.
+  \endrst
+ */
+template <typename T>
+class Buffer {
+ private:
+  FMT_DISALLOW_COPY_AND_ASSIGN(Buffer);
+
+ protected:
+  T *ptr_;
+  std::size_t size_;
+  std::size_t capacity_;
+
+  Buffer(T *ptr = FMT_NULL, std::size_t capacity = 0)
+    : ptr_(ptr), size_(0), capacity_(capacity) {}
+
+  /**
+    \rst
+    Increases the buffer capacity to hold at least *size* elements updating
+    ``ptr_`` and ``capacity_``.
+    \endrst
+   */
+  virtual void grow(std::size_t size) = 0;
+
+ public:
+  virtual ~Buffer() {}
+
+  /** Returns the size of this buffer. */
+  std::size_t size() const { return size_; }
+
+  /** Returns the capacity of this buffer. */
+  std::size_t capacity() const { return capacity_; }
+
+  /**
+    Resizes the buffer. If T is a POD type new elements may not be initialized.
+   */
+  void resize(std::size_t new_size) {
+    if (new_size > capacity_)
+      grow(new_size);
+    size_ = new_size;
+  }
+
+  /**
+    \rst
+    Reserves space to store at least *capacity* elements.
+    \endrst
+   */
+  void reserve(std::size_t capacity) {
+    if (capacity > capacity_)
+      grow(capacity);
+  }
+
+  void clear() FMT_NOEXCEPT { size_ = 0; }
+
+  void push_back(const T &value) {
+    if (size_ == capacity_)
+      grow(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /** Appends data to the end of the buffer. */
+  template <typename U>
+  void append(const U *begin, const U *end);
+
+  T &operator[](std::size_t index) { return ptr_[index]; }
+  const T &operator[](std::size_t index) const { return ptr_[index]; }
+};
+
+template <typename T>
+template <typename U>
+void Buffer<T>::append(const U *begin, const U *end) {
+  FMT_ASSERT(end >= begin, "negative value");
+  std::size_t new_size = size_ + (end - begin);
+  if (new_size > capacity_)
+    grow(new_size);
+  std::uninitialized_copy(begin, end,
+                          internal::make_ptr(ptr_, capacity_) + size_);
+  size_ = new_size;
+}
+
+namespace internal {
+
+// A memory buffer for trivially copyable/constructible types with the first
+// SIZE elements stored in the object itself.
+template <typename T, std::size_t SIZE, typename Allocator = std::allocator<T> >
+class MemoryBuffer : private Allocator, public Buffer<T> {
+ private:
+  T data_[SIZE];
+
+  // Deallocate memory allocated by the buffer.
+  void deallocate() {
+    if (this->ptr_ != data_) Allocator::deallocate(this->ptr_, this->capacity_);
+  }
+
+ protected:
+  void grow(std::size_t size) FMT_OVERRIDE;
+
+ public:
+  explicit MemoryBuffer(const Allocator &alloc = Allocator())
+      : Allocator(alloc), Buffer<T>(data_, SIZE) {}
+  ~MemoryBuffer() { deallocate(); }
+
+#if FMT_USE_RVALUE_REFERENCES
+ private:
+  // Move data from other to this buffer.
+  void move(MemoryBuffer &other) {
+    Allocator &this_alloc = *this, &other_alloc = other;
+    this_alloc = std::move(other_alloc);
+    this->size_ = other.size_;
+    this->capacity_ = other.capacity_;
+    if (other.ptr_ == other.data_) {
+      this->ptr_ = data_;
+      std::uninitialized_copy(other.data_, other.data_ + this->size_,
+                              make_ptr(data_, this->capacity_));
+    } else {
+      this->ptr_ = other.ptr_;
+      // Set pointer to the inline array so that delete is not called
+      // when deallocating.
+      other.ptr_ = other.data_;
+    }
+  }
+
+ public:
+  MemoryBuffer(MemoryBuffer &&other) {
+    move(other);
+  }
+
+  MemoryBuffer &operator=(MemoryBuffer &&other) {
+    assert(this != &other);
+    deallocate();
+    move(other);
+    return *this;
+  }
+#endif
+
+  // Returns a copy of the allocator associated with this buffer.
+  Allocator get_allocator() const { return *this; }
+};
+
+template <typename T, std::size_t SIZE, typename Allocator>
+void MemoryBuffer<T, SIZE, Allocator>::grow(std::size_t size) {
+  std::size_t new_capacity = this->capacity_ + this->capacity_ / 2;
+  if (size > new_capacity)
+      new_capacity = size;
+  T *new_ptr = this->allocate(new_capacity, FMT_NULL);
+  // The following code doesn't throw, so the raw pointer above doesn't leak.
+  std::uninitialized_copy(this->ptr_, this->ptr_ + this->size_,
+                          make_ptr(new_ptr, new_capacity));
+  std::size_t old_capacity = this->capacity_;
+  T *old_ptr = this->ptr_;
+  this->capacity_ = new_capacity;
+  this->ptr_ = new_ptr;
+  // deallocate may throw (at least in principle), but it doesn't matter since
+  // the buffer already uses the new storage and will deallocate it in case
+  // of exception.
+  if (old_ptr != data_)
+    Allocator::deallocate(old_ptr, old_capacity);
+}
+
+// A fixed-size buffer.
+template <typename Char>
+class FixedBuffer : public fmt::Buffer<Char> {
+ public:
+  FixedBuffer(Char *array, std::size_t size) : fmt::Buffer<Char>(array, size) {}
+
+ protected:
+  FMT_API void grow(std::size_t size) FMT_OVERRIDE;
+};
+
+template <typename Char>
+class BasicCharTraits {
+ public:
+#if FMT_SECURE_SCL
+  typedef stdext::checked_array_iterator<Char*> CharPtr;
+#else
+  typedef Char *CharPtr;
+#endif
+  static Char cast(int value) { return static_cast<Char>(value); }
+};
+
+template <typename Char>
+class CharTraits;
+
+template <>
+class CharTraits<char> : public BasicCharTraits<char> {
+ private:
+  // Conversion from wchar_t to char is not allowed.
+  static char convert(wchar_t);
+
+ public:
+  static char convert(char value) { return value; }
+
+  // Formats a floating-point number.
+  template <typename T>
+  FMT_API static int format_float(char *buffer, std::size_t size,
+      const char *format, unsigned width, int precision, T value);
+};
+
+#if FMT_USE_EXTERN_TEMPLATES
+extern template int CharTraits<char>::format_float<double>
+        (char *buffer, std::size_t size,
+         const char* format, unsigned width, int precision, double value);
+extern template int CharTraits<char>::format_float<long double>
+        (char *buffer, std::size_t size,
+         const char* format, unsigned width, int precision, long double value);
+#endif
+
+template <>
+class CharTraits<wchar_t> : public BasicCharTraits<wchar_t> {
+ public:
+  static wchar_t convert(char value) { return value; }
+  static wchar_t convert(wchar_t value) { return value; }
+
+  template <typename T>
+  FMT_API static int format_float(wchar_t *buffer, std::size_t size,
+      const wchar_t *format, unsigned width, int precision, T value);
+};
+
+#if FMT_USE_EXTERN_TEMPLATES
+extern template int CharTraits<wchar_t>::format_float<double>
+        (wchar_t *buffer, std::size_t size,
+         const wchar_t* format, unsigned width, int precision, double value);
+extern template int CharTraits<wchar_t>::format_float<long double>
+        (wchar_t *buffer, std::size_t size,
+         const wchar_t* format, unsigned width, int precision, long double value);
+#endif
+
+// Checks if a number is negative - used to avoid warnings.
+template <bool IsSigned>
+struct SignChecker {
+  template <typename T>
+  static bool is_negative(T value) { return value < 0; }
+};
+
+template <>
+struct SignChecker<false> {
+  template <typename T>
+  static bool is_negative(T) { return false; }
+};
+
+// Returns true if value is negative, false otherwise.
+// Same as (value < 0) but doesn't produce warnings if T is an unsigned type.
+template <typename T>
+inline bool is_negative(T value) {
+  return SignChecker<std::numeric_limits<T>::is_signed>::is_negative(value);
+}
+
+// Selects uint32_t if FitsIn32Bits is true, uint64_t otherwise.
+template <bool FitsIn32Bits>
+struct TypeSelector { typedef uint32_t Type; };
+
+template <>
+struct TypeSelector<false> { typedef uint64_t Type; };
+
+template <typename T>
+struct IntTraits {
+  // Smallest of uint32_t and uint64_t that is large enough to represent
+  // all values of T.
+  typedef typename
+    TypeSelector<std::numeric_limits<T>::digits <= 32>::Type MainType;
+};
+
+FMT_API void report_unknown_type(char code, const char *type);
+
+// Static data is placed in this class template to allow header-only
+// configuration.
+template <typename T = void>
+struct FMT_API BasicData {
+  static const uint32_t POWERS_OF_10_32[];
+  static const uint64_t POWERS_OF_10_64[];
+  static const char DIGITS[];
+};
+
+#if FMT_USE_EXTERN_TEMPLATES
+extern template struct BasicData<void>;
+#endif
+
+typedef BasicData<> Data;
+
+#ifdef FMT_BUILTIN_CLZLL
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+inline unsigned count_digits(uint64_t n) {
+  // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
+  // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
+  int t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
+  return to_unsigned(t) - (n < Data::POWERS_OF_10_64[t]) + 1;
+}
+#else
+// Fallback version of count_digits used when __builtin_clz is not available.
+inline unsigned count_digits(uint64_t n) {
+  unsigned count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#endif
+
+#ifdef FMT_BUILTIN_CLZ
+// Optional version of count_digits for better performance on 32-bit platforms.
+inline unsigned count_digits(uint32_t n) {
+  int t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
+  return to_unsigned(t) - (n < Data::POWERS_OF_10_32[t]) + 1;
+}
+#endif
+
+// A functor that doesn't add a thousands separator.
+struct NoThousandsSep {
+  template <typename Char>
+  void operator()(Char *) {}
+};
+
+// A functor that adds a thousands separator.
+class ThousandsSep {
+ private:
+  fmt::StringRef sep_;
+
+  // Index of a decimal digit with the least significant digit having index 0.
+  unsigned digit_index_;
+
+ public:
+  explicit ThousandsSep(fmt::StringRef sep) : sep_(sep), digit_index_(0) {}
+
+  template <typename Char>
+  void operator()(Char *&buffer) {
+    if (++digit_index_ % 3 != 0)
+      return;
+    buffer -= sep_.size();
+    std::uninitialized_copy(sep_.data(), sep_.data() + sep_.size(),
+                            internal::make_ptr(buffer, sep_.size()));
+  }
+};
+
+// Formats a decimal unsigned integer value writing into buffer.
+// thousands_sep is a functor that is called after writing each char to
+// add a thousands separator if necessary.
+template <typename UInt, typename Char, typename ThousandsSep>
+inline void format_decimal(Char *buffer, UInt value, unsigned num_digits,
+                           ThousandsSep thousands_sep) {
+  buffer += num_digits;
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    unsigned index = static_cast<unsigned>((value % 100) * 2);
+    value /= 100;
+    *--buffer = Data::DIGITS[index + 1];
+    thousands_sep(buffer);
+    *--buffer = Data::DIGITS[index];
+    thousands_sep(buffer);
+  }
+  if (value < 10) {
+    *--buffer = static_cast<char>('0' + value);
+    return;
+  }
+  unsigned index = static_cast<unsigned>(value * 2);
+  *--buffer = Data::DIGITS[index + 1];
+  thousands_sep(buffer);
+  *--buffer = Data::DIGITS[index];
+}
+
+template <typename UInt, typename Char>
+inline void format_decimal(Char *buffer, UInt value, unsigned num_digits) {
+  format_decimal(buffer, value, num_digits, NoThousandsSep());
+  return;
+}
+
+#ifndef _WIN32
+# define FMT_USE_WINDOWS_H 0
+#elif !defined(FMT_USE_WINDOWS_H)
+# define FMT_USE_WINDOWS_H 1
+#endif
+
+// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
+// All the functionality that relies on it will be disabled too.
+#if FMT_USE_WINDOWS_H
+// A converter from UTF-8 to UTF-16.
+// It is only provided for Windows since other systems support UTF-8 natively.
+class UTF8ToUTF16 {
+ private:
+  MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer_;
+
+ public:
+  FMT_API explicit UTF8ToUTF16(StringRef s);
+  operator WStringRef() const { return WStringRef(&buffer_[0], size()); }
+  size_t size() const { return buffer_.size() - 1; }
+  const wchar_t *c_str() const { return &buffer_[0]; }
+  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
+};
+
+// A converter from UTF-16 to UTF-8.
+// It is only provided for Windows since other systems support UTF-8 natively.
+class UTF16ToUTF8 {
+ private:
+  MemoryBuffer<char, INLINE_BUFFER_SIZE> buffer_;
+
+ public:
+  UTF16ToUTF8() {}
+  FMT_API explicit UTF16ToUTF8(WStringRef s);
+  operator StringRef() const { return StringRef(&buffer_[0], size()); }
+  size_t size() const { return buffer_.size() - 1; }
+  const char *c_str() const { return &buffer_[0]; }
+  std::string str() const { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a system error code instead of
+  // throwing exception on conversion error. This method may still throw
+  // in case of memory allocation error.
+  FMT_API int convert(WStringRef s);
+};
+
+FMT_API void format_windows_error(fmt::Writer &out, int error_code,
+                                  fmt::StringRef message) FMT_NOEXCEPT;
+#endif
+
+// A formatting argument value.
+struct Value {
+  template <typename Char>
+  struct StringValue {
+    const Char *value;
+    std::size_t size;
+  };
+
+  typedef void (*FormatFunc)(
+      void *formatter, const void *arg, void *format_str_ptr);
+
+  struct CustomValue {
+    const void *value;
+    FormatFunc format;
+  };
+
+  union {
+    int int_value;
+    unsigned uint_value;
+    LongLong long_long_value;
+    ULongLong ulong_long_value;
+    double double_value;
+    long double long_double_value;
+    const void *pointer;
+    StringValue<char> string;
+    StringValue<signed char> sstring;
+    StringValue<unsigned char> ustring;
+    StringValue<wchar_t> wstring;
+    CustomValue custom;
+  };
+
+  enum Type {
+    NONE, NAMED_ARG,
+    // Integer types should go first,
+    INT, UINT, LONG_LONG, ULONG_LONG, BOOL, CHAR, LAST_INTEGER_TYPE = CHAR,
+    // followed by floating-point types.
+    DOUBLE, LONG_DOUBLE, LAST_NUMERIC_TYPE = LONG_DOUBLE,
+    CSTRING, STRING, WSTRING, POINTER, CUSTOM
+  };
+};
+
+// A formatting argument. It is a trivially copyable/constructible type to
+// allow storage in internal::MemoryBuffer.
+struct Arg : Value {
+  Type type;
+};
+
+template <typename Char>
+struct NamedArg;
+template <typename Char, typename T>
+struct NamedArgWithType;
+
+template <typename T = void>
+struct Null {};
+
+// A helper class template to enable or disable overloads taking wide
+// characters and strings in MakeValue.
+template <typename T, typename Char>
+struct WCharHelper {
+  typedef Null<T> Supported;
+  typedef T Unsupported;
+};
+
+template <typename T>
+struct WCharHelper<T, wchar_t> {
+  typedef T Supported;
+  typedef Null<T> Unsupported;
+};
+
+typedef char Yes[1];
+typedef char No[2];
+
+template <typename T>
+T &get();
+
+// These are non-members to workaround an overload resolution bug in bcc32.
+Yes &convert(fmt::ULongLong);
+No &convert(...);
+
+template<typename T, bool ENABLE_CONVERSION>
+struct ConvertToIntImpl {
+  enum { value = ENABLE_CONVERSION };
+};
+
+template<typename T, bool ENABLE_CONVERSION>
+struct ConvertToIntImpl2 {
+  enum { value = false };
+};
+
+template<typename T>
+struct ConvertToIntImpl2<T, true> {
+  enum {
+    // Don't convert numeric types.
+    value = ConvertToIntImpl<T, !std::numeric_limits<T>::is_specialized>::value
+  };
+};
+
+template<typename T>
+struct ConvertToInt {
+  enum {
+    enable_conversion = sizeof(fmt::internal::convert(get<T>())) == sizeof(Yes)
+  };
+  enum { value = ConvertToIntImpl2<T, enable_conversion>::value };
+};
+
+#define FMT_DISABLE_CONVERSION_TO_INT(Type) \
+  template <> \
+  struct ConvertToInt<Type> {  enum { value = 0 }; }
+
+// Silence warnings about convering float to int.
+FMT_DISABLE_CONVERSION_TO_INT(float);
+FMT_DISABLE_CONVERSION_TO_INT(double);
+FMT_DISABLE_CONVERSION_TO_INT(long double);
+
+template<bool B, class T = void>
+struct EnableIf {};
+
+template<class T>
+struct EnableIf<true, T> { typedef T type; };
+
+template<bool B, class T, class F>
+struct Conditional { typedef T type; };
+
+template<class T, class F>
+struct Conditional<false, T, F> { typedef F type; };
+
+// For bcc32 which doesn't understand ! in template arguments.
+template <bool>
+struct Not { enum { value = 0 }; };
+
+template <>
+struct Not<false> { enum { value = 1 }; };
+
+template <typename T>
+struct FalseType { enum { value = 0 }; };
+
+template <typename T, T> struct LConvCheck {
+  LConvCheck(int) {}
+};
+
+// Returns the thousands separator for the current locale.
+// We check if ``lconv`` contains ``thousands_sep`` because on Android
+// ``lconv`` is stubbed as an empty struct.
+template <typename LConv>
+inline StringRef thousands_sep(
+    LConv *lc, LConvCheck<char *LConv::*, &LConv::thousands_sep> = 0) {
+  return lc->thousands_sep;
+}
+
+inline fmt::StringRef thousands_sep(...) { return ""; }
+
+#define FMT_CONCAT(a, b) a##b
+
+#if FMT_GCC_VERSION >= 303
+# define FMT_UNUSED __attribute__((unused))
+#else
+# define FMT_UNUSED
+#endif
+
+#ifndef FMT_USE_STATIC_ASSERT
+# define FMT_USE_STATIC_ASSERT 0
+#endif
+
+#if FMT_USE_STATIC_ASSERT || FMT_HAS_FEATURE(cxx_static_assert) || \
+  (FMT_GCC_VERSION >= 403 && FMT_HAS_GXX_CXX11) || _MSC_VER >= 1600
+# define FMT_STATIC_ASSERT(cond, message) static_assert(cond, message)
+#else
+# define FMT_CONCAT_(a, b) FMT_CONCAT(a, b)
+# define FMT_STATIC_ASSERT(cond, message) \
+  typedef int FMT_CONCAT_(Assert, __LINE__)[(cond) ? 1 : -1] FMT_UNUSED
+#endif
+
+template <typename Formatter, typename Char, typename T>
+void format_arg(Formatter &, const Char *, const T &) {
+  FMT_STATIC_ASSERT(FalseType<T>::value,
+                    "Cannot format argument. To enable the use of ostream "
+                    "operator<< include fmt/ostream.h. Otherwise provide "
+                    "an overload of format_arg.");
+}
+
+// Makes an Arg object from any type.
+template <typename Formatter>
+class MakeValue : public Arg {
+ public:
+  typedef typename Formatter::Char Char;
+
+ private:
+  // The following two methods are private to disallow formatting of
+  // arbitrary pointers. If you want to output a pointer cast it to
+  // "void *" or "const void *". In particular, this forbids formatting
+  // of "[const] volatile char *" which is printed as bool by iostreams.
+  // Do not implement!
+  template <typename T>
+  MakeValue(const T *value);
+  template <typename T>
+  MakeValue(T *value);
+
+  // The following methods are private to disallow formatting of wide
+  // characters and strings into narrow strings as in
+  //   fmt::format("{}", L"test");
+  // To fix this, use a wide format string: fmt::format(L"{}", L"test").
+#if !FMT_MSC_VER || defined(_NATIVE_WCHAR_T_DEFINED)
+  MakeValue(typename WCharHelper<wchar_t, Char>::Unsupported);
+#endif
+  MakeValue(typename WCharHelper<wchar_t *, Char>::Unsupported);
+  MakeValue(typename WCharHelper<const wchar_t *, Char>::Unsupported);
+  MakeValue(typename WCharHelper<const std::wstring &, Char>::Unsupported);
+  MakeValue(typename WCharHelper<WStringRef, Char>::Unsupported);
+
+  void set_string(StringRef str) {
+    string.value = str.data();
+    string.size = str.size();
+  }
+
+  void set_string(WStringRef str) {
+    wstring.value = str.data();
+    wstring.size = str.size();
+  }
+
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T>
+  static void format_custom_arg(
+      void *formatter, const void *arg, void *format_str_ptr) {
+    format_arg(*static_cast<Formatter*>(formatter),
+               *static_cast<const Char**>(format_str_ptr),
+               *static_cast<const T*>(arg));
+  }
+
+ public:
+  MakeValue() {}
+
+#define FMT_MAKE_VALUE_(Type, field, TYPE, rhs) \
+  MakeValue(Type value) { field = rhs; } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+#define FMT_MAKE_VALUE(Type, field, TYPE) \
+  FMT_MAKE_VALUE_(Type, field, TYPE, value)
+
+  FMT_MAKE_VALUE(bool, int_value, BOOL)
+  FMT_MAKE_VALUE(short, int_value, INT)
+  FMT_MAKE_VALUE(unsigned short, uint_value, UINT)
+  FMT_MAKE_VALUE(int, int_value, INT)
+  FMT_MAKE_VALUE(unsigned, uint_value, UINT)
+
+  MakeValue(long value) {
+    // To minimize the number of types we need to deal with, long is
+    // translated either to int or to long long depending on its size.
+    if (const_check(sizeof(long) == sizeof(int)))
+      int_value = static_cast<int>(value);
+    else
+      long_long_value = value;
+  }
+  static uint64_t type(long) {
+    return sizeof(long) == sizeof(int) ? Arg::INT : Arg::LONG_LONG;
+  }
+
+  MakeValue(unsigned long value) {
+    if (const_check(sizeof(unsigned long) == sizeof(unsigned)))
+      uint_value = static_cast<unsigned>(value);
+    else
+      ulong_long_value = value;
+  }
+  static uint64_t type(unsigned long) {
+    return sizeof(unsigned long) == sizeof(unsigned) ?
+          Arg::UINT : Arg::ULONG_LONG;
+  }
+
+  FMT_MAKE_VALUE(LongLong, long_long_value, LONG_LONG)
+  FMT_MAKE_VALUE(ULongLong, ulong_long_value, ULONG_LONG)
+  FMT_MAKE_VALUE(float, double_value, DOUBLE)
+  FMT_MAKE_VALUE(double, double_value, DOUBLE)
+  FMT_MAKE_VALUE(long double, long_double_value, LONG_DOUBLE)
+  FMT_MAKE_VALUE(signed char, int_value, INT)
+  FMT_MAKE_VALUE(unsigned char, uint_value, UINT)
+  FMT_MAKE_VALUE(char, int_value, CHAR)
+
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+  MakeValue(typename WCharHelper<wchar_t, Char>::Supported value) {
+    int_value = value;
+  }
+  static uint64_t type(wchar_t) { return Arg::CHAR; }
+#endif
+
+#define FMT_MAKE_STR_VALUE(Type, TYPE) \
+  MakeValue(Type value) { set_string(value); } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+  FMT_MAKE_VALUE(char *, string.value, CSTRING)
+  FMT_MAKE_VALUE(const char *, string.value, CSTRING)
+  FMT_MAKE_VALUE(signed char *, sstring.value, CSTRING)
+  FMT_MAKE_VALUE(const signed char *, sstring.value, CSTRING)
+  FMT_MAKE_VALUE(unsigned char *, ustring.value, CSTRING)
+  FMT_MAKE_VALUE(const unsigned char *, ustring.value, CSTRING)
+  FMT_MAKE_STR_VALUE(const std::string &, STRING)
+  FMT_MAKE_STR_VALUE(StringRef, STRING)
+  FMT_MAKE_VALUE_(CStringRef, string.value, CSTRING, value.c_str())
+
+#define FMT_MAKE_WSTR_VALUE(Type, TYPE) \
+  MakeValue(typename WCharHelper<Type, Char>::Supported value) { \
+    set_string(value); \
+  } \
+  static uint64_t type(Type) { return Arg::TYPE; }
+
+  FMT_MAKE_WSTR_VALUE(wchar_t *, WSTRING)
+  FMT_MAKE_WSTR_VALUE(const wchar_t *, WSTRING)
+  FMT_MAKE_WSTR_VALUE(const std::wstring &, WSTRING)
+  FMT_MAKE_WSTR_VALUE(WStringRef, WSTRING)
+
+  FMT_MAKE_VALUE(void *, pointer, POINTER)
+  FMT_MAKE_VALUE(const void *, pointer, POINTER)
+
+  template <typename T>
+  MakeValue(const T &value,
+            typename EnableIf<Not<
+              ConvertToInt<T>::value>::value, int>::type = 0) {
+    custom.value = &value;
+    custom.format = &format_custom_arg<T>;
+  }
+
+  template <typename T>
+  static typename EnableIf<Not<ConvertToInt<T>::value>::value, uint64_t>::type
+      type(const T &) {
+    return Arg::CUSTOM;
+  }
+
+  // Additional template param `Char_` is needed here because make_type always
+  // uses char.
+  template <typename Char_>
+  MakeValue(const NamedArg<Char_> &value) { pointer = &value; }
+  template <typename Char_, typename T>
+  MakeValue(const NamedArgWithType<Char_, T> &value) { pointer = &value; }
+
+  template <typename Char_>
+  static uint64_t type(const NamedArg<Char_> &) { return Arg::NAMED_ARG; }
+  template <typename Char_, typename T>
+  static uint64_t type(const NamedArgWithType<Char_, T> &) { return Arg::NAMED_ARG; }
+};
+
+template <typename Formatter>
+class MakeArg : public Arg {
+public:
+  MakeArg() {
+    type = Arg::NONE;
+  }
+
+  template <typename T>
+  MakeArg(const T &value)
+  : Arg(MakeValue<Formatter>(value)) {
+    type = static_cast<Arg::Type>(MakeValue<Formatter>::type(value));
+  }
+};
+
+template <typename Char>
+struct NamedArg : Arg {
+  BasicStringRef<Char> name;
+
+  template <typename T>
+  NamedArg(BasicStringRef<Char> argname, const T &value)
+  : Arg(MakeArg< BasicFormatter<Char> >(value)), name(argname) {}
+};
+
+template <typename Char, typename T>
+struct NamedArgWithType : NamedArg<Char> {
+  NamedArgWithType(BasicStringRef<Char> argname, const T &value)
+  : NamedArg<Char>(argname, value) {}
+};
+
+class RuntimeError : public std::runtime_error {
+ protected:
+  RuntimeError() : std::runtime_error("") {}
+  RuntimeError(const RuntimeError &rerr) : std::runtime_error(rerr) {}
+  FMT_API ~RuntimeError() FMT_DTOR_NOEXCEPT;
+};
+
+template <typename Char>
+class ArgMap;
+}  // namespace internal
+
+/** An argument list. */
+class ArgList {
+ private:
+  // To reduce compiled code size per formatting function call, types of first
+  // MAX_PACKED_ARGS arguments are passed in the types_ field.
+  uint64_t types_;
+  union {
+    // If the number of arguments is less than MAX_PACKED_ARGS, the argument
+    // values are stored in values_, otherwise they are stored in args_.
+    // This is done to reduce compiled code size as storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const internal::Value *values_;
+    const internal::Arg *args_;
+  };
+
+  internal::Arg::Type type(unsigned index) const {
+    return type(types_, index);
+  }
+
+  template <typename Char>
+  friend class internal::ArgMap;
+
+ public:
+  // Maximum number of arguments with packed types.
+  enum { MAX_PACKED_ARGS = 16 };
+
+  ArgList() : types_(0) {}
+
+  ArgList(ULongLong types, const internal::Value *values)
+  : types_(types), values_(values) {}
+  ArgList(ULongLong types, const internal::Arg *args)
+  : types_(types), args_(args) {}
+
+  uint64_t types() const { return types_; }
+
+  /** Returns the argument at specified index. */
+  internal::Arg operator[](unsigned index) const {
+    using internal::Arg;
+    Arg arg;
+    bool use_values = type(MAX_PACKED_ARGS - 1) == Arg::NONE;
+    if (index < MAX_PACKED_ARGS) {
+      Arg::Type arg_type = type(index);
+      internal::Value &val = arg;
+      if (arg_type != Arg::NONE)
+        val = use_values ? values_[index] : args_[index];
+      arg.type = arg_type;
+      return arg;
+    }
+    if (use_values) {
+      // The index is greater than the number of arguments that can be stored
+      // in values, so return a "none" argument.
+      arg.type = Arg::NONE;
+      return arg;
+    }
+    for (unsigned i = MAX_PACKED_ARGS; i <= index; ++i) {
+      if (args_[i].type == Arg::NONE)
+        return args_[i];
+    }
+    return args_[index];
+  }
+
+  static internal::Arg::Type type(uint64_t types, unsigned index) {
+    unsigned shift = index * 4;
+    uint64_t mask = 0xf;
+    return static_cast<internal::Arg::Type>(
+          (types & (mask << shift)) >> shift);
+  }
+};
+
+#define FMT_DISPATCH(call) static_cast<Impl*>(this)->call
+
+/**
+  \rst
+  An argument visitor based on the `curiously recurring template pattern
+  <http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern>`_.
+
+  To use `~fmt::ArgVisitor` define a subclass that implements some or all of the
+  visit methods with the same signatures as the methods in `~fmt::ArgVisitor`,
+  for example, `~fmt::ArgVisitor::visit_int()`.
+  Pass the subclass as the *Impl* template parameter. Then calling
+  `~fmt::ArgVisitor::visit` for some argument will dispatch to a visit method
+  specific to the argument type. For example, if the argument type is
+  ``double`` then the `~fmt::ArgVisitor::visit_double()` method of a subclass
+  will be called. If the subclass doesn't contain a method with this signature,
+  then a corresponding method of `~fmt::ArgVisitor` will be called.
+
+  **Example**::
+
+    class MyArgVisitor : public fmt::ArgVisitor<MyArgVisitor, void> {
+     public:
+      void visit_int(int value) { fmt::print("{}", value); }
+      void visit_double(double value) { fmt::print("{}", value ); }
+    };
+  \endrst
+ */
+template <typename Impl, typename Result>
+class ArgVisitor {
+ private:
+  typedef internal::Arg Arg;
+
+ public:
+  void report_unhandled_arg() {}
+
+  Result visit_unhandled_arg() {
+    FMT_DISPATCH(report_unhandled_arg());
+    return Result();
+  }
+
+  /** Visits an ``int`` argument. **/
+  Result visit_int(int value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits a ``long long`` argument. **/
+  Result visit_long_long(LongLong value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits an ``unsigned`` argument. **/
+  Result visit_uint(unsigned value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits an ``unsigned long long`` argument. **/
+  Result visit_ulong_long(ULongLong value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits a ``bool`` argument. **/
+  Result visit_bool(bool value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits a ``char`` or ``wchar_t`` argument. **/
+  Result visit_char(int value) {
+    return FMT_DISPATCH(visit_any_int(value));
+  }
+
+  /** Visits an argument of any integral type. **/
+  template <typename T>
+  Result visit_any_int(T) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits a ``double`` argument. **/
+  Result visit_double(double value) {
+    return FMT_DISPATCH(visit_any_double(value));
+  }
+
+  /** Visits a ``long double`` argument. **/
+  Result visit_long_double(long double value) {
+    return FMT_DISPATCH(visit_any_double(value));
+  }
+
+  /** Visits a ``double`` or ``long double`` argument. **/
+  template <typename T>
+  Result visit_any_double(T) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits a null-terminated C string (``const char *``) argument. **/
+  Result visit_cstring(const char *) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits a string argument. **/
+  Result visit_string(Arg::StringValue<char>) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits a wide string argument. **/
+  Result visit_wstring(Arg::StringValue<wchar_t>) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits a pointer argument. **/
+  Result visit_pointer(const void *) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /** Visits an argument of a custom (user-defined) type. **/
+  Result visit_custom(Arg::CustomValue) {
+    return FMT_DISPATCH(visit_unhandled_arg());
+  }
+
+  /**
+    \rst
+    Visits an argument dispatching to the appropriate visit method based on
+    the argument type. For example, if the argument type is ``double`` then
+    the `~fmt::ArgVisitor::visit_double()` method of the *Impl* class will be
+    called.
+    \endrst
+   */
+  Result visit(const Arg &arg) {
+    switch (arg.type) {
+    case Arg::NONE:
+    case Arg::NAMED_ARG:
+      FMT_ASSERT(false, "invalid argument type");
+      break;
+    case Arg::INT:
+      return FMT_DISPATCH(visit_int(arg.int_value));
+    case Arg::UINT:
+      return FMT_DISPATCH(visit_uint(arg.uint_value));
+    case Arg::LONG_LONG:
+      return FMT_DISPATCH(visit_long_long(arg.long_long_value));
+    case Arg::ULONG_LONG:
+      return FMT_DISPATCH(visit_ulong_long(arg.ulong_long_value));
+    case Arg::BOOL:
+      return FMT_DISPATCH(visit_bool(arg.int_value != 0));
+    case Arg::CHAR:
+      return FMT_DISPATCH(visit_char(arg.int_value));
+    case Arg::DOUBLE:
+      return FMT_DISPATCH(visit_double(arg.double_value));
+    case Arg::LONG_DOUBLE:
+      return FMT_DISPATCH(visit_long_double(arg.long_double_value));
+    case Arg::CSTRING:
+      return FMT_DISPATCH(visit_cstring(arg.string.value));
+    case Arg::STRING:
+      return FMT_DISPATCH(visit_string(arg.string));
+    case Arg::WSTRING:
+      return FMT_DISPATCH(visit_wstring(arg.wstring));
+    case Arg::POINTER:
+      return FMT_DISPATCH(visit_pointer(arg.pointer));
+    case Arg::CUSTOM:
+      return FMT_DISPATCH(visit_custom(arg.custom));
+    }
+    return Result();
+  }
+};
+
+enum Alignment {
+  ALIGN_DEFAULT, ALIGN_LEFT, ALIGN_RIGHT, ALIGN_CENTER, ALIGN_NUMERIC
+};
+
+// Flags.
+enum {
+  SIGN_FLAG = 1, PLUS_FLAG = 2, MINUS_FLAG = 4, HASH_FLAG = 8,
+  CHAR_FLAG = 0x10  // Argument has char type - used in error reporting.
+};
+
+// An empty format specifier.
+struct EmptySpec {};
+
+// A type specifier.
+template <char TYPE>
+struct TypeSpec : EmptySpec {
+  Alignment align() const { return ALIGN_DEFAULT; }
+  unsigned width() const { return 0; }
+  int precision() const { return -1; }
+  bool flag(unsigned) const { return false; }
+  char type() const { return TYPE; }
+  char type_prefix() const { return TYPE; }
+  char fill() const { return ' '; }
+};
+
+// A width specifier.
+struct WidthSpec {
+  unsigned width_;
+  // Fill is always wchar_t and cast to char if necessary to avoid having
+  // two specialization of WidthSpec and its subclasses.
+  wchar_t fill_;
+
+  WidthSpec(unsigned width, wchar_t fill) : width_(width), fill_(fill) {}
+
+  unsigned width() const { return width_; }
+  wchar_t fill() const { return fill_; }
+};
+
+// An alignment specifier.
+struct AlignSpec : WidthSpec {
+  Alignment align_;
+
+  AlignSpec(unsigned width, wchar_t fill, Alignment align = ALIGN_DEFAULT)
+  : WidthSpec(width, fill), align_(align) {}
+
+  Alignment align() const { return align_; }
+
+  int precision() const { return -1; }
+};
+
+// An alignment and type specifier.
+template <char TYPE>
+struct AlignTypeSpec : AlignSpec {
+  AlignTypeSpec(unsigned width, wchar_t fill) : AlignSpec(width, fill) {}
+
+  bool flag(unsigned) const { return false; }
+  char type() const { return TYPE; }
+  char type_prefix() const { return TYPE; }
+};
+
+// A full format specifier.
+struct FormatSpec : AlignSpec {
+  unsigned flags_;
+  int precision_;
+  char type_;
+
+  FormatSpec(
+    unsigned width = 0, char type = 0, wchar_t fill = ' ')
+  : AlignSpec(width, fill), flags_(0), precision_(-1), type_(type) {}
+
+  bool flag(unsigned f) const { return (flags_ & f) != 0; }
+  int precision() const { return precision_; }
+  char type() const { return type_; }
+  char type_prefix() const { return type_; }
+};
+
+// An integer format specifier.
+template <typename T, typename SpecT = TypeSpec<0>, typename Char = char>
+class IntFormatSpec : public SpecT {
+ private:
+  T value_;
+
+ public:
+  IntFormatSpec(T val, const SpecT &spec = SpecT())
+  : SpecT(spec), value_(val) {}
+
+  T value() const { return value_; }
+};
+
+// A string format specifier.
+template <typename Char>
+class StrFormatSpec : public AlignSpec {
+ private:
+  const Char *str_;
+
+ public:
+  template <typename FillChar>
+  StrFormatSpec(const Char *str, unsigned width, FillChar fill)
+  : AlignSpec(width, fill), str_(str) {
+    internal::CharTraits<Char>::convert(FillChar());
+  }
+
+  const Char *str() const { return str_; }
+};
+
+/**
+  Returns an integer format specifier to format the value in base 2.
+ */
+IntFormatSpec<int, TypeSpec<'b'> > bin(int value);
+
+/**
+  Returns an integer format specifier to format the value in base 8.
+ */
+IntFormatSpec<int, TypeSpec<'o'> > oct(int value);
+
+/**
+  Returns an integer format specifier to format the value in base 16 using
+  lower-case letters for the digits above 9.
+ */
+IntFormatSpec<int, TypeSpec<'x'> > hex(int value);
+
+/**
+  Returns an integer formatter format specifier to format in base 16 using
+  upper-case letters for the digits above 9.
+ */
+IntFormatSpec<int, TypeSpec<'X'> > hexu(int value);
+
+/**
+  \rst
+  Returns an integer format specifier to pad the formatted argument with the
+  fill character to the specified width using the default (right) numeric
+  alignment.
+
+  **Example**::
+
+    MemoryWriter out;
+    out << pad(hex(0xcafe), 8, '0');
+    // out.str() == "0000cafe"
+
+  \endrst
+ */
+template <char TYPE_CODE, typename Char>
+IntFormatSpec<int, AlignTypeSpec<TYPE_CODE>, Char> pad(
+    int value, unsigned width, Char fill = ' ');
+
+#define FMT_DEFINE_INT_FORMATTERS(TYPE) \
+inline IntFormatSpec<TYPE, TypeSpec<'b'> > bin(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'b'> >(value, TypeSpec<'b'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'o'> > oct(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'o'> >(value, TypeSpec<'o'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'x'> > hex(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'x'> >(value, TypeSpec<'x'>()); \
+} \
+ \
+inline IntFormatSpec<TYPE, TypeSpec<'X'> > hexu(TYPE value) { \
+  return IntFormatSpec<TYPE, TypeSpec<'X'> >(value, TypeSpec<'X'>()); \
+} \
+ \
+template <char TYPE_CODE> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> > pad( \
+    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE> > f, unsigned width) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE> >( \
+      f.value(), AlignTypeSpec<TYPE_CODE>(width, ' ')); \
+} \
+ \
+/* For compatibility with older compilers we provide two overloads for pad, */ \
+/* one that takes a fill character and one that doesn't. In the future this */ \
+/* can be replaced with one overload making the template argument Char      */ \
+/* default to char (C++11). */ \
+template <char TYPE_CODE, typename Char> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char> pad( \
+    IntFormatSpec<TYPE, TypeSpec<TYPE_CODE>, Char> f, \
+    unsigned width, Char fill) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<TYPE_CODE>, Char>( \
+      f.value(), AlignTypeSpec<TYPE_CODE>(width, fill)); \
+} \
+ \
+inline IntFormatSpec<TYPE, AlignTypeSpec<0> > pad( \
+    TYPE value, unsigned width) { \
+  return IntFormatSpec<TYPE, AlignTypeSpec<0> >( \
+      value, AlignTypeSpec<0>(width, ' ')); \
+} \
+ \
+template <typename Char> \
+inline IntFormatSpec<TYPE, AlignTypeSpec<0>, Char> pad( \
+   TYPE value, unsigned width, Char fill) { \
+ return IntFormatSpec<TYPE, AlignTypeSpec<0>, Char>( \
+     value, AlignTypeSpec<0>(width, fill)); \
+}
+
+FMT_DEFINE_INT_FORMATTERS(int)
+FMT_DEFINE_INT_FORMATTERS(long)
+FMT_DEFINE_INT_FORMATTERS(unsigned)
+FMT_DEFINE_INT_FORMATTERS(unsigned long)
+FMT_DEFINE_INT_FORMATTERS(LongLong)
+FMT_DEFINE_INT_FORMATTERS(ULongLong)
+
+/**
+  \rst
+  Returns a string formatter that pads the formatted argument with the fill
+  character to the specified width using the default (left) string alignment.
+
+  **Example**::
+
+    std::string s = str(MemoryWriter() << pad("abc", 8));
+    // s == "abc     "
+
+  \endrst
+ */
+template <typename Char>
+inline StrFormatSpec<Char> pad(
+    const Char *str, unsigned width, Char fill = ' ') {
+  return StrFormatSpec<Char>(str, width, fill);
+}
+
+inline StrFormatSpec<wchar_t> pad(
+    const wchar_t *str, unsigned width, char fill = ' ') {
+  return StrFormatSpec<wchar_t>(str, width, fill);
+}
+
+namespace internal {
+
+template <typename Char>
+class ArgMap {
+ private:
+  typedef std::vector<
+    std::pair<fmt::BasicStringRef<Char>, internal::Arg> > MapType;
+  typedef typename MapType::value_type Pair;
+
+  MapType map_;
+
+ public:
+  FMT_API void init(const ArgList &args);
+
+  const internal::Arg *find(const fmt::BasicStringRef<Char> &name) const {
+    // The list is unsorted, so just return the first matching name.
+    for (typename MapType::const_iterator it = map_.begin(), end = map_.end();
+         it != end; ++it) {
+      if (it->first == name)
+        return &it->second;
+    }
+    return FMT_NULL;
+  }
+};
+
+template <typename Impl, typename Char, typename Spec = fmt::FormatSpec>
+class ArgFormatterBase : public ArgVisitor<Impl, void> {
+ private:
+  BasicWriter<Char> &writer_;
+  Spec &spec_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(ArgFormatterBase);
+
+  void write_pointer(const void *p) {
+    spec_.flags_ = HASH_FLAG;
+    spec_.type_ = 'x';
+    writer_.write_int(reinterpret_cast<uintptr_t>(p), spec_);
+  }
+
+  // workaround MSVC two-phase lookup issue
+  typedef internal::Arg Arg;
+
+ protected:
+  BasicWriter<Char> &writer() { return writer_; }
+  Spec &spec() { return spec_; }
+
+  void write(bool value) {
+    const char *str_value = value ? "true" : "false";
+    Arg::StringValue<char> str = { str_value, std::strlen(str_value) };
+    writer_.write_str(str, spec_);
+  }
+
+  void write(const char *value) {
+    Arg::StringValue<char> str = {value, value ? std::strlen(value) : 0};
+    writer_.write_str(str, spec_);
+  }
+
+ public:
+  typedef Spec SpecType;
+
+  ArgFormatterBase(BasicWriter<Char> &w, Spec &s)
+  : writer_(w), spec_(s) {}
+
+  template <typename T>
+  void visit_any_int(T value) { writer_.write_int(value, spec_); }
+
+  template <typename T>
+  void visit_any_double(T value) { writer_.write_double(value, spec_); }
+
+  void visit_bool(bool value) {
+    if (spec_.type_) {
+      visit_any_int(value);
+      return;
+    }
+    write(value);
+  }
+
+  void visit_char(int value) {
+    if (spec_.type_ && spec_.type_ != 'c') {
+      spec_.flags_ |= CHAR_FLAG;
+      writer_.write_int(value, spec_);
+      return;
+    }
+    if (spec_.align_ == ALIGN_NUMERIC || spec_.flags_ != 0)
+      FMT_THROW(FormatError("invalid format specifier for char"));
+    typedef typename BasicWriter<Char>::CharPtr CharPtr;
+    Char fill = internal::CharTraits<Char>::cast(spec_.fill());
+    CharPtr out = CharPtr();
+    const unsigned CHAR_SIZE = 1;
+    if (spec_.width_ > CHAR_SIZE) {
+      out = writer_.grow_buffer(spec_.width_);
+      if (spec_.align_ == ALIGN_RIGHT) {
+        std::uninitialized_fill_n(out, spec_.width_ - CHAR_SIZE, fill);
+        out += spec_.width_ - CHAR_SIZE;
+      } else if (spec_.align_ == ALIGN_CENTER) {
+        out = writer_.fill_padding(out, spec_.width_,
+                                   internal::const_check(CHAR_SIZE), fill);
+      } else {
+        std::uninitialized_fill_n(out + CHAR_SIZE,
+                                  spec_.width_ - CHAR_SIZE, fill);
+      }
+    } else {
+      out = writer_.grow_buffer(CHAR_SIZE);
+    }
+    *out = internal::CharTraits<Char>::cast(value);
+  }
+
+  void visit_cstring(const char *value) {
+    if (spec_.type_ == 'p')
+      return write_pointer(value);
+    write(value);
+  }
+
+  // Qualification with "internal" here and below is a workaround for nvcc.
+  void visit_string(internal::Arg::StringValue<char> value) {
+    writer_.write_str(value, spec_);
+  }
+
+  using ArgVisitor<Impl, void>::visit_wstring;
+
+  void visit_wstring(internal::Arg::StringValue<Char> value) {
+    writer_.write_str(value, spec_);
+  }
+
+  void visit_pointer(const void *value) {
+    if (spec_.type_ && spec_.type_ != 'p')
+      report_unknown_type(spec_.type_, "pointer");
+    write_pointer(value);
+  }
+};
+
+class FormatterBase {
+ private:
+  ArgList args_;
+  int next_arg_index_;
+
+  // Returns the argument with specified index.
+  FMT_API Arg do_get_arg(unsigned arg_index, const char *&error);
+
+ protected:
+  const ArgList &args() const { return args_; }
+
+  explicit FormatterBase(const ArgList &args) {
+    args_ = args;
+    next_arg_index_ = 0;
+  }
+
+  // Returns the next argument.
+  Arg next_arg(const char *&error) {
+    if (next_arg_index_ >= 0)
+      return do_get_arg(internal::to_unsigned(next_arg_index_++), error);
+    error = "cannot switch from manual to automatic argument indexing";
+    return Arg();
+  }
+
+  // Checks if manual indexing is used and returns the argument with
+  // specified index.
+  Arg get_arg(unsigned arg_index, const char *&error) {
+    return check_no_auto_index(error) ? do_get_arg(arg_index, error) : Arg();
+  }
+
+  bool check_no_auto_index(const char *&error) {
+    if (next_arg_index_ > 0) {
+      error = "cannot switch from automatic to manual argument indexing";
+      return false;
+    }
+    next_arg_index_ = -1;
+    return true;
+  }
+
+  template <typename Char>
+  void write(BasicWriter<Char> &w, const Char *start, const Char *end) {
+    if (start != end)
+      w << BasicStringRef<Char>(start, internal::to_unsigned(end - start));
+  }
+};
+}  // namespace internal
+
+/**
+  \rst
+  An argument formatter based on the `curiously recurring template pattern
+  <http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern>`_.
+
+  To use `~fmt::BasicArgFormatter` define a subclass that implements some or
+  all of the visit methods with the same signatures as the methods in
+  `~fmt::ArgVisitor`, for example, `~fmt::ArgVisitor::visit_int()`.
+  Pass the subclass as the *Impl* template parameter. When a formatting
+  function processes an argument, it will dispatch to a visit method
+  specific to the argument type. For example, if the argument type is
+  ``double`` then the `~fmt::ArgVisitor::visit_double()` method of a subclass
+  will be called. If the subclass doesn't contain a method with this signature,
+  then a corresponding method of `~fmt::BasicArgFormatter` or its superclass
+  will be called.
+  \endrst
+ */
+template <typename Impl, typename Char, typename Spec = fmt::FormatSpec>
+class BasicArgFormatter : public internal::ArgFormatterBase<Impl, Char, Spec> {
+ private:
+  BasicFormatter<Char, Impl> &formatter_;
+  const Char *format_;
+
+ public:
+  /**
+    \rst
+    Constructs an argument formatter object.
+    *formatter* is a reference to the main formatter object, *spec* contains
+    format specifier information for standard argument types, and *fmt* points
+    to the part of the format string being parsed for custom argument types.
+    \endrst
+   */
+  BasicArgFormatter(BasicFormatter<Char, Impl> &formatter,
+                    Spec &spec, const Char *fmt)
+  : internal::ArgFormatterBase<Impl, Char, Spec>(formatter.writer(), spec),
+    formatter_(formatter), format_(fmt) {}
+
+  /** Formats an argument of a custom (user-defined) type. */
+  void visit_custom(internal::Arg::CustomValue c) {
+    c.format(&formatter_, c.value, &format_);
+  }
+};
+
+/** The default argument formatter. */
+template <typename Char>
+class ArgFormatter :
+    public BasicArgFormatter<ArgFormatter<Char>, Char, FormatSpec> {
+ public:
+  /** Constructs an argument formatter object. */
+  ArgFormatter(BasicFormatter<Char> &formatter,
+               FormatSpec &spec, const Char *fmt)
+  : BasicArgFormatter<ArgFormatter<Char>,
+                      Char, FormatSpec>(formatter, spec, fmt) {}
+};
+
+/** This template formats data and writes the output to a writer. */
+template <typename CharType, typename ArgFormatter>
+class BasicFormatter : private internal::FormatterBase {
+ public:
+  /** The character type for the output. */
+  typedef CharType Char;
+
+ private:
+  BasicWriter<Char> &writer_;
+  internal::ArgMap<Char> map_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(BasicFormatter);
+
+  using internal::FormatterBase::get_arg;
+
+  // Checks if manual indexing is used and returns the argument with
+  // specified name.
+  internal::Arg get_arg(BasicStringRef<Char> arg_name, const char *&error);
+
+  // Parses argument index and returns corresponding argument.
+  internal::Arg parse_arg_index(const Char *&s);
+
+  // Parses argument name and returns corresponding argument.
+  internal::Arg parse_arg_name(const Char *&s);
+
+ public:
+  /**
+   \rst
+   Constructs a ``BasicFormatter`` object. References to the arguments and
+   the writer are stored in the formatter object so make sure they have
+   appropriate lifetimes.
+   \endrst
+   */
+  BasicFormatter(const ArgList &args, BasicWriter<Char> &w)
+    : internal::FormatterBase(args), writer_(w) {}
+
+  /** Returns a reference to the writer associated with this formatter. */
+  BasicWriter<Char> &writer() { return writer_; }
+
+  /** Formats stored arguments and writes the output to the writer. */
+  void format(BasicCStringRef<Char> format_str);
+
+  // Formats a single argument and advances format_str, a format string pointer.
+  const Char *format(const Char *&format_str, const internal::Arg &arg);
+};
+
+// Generates a comma-separated list with results of applying f to
+// numbers 0..n-1.
+# define FMT_GEN(n, f) FMT_GEN##n(f)
+# define FMT_GEN1(f)  f(0)
+# define FMT_GEN2(f)  FMT_GEN1(f),  f(1)
+# define FMT_GEN3(f)  FMT_GEN2(f),  f(2)
+# define FMT_GEN4(f)  FMT_GEN3(f),  f(3)
+# define FMT_GEN5(f)  FMT_GEN4(f),  f(4)
+# define FMT_GEN6(f)  FMT_GEN5(f),  f(5)
+# define FMT_GEN7(f)  FMT_GEN6(f),  f(6)
+# define FMT_GEN8(f)  FMT_GEN7(f),  f(7)
+# define FMT_GEN9(f)  FMT_GEN8(f),  f(8)
+# define FMT_GEN10(f) FMT_GEN9(f),  f(9)
+# define FMT_GEN11(f) FMT_GEN10(f), f(10)
+# define FMT_GEN12(f) FMT_GEN11(f), f(11)
+# define FMT_GEN13(f) FMT_GEN12(f), f(12)
+# define FMT_GEN14(f) FMT_GEN13(f), f(13)
+# define FMT_GEN15(f) FMT_GEN14(f), f(14)
+
+namespace internal {
+inline uint64_t make_type() { return 0; }
+
+template <typename T>
+inline uint64_t make_type(const T &arg) {
+  return MakeValue< BasicFormatter<char> >::type(arg);
+}
+
+template <std::size_t N, bool/*IsPacked*/= (N < ArgList::MAX_PACKED_ARGS)>
+struct ArgArray;
+
+template <std::size_t N>
+struct ArgArray<N, true/*IsPacked*/> {
+  typedef Value Type[N > 0 ? N : 1];
+
+  template <typename Formatter, typename T>
+  static Value make(const T &value) {
+#ifdef __clang__
+    Value result = MakeValue<Formatter>(value);
+    // Workaround a bug in Apple LLVM version 4.2 (clang-425.0.28) of clang:
+    // https://github.com/fmtlib/fmt/issues/276
+    (void)result.custom.format;
+    return result;
+#else
+    return MakeValue<Formatter>(value);
+#endif
+  }
+};
+
+template <std::size_t N>
+struct ArgArray<N, false/*IsPacked*/> {
+  typedef Arg Type[N + 1]; // +1 for the list end Arg::NONE
+
+  template <typename Formatter, typename T>
+  static Arg make(const T &value) { return MakeArg<Formatter>(value); }
+};
+
+#if FMT_USE_VARIADIC_TEMPLATES
+template <typename Arg, typename... Args>
+inline uint64_t make_type(const Arg &first, const Args & ... tail) {
+  return make_type(first) | (make_type(tail...) << 4);
+}
+
+#else
+
+struct ArgType {
+  uint64_t type;
+
+  ArgType() : type(0) {}
+
+  template <typename T>
+  ArgType(const T &arg) : type(make_type(arg)) {}
+};
+
+# define FMT_ARG_TYPE_DEFAULT(n) ArgType t##n = ArgType()
+
+inline uint64_t make_type(FMT_GEN15(FMT_ARG_TYPE_DEFAULT)) {
+  return t0.type | (t1.type << 4) | (t2.type << 8) | (t3.type << 12) |
+      (t4.type << 16) | (t5.type << 20) | (t6.type << 24) | (t7.type << 28) |
+      (t8.type << 32) | (t9.type << 36) | (t10.type << 40) | (t11.type << 44) |
+      (t12.type << 48) | (t13.type << 52) | (t14.type << 56);
+}
+#endif
+}  // namespace internal
+
+# define FMT_MAKE_TEMPLATE_ARG(n) typename T##n
+# define FMT_MAKE_ARG_TYPE(n) T##n
+# define FMT_MAKE_ARG(n) const T##n &v##n
+# define FMT_ASSIGN_char(n) \
+  arr[n] = fmt::internal::MakeValue< fmt::BasicFormatter<char> >(v##n)
+# define FMT_ASSIGN_wchar_t(n) \
+  arr[n] = fmt::internal::MakeValue< fmt::BasicFormatter<wchar_t> >(v##n)
+
+#if FMT_USE_VARIADIC_TEMPLATES
+// Defines a variadic function returning void.
+# define FMT_VARIADIC_VOID(func, arg_type) \
+  template <typename... Args> \
+  void func(arg_type arg0, const Args & ... args) { \
+    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
+    typename ArgArray::Type array{ \
+      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
+    func(arg0, fmt::ArgList(fmt::internal::make_type(args...), array)); \
+  }
+
+// Defines a variadic constructor.
+# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
+  template <typename... Args> \
+  ctor(arg0_type arg0, arg1_type arg1, const Args & ... args) { \
+    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
+    typename ArgArray::Type array{ \
+      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
+    func(arg0, arg1, fmt::ArgList(fmt::internal::make_type(args...), array)); \
+  }
+
+#else
+
+# define FMT_MAKE_REF(n) \
+  fmt::internal::MakeValue< fmt::BasicFormatter<Char> >(v##n)
+# define FMT_MAKE_REF2(n) v##n
+
+// Defines a wrapper for a function taking one argument of type arg_type
+// and n additional arguments of arbitrary types.
+# define FMT_WRAP1(func, arg_type, n) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  inline void func(arg_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
+    const fmt::internal::ArgArray<n>::Type array = {FMT_GEN(n, FMT_MAKE_REF)}; \
+    func(arg1, fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), array)); \
+  }
+
+// Emulates a variadic function returning void on a pre-C++11 compiler.
+# define FMT_VARIADIC_VOID(func, arg_type) \
+  inline void func(arg_type arg) { func(arg, fmt::ArgList()); } \
+  FMT_WRAP1(func, arg_type, 1) FMT_WRAP1(func, arg_type, 2) \
+  FMT_WRAP1(func, arg_type, 3) FMT_WRAP1(func, arg_type, 4) \
+  FMT_WRAP1(func, arg_type, 5) FMT_WRAP1(func, arg_type, 6) \
+  FMT_WRAP1(func, arg_type, 7) FMT_WRAP1(func, arg_type, 8) \
+  FMT_WRAP1(func, arg_type, 9) FMT_WRAP1(func, arg_type, 10)
+
+# define FMT_CTOR(ctor, func, arg0_type, arg1_type, n) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  ctor(arg0_type arg0, arg1_type arg1, FMT_GEN(n, FMT_MAKE_ARG)) { \
+    const fmt::internal::ArgArray<n>::Type array = {FMT_GEN(n, FMT_MAKE_REF)}; \
+    func(arg0, arg1, fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), array)); \
+  }
+
+// Emulates a variadic constructor on a pre-C++11 compiler.
+# define FMT_VARIADIC_CTOR(ctor, func, arg0_type, arg1_type) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 1) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 2) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 3) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 4) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 5) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 6) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 7) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 8) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 9) \
+  FMT_CTOR(ctor, func, arg0_type, arg1_type, 10)
+#endif
+
+// Generates a comma-separated list with results of applying f to pairs
+// (argument, index).
+#define FMT_FOR_EACH1(f, x0) f(x0, 0)
+#define FMT_FOR_EACH2(f, x0, x1) \
+  FMT_FOR_EACH1(f, x0), f(x1, 1)
+#define FMT_FOR_EACH3(f, x0, x1, x2) \
+  FMT_FOR_EACH2(f, x0 ,x1), f(x2, 2)
+#define FMT_FOR_EACH4(f, x0, x1, x2, x3) \
+  FMT_FOR_EACH3(f, x0, x1, x2), f(x3, 3)
+#define FMT_FOR_EACH5(f, x0, x1, x2, x3, x4) \
+  FMT_FOR_EACH4(f, x0, x1, x2, x3), f(x4, 4)
+#define FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5) \
+  FMT_FOR_EACH5(f, x0, x1, x2, x3, x4), f(x5, 5)
+#define FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6) \
+  FMT_FOR_EACH6(f, x0, x1, x2, x3, x4, x5), f(x6, 6)
+#define FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7) \
+  FMT_FOR_EACH7(f, x0, x1, x2, x3, x4, x5, x6), f(x7, 7)
+#define FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8) \
+  FMT_FOR_EACH8(f, x0, x1, x2, x3, x4, x5, x6, x7), f(x8, 8)
+#define FMT_FOR_EACH10(f, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
+  FMT_FOR_EACH9(f, x0, x1, x2, x3, x4, x5, x6, x7, x8), f(x9, 9)
+
+/**
+ An error returned by an operating system or a language runtime,
+ for example a file opening error.
+*/
+class SystemError : public internal::RuntimeError {
+ private:
+  FMT_API void init(int err_code, CStringRef format_str, ArgList args);
+
+ protected:
+  int error_code_;
+
+  typedef char Char;  // For FMT_VARIADIC_CTOR.
+
+  SystemError() {}
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::SystemError` object with a description
+   formatted with `fmt::format_system_error`. *message* and additional
+   arguments passed into the constructor are formatted similarly to
+   `fmt::format`.
+
+   **Example**::
+
+     // This throws a SystemError with the description
+     //   cannot open file 'madeup': No such file or directory
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     std::FILE *file = std::fopen(filename, "r");
+     if (!file)
+       throw fmt::SystemError(errno, "cannot open file '{}'", filename);
+   \endrst
+  */
+  SystemError(int error_code, CStringRef message) {
+    init(error_code, message, ArgList());
+  }
+  FMT_DEFAULTED_COPY_CTOR(SystemError)
+  FMT_VARIADIC_CTOR(SystemError, init, int, CStringRef)
+
+  FMT_API ~SystemError() FMT_DTOR_NOEXCEPT;
+
+  int error_code() const { return error_code_; }
+};
+
+/**
+  \rst
+  Formats an error returned by an operating system or a language runtime,
+  for example a file opening error, and writes it to *out* in the following
+  form:
+
+  .. parsed-literal::
+     *<message>*: *<system-message>*
+
+  where *<message>* is the passed message and *<system-message>* is
+  the system message corresponding to the error code.
+  *error_code* is a system error code as given by ``errno``.
+  If *error_code* is not a valid error code such as -1, the system message
+  may look like "Unknown error -1" and is platform-dependent.
+  \endrst
+ */
+FMT_API void format_system_error(fmt::Writer &out, int error_code,
+                                 fmt::StringRef message) FMT_NOEXCEPT;
+
+/**
+  \rst
+  This template provides operations for formatting and writing data into
+  a character stream. The output is stored in a buffer provided by a subclass
+  such as :class:`fmt::BasicMemoryWriter`.
+
+  You can use one of the following typedefs for common character types:
+
+  +---------+----------------------+
+  | Type    | Definition           |
+  +=========+======================+
+  | Writer  | BasicWriter<char>    |
+  +---------+----------------------+
+  | WWriter | BasicWriter<wchar_t> |
+  +---------+----------------------+
+
+  \endrst
+ */
+template <typename Char>
+class BasicWriter {
+ private:
+  // Output buffer.
+  Buffer<Char> &buffer_;
+
+  FMT_DISALLOW_COPY_AND_ASSIGN(BasicWriter);
+
+  typedef typename internal::CharTraits<Char>::CharPtr CharPtr;
+
+#if FMT_SECURE_SCL
+  // Returns pointer value.
+  static Char *get(CharPtr p) { return p.base(); }
+#else
+  static Char *get(Char *p) { return p; }
+#endif
+
+  // Fills the padding around the content and returns the pointer to the
+  // content area.
+  static CharPtr fill_padding(CharPtr buffer,
+      unsigned total_size, std::size_t content_size, wchar_t fill);
+
+  // Grows the buffer by n characters and returns a pointer to the newly
+  // allocated area.
+  CharPtr grow_buffer(std::size_t n) {
+    std::size_t size = buffer_.size();
+    buffer_.resize(size + n);
+    return internal::make_ptr(&buffer_[size], n);
+  }
+
+  // Writes an unsigned decimal integer.
+  template <typename UInt>
+  Char *write_unsigned_decimal(UInt value, unsigned prefix_size = 0) {
+    unsigned num_digits = internal::count_digits(value);
+    Char *ptr = get(grow_buffer(prefix_size + num_digits));
+    internal::format_decimal(ptr + prefix_size, value, num_digits);
+    return ptr;
+  }
+
+  // Writes a decimal integer.
+  template <typename Int>
+  void write_decimal(Int value) {
+    typedef typename internal::IntTraits<Int>::MainType MainType;
+    MainType abs_value = static_cast<MainType>(value);
+    if (internal::is_negative(value)) {
+      abs_value = 0 - abs_value;
+      *write_unsigned_decimal(abs_value, 1) = '-';
+    } else {
+      write_unsigned_decimal(abs_value, 0);
+    }
+  }
+
+  // Prepare a buffer for integer formatting.
+  CharPtr prepare_int_buffer(unsigned num_digits,
+      const EmptySpec &, const char *prefix, unsigned prefix_size) {
+    unsigned size = prefix_size + num_digits;
+    CharPtr p = grow_buffer(size);
+    std::uninitialized_copy(prefix, prefix + prefix_size, p);
+    return p + size - 1;
+  }
+
+  template <typename Spec>
+  CharPtr prepare_int_buffer(unsigned num_digits,
+    const Spec &spec, const char *prefix, unsigned prefix_size);
+
+  // Formats an integer.
+  template <typename T, typename Spec>
+  void write_int(T value, Spec spec);
+
+  // Formats a floating-point number (double or long double).
+  template <typename T, typename Spec>
+  void write_double(T value, const Spec &spec);
+
+  // Writes a formatted string.
+  template <typename StrChar>
+  CharPtr write_str(const StrChar *s, std::size_t size, const AlignSpec &spec);
+
+  template <typename StrChar, typename Spec>
+  void write_str(const internal::Arg::StringValue<StrChar> &str,
+                 const Spec &spec);
+
+  // This following methods are private to disallow writing wide characters
+  // and strings to a char stream. If you want to print a wide string as a
+  // pointer as std::ostream does, cast it to const void*.
+  // Do not implement!
+  void operator<<(typename internal::WCharHelper<wchar_t, Char>::Unsupported);
+  void operator<<(
+      typename internal::WCharHelper<const wchar_t *, Char>::Unsupported);
+
+  // Appends floating-point length specifier to the format string.
+  // The second argument is only used for overload resolution.
+  void append_float_length(Char *&format_ptr, long double) {
+    *format_ptr++ = 'L';
+  }
+
+  template<typename T>
+  void append_float_length(Char *&, T) {}
+
+  template <typename Impl, typename Char_, typename Spec_>
+  friend class internal::ArgFormatterBase;
+
+  template <typename Impl, typename Char_, typename Spec_>
+  friend class BasicPrintfArgFormatter;
+
+ protected:
+  /**
+    Constructs a ``BasicWriter`` object.
+   */
+  explicit BasicWriter(Buffer<Char> &b) : buffer_(b) {}
+
+ public:
+  /**
+    \rst
+    Destroys a ``BasicWriter`` object.
+    \endrst
+   */
+  virtual ~BasicWriter() {}
+
+  /**
+    Returns the total number of characters written.
+   */
+  std::size_t size() const { return buffer_.size(); }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  const Char *data() const FMT_NOEXCEPT { return &buffer_[0]; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  const Char *c_str() const {
+    std::size_t size = buffer_.size();
+    buffer_.reserve(size + 1);
+    buffer_[size] = '\0';
+    return &buffer_[0];
+  }
+
+  /**
+    \rst
+    Returns the content of the output buffer as an `std::string`.
+    \endrst
+   */
+  std::basic_string<Char> str() const {
+    return std::basic_string<Char>(&buffer_[0], buffer_.size());
+  }
+
+  /**
+    \rst
+    Writes formatted data.
+
+    *args* is an argument list representing arbitrary arguments.
+
+    **Example**::
+
+       MemoryWriter out;
+       out.write("Current point:\n");
+       out.write("({:+f}, {:+f})", -3.14, 3.14);
+
+    This will write the following output to the ``out`` object:
+
+    .. code-block:: none
+
+       Current point:
+       (-3.140000, +3.140000)
+
+    The output can be accessed using :func:`data()`, :func:`c_str` or
+    :func:`str` methods.
+
+    See also :ref:`syntax`.
+    \endrst
+   */
+  void write(BasicCStringRef<Char> format, ArgList args) {
+    BasicFormatter<Char>(args, *this).format(format);
+  }
+  FMT_VARIADIC_VOID(write, BasicCStringRef<Char>)
+
+  BasicWriter &operator<<(int value) {
+    write_decimal(value);
+    return *this;
+  }
+  BasicWriter &operator<<(unsigned value) {
+    return *this << IntFormatSpec<unsigned>(value);
+  }
+  BasicWriter &operator<<(long value) {
+    write_decimal(value);
+    return *this;
+  }
+  BasicWriter &operator<<(unsigned long value) {
+    return *this << IntFormatSpec<unsigned long>(value);
+  }
+  BasicWriter &operator<<(LongLong value) {
+    write_decimal(value);
+    return *this;
+  }
+
+  /**
+    \rst
+    Formats *value* and writes it to the stream.
+    \endrst
+   */
+  BasicWriter &operator<<(ULongLong value) {
+    return *this << IntFormatSpec<ULongLong>(value);
+  }
+
+  BasicWriter &operator<<(double value) {
+    write_double(value, FormatSpec());
+    return *this;
+  }
+
+  /**
+    \rst
+    Formats *value* using the general format for floating-point numbers
+    (``'g'``) and writes it to the stream.
+    \endrst
+   */
+  BasicWriter &operator<<(long double value) {
+    write_double(value, FormatSpec());
+    return *this;
+  }
+
+  /**
+    Writes a character to the stream.
+   */
+  BasicWriter &operator<<(char value) {
+    buffer_.push_back(value);
+    return *this;
+  }
+
+  BasicWriter &operator<<(
+      typename internal::WCharHelper<wchar_t, Char>::Supported value) {
+    buffer_.push_back(value);
+    return *this;
+  }
+
+  /**
+    \rst
+    Writes *value* to the stream.
+    \endrst
+   */
+  BasicWriter &operator<<(fmt::BasicStringRef<Char> value) {
+    const Char *str = value.data();
+    buffer_.append(str, str + value.size());
+    return *this;
+  }
+
+  BasicWriter &operator<<(
+      typename internal::WCharHelper<StringRef, Char>::Supported value) {
+    const char *str = value.data();
+    buffer_.append(str, str + value.size());
+    return *this;
+  }
+
+  template <typename T, typename Spec, typename FillChar>
+  BasicWriter &operator<<(IntFormatSpec<T, Spec, FillChar> spec) {
+    internal::CharTraits<Char>::convert(FillChar());
+    write_int(spec.value(), spec);
+    return *this;
+  }
+
+  template <typename StrChar>
+  BasicWriter &operator<<(const StrFormatSpec<StrChar> &spec) {
+    const StrChar *s = spec.str();
+    write_str(s, std::char_traits<Char>::length(s), spec);
+    return *this;
+  }
+
+  void clear() FMT_NOEXCEPT { buffer_.clear(); }
+
+  Buffer<Char> &buffer() FMT_NOEXCEPT { return buffer_; }
+};
+
+template <typename Char>
+template <typename StrChar>
+typename BasicWriter<Char>::CharPtr BasicWriter<Char>::write_str(
+      const StrChar *s, std::size_t size, const AlignSpec &spec) {
+  CharPtr out = CharPtr();
+  if (spec.width() > size) {
+    out = grow_buffer(spec.width());
+    Char fill = internal::CharTraits<Char>::cast(spec.fill());
+    if (spec.align() == ALIGN_RIGHT) {
+      std::uninitialized_fill_n(out, spec.width() - size, fill);
+      out += spec.width() - size;
+    } else if (spec.align() == ALIGN_CENTER) {
+      out = fill_padding(out, spec.width(), size, fill);
+    } else {
+      std::uninitialized_fill_n(out + size, spec.width() - size, fill);
+    }
+  } else {
+    out = grow_buffer(size);
+  }
+  std::uninitialized_copy(s, s + size, out);
+  return out;
+}
+
+template <typename Char>
+template <typename StrChar, typename Spec>
+void BasicWriter<Char>::write_str(
+    const internal::Arg::StringValue<StrChar> &s, const Spec &spec) {
+  // Check if StrChar is convertible to Char.
+  internal::CharTraits<Char>::convert(StrChar());
+  if (spec.type_ && spec.type_ != 's')
+    internal::report_unknown_type(spec.type_, "string");
+  const StrChar *str_value = s.value;
+  std::size_t str_size = s.size;
+  if (str_size == 0) {
+    if (!str_value) {
+      FMT_THROW(FormatError("string pointer is null"));
+    }
+  }
+  std::size_t precision = static_cast<std::size_t>(spec.precision_);
+  if (spec.precision_ >= 0 && precision < str_size)
+    str_size = precision;
+  write_str(str_value, str_size, spec);
+}
+
+template <typename Char>
+typename BasicWriter<Char>::CharPtr
+  BasicWriter<Char>::fill_padding(
+    CharPtr buffer, unsigned total_size,
+    std::size_t content_size, wchar_t fill) {
+  std::size_t padding = total_size - content_size;
+  std::size_t left_padding = padding / 2;
+  Char fill_char = internal::CharTraits<Char>::cast(fill);
+  std::uninitialized_fill_n(buffer, left_padding, fill_char);
+  buffer += left_padding;
+  CharPtr content = buffer;
+  std::uninitialized_fill_n(buffer + content_size,
+                            padding - left_padding, fill_char);
+  return content;
+}
+
+template <typename Char>
+template <typename Spec>
+typename BasicWriter<Char>::CharPtr
+  BasicWriter<Char>::prepare_int_buffer(
+    unsigned num_digits, const Spec &spec,
+    const char *prefix, unsigned prefix_size) {
+  unsigned width = spec.width();
+  Alignment align = spec.align();
+  Char fill = internal::CharTraits<Char>::cast(spec.fill());
+  if (spec.precision() > static_cast<int>(num_digits)) {
+    // Octal prefix '0' is counted as a digit, so ignore it if precision
+    // is specified.
+    if (prefix_size > 0 && prefix[prefix_size - 1] == '0')
+      --prefix_size;
+    unsigned number_size =
+        prefix_size + internal::to_unsigned(spec.precision());
+    AlignSpec subspec(number_size, '0', ALIGN_NUMERIC);
+    if (number_size >= width)
+      return prepare_int_buffer(num_digits, subspec, prefix, prefix_size);
+    buffer_.reserve(width);
+    unsigned fill_size = width - number_size;
+    if (align != ALIGN_LEFT) {
+      CharPtr p = grow_buffer(fill_size);
+      std::uninitialized_fill(p, p + fill_size, fill);
+    }
+    CharPtr result = prepare_int_buffer(
+        num_digits, subspec, prefix, prefix_size);
+    if (align == ALIGN_LEFT) {
+      CharPtr p = grow_buffer(fill_size);
+      std::uninitialized_fill(p, p + fill_size, fill);
+    }
+    return result;
+  }
+  unsigned size = prefix_size + num_digits;
+  if (width <= size) {
+    CharPtr p = grow_buffer(size);
+    std::uninitialized_copy(prefix, prefix + prefix_size, p);
+    return p + size - 1;
+  }
+  CharPtr p = grow_buffer(width);
+  CharPtr end = p + width;
+  if (align == ALIGN_LEFT) {
+    std::uninitialized_copy(prefix, prefix + prefix_size, p);
+    p += size;
+    std::uninitialized_fill(p, end, fill);
+  } else if (align == ALIGN_CENTER) {
+    p = fill_padding(p, width, size, fill);
+    std::uninitialized_copy(prefix, prefix + prefix_size, p);
+    p += size;
+  } else {
+    if (align == ALIGN_NUMERIC) {
+      if (prefix_size != 0) {
+        p = std::uninitialized_copy(prefix, prefix + prefix_size, p);
+        size -= prefix_size;
+      }
+    } else {
+      std::uninitialized_copy(prefix, prefix + prefix_size, end - size);
+    }
+    std::uninitialized_fill(p, end - size, fill);
+    p = end;
+  }
+  return p - 1;
+}
+
+template <typename Char>
+template <typename T, typename Spec>
+void BasicWriter<Char>::write_int(T value, Spec spec) {
+  unsigned prefix_size = 0;
+  typedef typename internal::IntTraits<T>::MainType UnsignedType;
+  UnsignedType abs_value = static_cast<UnsignedType>(value);
+  char prefix[4] = "";
+  if (internal::is_negative(value)) {
+    prefix[0] = '-';
+    ++prefix_size;
+    abs_value = 0 - abs_value;
+  } else if (spec.flag(SIGN_FLAG)) {
+    prefix[0] = spec.flag(PLUS_FLAG) ? '+' : ' ';
+    ++prefix_size;
+  }
+  switch (spec.type()) {
+  case 0: case 'd': {
+    unsigned num_digits = internal::count_digits(abs_value);
+    CharPtr p = prepare_int_buffer(num_digits, spec, prefix, prefix_size) + 1;
+    internal::format_decimal(get(p), abs_value, 0);
+    break;
+  }
+  case 'x': case 'X': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG)) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = spec.type_prefix();
+    }
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 4) != 0);
+    Char *p = get(prepare_int_buffer(
+      num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    const char *digits = spec.type() == 'x' ?
+        "0123456789abcdef" : "0123456789ABCDEF";
+    do {
+      *p-- = digits[n & 0xf];
+    } while ((n >>= 4) != 0);
+    break;
+  }
+  case 'b': case 'B': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG)) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = spec.type_prefix();
+    }
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 1) != 0);
+    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    do {
+      *p-- = static_cast<Char>('0' + (n & 1));
+    } while ((n >>= 1) != 0);
+    break;
+  }
+  case 'o': {
+    UnsignedType n = abs_value;
+    if (spec.flag(HASH_FLAG))
+      prefix[prefix_size++] = '0';
+    unsigned num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((n >>= 3) != 0);
+    Char *p = get(prepare_int_buffer(num_digits, spec, prefix, prefix_size));
+    n = abs_value;
+    do {
+      *p-- = static_cast<Char>('0' + (n & 7));
+    } while ((n >>= 3) != 0);
+    break;
+  }
+  case 'n': {
+    unsigned num_digits = internal::count_digits(abs_value);
+    fmt::StringRef sep = "";
+#if !(defined(ANDROID) || defined(__ANDROID__))
+    sep = internal::thousands_sep(std::localeconv());
+#endif
+    unsigned size = static_cast<unsigned>(
+          num_digits + sep.size() * ((num_digits - 1) / 3));
+    CharPtr p = prepare_int_buffer(size, spec, prefix, prefix_size) + 1;
+    internal::format_decimal(get(p), abs_value, 0, internal::ThousandsSep(sep));
+    break;
+  }
+  default:
+    internal::report_unknown_type(
+      spec.type(), spec.flag(CHAR_FLAG) ? "char" : "integer");
+    break;
+  }
+}
+
+template <typename Char>
+template <typename T, typename Spec>
+void BasicWriter<Char>::write_double(T value, const Spec &spec) {
+  // Check type.
+  char type = spec.type();
+  bool upper = false;
+  switch (type) {
+  case 0:
+    type = 'g';
+    break;
+  case 'e': case 'f': case 'g': case 'a':
+    break;
+  case 'F':
+#if FMT_MSC_VER
+    // MSVC's printf doesn't support 'F'.
+    type = 'f';
+#endif
+    // Fall through.
+  case 'E': case 'G': case 'A':
+    upper = true;
+    break;
+  default:
+    internal::report_unknown_type(type, "double");
+    break;
+  }
+
+  char sign = 0;
+  // Use isnegative instead of value < 0 because the latter is always
+  // false for NaN.
+  if (internal::FPUtil::isnegative(static_cast<double>(value))) {
+    sign = '-';
+    value = -value;
+  } else if (spec.flag(SIGN_FLAG)) {
+    sign = spec.flag(PLUS_FLAG) ? '+' : ' ';
+  }
+
+  if (internal::FPUtil::isnotanumber(value)) {
+    // Format NaN ourselves because sprintf's output is not consistent
+    // across platforms.
+    std::size_t nan_size = 4;
+    const char *nan = upper ? " NAN" : " nan";
+    if (!sign) {
+      --nan_size;
+      ++nan;
+    }
+    CharPtr out = write_str(nan, nan_size, spec);
+    if (sign)
+      *out = sign;
+    return;
+  }
+
+  if (internal::FPUtil::isinfinity(value)) {
+    // Format infinity ourselves because sprintf's output is not consistent
+    // across platforms.
+    std::size_t inf_size = 4;
+    const char *inf = upper ? " INF" : " inf";
+    if (!sign) {
+      --inf_size;
+      ++inf;
+    }
+    CharPtr out = write_str(inf, inf_size, spec);
+    if (sign)
+      *out = sign;
+    return;
+  }
+
+  std::size_t offset = buffer_.size();
+  unsigned width = spec.width();
+  if (sign) {
+    buffer_.reserve(buffer_.size() + (width > 1u ? width : 1u));
+    if (width > 0)
+      --width;
+    ++offset;
+  }
+
+  // Build format string.
+  enum { MAX_FORMAT_SIZE = 10}; // longest format: %#-*.*Lg
+  Char format[MAX_FORMAT_SIZE];
+  Char *format_ptr = format;
+  *format_ptr++ = '%';
+  unsigned width_for_sprintf = width;
+  if (spec.flag(HASH_FLAG))
+    *format_ptr++ = '#';
+  if (spec.align() == ALIGN_CENTER) {
+    width_for_sprintf = 0;
+  } else {
+    if (spec.align() == ALIGN_LEFT)
+      *format_ptr++ = '-';
+    if (width != 0)
+      *format_ptr++ = '*';
+  }
+  if (spec.precision() >= 0) {
+    *format_ptr++ = '.';
+    *format_ptr++ = '*';
+  }
+
+  append_float_length(format_ptr, value);
+  *format_ptr++ = type;
+  *format_ptr = '\0';
+
+  // Format using snprintf.
+  Char fill = internal::CharTraits<Char>::cast(spec.fill());
+  unsigned n = 0;
+  Char *start = FMT_NULL;
+  for (;;) {
+    std::size_t buffer_size = buffer_.capacity() - offset;
+#if FMT_MSC_VER
+    // MSVC's vsnprintf_s doesn't work with zero size, so reserve
+    // space for at least one extra character to make the size non-zero.
+    // Note that the buffer's capacity will increase by more than 1.
+    if (buffer_size == 0) {
+      buffer_.reserve(offset + 1);
+      buffer_size = buffer_.capacity() - offset;
+    }
+#endif
+    start = &buffer_[offset];
+    int result = internal::CharTraits<Char>::format_float(
+        start, buffer_size, format, width_for_sprintf, spec.precision(), value);
+    if (result >= 0) {
+      n = internal::to_unsigned(result);
+      if (offset + n < buffer_.capacity())
+        break;  // The buffer is large enough - continue with formatting.
+      buffer_.reserve(offset + n + 1);
+    } else {
+      // If result is negative we ask to increase the capacity by at least 1,
+      // but as std::vector, the buffer grows exponentially.
+      buffer_.reserve(buffer_.capacity() + 1);
+    }
+  }
+  if (sign) {
+    if ((spec.align() != ALIGN_RIGHT && spec.align() != ALIGN_DEFAULT) ||
+        *start != ' ') {
+      *(start - 1) = sign;
+      sign = 0;
+    } else {
+      *(start - 1) = fill;
+    }
+    ++n;
+  }
+  if (spec.align() == ALIGN_CENTER && spec.width() > n) {
+    width = spec.width();
+    CharPtr p = grow_buffer(width);
+    std::memmove(get(p) + (width - n) / 2, get(p), n * sizeof(Char));
+    fill_padding(p, spec.width(), n, fill);
+    return;
+  }
+  if (spec.fill() != ' ' || sign) {
+    while (*start == ' ')
+      *start++ = fill;
+    if (sign)
+      *(start - 1) = sign;
+  }
+  grow_buffer(n);
+}
+
+/**
+  \rst
+  This class template provides operations for formatting and writing data
+  into a character stream. The output is stored in a memory buffer that grows
+  dynamically.
+
+  You can use one of the following typedefs for common character types
+  and the standard allocator:
+
+  +---------------+-----------------------------------------------------+
+  | Type          | Definition                                          |
+  +===============+=====================================================+
+  | MemoryWriter  | BasicMemoryWriter<char, std::allocator<char>>       |
+  +---------------+-----------------------------------------------------+
+  | WMemoryWriter | BasicMemoryWriter<wchar_t, std::allocator<wchar_t>> |
+  +---------------+-----------------------------------------------------+
+
+  **Example**::
+
+     MemoryWriter out;
+     out << "The answer is " << 42 << "\n";
+     out.write("({:+f}, {:+f})", -3.14, 3.14);
+
+  This will write the following output to the ``out`` object:
+
+  .. code-block:: none
+
+     The answer is 42
+     (-3.140000, +3.140000)
+
+  The output can be converted to an ``std::string`` with ``out.str()`` or
+  accessed as a C string with ``out.c_str()``.
+  \endrst
+ */
+template <typename Char, typename Allocator = std::allocator<Char> >
+class BasicMemoryWriter : public BasicWriter<Char> {
+ private:
+  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE, Allocator> buffer_;
+
+ public:
+  explicit BasicMemoryWriter(const Allocator& alloc = Allocator())
+    : BasicWriter<Char>(buffer_), buffer_(alloc) {}
+
+#if FMT_USE_RVALUE_REFERENCES
+  /**
+    \rst
+    Constructs a :class:`fmt::BasicMemoryWriter` object moving the content
+    of the other object to it.
+    \endrst
+   */
+  BasicMemoryWriter(BasicMemoryWriter &&other)
+    : BasicWriter<Char>(buffer_), buffer_(std::move(other.buffer_)) {
+  }
+
+  /**
+    \rst
+    Moves the content of the other ``BasicMemoryWriter`` object to this one.
+    \endrst
+   */
+  BasicMemoryWriter &operator=(BasicMemoryWriter &&other) {
+    buffer_ = std::move(other.buffer_);
+    return *this;
+  }
+#endif
+};
+
+typedef BasicMemoryWriter<char> MemoryWriter;
+typedef BasicMemoryWriter<wchar_t> WMemoryWriter;
+
+/**
+  \rst
+  This class template provides operations for formatting and writing data
+  into a fixed-size array. For writing into a dynamically growing buffer
+  use :class:`fmt::BasicMemoryWriter`.
+
+  Any write method will throw ``std::runtime_error`` if the output doesn't fit
+  into the array.
+
+  You can use one of the following typedefs for common character types:
+
+  +--------------+---------------------------+
+  | Type         | Definition                |
+  +==============+===========================+
+  | ArrayWriter  | BasicArrayWriter<char>    |
+  +--------------+---------------------------+
+  | WArrayWriter | BasicArrayWriter<wchar_t> |
+  +--------------+---------------------------+
+  \endrst
+ */
+template <typename Char>
+class BasicArrayWriter : public BasicWriter<Char> {
+ private:
+  internal::FixedBuffer<Char> buffer_;
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
+   given size.
+   \endrst
+   */
+  BasicArrayWriter(Char *array, std::size_t size)
+    : BasicWriter<Char>(buffer_), buffer_(array, size) {}
+
+  /**
+   \rst
+   Constructs a :class:`fmt::BasicArrayWriter` object for *array* of the
+   size known at compile time.
+   \endrst
+   */
+  template <std::size_t SIZE>
+  explicit BasicArrayWriter(Char (&array)[SIZE])
+    : BasicWriter<Char>(buffer_), buffer_(array, SIZE) {}
+};
+
+typedef BasicArrayWriter<char> ArrayWriter;
+typedef BasicArrayWriter<wchar_t> WArrayWriter;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code,
+                                 StringRef message) FMT_NOEXCEPT;
+
+#if FMT_USE_WINDOWS_H
+
+/** A Windows error. */
+class WindowsError : public SystemError {
+ private:
+  FMT_API void init(int error_code, CStringRef format_str, ArgList args);
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::WindowsError` object with the description
+   of the form
+
+   .. parsed-literal::
+     *<message>*: *<system-message>*
+
+   where *<message>* is the formatted message and *<system-message>* is the
+   system message corresponding to the error code.
+   *error_code* is a Windows error code as given by ``GetLastError``.
+   If *error_code* is not a valid error code such as -1, the system message
+   will look like "error -1".
+
+   **Example**::
+
+     // This throws a WindowsError with the description
+     //   cannot open file 'madeup': The system cannot find the file specified.
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     LPOFSTRUCT of = LPOFSTRUCT();
+     HFILE file = OpenFile(filename, &of, OF_READ);
+     if (file == HFILE_ERROR) {
+       throw fmt::WindowsError(GetLastError(),
+                               "cannot open file '{}'", filename);
+     }
+   \endrst
+  */
+  WindowsError(int error_code, CStringRef message) {
+    init(error_code, message, ArgList());
+  }
+  FMT_VARIADIC_CTOR(WindowsError, init, int, CStringRef)
+};
+
+// Reports a Windows error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_windows_error(int error_code,
+                                  StringRef message) FMT_NOEXCEPT;
+
+#endif
+
+enum Color { BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE };
+
+/**
+  Formats a string and prints it to stdout using ANSI escape sequences
+  to specify color (experimental).
+  Example:
+    print_colored(fmt::RED, "Elapsed time: {0:.2f} seconds", 1.23);
+ */
+FMT_API void print_colored(Color c, CStringRef format, ArgList args);
+
+/**
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    std::string message = format("The answer is {}", 42);
+  \endrst
+*/
+inline std::string format(CStringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  return w.str();
+}
+
+inline std::wstring format(WCStringRef format_str, ArgList args) {
+  WMemoryWriter w;
+  w.write(format_str, args);
+  return w.str();
+}
+
+/**
+  \rst
+  Prints formatted data to the file *f*.
+
+  **Example**::
+
+    print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+FMT_API void print(std::FILE *f, CStringRef format_str, ArgList args);
+
+/**
+  \rst
+  Prints formatted data to ``stdout``.
+
+  **Example**::
+
+    print("Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+FMT_API void print(CStringRef format_str, ArgList args);
+
+/**
+  Fast integer formatter.
+ */
+class FormatInt {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum {BUFFER_SIZE = std::numeric_limits<ULongLong>::digits10 + 3};
+  mutable char buffer_[BUFFER_SIZE];
+  char *str_;
+
+  // Formats value in reverse and returns the number of digits.
+  char *format_decimal(ULongLong value) {
+    char *buffer_end = buffer_ + BUFFER_SIZE - 1;
+    while (value >= 100) {
+      // Integer division is slow so do it for a group of two digits instead
+      // of for every digit. The idea comes from the talk by Alexandrescu
+      // "Three Optimization Tips for C++". See speed-test for a comparison.
+      unsigned index = static_cast<unsigned>((value % 100) * 2);
+      value /= 100;
+      *--buffer_end = internal::Data::DIGITS[index + 1];
+      *--buffer_end = internal::Data::DIGITS[index];
+    }
+    if (value < 10) {
+      *--buffer_end = static_cast<char>('0' + value);
+      return buffer_end;
+    }
+    unsigned index = static_cast<unsigned>(value * 2);
+    *--buffer_end = internal::Data::DIGITS[index + 1];
+    *--buffer_end = internal::Data::DIGITS[index];
+    return buffer_end;
+  }
+
+  void FormatSigned(LongLong value) {
+    ULongLong abs_value = static_cast<ULongLong>(value);
+    bool negative = value < 0;
+    if (negative)
+      abs_value = 0 - abs_value;
+    str_ = format_decimal(abs_value);
+    if (negative)
+      *--str_ = '-';
+  }
+
+ public:
+  explicit FormatInt(int value) { FormatSigned(value); }
+  explicit FormatInt(long value) { FormatSigned(value); }
+  explicit FormatInt(LongLong value) { FormatSigned(value); }
+  explicit FormatInt(unsigned value) : str_(format_decimal(value)) {}
+  explicit FormatInt(unsigned long value) : str_(format_decimal(value)) {}
+  explicit FormatInt(ULongLong value) : str_(format_decimal(value)) {}
+
+  /** Returns the number of characters written to the output buffer. */
+  std::size_t size() const {
+    return internal::to_unsigned(buffer_ - str_ + BUFFER_SIZE - 1);
+  }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  const char *data() const { return str_; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  const char *c_str() const {
+    buffer_[BUFFER_SIZE - 1] = '\0';
+    return str_;
+  }
+
+  /**
+    \rst
+    Returns the content of the output buffer as an ``std::string``.
+    \endrst
+   */
+  std::string str() const { return std::string(str_, size()); }
+};
+
+// Formats a decimal integer value writing into buffer and returns
+// a pointer to the end of the formatted string. This function doesn't
+// write a terminating null character.
+template <typename T>
+inline void format_decimal(char *&buffer, T value) {
+  typedef typename internal::IntTraits<T>::MainType MainType;
+  MainType abs_value = static_cast<MainType>(value);
+  if (internal::is_negative(value)) {
+    *buffer++ = '-';
+    abs_value = 0 - abs_value;
+  }
+  if (abs_value < 100) {
+    if (abs_value < 10) {
+      *buffer++ = static_cast<char>('0' + abs_value);
+      return;
+    }
+    unsigned index = static_cast<unsigned>(abs_value * 2);
+    *buffer++ = internal::Data::DIGITS[index];
+    *buffer++ = internal::Data::DIGITS[index + 1];
+    return;
+  }
+  unsigned num_digits = internal::count_digits(abs_value);
+  internal::format_decimal(buffer, abs_value, num_digits);
+  buffer += num_digits;
+}
+
+/**
+  \rst
+  Returns a named argument for formatting functions.
+
+  **Example**::
+
+    print("Elapsed time: {s:.2f} seconds", arg("s", 1.23));
+
+  \endrst
+ */
+template <typename T>
+inline internal::NamedArgWithType<char, T> arg(StringRef name, const T &arg) {
+  return internal::NamedArgWithType<char, T>(name, arg);
+}
+
+template <typename T>
+inline internal::NamedArgWithType<wchar_t, T> arg(WStringRef name, const T &arg) {
+  return internal::NamedArgWithType<wchar_t, T>(name, arg);
+}
+
+// The following two functions are deleted intentionally to disable
+// nested named arguments as in ``format("{}", arg("a", arg("b", 42)))``.
+template <typename Char>
+void arg(StringRef, const internal::NamedArg<Char>&) FMT_DELETED_OR_UNDEFINED;
+template <typename Char>
+void arg(WStringRef, const internal::NamedArg<Char>&) FMT_DELETED_OR_UNDEFINED;
+}
+
+#if FMT_GCC_VERSION
+// Use the system_header pragma to suppress warnings about variadic macros
+// because suppressing -Wvariadic-macros with the diagnostic pragma doesn't
+// work. It is used at the end because we want to suppress as little warnings
+// as possible.
+# pragma GCC system_header
+#endif
+
+// This is used to work around VC++ bugs in handling variadic macros.
+#define FMT_EXPAND(args) args
+
+// Returns the number of arguments.
+// Based on https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s.
+#define FMT_NARG(...) FMT_NARG_(__VA_ARGS__, FMT_RSEQ_N())
+#define FMT_NARG_(...) FMT_EXPAND(FMT_ARG_N(__VA_ARGS__))
+#define FMT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define FMT_RSEQ_N() 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define FMT_FOR_EACH_(N, f, ...) \
+  FMT_EXPAND(FMT_CONCAT(FMT_FOR_EACH, N)(f, __VA_ARGS__))
+#define FMT_FOR_EACH(f, ...) \
+  FMT_EXPAND(FMT_FOR_EACH_(FMT_NARG(__VA_ARGS__), f, __VA_ARGS__))
+
+#define FMT_ADD_ARG_NAME(type, index) type arg##index
+#define FMT_GET_ARG_NAME(type, index) arg##index
+
+#if FMT_USE_VARIADIC_TEMPLATES
+# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
+  template <typename... Args> \
+  ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
+      const Args & ... args) { \
+    typedef fmt::internal::ArgArray<sizeof...(Args)> ArgArray; \
+    typename ArgArray::Type array{ \
+      ArgArray::template make<fmt::BasicFormatter<Char> >(args)...}; \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), \
+      fmt::ArgList(fmt::internal::make_type(args...), array)); \
+  }
+#else
+// Defines a wrapper for a function taking __VA_ARGS__ arguments
+// and n additional arguments of arbitrary types.
+# define FMT_WRAP(Char, ReturnType, func, call, n, ...) \
+  template <FMT_GEN(n, FMT_MAKE_TEMPLATE_ARG)> \
+  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__), \
+      FMT_GEN(n, FMT_MAKE_ARG)) { \
+    fmt::internal::ArgArray<n>::Type arr; \
+    FMT_GEN(n, FMT_ASSIGN_##Char); \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList( \
+      fmt::internal::make_type(FMT_GEN(n, FMT_MAKE_REF2)), arr)); \
+  }
+
+# define FMT_VARIADIC_(Char, ReturnType, func, call, ...) \
+  inline ReturnType func(FMT_FOR_EACH(FMT_ADD_ARG_NAME, __VA_ARGS__)) { \
+    call(FMT_FOR_EACH(FMT_GET_ARG_NAME, __VA_ARGS__), fmt::ArgList()); \
+  } \
+  FMT_WRAP(Char, ReturnType, func, call, 1, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 2, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 3, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 4, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 5, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 6, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 7, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 8, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 9, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 10, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 11, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 12, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 13, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 14, __VA_ARGS__) \
+  FMT_WRAP(Char, ReturnType, func, call, 15, __VA_ARGS__)
+#endif  // FMT_USE_VARIADIC_TEMPLATES
+
+/**
+  \rst
+  Defines a variadic function with the specified return type, function name
+  and argument types passed as variable arguments to this macro.
+
+  **Example**::
+
+    void print_error(const char *file, int line, const char *format,
+                     fmt::ArgList args) {
+      fmt::print("{}: {}: ", file, line);
+      fmt::print(format, args);
+    }
+    FMT_VARIADIC(void, print_error, const char *, int, const char *)
+
+  ``FMT_VARIADIC`` is used for compatibility with legacy C++ compilers that
+  don't implement variadic templates. You don't have to use this macro if
+  you don't need legacy compiler support and can use variadic templates
+  directly::
+
+    template <typename... Args>
+    void print_error(const char *file, int line, const char *format,
+                     const Args & ... args) {
+      fmt::print("{}: {}: ", file, line);
+      fmt::print(format, args...);
+    }
+  \endrst
+ */
+#define FMT_VARIADIC(ReturnType, func, ...) \
+  FMT_VARIADIC_(char, ReturnType, func, return func, __VA_ARGS__)
+
+#define FMT_VARIADIC_W(ReturnType, func, ...) \
+  FMT_VARIADIC_(wchar_t, ReturnType, func, return func, __VA_ARGS__)
+
+#define FMT_CAPTURE_ARG_(id, index) ::fmt::arg(#id, id)
+
+#define FMT_CAPTURE_ARG_W_(id, index) ::fmt::arg(L###id, id)
+
+/**
+  \rst
+  Convenient macro to capture the arguments' names and values into several
+  ``fmt::arg(name, value)``.
+
+  **Example**::
+
+    int x = 1, y = 2;
+    print("point: ({x}, {y})", FMT_CAPTURE(x, y));
+    // same as:
+    // print("point: ({x}, {y})", arg("x", x), arg("y", y));
+
+  \endrst
+ */
+#define FMT_CAPTURE(...) FMT_FOR_EACH(FMT_CAPTURE_ARG_, __VA_ARGS__)
+
+#define FMT_CAPTURE_W(...) FMT_FOR_EACH(FMT_CAPTURE_ARG_W_, __VA_ARGS__)
+
+namespace fmt {
+FMT_VARIADIC(std::string, format, CStringRef)
+FMT_VARIADIC_W(std::wstring, format, WCStringRef)
+FMT_VARIADIC(void, print, CStringRef)
+FMT_VARIADIC(void, print, std::FILE *, CStringRef)
+FMT_VARIADIC(void, print_colored, Color, CStringRef)
+
+namespace internal {
+template <typename Char>
+inline bool is_name_start(Char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
+}
+
+// Parses an unsigned integer advancing s to the end of the parsed input.
+// This function assumes that the first character of s is a digit.
+template <typename Char>
+unsigned parse_nonnegative_int(const Char *&s) {
+  assert('0' <= *s && *s <= '9');
+  unsigned value = 0;
+  do {
+    unsigned new_value = value * 10 + (*s++ - '0');
+    // Check if value wrapped around.
+    if (new_value < value) {
+      value = (std::numeric_limits<unsigned>::max)();
+      break;
+    }
+    value = new_value;
+  } while ('0' <= *s && *s <= '9');
+  // Convert to unsigned to prevent a warning.
+  unsigned max_int = (std::numeric_limits<int>::max)();
+  if (value > max_int)
+    FMT_THROW(FormatError("number is too big"));
+  return value;
+}
+
+inline void require_numeric_argument(const Arg &arg, char spec) {
+  if (arg.type > Arg::LAST_NUMERIC_TYPE) {
+    std::string message =
+        fmt::format("format specifier '{}' requires numeric argument", spec);
+    FMT_THROW(fmt::FormatError(message));
+  }
+}
+
+template <typename Char>
+void check_sign(const Char *&s, const Arg &arg) {
+  char sign = static_cast<char>(*s);
+  require_numeric_argument(arg, sign);
+  if (arg.type == Arg::UINT || arg.type == Arg::ULONG_LONG) {
+    FMT_THROW(FormatError(fmt::format(
+      "format specifier '{}' requires signed argument", sign)));
+  }
+  ++s;
+}
+}  // namespace internal
+
+template <typename Char, typename AF>
+inline internal::Arg BasicFormatter<Char, AF>::get_arg(
+    BasicStringRef<Char> arg_name, const char *&error) {
+  if (check_no_auto_index(error)) {
+    map_.init(args());
+    const internal::Arg *arg = map_.find(arg_name);
+    if (arg)
+      return *arg;
+    error = "argument not found";
+  }
+  return internal::Arg();
+}
+
+template <typename Char, typename AF>
+inline internal::Arg BasicFormatter<Char, AF>::parse_arg_index(const Char *&s) {
+  const char *error = FMT_NULL;
+  internal::Arg arg = *s < '0' || *s > '9' ?
+        next_arg(error) : get_arg(internal::parse_nonnegative_int(s), error);
+  if (error) {
+    FMT_THROW(FormatError(
+                *s != '}' && *s != ':' ? "invalid format string" : error));
+  }
+  return arg;
+}
+
+template <typename Char, typename AF>
+inline internal::Arg BasicFormatter<Char, AF>::parse_arg_name(const Char *&s) {
+  assert(internal::is_name_start(*s));
+  const Char *start = s;
+  Char c;
+  do {
+    c = *++s;
+  } while (internal::is_name_start(c) || ('0' <= c && c <= '9'));
+  const char *error = FMT_NULL;
+  internal::Arg arg = get_arg(BasicStringRef<Char>(start, s - start), error);
+  if (error)
+    FMT_THROW(FormatError(error));
+  return arg;
+}
+
+template <typename Char, typename ArgFormatter>
+const Char *BasicFormatter<Char, ArgFormatter>::format(
+    const Char *&format_str, const internal::Arg &arg) {
+  using internal::Arg;
+  const Char *s = format_str;
+  typename ArgFormatter::SpecType spec;
+  if (*s == ':') {
+    if (arg.type == Arg::CUSTOM) {
+      arg.custom.format(this, arg.custom.value, &s);
+      return s;
+    }
+    ++s;
+    // Parse fill and alignment.
+    if (Char c = *s) {
+      const Char *p = s + 1;
+      spec.align_ = ALIGN_DEFAULT;
+      do {
+        switch (*p) {
+          case '<':
+            spec.align_ = ALIGN_LEFT;
+            break;
+          case '>':
+            spec.align_ = ALIGN_RIGHT;
+            break;
+          case '=':
+            spec.align_ = ALIGN_NUMERIC;
+            break;
+          case '^':
+            spec.align_ = ALIGN_CENTER;
+            break;
+        }
+        if (spec.align_ != ALIGN_DEFAULT) {
+          if (p != s) {
+            if (c == '}') break;
+            if (c == '{')
+              FMT_THROW(FormatError("invalid fill character '{'"));
+            s += 2;
+            spec.fill_ = c;
+          } else ++s;
+          if (spec.align_ == ALIGN_NUMERIC)
+            require_numeric_argument(arg, '=');
+          break;
+        }
+      } while (--p >= s);
+    }
+
+    // Parse sign.
+    switch (*s) {
+      case '+':
+        check_sign(s, arg);
+        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
+        break;
+      case '-':
+        check_sign(s, arg);
+        spec.flags_ |= MINUS_FLAG;
+        break;
+      case ' ':
+        check_sign(s, arg);
+        spec.flags_ |= SIGN_FLAG;
+        break;
+    }
+
+    if (*s == '#') {
+      require_numeric_argument(arg, '#');
+      spec.flags_ |= HASH_FLAG;
+      ++s;
+    }
+
+    // Parse zero flag.
+    if (*s == '0') {
+      require_numeric_argument(arg, '0');
+      spec.align_ = ALIGN_NUMERIC;
+      spec.fill_ = '0';
+      ++s;
+    }
+
+    // Parse width.
+    if ('0' <= *s && *s <= '9') {
+      spec.width_ = internal::parse_nonnegative_int(s);
+    } else if (*s == '{') {
+      ++s;
+      Arg width_arg = internal::is_name_start(*s) ?
+            parse_arg_name(s) : parse_arg_index(s);
+      if (*s++ != '}')
+        FMT_THROW(FormatError("invalid format string"));
+      ULongLong value = 0;
+      switch (width_arg.type) {
+      case Arg::INT:
+        if (width_arg.int_value < 0)
+          FMT_THROW(FormatError("negative width"));
+        value = width_arg.int_value;
+        break;
+      case Arg::UINT:
+        value = width_arg.uint_value;
+        break;
+      case Arg::LONG_LONG:
+        if (width_arg.long_long_value < 0)
+          FMT_THROW(FormatError("negative width"));
+        value = width_arg.long_long_value;
+        break;
+      case Arg::ULONG_LONG:
+        value = width_arg.ulong_long_value;
+        break;
+      default:
+        FMT_THROW(FormatError("width is not integer"));
+      }
+      if (value > (std::numeric_limits<int>::max)())
+        FMT_THROW(FormatError("number is too big"));
+      spec.width_ = static_cast<int>(value);
+    }
+
+    // Parse precision.
+    if (*s == '.') {
+      ++s;
+      spec.precision_ = 0;
+      if ('0' <= *s && *s <= '9') {
+        spec.precision_ = internal::parse_nonnegative_int(s);
+      } else if (*s == '{') {
+        ++s;
+        Arg precision_arg = internal::is_name_start(*s) ?
+              parse_arg_name(s) : parse_arg_index(s);
+        if (*s++ != '}')
+          FMT_THROW(FormatError("invalid format string"));
+        ULongLong value = 0;
+        switch (precision_arg.type) {
+          case Arg::INT:
+            if (precision_arg.int_value < 0)
+              FMT_THROW(FormatError("negative precision"));
+            value = precision_arg.int_value;
+            break;
+          case Arg::UINT:
+            value = precision_arg.uint_value;
+            break;
+          case Arg::LONG_LONG:
+            if (precision_arg.long_long_value < 0)
+              FMT_THROW(FormatError("negative precision"));
+            value = precision_arg.long_long_value;
+            break;
+          case Arg::ULONG_LONG:
+            value = precision_arg.ulong_long_value;
+            break;
+          default:
+            FMT_THROW(FormatError("precision is not integer"));
+        }
+        if (value > (std::numeric_limits<int>::max)())
+          FMT_THROW(FormatError("number is too big"));
+        spec.precision_ = static_cast<int>(value);
+      } else {
+        FMT_THROW(FormatError("missing precision specifier"));
+      }
+      if (arg.type <= Arg::LAST_INTEGER_TYPE || arg.type == Arg::POINTER) {
+        FMT_THROW(FormatError(
+            fmt::format("precision not allowed in {} format specifier",
+            arg.type == Arg::POINTER ? "pointer" : "integer")));
+      }
+    }
+
+    // Parse type.
+    if (*s != '}' && *s)
+      spec.type_ = static_cast<char>(*s++);
+  }
+
+  if (*s++ != '}')
+    FMT_THROW(FormatError("missing '}' in format string"));
+
+  // Format argument.
+  ArgFormatter(*this, spec, s - 1).visit(arg);
+  return s;
+}
+
+template <typename Char, typename AF>
+void BasicFormatter<Char, AF>::format(BasicCStringRef<Char> format_str) {
+  const Char *s = format_str.c_str();
+  const Char *start = s;
+  while (*s) {
+    Char c = *s++;
+    if (c != '{' && c != '}') continue;
+    if (*s == c) {
+      write(writer_, start, s);
+      start = ++s;
+      continue;
+    }
+    if (c == '}')
+      FMT_THROW(FormatError("unmatched '}' in format string"));
+    write(writer_, start, s - 1);
+    internal::Arg arg = internal::is_name_start(*s) ?
+          parse_arg_name(s) : parse_arg_index(s);
+    start = s = format(s, arg);
+  }
+  write(writer_, start, s);
+}
+
+template <typename Char, typename It>
+struct ArgJoin {
+  It first;
+  It last;
+  BasicCStringRef<Char> sep;
+
+  ArgJoin(It first, It last, const BasicCStringRef<Char>& sep) :
+    first(first),
+    last(last),
+    sep(sep) {}
+};
+
+template <typename It>
+ArgJoin<char, It> join(It first, It last, const BasicCStringRef<char>& sep) {
+  return ArgJoin<char, It>(first, last, sep);
+}
+
+template <typename It>
+ArgJoin<wchar_t, It> join(It first, It last, const BasicCStringRef<wchar_t>& sep) {
+  return ArgJoin<wchar_t, It>(first, last, sep);
+}
+
+#if FMT_HAS_GXX_CXX11
+template <typename Range>
+auto join(const Range& range, const BasicCStringRef<char>& sep)
+    -> ArgJoin<char, decltype(std::begin(range))> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename Range>
+auto join(const Range& range, const BasicCStringRef<wchar_t>& sep)
+    -> ArgJoin<wchar_t, decltype(std::begin(range))> {
+  return join(std::begin(range), std::end(range), sep);
+}
+#endif
+
+template <typename ArgFormatter, typename Char, typename It>
+void format_arg(fmt::BasicFormatter<Char, ArgFormatter> &f,
+    const Char *&format_str, const ArgJoin<Char, It>& e) {
+  const Char* end = format_str;
+  if (*end == ':')
+    ++end;
+  while (*end && *end != '}')
+    ++end;
+  if (*end != '}')
+    FMT_THROW(FormatError("missing '}' in format string"));
+
+  It it = e.first;
+  if (it != e.last) {
+    const Char* save = format_str;
+    f.format(format_str, internal::MakeArg<fmt::BasicFormatter<Char, ArgFormatter> >(*it++));
+    while (it != e.last) {
+      f.writer().write(e.sep);
+      format_str = save;
+      f.format(format_str, internal::MakeArg<fmt::BasicFormatter<Char, ArgFormatter> >(*it++));
+    }
+  }
+  format_str = end + 1;
+}
+}  // namespace fmt
+
+#if FMT_USE_USER_DEFINED_LITERALS
+namespace fmt {
+namespace internal {
+
+template <typename Char>
+struct UdlFormat {
+  const Char *str;
+
+  template <typename... Args>
+  auto operator()(Args && ... args) const
+                  -> decltype(format(str, std::forward<Args>(args)...)) {
+    return format(str, std::forward<Args>(args)...);
+  }
+};
+
+template <typename Char>
+struct UdlArg {
+  const Char *str;
+
+  template <typename T>
+  NamedArgWithType<Char, T> operator=(T &&value) const {
+    return {str, std::forward<T>(value)};
+  }
+};
+
+} // namespace internal
+
+inline namespace literals {
+
+/**
+  \rst
+  C++11 literal equivalent of :func:`fmt::format`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    std::string message = "The answer is {}"_format(42);
+  \endrst
+ */
+inline internal::UdlFormat<char>
+operator"" _format(const char *s, std::size_t) { return {s}; }
+inline internal::UdlFormat<wchar_t>
+operator"" _format(const wchar_t *s, std::size_t) { return {s}; }
+
+/**
+  \rst
+  C++11 literal equivalent of :func:`fmt::arg`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+  \endrst
+ */
+inline internal::UdlArg<char>
+operator"" _a(const char *s, std::size_t) { return {s}; }
+inline internal::UdlArg<wchar_t>
+operator"" _a(const wchar_t *s, std::size_t) { return {s}; }
+
+} // inline namespace literals
+} // namespace fmt
+#endif // FMT_USE_USER_DEFINED_LITERALS
+
+// Restore warnings.
+#if FMT_GCC_VERSION >= 406
+# pragma GCC diagnostic pop
+#endif
+
+#if defined(__clang__) && !defined(FMT_ICC_VERSION)
+# pragma clang diagnostic pop
+#endif
+
+#ifdef FMT_HEADER_ONLY
+# define FMT_FUNC inline
+# include "format.cc"
+#else
+# define FMT_FUNC
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/include/vtkmdiy/fmt/ostream.cc b/include/vtkmdiy/fmt/ostream.cc
new file mode 100644
index 000000000..2d443f730
--- /dev/null
+++ b/include/vtkmdiy/fmt/ostream.cc
@@ -0,0 +1,35 @@
+/*
+ Formatting library for C++ - std::ostream support
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ For the license information refer to format.h.
+ */
+
+#include "ostream.h"
+
+namespace fmt {
+
+namespace internal {
+FMT_FUNC void write(std::ostream &os, Writer &w) {
+  const char *data = w.data();
+  typedef internal::MakeUnsigned<std::streamsize>::Type UnsignedStreamSize;
+  UnsignedStreamSize size = w.size();
+  UnsignedStreamSize max_size =
+      internal::to_unsigned((std::numeric_limits<std::streamsize>::max)());
+  do {
+    UnsignedStreamSize n = size <= max_size ? size : max_size;
+    os.write(data, static_cast<std::streamsize>(n));
+    data += n;
+    size -= n;
+  } while (size != 0);
+}
+}
+
+FMT_FUNC void print(std::ostream &os, CStringRef format_str, ArgList args) {
+  MemoryWriter w;
+  w.write(format_str, args);
+  internal::write(os, w);
+}
+}  // namespace fmt
diff --git a/include/vtkmdiy/fmt/ostream.h b/include/vtkmdiy/fmt/ostream.h
new file mode 100644
index 000000000..84a02d173
--- /dev/null
+++ b/include/vtkmdiy/fmt/ostream.h
@@ -0,0 +1,105 @@
+/*
+ Formatting library for C++ - std::ostream support
+
+ Copyright (c) 2012 - 2016, Victor Zverovich
+ All rights reserved.
+
+ For the license information refer to format.h.
+ */
+
+#ifndef FMT_OSTREAM_H_
+#define FMT_OSTREAM_H_
+
+#include "format.h"
+#include <ostream>
+
+namespace fmt {
+
+namespace internal {
+
+template <class Char>
+class FormatBuf : public std::basic_streambuf<Char> {
+ private:
+  typedef typename std::basic_streambuf<Char>::int_type int_type;
+  typedef typename std::basic_streambuf<Char>::traits_type traits_type;
+
+  Buffer<Char> &buffer_;
+
+ public:
+  FormatBuf(Buffer<Char> &buffer) : buffer_(buffer) {}
+
+ protected:
+  // The put-area is actually always empty. This makes the implementation
+  // simpler and has the advantage that the streambuf and the buffer are always
+  // in sync and sputc never writes into uninitialized memory. The obvious
+  // disadvantage is that each call to sputc always results in a (virtual) call
+  // to overflow. There is no disadvantage here for sputn since this always
+  // results in a call to xsputn.
+
+  int_type overflow(int_type ch = traits_type::eof()) FMT_OVERRIDE {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<Char>(ch));
+    return ch;
+  }
+
+  std::streamsize xsputn(const Char *s, std::streamsize count) FMT_OVERRIDE {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
+Yes &convert(std::ostream &);
+
+struct DummyStream : std::ostream {
+  DummyStream();  // Suppress a bogus warning in MSVC.
+  // Hide all operator<< overloads from std::ostream.
+  void operator<<(Null<>);
+};
+
+No &operator<<(std::ostream &, int);
+
+template<typename T>
+struct ConvertToIntImpl<T, true> {
+  // Convert to int only if T doesn't have an overloaded operator<<.
+  enum {
+    value = sizeof(convert(get<DummyStream>() << get<T>())) == sizeof(No)
+  };
+};
+
+// Write the content of w to os.
+FMT_API void write(std::ostream &os, Writer &w);
+}  // namespace internal
+
+// Formats a value.
+template <typename Char, typename ArgFormatter_, typename T>
+void format_arg(BasicFormatter<Char, ArgFormatter_> &f,
+                const Char *&format_str, const T &value) {
+  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE> buffer;
+
+  internal::FormatBuf<Char> format_buf(buffer);
+  std::basic_ostream<Char> output(&format_buf);
+  output << value;
+
+  BasicStringRef<Char> str(&buffer[0], buffer.size());
+  typedef internal::MakeArg< BasicFormatter<Char> > MakeArg;
+  format_str = f.format(format_str, MakeArg(str));
+}
+
+/**
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    print(cerr, "Don't {}!", "panic");
+  \endrst
+ */
+FMT_API void print(std::ostream &os, CStringRef format_str, ArgList args);
+FMT_VARIADIC(void, print, std::ostream &, CStringRef)
+}  // namespace fmt
+
+#ifdef FMT_HEADER_ONLY
+# include "ostream.cc"
+#endif
+
+#endif  // FMT_OSTREAM_H_
diff --git a/include/vtkmdiy/grid.hpp b/include/vtkmdiy/grid.hpp
new file mode 100644
index 000000000..cfdb72a65
--- /dev/null
+++ b/include/vtkmdiy/grid.hpp
@@ -0,0 +1,153 @@
+#ifndef DIY_GRID_HPP
+#define DIY_GRID_HPP
+
+#include "point.hpp"
+
+namespace diy
+{
+
+template<class C, unsigned D>
+struct Grid;
+
+template<class C, unsigned D>
+struct GridRef
+{
+    public:
+        typedef     C                                           Value;
+
+        typedef     Point<int, D>                               Vertex;
+        typedef     size_t                                      Index;
+
+    public:
+        template<class Int>
+                GridRef(C* data, const Point<Int,D>& shape, bool c_order = true):
+                    data_(data), shape_(shape), c_order_(c_order)   { set_stride(); }
+
+                GridRef(Grid<C,D>& g):
+                    data_(g.data()), shape_(g.shape()),
+                    c_order_(g.c_order())                       { set_stride(); }
+
+        template<class Int>
+        C       operator()(const Point<Int, D>& v) const        { return (*this)(index(v)); }
+
+        template<class Int>
+        C&      operator()(const Point<Int, D>& v)              { return (*this)(index(v)); }
+
+        C       operator()(Index i) const                       { return data_[i]; }
+        C&      operator()(Index i)                             { return data_[i]; }
+
+        const Vertex&
+                shape() const                                   { return shape_; }
+
+        const C*
+                data() const                                    { return data_; }
+        C*      data()                                          { return data_; }
+
+        // Set every element to the given value
+        GridRef&    operator=(C value)                          { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] = value; return *this; }
+        GridRef&    operator/=(C value)                         { Index s = size(); for (Index i = 0; i < s; ++i) data_[i] /= value; return *this; }
+
+        Vertex      vertex(Index idx) const                     { Vertex v; for (unsigned i = 0; i < D; ++i) { v[i] = idx / stride_[i]; idx %= stride_[i]; } return v; }
+        Index       index(const Vertex& v) const                { Index idx = 0; for (unsigned i = 0; i < D; ++i) { idx += ((Index) v[i]) * ((Index) stride_[i]); } return idx; }
+
+        Index       size() const                                { return size(shape()); }
+        void        swap(GridRef& other)                        { std::swap(data_, other.data_); std::swap(shape_, other.shape_); std::swap(stride_, other.stride_); std::swap(c_order_, other.c_order_); }
+
+        bool        c_order() const                             { return c_order_; }
+
+        static constexpr
+        unsigned    dimension()                                 { return D; }
+
+    protected:
+        static Index
+                size(const Vertex& v)                           { Index res = 1; for (unsigned i = 0; i < D; ++i) res *= v[i]; return res; }
+
+        void    set_stride()
+        {
+            Index cur = 1;
+            if (c_order_)
+                for (unsigned i = D; i > 0; --i) { stride_[i-1] = cur; cur *= shape_[i-1]; }
+            else
+                for (unsigned i = 0; i < D; ++i) { stride_[i] = cur; cur *= shape_[i]; }
+
+        }
+        void    set_shape(const Vertex& v)                      { shape_ = v; set_stride(); }
+        void    set_data(C* data)                               { data_ = data; }
+        void    set_c_order(bool order)                         { c_order_ = order; }
+
+    private:
+        C*      data_;
+        Vertex  shape_;
+        Vertex  stride_;
+        bool    c_order_;
+};
+
+
+template<class C, unsigned D>
+struct Grid: public GridRef<C,D>
+{
+    public:
+        typedef     GridRef<C,D>                                Parent;
+        typedef     typename Parent::Value                      Value;
+        typedef     typename Parent::Index                      Index;
+        typedef     typename Parent::Vertex                     Vertex;
+        typedef     Parent                                      Reference;
+
+        template<class U>
+        struct rebind { typedef Grid<U,D>                       type; };
+
+    public:
+                Grid():
+                    Parent(new C[0], Vertex::zero())            {}
+        template<class Int>
+                Grid(const Point<Int, D>& shape, bool c_order = true):
+                    Parent(new C[size(shape)], shape, c_order)
+                {}
+
+                Grid(Grid&& g): Grid()                          { Parent::swap(g); }
+
+                Grid(const Parent& g):
+                    Parent(new C[size(g.shape())], g.shape(),
+                           g.c_order())                         { copy_data(g.data()); }
+
+        template<class OtherGrid>
+                Grid(const OtherGrid& g):
+                    Parent(new C[size(g.shape())],
+                           g.shape(),
+                           g.c_order())                         { copy_data(g.data()); }
+
+                ~Grid()                                         { delete[] Parent::data(); }
+
+        template<class OC>
+        Grid&   operator=(const GridRef<OC, D>& other)
+        {
+            delete[] Parent::data();
+            Parent::set_c_order(other.c_order());       // NB: order needs to be set before the shape, to set the stride correctly
+            Parent::set_shape(other.shape());
+            Index s = size(shape());
+            Parent::set_data(new C[s]);
+            copy_data(other.data());
+            return *this;
+        }
+
+        Grid&   operator=(Grid&& g)                             { Parent::swap(g); return *this; }
+
+        using Parent::data;
+        using Parent::shape;
+        using Parent::operator();
+        using Parent::operator=;
+        using Parent::size;
+
+    private:
+        template<class OC>
+        void    copy_data(const OC* data)
+        {
+            Index s = size(shape());
+            for (Index i = 0; i < s; ++i)
+                Parent::data()[i] = data[i];
+        }
+};
+
+}
+
+#endif
diff --git a/include/vtkmdiy/io/block.hpp b/include/vtkmdiy/io/block.hpp
new file mode 100644
index 000000000..05e45a800
--- /dev/null
+++ b/include/vtkmdiy/io/block.hpp
@@ -0,0 +1,396 @@
+#ifndef DIY_IO_BLOCK_HPP
+#define DIY_IO_BLOCK_HPP
+
+#include <string>
+#include <algorithm>
+#include <stdexcept>
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include "../mpi.hpp"
+#include "../assigner.hpp"
+#include "../master.hpp"
+#include "../storage.hpp"
+#include "../log.hpp"
+
+// Read and write collections of blocks using MPI-IO
+namespace diy
+{
+namespace io
+{
+  namespace detail
+  {
+    typedef mpi::io::offset                 offset_t;
+
+    struct GidOffsetCount
+    {
+                    GidOffsetCount():                                   // need to initialize a vector of given size
+                        gid(-1), offset(0), count(0)                    {}
+
+                    GidOffsetCount(int gid_, offset_t offset_, offset_t count_):
+                        gid(gid_), offset(offset_), count(count_)       {}
+
+        bool        operator<(const GidOffsetCount& other) const        { return gid < other.gid; }
+
+        int         gid;
+        offset_t    offset;
+        offset_t    count;
+    };
+  }
+}
+
+// Serialize GidOffsetCount explicitly, to avoid alignment and unitialized data issues
+// (to get identical output files given the same block input)
+template<>
+struct Serialization<io::detail::GidOffsetCount>
+{
+    typedef             io::detail::GidOffsetCount                  GidOffsetCount;
+
+    static void         save(BinaryBuffer& bb, const GidOffsetCount& x)
+    {
+      diy::save(bb, x.gid);
+      diy::save(bb, x.offset);
+      diy::save(bb, x.count);
+    }
+
+    static void         load(BinaryBuffer& bb, GidOffsetCount& x)
+    {
+      diy::load(bb, x.gid);
+      diy::load(bb, x.offset);
+      diy::load(bb, x.count);
+    }
+};
+
+namespace io
+{
+/**
+ * \ingroup IO
+ * \brief Write blocks to storage collectively in one shared file
+ */
+  inline
+  void
+  write_blocks(const std::string&           outfilename,           //!< output file name
+               const mpi::communicator&     comm,                  //!< communicator
+               Master&                      master,                //!< master object
+               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
+               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in the master
+  {
+    if (!save) save = master.saver();       // save is likely to be different from master.save()
+
+    typedef detail::offset_t                offset_t;
+    typedef detail::GidOffsetCount          GidOffsetCount;
+
+    unsigned size = master.size(),
+             max_size, min_size;
+    mpi::all_reduce(comm, size, max_size, mpi::maximum<unsigned>());
+    mpi::all_reduce(comm, size, min_size, mpi::minimum<unsigned>());
+
+    // truncate the file
+    if (comm.rank() == 0)
+        truncate(outfilename.c_str(), 0);
+
+    mpi::io::file f(comm, outfilename, mpi::io::file::wronly | mpi::io::file::create);
+
+    offset_t  start = 0, shift;
+    std::vector<GidOffsetCount>     offset_counts;
+    unsigned i;
+    for (i = 0; i < max_size; ++i)
+    {
+      offset_t count = 0,
+               offset;
+      if (i < size)
+      {
+        // get the block from master and serialize it
+        const void* block = master.get(i);
+        MemoryBuffer bb;
+        LinkFactory::save(bb, master.link(i));
+        save(block, bb);
+        count = bb.buffer.size();
+        mpi::scan(comm, count, offset, std::plus<offset_t>());
+        offset += start - count;
+        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
+        start += shift;
+
+        if (i < min_size)       // up to min_size, we can do collective IO
+          f.write_at_all(offset, bb.buffer);
+        else
+          f.write_at(offset, bb.buffer);
+
+        offset_counts.push_back(GidOffsetCount(master.gid(i), offset, count));
+      } else
+      {
+        // matching global operations
+        mpi::scan(comm, count, offset, std::plus<offset_t>());
+        mpi::all_reduce(comm, count, shift, std::plus<offset_t>());
+
+        // -1 indicates that there is no block written here from this rank
+        offset_counts.push_back(GidOffsetCount(-1, offset, count));
+      }
+    }
+
+    if (comm.rank() == 0)
+    {
+      // round-about way of gather vector of vectors of GidOffsetCount to avoid registering a new mpi datatype
+      std::vector< std::vector<char> > gathered_offset_count_buffers;
+      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
+      mpi::gather(comm, oc_buffer.buffer, gathered_offset_count_buffers, 0);
+
+      std::vector<GidOffsetCount>  all_offset_counts;
+      for (unsigned i = 0; i < gathered_offset_count_buffers.size(); ++i)
+      {
+        MemoryBuffer oc_buffer; oc_buffer.buffer.swap(gathered_offset_count_buffers[i]);
+        std::vector<GidOffsetCount> offset_counts;
+        diy::load(oc_buffer, offset_counts);
+        for (unsigned j = 0; j < offset_counts.size(); ++j)
+          if (offset_counts[j].gid != -1)
+            all_offset_counts.push_back(offset_counts[j]);
+      }
+      std::sort(all_offset_counts.begin(), all_offset_counts.end());        // sorts by gid
+
+      MemoryBuffer bb;
+      diy::save(bb, all_offset_counts);
+      diy::save(bb, extra);
+      size_t footer_size = bb.size();
+      diy::save(bb, footer_size);
+
+      // find footer_offset as the max of (offset + count)
+      offset_t footer_offset = 0;
+      for (unsigned i = 0; i < all_offset_counts.size(); ++i)
+      {
+        offset_t end = all_offset_counts[i].offset + all_offset_counts[i].count;
+        if (end > footer_offset)
+            footer_offset = end;
+      }
+      f.write_at(footer_offset, bb.buffer);
+    } else
+    {
+      MemoryBuffer oc_buffer; diy::save(oc_buffer, offset_counts);
+      mpi::gather(comm, oc_buffer.buffer, 0);
+    }
+  }
+
+/**
+ * \ingroup IO
+ * \brief Read blocks from storage collectively from one shared file
+ */
+    inline
+    void
+    read_blocks(const std::string&           infilename,     //!< input file name
+                const mpi::communicator&     comm,           //!< communicator
+                Assigner&                    assigner,       //!< assigner object
+                Master&                      master,         //!< master object
+                MemoryBuffer&                extra,          //!< user-defined metadata in file header
+                Master::LoadBlock            load = 0)       //!< load block function in case different than or unefined in the master
+    {
+        if (!load) load = master.loader();      // load is likely to be different from master.load()
+
+        typedef detail::offset_t                offset_t;
+        typedef detail::GidOffsetCount          GidOffsetCount;
+
+        mpi::io::file f(comm, infilename, mpi::io::file::rdonly);
+
+        offset_t    footer_offset = f.size() - sizeof(size_t);
+        size_t footer_size;
+
+        // Read the size
+        f.read_at_all(footer_offset, (char*) &footer_size, sizeof(footer_size));
+
+        // Read all_offset_counts
+        footer_offset -= footer_size;
+        MemoryBuffer footer;
+        footer.buffer.resize(footer_size);
+        f.read_at_all(footer_offset, footer.buffer);
+
+        std::vector<GidOffsetCount>  all_offset_counts;
+        diy::load(footer, all_offset_counts);
+        diy::load(footer, extra);
+        extra.reset();
+
+        // Get local gids from assigner
+        size_t size = all_offset_counts.size();
+        assigner.set_nblocks(size);
+        std::vector<int> gids;
+        assigner.local_gids(comm.rank(), gids);
+
+        for (unsigned i = 0; i < gids.size(); ++i)
+        {
+            if (gids[i] != all_offset_counts[gids[i]].gid)
+                get_logger()->warn("gids don't match in diy::io::read_blocks(), {} vs {}",
+                                   gids[i], all_offset_counts[gids[i]].gid);
+
+            offset_t offset = all_offset_counts[gids[i]].offset,
+                     count  = all_offset_counts[gids[i]].count;
+            MemoryBuffer bb;
+            bb.buffer.resize(count);
+            f.read_at(offset, bb.buffer);
+            Link* l = LinkFactory::load(bb);
+            l->fix(assigner);
+            void* b = master.create();
+            load(b, bb);
+            master.add(gids[i], b, l);
+        }
+    }
+
+
+  // Functions without the extra buffer, for compatibility with the old code
+  inline
+  void
+  write_blocks(const std::string&           outfilename,
+               const mpi::communicator&     comm,
+               Master&                      master,
+               Master::SaveBlock            save)
+  {
+    MemoryBuffer extra;
+    write_blocks(outfilename, comm, master, extra, save);
+  }
+
+  inline
+  void
+  read_blocks(const std::string&           infilename,
+              const mpi::communicator&     comm,
+              Assigner&                    assigner,
+              Master&                      master,
+              Master::LoadBlock            load = 0)
+  {
+    MemoryBuffer extra;     // dummy
+    read_blocks(infilename, comm, assigner, master, extra, load);
+  }
+
+namespace split
+{
+/**
+ * \ingroup IO
+ * \brief Write blocks to storage independently in one file per process
+ */
+  inline
+  void
+  write_blocks(const std::string&           outfilename,           //!< output file name
+               const mpi::communicator&     comm,                  //!< communicator
+               Master&                      master,                //!< master object
+               const MemoryBuffer&          extra = MemoryBuffer(),//!< user-defined metadata for file header; meaningful only on rank == 0
+               Master::SaveBlock            save = 0)              //!< block save function in case different than or undefined in master
+  {
+    if (!save) save = master.saver();       // save is likely to be different from master.save()
+
+    bool proceed = false;
+    size_t size = 0;
+    if (comm.rank() == 0)
+    {
+        struct stat s;
+        if (stat(outfilename.c_str(), &s) == 0)
+        {
+            if (S_ISDIR(s.st_mode))
+                proceed = true;
+        } else if (mkdir(outfilename.c_str(), 0755) == 0)
+            proceed = true;
+        mpi::broadcast(comm, proceed, 0);
+        mpi::reduce(comm, (size_t) master.size(), size, 0, std::plus<size_t>());
+    } else
+    {
+        mpi::broadcast(comm, proceed, 0);
+        mpi::reduce(comm, (size_t) master.size(), 0, std::plus<size_t>());
+    }
+
+    if (!proceed)
+        throw std::runtime_error("Cannot access or create directory: " + outfilename);
+
+    for (int i = 0; i < (int)master.size(); ++i)
+    {
+        const void* block = master.get(i);
+
+        std::string filename = fmt::format("{}/{}", outfilename, master.gid(i));
+
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
+
+        LinkFactory::save(bb, master.link(i));
+        save(block, bb);
+
+        fclose(bb.file);
+    }
+
+    if (comm.rank() == 0)
+    {
+        // save the extra buffer
+        std::string filename = outfilename + "/extra";
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "w"));
+        ::diy::save(bb, size);
+        ::diy::save(bb, extra);
+        fclose(bb.file);
+    }
+  }
+
+/**
+ * \ingroup IO
+ * \brief Read blocks from storage independently from one file per process
+ */
+  inline
+  void
+  read_blocks(const std::string&           infilename,  //!< input file name
+              const mpi::communicator&     comm,        //!< communicator
+              Assigner&                    assigner,    //!< assigner object
+              Master&                      master,      //!< master object
+              MemoryBuffer&                extra,       //!< user-defined metadata in file header
+              Master::LoadBlock            load = 0)    //!< block load function in case different than or undefined in master
+  {
+    if (!load) load = master.loader();      // load is likely to be different from master.load()
+
+    // load the extra buffer and size
+    size_t          size;
+    std::string filename = infilename + "/extra";
+    ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
+    ::diy::load(bb, size);
+    ::diy::load(bb, extra);
+    extra.reset();
+    fclose(bb.file);
+
+    // Get local gids from assigner
+    assigner.set_nblocks(size);
+    std::vector<int> gids;
+    assigner.local_gids(comm.rank(), gids);
+
+    // Read our blocks;
+    for (unsigned i = 0; i < gids.size(); ++i)
+    {
+        std::string filename = fmt::format("{}/{}", infilename, gids[i]);
+
+        ::diy::detail::FileBuffer bb(fopen(filename.c_str(), "r"));
+        Link* l = LinkFactory::load(bb);
+        l->fix(assigner);
+        void* b = master.create();
+        load(b, bb);
+        master.add(gids[i], b, l);
+
+        fclose(bb.file);
+    }
+  }
+
+  // Functions without the extra buffer, for compatibility with the old code
+  inline
+  void
+  write_blocks(const std::string&           outfilename,
+               const mpi::communicator&     comm,
+               Master&                      master,
+               Master::SaveBlock            save)
+  {
+    MemoryBuffer extra;
+    write_blocks(outfilename, comm, master, extra, save);
+  }
+
+  inline
+  void
+  read_blocks(const std::string&           infilename,
+              const mpi::communicator&     comm,
+              Assigner&                    assigner,
+              Master&                      master,
+              Master::LoadBlock            load = 0)
+  {
+    MemoryBuffer extra;     // dummy
+    read_blocks(infilename, comm, assigner, master, extra, load);
+  }
+} // split
+} // io
+} // diy
+
+#endif
diff --git a/include/vtkmdiy/io/bov.hpp b/include/vtkmdiy/io/bov.hpp
new file mode 100644
index 000000000..bd8b24009
--- /dev/null
+++ b/include/vtkmdiy/io/bov.hpp
@@ -0,0 +1,171 @@
+#ifndef DIY_IO_BOV_HPP
+#define DIY_IO_BOV_HPP
+
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+#include "../types.hpp"
+#include "../mpi.hpp"
+
+namespace diy
+{
+namespace io
+{
+  // Reads and writes subsets of a block of values into specified block bounds
+  class BOV
+  {
+    public:
+      typedef       std::vector<int>                                    Shape;
+    public:
+                    BOV(mpi::io::file&    f):
+                      f_(f), offset_(0)                                 {}
+
+      template<class S>
+                    BOV(mpi::io::file&    f,
+                        const S&          shape  = S(),
+                        mpi::io::offset   offset = 0):
+                      f_(f), offset_(offset)                            { set_shape(shape); }
+
+      void          set_offset(mpi::io::offset offset)                  { offset_ = offset; }
+
+      template<class S>
+      void          set_shape(const S& shape)
+      {
+        shape_.clear();
+        stride_.clear();
+        for (unsigned i = 0; i < shape.size(); ++i)
+        {
+            shape_.push_back(shape[i]);
+            stride_.push_back(1);
+        }
+        for (int i = shape_.size() - 2; i >=  0; --i)
+          stride_[i] = stride_[i+1] * shape_[i+1];
+      }
+
+      const Shape&  shape() const                                       { return shape_; }
+
+      template<class T>
+      void          read(const DiscreteBounds& bounds, T* buffer, bool collective = false, int chunk = 1) const;
+
+      template<class T>
+      void          write(const DiscreteBounds& bounds, const T* buffer, bool collective = false, int chunk = 1);
+
+      template<class T>
+      void          write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective = false, int chunk = 1);
+
+    protected:
+      mpi::io::file&        file()                                        { return f_; }
+
+    private:
+      mpi::io::file&        f_;
+      Shape                 shape_;
+      std::vector<size_t>   stride_;
+      size_t                offset_;
+  };
+}
+}
+
+template<class T>
+void
+diy::io::BOV::
+read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
+{
+  int dim   = shape_.size();
+  int total = 1;
+  std::vector<int> subsizes;
+  for (int i = 0; i < dim; ++i)
+  {
+    subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
+    total *= subsizes.back();
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+    T_type = mpi::detail::get_mpi_datatype<T>();
+  else
+  {
+    // create an MPI struct of size chunk to read the data in those chunks
+    // (this allows to work around MPI-IO weirdness where crucial quantities
+    // are ints, which are too narrow of a type)
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk;
+  MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_commit(&fileblk);
+
+  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+      MPI_File_read(f_.handle(), buffer, total, T_type, &s.s);
+  else
+      MPI_File_read_all(f_.handle(), buffer, total, T_type, &s.s);
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+}
+
+template<class T>
+void
+diy::io::BOV::
+write(const DiscreteBounds& bounds, const T* buffer, bool collective, int chunk)
+{
+    write(bounds, buffer, bounds, collective, chunk);
+}
+
+template<class T>
+void
+diy::io::BOV::
+write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective, int chunk)
+{
+  int dim   = shape_.size();
+  std::vector<int> subsizes;
+  std::vector<int> buffer_shape, buffer_start;
+  for (int i = 0; i < dim; ++i)
+  {
+    buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
+    buffer_start.push_back(core.min[i] - bounds.min[i]);
+    subsizes.push_back(core.max[i] - core.min[i] + 1);
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+    T_type = mpi::detail::get_mpi_datatype<T>();
+  else
+  {
+    // assume T is a binary block and create an MPI struct of appropriate size
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk, subbuffer;
+  MPI_Type_create_subarray(dim, (int*) &shape_[0],       &subsizes[0], (int*) &bounds.min[0],   MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
+  MPI_Type_commit(&fileblk);
+  MPI_Type_commit(&subbuffer);
+
+  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+    MPI_File_write(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
+  else
+    MPI_File_write_all(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+  MPI_Type_free(&subbuffer);
+}
+
+#endif
diff --git a/include/vtkmdiy/io/numpy.hpp b/include/vtkmdiy/io/numpy.hpp
new file mode 100644
index 000000000..0199a0c38
--- /dev/null
+++ b/include/vtkmdiy/io/numpy.hpp
@@ -0,0 +1,213 @@
+#ifndef DIY_IO_NMPY_HPP
+#define DIY_IO_NMPY_HPP
+
+#include <sstream>
+#include <complex>
+#include <stdexcept>
+
+#include "../serialization.hpp"
+#include "bov.hpp"
+
+namespace diy
+{
+namespace io
+{
+  class NumPy: public BOV
+  {
+    public:
+                        NumPy(mpi::io::file& f):
+                          BOV(f)                                {}
+
+      unsigned          word_size() const                       { return word_size_; }
+
+      unsigned          read_header()
+      {
+        BOV::Shape  shape;
+        bool        fortran;
+        size_t      offset = parse_npy_header(shape, fortran);
+        if (fortran)
+            throw std::runtime_error("diy::io::NumPy cannot read data in fortran order");
+        BOV::set_offset(offset);
+        BOV::set_shape(shape);
+        return word_size_;
+      }
+
+      template<class T>
+      void              write_header(int dim, const DiscreteBounds& bounds);
+
+      template<class T, class S>
+      void              write_header(const S& shape);
+
+    private:
+      inline size_t     parse_npy_header(BOV::Shape& shape, bool& fortran_order);
+      void              save(diy::BinaryBuffer& bb, const std::string& s)               { bb.save_binary(s.c_str(), s.size()); }
+      template<class T>
+      inline void       convert_and_save(diy::BinaryBuffer& bb, const T& x)
+      {
+          std::ostringstream oss;
+          oss << x;
+          save(bb, oss.str());
+      }
+
+    private:
+      unsigned          word_size_;
+  };
+
+  namespace detail
+  {
+    inline char big_endian();
+    template<class T>
+    char map_numpy_type();
+  }
+}
+}
+
+// Modified from: https://github.com/rogersce/cnpy
+// Copyright (C) 2011  Carl Rogers
+// Released under MIT License
+// license available at http://www.opensource.org/licenses/mit-license.php
+size_t
+diy::io::NumPy::
+parse_npy_header(BOV::Shape& shape, bool& fortran_order)
+{
+    char buffer[256];
+    file().read_at_all(0, buffer, 256);
+    std::string header(buffer, buffer + 256);
+    size_t nl = header.find('\n');
+    if (nl == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to read the header");
+    header = header.substr(11, nl - 11 + 1);
+    size_t header_size = nl + 1;
+
+    int loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    unsigned ndims;
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    if(str_shape[str_shape.size()-1] == ',') ndims = 1;
+    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    shape.resize(ndims);
+    for(unsigned int i = 0;i < ndims;i++) {
+        loc1 = str_shape.find(",");
+        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
+        str_shape = str_shape.substr(loc1+1);
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable.
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    //bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    //assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size_ = atoi(str_ws.substr(0,loc2).c_str());
+
+    return header_size;
+}
+
+template<class T>
+void
+diy::io::NumPy::
+write_header(int dim, const DiscreteBounds& bounds)
+{
+    std::vector<int> shape;
+    for (int i = 0; i < dim; ++i)
+        shape.push_back(bounds.max[i] - bounds.min[i] + 1);
+
+    write_header< T, std::vector<int> >(shape);
+}
+
+
+template<class T, class S>
+void
+diy::io::NumPy::
+write_header(const S& shape)
+{
+    BOV::set_shape(shape);
+
+    diy::MemoryBuffer dict;
+    save(dict, "{'descr': '");
+    diy::save(dict, detail::big_endian());
+    diy::save(dict, detail::map_numpy_type<T>());
+    convert_and_save(dict, sizeof(T));
+    save(dict, "', 'fortran_order': False, 'shape': (");
+    convert_and_save(dict, shape[0]);
+    for (int i = 1; i < (int) shape.size(); i++)
+    {
+        save(dict, ", ");
+        convert_and_save(dict, shape[i]);
+    }
+    if(shape.size() == 1) save(dict, ",");
+    save(dict, "), }");
+    //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+    int remainder = 16 - (10 + dict.position) % 16;
+    for (int i = 0; i < remainder - 1; ++i)
+        diy::save(dict, ' ');
+    diy::save(dict, '\n');
+
+    diy::MemoryBuffer header;
+    diy::save(header, (char) 0x93);
+    save(header, "NUMPY");
+    diy::save(header, (char) 0x01);  // major version of numpy format
+    diy::save(header, (char) 0x00);  // minor version of numpy format
+    diy::save(header, (unsigned short) dict.position);
+    header.save_binary(&dict.buffer[0], dict.buffer.size());
+
+    BOV::set_offset(header.position);
+
+    if (file().comm().rank() == 0)
+        file().write_at(0, &header.buffer[0], header.buffer.size());
+}
+
+char
+diy::io::detail::big_endian()
+{
+  unsigned char x[] = {1,0};
+  void* x_void = x;
+  short y = *static_cast<short*>(x_void);
+  return y == 1 ? '<' : '>';
+}
+
+namespace diy
+{
+namespace io
+{
+namespace detail
+{
+template<> inline char map_numpy_type<float>()                         { return 'f'; }
+template<> inline char map_numpy_type<double>()                        { return 'f'; }
+template<> inline char map_numpy_type<long double>()                   { return 'f'; }
+
+template<> inline char map_numpy_type<int>()                           { return 'i'; }
+template<> inline char map_numpy_type<char>()                          { return 'i'; }
+template<> inline char map_numpy_type<short>()                         { return 'i'; }
+template<> inline char map_numpy_type<long>()                          { return 'i'; }
+template<> inline char map_numpy_type<long long>()                     { return 'i'; }
+
+template<> inline char map_numpy_type<unsigned int>()                  { return 'u'; }
+template<> inline char map_numpy_type<unsigned char>()                 { return 'u'; }
+template<> inline char map_numpy_type<unsigned short>()                { return 'u'; }
+template<> inline char map_numpy_type<unsigned long>()                 { return 'u'; }
+template<> inline char map_numpy_type<unsigned long long>()            { return 'u'; }
+
+template<> inline char map_numpy_type<bool>()                          { return 'b'; }
+
+template<> inline char map_numpy_type< std::complex<float> >()         { return 'c'; }
+template<> inline char map_numpy_type< std::complex<double> >()        { return 'c'; }
+template<> inline char map_numpy_type< std::complex<long double> >()   { return 'c'; }
+}
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/link.hpp b/include/vtkmdiy/link.hpp
new file mode 100644
index 000000000..3262eef61
--- /dev/null
+++ b/include/vtkmdiy/link.hpp
@@ -0,0 +1,219 @@
+#ifndef DIY_COVER_HPP
+#define DIY_COVER_HPP
+
+#include <vector>
+#include <map>
+#include <algorithm>
+
+#include "types.hpp"
+#include "serialization.hpp"
+#include "assigner.hpp"
+
+namespace diy
+{
+  // Local view of a distributed representation of a cover, a completely unstructured link
+  class Link
+  {
+    public:
+      virtual   ~Link()                             {}  // need to be able to delete derived classes
+
+      int       size() const                        { return neighbors_.size(); }
+      inline
+      int       size_unique() const;
+      BlockID   target(int i) const                 { return neighbors_[i]; }
+      BlockID&  target(int i)                       { return neighbors_[i]; }
+      inline
+      int       find(int gid) const;
+
+      void      add_neighbor(const BlockID& block)  { neighbors_.push_back(block); }
+
+      void      fix(const Assigner& assigner)       { for (unsigned i = 0; i < neighbors_.size(); ++i) { neighbors_[i].proc = assigner.rank(neighbors_[i].gid); } }
+
+      void      swap(Link& other)                   { neighbors_.swap(other.neighbors_); }
+
+      virtual void  save(BinaryBuffer& bb) const    { diy::save(bb, neighbors_); }
+      virtual void  load(BinaryBuffer& bb)          { diy::load(bb, neighbors_); }
+
+      virtual size_t id() const                     { return 0; }
+
+    private:
+      std::vector<BlockID>  neighbors_;
+  };
+
+  template<class Bounds_>
+  class RegularLink;
+
+  typedef       RegularLink<DiscreteBounds>         RegularGridLink;
+  typedef       RegularLink<ContinuousBounds>       RegularContinuousLink;
+
+  // Selector between regular discrete and contious links given bounds type
+  template<class Bounds_>
+  struct RegularLinkSelector;
+
+  template<>
+  struct RegularLinkSelector<DiscreteBounds>
+  {
+    typedef     RegularGridLink         type;
+    static const size_t id = 1;
+  };
+
+  template<>
+  struct RegularLinkSelector<ContinuousBounds>
+  {
+    typedef     RegularContinuousLink   type;
+    static const size_t id = 2;
+  };
+
+
+  // for a regular decomposition, it makes sense to address the neighbors by direction
+  // and store local and neighbor bounds
+  template<class Bounds_>
+  class RegularLink: public Link
+  {
+    public:
+      typedef   Bounds_                             Bounds;
+
+      typedef   std::map<Direction, int>            DirMap;
+      typedef   std::vector<Direction>              DirVec;
+
+    public:
+                RegularLink(int dim, const Bounds& core, const Bounds& bounds):
+                  dim_(dim), core_(core), bounds_(bounds)            {}
+
+      // dimension
+      int       dimension() const                       { return dim_; }
+
+      // direction
+      int       direction(Direction dir) const;         // convert direction to a neighbor (-1 if no neighbor)
+      Direction direction(int i) const                  { return dir_vec_[i]; }
+      void      add_direction(Direction dir)            { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
+
+      // wrap
+      void       add_wrap(Direction dir)                { wrap_.push_back(dir); }
+      Direction  wrap(int i) const                      { return wrap_[i]; }
+      Direction& wrap(int i)                            { return wrap_[i]; }
+
+      // bounds
+      const Bounds& core() const                        { return core_; }
+      Bounds&       core()                              { return core_; }
+      const Bounds& bounds() const                      { return bounds_; }
+      Bounds&       bounds()                            { return bounds_; }
+      const Bounds& bounds(int i) const                 { return nbr_bounds_[i]; }
+      void          add_bounds(const Bounds& bounds)    { nbr_bounds_.push_back(bounds); }
+
+      void      swap(RegularLink& other)                { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); wrap_.swap(other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); }
+
+      void      save(BinaryBuffer& bb) const
+      {
+          Link::save(bb);
+          diy::save(bb, dim_);
+          diy::save(bb, dir_map_);
+          diy::save(bb, dir_vec_);
+          diy::save(bb, core_);
+          diy::save(bb, bounds_);
+          diy::save(bb, nbr_bounds_);
+          diy::save(bb, wrap_);
+      }
+
+      void      load(BinaryBuffer& bb)
+      {
+          Link::load(bb);
+          diy::load(bb, dim_);
+          diy::load(bb, dir_map_);
+          diy::load(bb, dir_vec_);
+          diy::load(bb, core_);
+          diy::load(bb, bounds_);
+          diy::load(bb, nbr_bounds_);
+          diy::load(bb, wrap_);
+      }
+
+      virtual size_t id() const                         { return RegularLinkSelector<Bounds>::id; }
+
+    private:
+      int       dim_;
+
+      DirMap    dir_map_;
+      DirVec    dir_vec_;
+
+      Bounds                    core_;
+      Bounds                    bounds_;
+      std::vector<Bounds>       nbr_bounds_;
+      std::vector<Direction>    wrap_;
+  };
+
+  // Other cover candidates: KDTreeLink, AMRGridLink
+
+  struct LinkFactory
+  {
+    public:
+      static Link*          create(size_t id)
+      {
+          // not pretty, but will do for now
+          if (id == 0)
+            return new Link;
+          else if (id == 1)
+            return new RegularGridLink(0, DiscreteBounds(), DiscreteBounds());
+          else if (id == 2)
+            return new RegularContinuousLink(0, ContinuousBounds(), ContinuousBounds());
+          else
+            return 0;
+      }
+
+      inline static void    save(BinaryBuffer& bb, const Link* l);
+      inline static Link*   load(BinaryBuffer& bb);
+  };
+}
+
+
+void
+diy::LinkFactory::
+save(BinaryBuffer& bb, const Link* l)
+{
+    diy::save(bb, l->id());
+    l->save(bb);
+}
+
+diy::Link*
+diy::LinkFactory::
+load(BinaryBuffer& bb)
+{
+    size_t id;
+    diy::load(bb, id);
+    Link* l = create(id);
+    l->load(bb);
+    return l;
+}
+
+int
+diy::Link::
+find(int gid) const
+{
+    for (unsigned i = 0; i < (unsigned)size(); ++i)
+  {
+    if (target(i).gid == gid)
+      return i;
+  }
+  return -1;
+}
+int
+diy::Link::
+size_unique() const
+{
+    std::vector<BlockID> tmp(neighbors_.begin(), neighbors_.end());
+    std::sort(tmp.begin(), tmp.end());
+    return std::unique(tmp.begin(), tmp.end()) - tmp.begin();
+}
+
+template<class Bounds>
+int
+diy::RegularLink<Bounds>::
+direction(Direction dir) const
+{
+  DirMap::const_iterator it = dir_map_.find(dir);
+  if (it == dir_map_.end())
+    return -1;
+  else
+    return it->second;
+}
+
+#endif
diff --git a/include/vtkmdiy/log.hpp b/include/vtkmdiy/log.hpp
new file mode 100644
index 000000000..45f202f92
--- /dev/null
+++ b/include/vtkmdiy/log.hpp
@@ -0,0 +1,103 @@
+#ifndef DIY_LOG_HPP
+#define DIY_LOG_HPP
+
+#ifndef DIY_USE_SPDLOG
+
+#include <memory>
+#include "fmt/format.h"
+#include "fmt/ostream.h"
+
+namespace diy
+{
+
+namespace spd
+{
+    struct logger
+    {
+        // logger.info(cppformat_string, arg1, arg2, arg3, ...) call style
+        template <typename... Args> void trace(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void debug(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void info(const char* fmt, const Args&... args)     {}
+        template <typename... Args> void warn(const char* fmt, const Args&... args)     {}
+        template <typename... Args> void error(const char* fmt, const Args&... args)    {}
+        template <typename... Args> void critical(const char* fmt, const Args&... args) {}
+    };
+}
+
+inline
+std::shared_ptr<spd::logger>
+get_logger()
+{
+    return std::make_shared<spd::logger>();
+}
+
+inline
+std::shared_ptr<spd::logger>
+create_logger(std::string)
+{
+    return std::make_shared<spd::logger>();
+}
+
+template<class... Args>
+std::shared_ptr<spd::logger>
+set_logger(Args... args)
+{
+    return std::make_shared<spd::logger>();
+}
+
+}   // diy
+
+#else // DIY_USE_SPDLOG
+
+#include <string>
+
+#include <spdlog/spdlog.h>
+#include <spdlog/sinks/null_sink.h>
+
+#include <spdlog/fmt/bundled/format.h>
+#include <spdlog/fmt/bundled/ostream.h>
+
+namespace diy
+{
+
+namespace spd = ::spdlog;
+
+inline
+std::shared_ptr<spd::logger>
+get_logger()
+{
+    auto log = spd::get("diy");
+    if (!log)
+    {
+        auto null_sink = std::make_shared<spd::sinks::null_sink_mt> ();
+        log = std::make_shared<spd::logger>("null_logger", null_sink);
+    }
+    return log;
+}
+
+inline
+std::shared_ptr<spd::logger>
+create_logger(std::string log_level)
+{
+    auto log = spd::stderr_logger_mt("diy");
+    int lvl;
+    for (lvl = spd::level::trace; lvl < spd::level::off; ++lvl)
+        if (spd::level::level_names[lvl] == log_level)
+            break;
+    log->set_level(static_cast<spd::level::level_enum>(lvl));
+    return log;
+}
+
+template<class... Args>
+std::shared_ptr<spd::logger>
+set_logger(Args... args)
+{
+    auto log = std::make_shared<spdlog::logger>("diy", args...);
+    return log;
+}
+
+}   // diy
+#endif
+
+
+#endif // DIY_LOG_HPP
diff --git a/include/vtkmdiy/master.hpp b/include/vtkmdiy/master.hpp
new file mode 100644
index 000000000..97ccb8724
--- /dev/null
+++ b/include/vtkmdiy/master.hpp
@@ -0,0 +1,1203 @@
+#ifndef DIY_MASTER_HPP
+#define DIY_MASTER_HPP
+
+#include <vector>
+#include <map>
+#include <list>
+#include <deque>
+#include <algorithm>
+#include <functional>
+
+#include "link.hpp"
+#include "collection.hpp"
+
+// Communicator functionality
+#include "mpi.hpp"
+#include "serialization.hpp"
+#include "detail/collectives.hpp"
+#include "time.hpp"
+
+#include "thread.hpp"
+
+#include "detail/block_traits.hpp"
+
+#include "log.hpp"
+#include "stats.hpp"
+
+namespace diy
+{
+  // Stores and manages blocks; initiates serialization and communication when necessary.
+  //
+  // Provides a foreach function, which is meant as the main entry point.
+  //
+  // Provides a conversion between global and local block ids,
+  // which is hidden from blocks via a communicator proxy.
+  class Master
+  {
+    public:
+      struct ProcessBlock;
+
+      template<class Block>
+      struct Binder;
+
+      // Commands
+      struct BaseCommand;
+
+      template<class Block>
+      struct Command;
+
+      typedef std::vector<BaseCommand*>     Commands;
+
+      // Skip
+      using Skip = std::function<bool(int, const Master&)>;
+
+      struct SkipNoIncoming;
+      struct NeverSkip { bool    operator()(int i, const Master& master) const   { return false; } };
+
+      // Collection
+      typedef Collection::Create            CreateBlock;
+      typedef Collection::Destroy           DestroyBlock;
+      typedef Collection::Save              SaveBlock;
+      typedef Collection::Load              LoadBlock;
+
+    public:
+      // Communicator types
+      struct Proxy;
+      struct ProxyWithLink;
+
+      // foreach callback
+      template<class Block>
+      using Callback = std::function<void(Block*, const ProxyWithLink&)>;
+
+      struct QueuePolicy
+      {
+        virtual bool    unload_incoming(const Master& master, int from, int to, size_t size) const  =0;
+        virtual bool    unload_outgoing(const Master& master, int from, size_t size) const          =0;
+        virtual         ~QueuePolicy() {}
+      };
+
+      //! Move queues out of core if their size exceeds a parameter given in the constructor
+      struct QueueSizePolicy: public QueuePolicy
+      {
+                QueueSizePolicy(size_t sz): size(sz)          {}
+        bool    unload_incoming(const Master& master, int from, int to, size_t sz) const    { return sz > size; }
+        bool    unload_outgoing(const Master& master, int from, size_t sz) const            { return sz > size*master.outgoing_count(from); }
+
+        size_t  size;
+      };
+
+      struct MessageInfo
+      {
+        int from, to;
+        int round;
+      };
+
+      struct InFlightSend
+      {
+        std::shared_ptr<MemoryBuffer> message;
+        mpi::request                  request;
+
+        // for debug purposes:
+        MessageInfo info;
+      };
+
+      struct InFlightRecv
+      {
+        MemoryBuffer message;
+        MessageInfo info{ -1, -1, -1 };
+      };
+
+      struct Collective;
+      struct tags       { enum { queue, piece }; };
+
+      typedef           std::list<InFlightSend>             InFlightSendsList;
+      typedef           std::map<int, InFlightRecv>         InFlightRecvsMap;
+      typedef           std::list<int>                      ToSendList;         // [gid]
+      typedef           std::list<Collective>               CollectivesList;
+      typedef           std::map<int, CollectivesList>      CollectivesMap;     // gid          -> [collectives]
+
+
+      struct QueueRecord
+      {
+                        QueueRecord(size_t s = 0, int e = -1): size(s), external(e)     {}
+        size_t          size;
+        int             external;
+      };
+
+      typedef           std::map<int,     QueueRecord>      InQueueRecords;     //  gid         -> (size, external)
+      typedef           std::map<int,     MemoryBuffer>     IncomingQueues;     //  gid         -> queue
+      typedef           std::map<BlockID, MemoryBuffer>     OutgoingQueues;     // (gid, proc)  -> queue
+      typedef           std::map<BlockID, QueueRecord>      OutQueueRecords;    // (gid, proc)  -> (size, external)
+      struct IncomingQueuesRecords
+      {
+        InQueueRecords  records;
+        IncomingQueues  queues;
+      };
+      struct OutgoingQueuesRecord
+      {
+                        OutgoingQueuesRecord(int e = -1): external(e)       {}
+        int             external;
+        OutQueueRecords external_local;
+        OutgoingQueues  queues;
+      };
+      typedef           std::map<int,     IncomingQueuesRecords>    IncomingQueuesMap;  //  gid         -> {  gid       -> queue }
+      typedef           std::map<int,     OutgoingQueuesRecord>     OutgoingQueuesMap;  //  gid         -> { (gid,proc) -> queue }
+
+      struct IncomingRound
+      {
+        IncomingQueuesMap map;
+        int received{0};
+      };
+      typedef std::map<int, IncomingRound> IncomingRoundMap;
+
+
+    public:
+     /**
+      * \ingroup Initialization
+      * \brief The main DIY object
+      *
+      * Helper functions specify how to:
+           * create an empty block,
+           * destroy a block (a function that's expected to upcast and delete),
+           * serialize a block
+      */
+                    Master(mpi::communicator    comm,          //!< communicator
+                           int                  threads  = 1,  //!< number of threads DIY can use
+                           int                  limit    = -1, //!< number of blocks to store in memory
+                           CreateBlock          create   = 0,  //!< block create function; master manages creation if create != 0
+                           DestroyBlock         destroy  = 0,  //!< block destroy function; master manages destruction if destroy != 0
+                           ExternalStorage*     storage  = 0,  //!< storage object (path, method, etc.) for storing temporary blocks being shuffled in/out of core
+                           SaveBlock            save     = 0,  //!< block save function; master manages saving if save != 0
+                           LoadBlock            load     = 0,  //!< block load function; master manages loading if load != 0
+                           QueuePolicy*         q_policy = new QueueSizePolicy(4096)): //!< policy for managing message queues specifies maximum size of message queues to keep in memory
+                      blocks_(create, destroy, storage, save, load),
+                      queue_policy_(q_policy),
+                      limit_(limit),
+                      threads_(threads == -1 ? thread::hardware_concurrency() : threads),
+                      storage_(storage),
+                      // Communicator functionality
+                      comm_(comm),
+                      expected_(0),
+                      exchange_round_(-1),
+                      immediate_(true)
+                                                        {}
+                    ~Master()                           { set_immediate(true); clear(); delete queue_policy_; }
+      inline void   clear();
+      inline void   destroy(int i)                      { if (blocks_.own()) blocks_.destroy(i); }
+
+      inline int    add(int gid, void* b, Link* l);     //!< add a block
+      inline void*  release(int i);                     //!< release ownership of the block
+
+      //!< return the `i`-th block
+      inline void*  block(int i) const                  { return blocks_.find(i); }
+      template<class Block>
+      Block*        block(int i) const                  { return static_cast<Block*>(block(i)); }
+      inline Link*  link(int i) const                   { return links_[i]; }
+      inline int    loaded_block() const                { return blocks_.available(); }
+
+      inline void   unload(int i);
+      inline void   load(int i);
+      void          unload(std::vector<int>& loaded)    { for(unsigned i = 0; i < loaded.size(); ++i) unload(loaded[i]); loaded.clear(); }
+      void          unload_all()                        { for(unsigned i = 0; i < size(); ++i) if (block(i) != 0) unload(i); }
+      inline bool   has_incoming(int i) const;
+
+      inline void   unload_queues(int i);
+      inline void   unload_incoming(int gid);
+      inline void   unload_outgoing(int gid);
+      inline void   load_queues(int i);
+      inline void   load_incoming(int gid);
+      inline void   load_outgoing(int gid);
+
+      //! return the MPI communicator
+      const mpi::communicator&  communicator() const    { return comm_; }
+      //! return the MPI communicator
+      mpi::communicator&        communicator()          { return comm_; }
+
+      //! return the `i`-th block, loading it if necessary
+      void*         get(int i)                          { return blocks_.get(i); }
+      //! return gid of the `i`-th block
+      int           gid(int i) const                    { return gids_[i]; }
+      //! return the local id of the local block with global id gid, or -1 if not local
+      int           lid(int gid) const                  { return local(gid) ?  lids_.find(gid)->second : -1; }
+      //! whether the block with global id gid is local
+      bool          local(int gid) const                { return lids_.find(gid) != lids_.end(); }
+
+      //! exchange the queues between all the blocks (collective operation)
+      inline void   exchange();
+      inline void   process_collectives();
+
+      inline
+      ProxyWithLink proxy(int i) const;
+
+      //! return the number of local blocks
+      unsigned      size() const                        { return blocks_.size(); }
+      void*         create() const                      { return blocks_.create(); }
+
+      // accessors
+      int           limit() const                       { return limit_; }
+      int           threads() const                     { return threads_; }
+      int           in_memory() const                   { return *blocks_.in_memory().const_access(); }
+
+      void          set_threads(int threads)            { threads_ = threads; }
+
+      CreateBlock   creator() const                     { return blocks_.creator(); }
+      DestroyBlock  destroyer() const                   { return blocks_.destroyer(); }
+      LoadBlock     loader() const                      { return blocks_.loader(); }
+      SaveBlock     saver() const                       { return blocks_.saver(); }
+
+      //! call `f` with every block
+      template<class Block>
+      void          foreach_(const Callback<Block>& f, const Skip& s = NeverSkip());
+
+      template<class F>
+      void          foreach(const F& f, const Skip& s = NeverSkip())
+      {
+          using Block = typename detail::block_traits<F>::type;
+          foreach_<Block>(f, s);
+      }
+
+      inline void   execute();
+
+      bool          immediate() const                   { return immediate_; }
+      void          set_immediate(bool i)               { if (i && !immediate_) execute(); immediate_ = i; }
+
+    public:
+      // Communicator functionality
+      IncomingQueues&   incoming(int gid)               { return incoming_[exchange_round_].map[gid].queues; }
+      OutgoingQueues&   outgoing(int gid)               { return outgoing_[gid].queues; }
+      CollectivesList&  collectives(int gid)            { return collectives_[gid]; }
+      size_t            incoming_count(int gid) const
+      {
+        IncomingRoundMap::const_iterator round_it = incoming_.find(exchange_round_);
+        if (round_it == incoming_.end())
+          return 0;
+        IncomingQueuesMap::const_iterator queue_it = round_it->second.map.find(gid);
+        if (queue_it == round_it->second.map.end())
+          return 0;
+        return queue_it->second.queues.size();
+      }
+      size_t            outgoing_count(int gid) const   { OutgoingQueuesMap::const_iterator it = outgoing_.find(gid); if (it == outgoing_.end()) return 0; return it->second.queues.size(); }
+
+      void              set_expected(int expected)      { expected_ = expected; }
+      void              add_expected(int i)             { expected_ += i; }
+      int               expected() const                { return expected_; }
+      void              replace_link(int i, Link* link) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link; expected_ += links_[i]->size_unique(); }
+
+    public:
+      // Communicator functionality
+      inline void       flush();            // makes sure all the serialized queues migrate to their target processors
+
+    private:
+      // Communicator functionality
+      inline void       comm_exchange(ToSendList& to_send, int out_queues_limit);     // possibly called in between block computations
+      inline bool       nudge();
+
+      void              cancel_requests();              // TODO
+
+      // debug
+      inline void       show_incoming_records() const;
+
+    private:
+      std::vector<Link*>    links_;
+      Collection            blocks_;
+      std::vector<int>      gids_;
+      std::map<int, int>    lids_;
+
+      QueuePolicy*          queue_policy_;
+
+      int                   limit_;
+      int                   threads_;
+      ExternalStorage*      storage_;
+
+    private:
+      // Communicator
+      mpi::communicator     comm_;
+      IncomingRoundMap      incoming_;
+      OutgoingQueuesMap     outgoing_;
+      InFlightSendsList     inflight_sends_;
+      InFlightRecvsMap      inflight_recvs_;
+      CollectivesMap        collectives_;
+      int                   expected_;
+      int                   exchange_round_;
+      bool                  immediate_;
+      Commands              commands_;
+
+    private:
+      fast_mutex            add_mutex_;
+
+    public:
+      std::shared_ptr<spd::logger>  log = get_logger();
+      stats::Profiler               prof;
+  };
+
+  struct Master::BaseCommand
+  {
+      virtual       ~BaseCommand()                                                  {}      // to delete derived classes
+      virtual void  execute(void* b, const ProxyWithLink& cp) const                 =0;
+      virtual bool  skip(int i, const Master& master) const                         =0;
+  };
+
+  template<class Block>
+  struct Master::Command: public BaseCommand
+  {
+            Command(Callback<Block> f_, const Skip& s_):
+                f(f_), s(s_)                                                        {}
+
+      void  execute(void* b, const ProxyWithLink& cp) const override                { f(static_cast<Block*>(b), cp); }
+      bool  skip(int i, const Master& m) const override                             { return s(i,m); }
+
+      Callback<Block>   f;
+      Skip              s;
+  };
+
+  struct Master::SkipNoIncoming
+  { bool operator()(int i, const Master& master) const   { return !master.has_incoming(i); } };
+
+  struct Master::Collective
+  {
+            Collective():
+              cop_(0)                           {}
+            Collective(detail::CollectiveOp* cop):
+              cop_(cop)                         {}
+            // this copy constructor is very ugly, but need it to insert Collectives into a list
+            Collective(const Collective& other):
+              cop_(0)                           { swap(const_cast<Collective&>(other)); }
+            ~Collective()                       { delete cop_; }
+
+    void    init()                              { cop_->init(); }
+    void    swap(Collective& other)             { std::swap(cop_, other.cop_); }
+    void    update(const Collective& other)     { cop_->update(*other.cop_); }
+    void    global(const mpi::communicator& c)  { cop_->global(c); }
+    void    copy_from(Collective& other) const  { cop_->copy_from(*other.cop_); }
+    void    result_out(void* x) const           { cop_->result_out(x); }
+
+    detail::CollectiveOp*                       cop_;
+
+    private:
+    Collective& operator=(const Collective& other);
+  };
+}
+
+#include "proxy.hpp"
+
+// --- ProcessBlock ---
+struct diy::Master::ProcessBlock
+{
+          ProcessBlock(Master&                    master_,
+                       const std::deque<int>&     blocks_,
+                       int                        local_limit_,
+                       critical_resource<int>&    idx_):
+              master(master_),
+              blocks(blocks_),
+              local_limit(local_limit_),
+              idx(idx_)
+          {}
+
+  void    process()
+  {
+    master.log->debug("Processing with thread: {}",  this_thread::get_id());
+
+    std::vector<int>      local;
+    do
+    {
+      int cur = (*idx.access())++;
+
+      if ((size_t)cur >= blocks.size())
+          return;
+
+      int i = blocks[cur];
+      if (master.block(i))
+      {
+          if (local.size() == (size_t)local_limit)
+              master.unload(local);
+          local.push_back(i);
+      }
+
+      master.log->debug("Processing block: {}", master.gid(i));
+
+      bool skip_block = true;
+      for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
+      {
+          if (!master.commands_[cmd]->skip(i, master))
+          {
+              skip_block = false;
+              break;
+          }
+      }
+
+      IncomingQueuesMap &current_incoming = master.incoming_[master.exchange_round_].map;
+      if (skip_block)
+      {
+          if (master.block(i) == 0)
+              master.load_queues(i);      // even though we are skipping the block, the queues might be necessary
+
+          for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
+          {
+              master.commands_[cmd]->execute(0, master.proxy(i));  // 0 signals that we are skipping the block (even if it's loaded)
+
+              // no longer need them, so get rid of them, rather than risk reloading
+              current_incoming[master.gid(i)].queues.clear();
+              current_incoming[master.gid(i)].records.clear();
+          }
+
+          if (master.block(i) == 0)
+              master.unload_queues(i);    // even though we are skipping the block, the queues might be necessary
+      }
+      else
+      {
+          if (master.block(i) == 0)                             // block unloaded
+          {
+              if (local.size() == (size_t)local_limit)                    // reached the local limit
+                  master.unload(local);
+
+              master.load(i);
+              local.push_back(i);
+          }
+
+          for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
+          {
+              master.commands_[cmd]->execute(master.block(i), master.proxy(i));
+
+              // no longer need them, so get rid of them
+              current_incoming[master.gid(i)].queues.clear();
+              current_incoming[master.gid(i)].records.clear();
+          }
+      }
+    } while(true);
+
+    // TODO: invoke opportunistic communication
+    //       don't forget to adjust Master::exchange()
+  }
+
+  static void run(void* bf)                   { static_cast<ProcessBlock*>(bf)->process(); }
+
+  Master&                 master;
+  const std::deque<int>&  blocks;
+  int                     local_limit;
+  critical_resource<int>& idx;
+};
+// --------------------
+
+void
+diy::Master::
+clear()
+{
+  for (unsigned i = 0; i < size(); ++i)
+    delete links_[i];
+  blocks_.clear();
+  links_.clear();
+  gids_.clear();
+  lids_.clear();
+  expected_ = 0;
+}
+
+void
+diy::Master::
+unload(int i)
+{
+  log->debug("Unloading block: {}", gid(i));
+
+  blocks_.unload(i);
+  unload_queues(i);
+}
+
+void
+diy::Master::
+unload_queues(int i)
+{
+  unload_incoming(gid(i));
+  unload_outgoing(gid(i));
+}
+
+void
+diy::Master::
+unload_incoming(int gid)
+{
+  for (IncomingRoundMap::iterator round_itr = incoming_.begin(); round_itr != incoming_.end(); ++round_itr)
+  {
+    IncomingQueuesMap::iterator qmap_itr = round_itr->second.map.find(gid);
+    if (qmap_itr == round_itr->second.map.end())
+    {
+      continue;
+    }
+    IncomingQueuesRecords& in_qrs = qmap_itr->second;
+    for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+    {
+      QueueRecord& qr = it->second;
+      if (queue_policy_->unload_incoming(*this, it->first, gid, qr.size))
+      {
+        log->debug("Unloading queue: {} <- {}", gid, it->first);
+        qr.external = storage_->put(in_qrs.queues[it->first]);
+      }
+    }
+  }
+}
+
+void
+diy::Master::
+unload_outgoing(int gid)
+{
+  OutgoingQueuesRecord& out_qr = outgoing_[gid];
+
+  size_t out_queues_size = sizeof(size_t);   // map size
+  size_t count = 0;
+  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end(); ++it)
+  {
+    if (it->first.proc == comm_.rank()) continue;
+
+    out_queues_size += sizeof(BlockID);     // target
+    out_queues_size += sizeof(size_t);      // buffer.position
+    out_queues_size += sizeof(size_t);      // buffer.size
+    out_queues_size += it->second.size();   // buffer contents
+    ++count;
+  }
+  if (queue_policy_->unload_outgoing(*this, gid, out_queues_size - sizeof(size_t)))
+  {
+      log->debug("Unloading outgoing queues: {} -> ...; size = {}\n", gid, out_queues_size);
+      MemoryBuffer  bb;     bb.reserve(out_queues_size);
+      diy::save(bb, count);
+
+      for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end();)
+      {
+        if (it->first.proc == comm_.rank())
+        {
+          // treat as incoming
+          if (queue_policy_->unload_incoming(*this, gid, it->first.gid, it->second.size()))
+          {
+            QueueRecord& qr = out_qr.external_local[it->first];
+            qr.size = it->second.size();
+            qr.external = storage_->put(it->second);
+
+            out_qr.queues.erase(it++);
+            continue;
+          } // else keep in memory
+        } else
+        {
+          diy::save(bb, it->first);
+          diy::save(bb, it->second);
+
+          out_qr.queues.erase(it++);
+          continue;
+        }
+        ++it;
+      }
+
+      // TODO: this mechanism could be adjusted for direct saving to disk
+      //       (without intermediate binary buffer serialization)
+      out_qr.external = storage_->put(bb);
+  }
+}
+
+void
+diy::Master::
+load(int i)
+{
+ log->debug("Loading block: {}", gid(i));
+
+  blocks_.load(i);
+  load_queues(i);
+}
+
+void
+diy::Master::
+load_queues(int i)
+{
+  load_incoming(gid(i));
+  load_outgoing(gid(i));
+}
+
+void
+diy::Master::
+load_incoming(int gid)
+{
+  IncomingQueuesRecords& in_qrs = incoming_[exchange_round_].map[gid];
+  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+  {
+    QueueRecord& qr = it->second;
+    if (qr.external != -1)
+    {
+        log->debug("Loading queue: {} <- {}", gid, it->first);
+        storage_->get(qr.external, in_qrs.queues[it->first]);
+        qr.external = -1;
+    }
+  }
+}
+
+void
+diy::Master::
+load_outgoing(int gid)
+{
+  // TODO: we could adjust this mechanism to read directly from storage,
+  //       bypassing an intermediate MemoryBuffer
+  OutgoingQueuesRecord& out_qr = outgoing_[gid];
+  if (out_qr.external != -1)
+  {
+    MemoryBuffer bb;
+    storage_->get(out_qr.external, bb);
+    out_qr.external = -1;
+
+    size_t count;
+    diy::load(bb, count);
+    for (size_t i = 0; i < count; ++i)
+    {
+      BlockID to;
+      diy::load(bb, to);
+      diy::load(bb, out_qr.queues[to]);
+    }
+  }
+}
+
+diy::Master::ProxyWithLink
+diy::Master::
+proxy(int i) const
+{ return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i)), block(i), link(i)); }
+
+
+int
+diy::Master::
+add(int gid, void* b, Link* l)
+{
+  if (*blocks_.in_memory().const_access() == limit_)
+    unload_all();
+
+  lock_guard<fast_mutex>    lock(add_mutex_);       // allow to add blocks from multiple threads
+
+  blocks_.add(b);
+  links_.push_back(l);
+  gids_.push_back(gid);
+
+  int lid = gids_.size() - 1;
+  lids_[gid] = lid;
+  add_expected(l->size_unique()); // NB: at every iteration we expect a message from each unique neighbor
+
+  return lid;
+}
+
+void*
+diy::Master::
+release(int i)
+{
+  void* b = blocks_.release(i);
+  delete link(i);   links_[i] = 0;
+  lids_.erase(gid(i));
+  return b;
+}
+
+bool
+diy::Master::
+has_incoming(int i) const
+{
+  const IncomingQueuesRecords& in_qrs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
+  for (InQueueRecords::const_iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+  {
+    const QueueRecord& qr = it->second;
+    if (qr.size != 0)
+        return true;
+  }
+  return false;
+}
+
+template<class Block>
+void
+diy::Master::
+foreach_(const Callback<Block>& f, const Skip& skip)
+{
+    auto scoped = prof.scoped("foreach");
+    commands_.push_back(new Command<Block>(f, skip));
+
+    if (immediate())
+        execute();
+}
+
+void
+diy::Master::
+execute()
+{
+  log->debug("Entered execute()");
+  auto scoped = prof.scoped("execute");
+  //show_incoming_records();
+
+  // touch the outgoing and incoming queues as well as collectives to make sure they exist
+  for (unsigned i = 0; i < size(); ++i)
+  {
+    outgoing(gid(i));
+    incoming(gid(i));           // implicitly touches queue records
+    collectives(gid(i));
+  }
+
+  if (commands_.empty())
+      return;
+
+  // Order the blocks, so the loaded ones come first
+  std::deque<int>   blocks;
+  for (unsigned i = 0; i < size(); ++i)
+    if (block(i) == 0)
+        blocks.push_back(i);
+    else
+        blocks.push_front(i);
+
+  // don't use more threads than we can have blocks in memory
+  int num_threads;
+  int blocks_per_thread;
+  if (limit_ == -1)
+  {
+    num_threads = threads_;
+    blocks_per_thread = size();
+  }
+  else
+  {
+    num_threads = std::min(threads_, limit_);
+    blocks_per_thread = limit_/num_threads;
+  }
+
+  // idx is shared
+  critical_resource<int> idx(0);
+
+  typedef                 ProcessBlock                                   BlockFunctor;
+  if (num_threads > 1)
+  {
+    // launch the threads
+    typedef               std::pair<thread*, BlockFunctor*>               ThreadFunctorPair;
+    typedef               std::list<ThreadFunctorPair>                    ThreadFunctorList;
+    ThreadFunctorList     threads;
+    for (unsigned i = 0; i < (unsigned)num_threads; ++i)
+    {
+        BlockFunctor* bf = new BlockFunctor(*this, blocks, blocks_per_thread, idx);
+        threads.push_back(ThreadFunctorPair(new thread(&BlockFunctor::run, bf), bf));
+    }
+
+    // join the threads
+    for(ThreadFunctorList::iterator it = threads.begin(); it != threads.end(); ++it)
+    {
+        thread*           t  = it->first;
+        BlockFunctor*     bf = it->second;
+        t->join();
+        delete t;
+        delete bf;
+    }
+  } else
+  {
+      BlockFunctor bf(*this, blocks, blocks_per_thread, idx);
+      BlockFunctor::run(&bf);
+  }
+
+  // clear incoming queues
+  incoming_[exchange_round_].map.clear();
+
+  if (limit() != -1 && in_memory() > limit())
+      throw std::runtime_error(fmt::format("Fatal: {} blocks in memory, with limit {}", in_memory(), limit()));
+
+  // clear commands
+  for (size_t i = 0; i < commands_.size(); ++i)
+      delete commands_[i];
+  commands_.clear();
+}
+
+void
+diy::Master::
+exchange()
+{
+  auto scoped = prof.scoped("exchange");
+  execute();
+
+  log->debug("Starting exchange");
+
+  // make sure there is a queue for each neighbor
+  for (int i = 0; i < (int)size(); ++i)
+  {
+    OutgoingQueues&  outgoing_queues  = outgoing_[gid(i)].queues;
+    OutQueueRecords& external_local   = outgoing_[gid(i)].external_local;
+    if (outgoing_queues.size() < (size_t)link(i)->size())
+      for (unsigned j = 0; j < (unsigned)link(i)->size(); ++j)
+      {
+        if (external_local.find(link(i)->target(j)) == external_local.end())
+          outgoing_queues[link(i)->target(j)];        // touch the outgoing queue, creating it if necessary
+      }
+  }
+
+  flush();
+  log->debug("Finished exchange");
+}
+
+namespace diy
+{
+namespace detail
+{
+  template <typename T>
+  struct VectorWindow
+  {
+    T *begin;
+    size_t count;
+  };
+} // namespace detail
+
+namespace mpi
+{
+namespace detail
+{
+  template<typename T>  struct is_mpi_datatype< diy::detail::VectorWindow<T> > { typedef true_type type; };
+
+  template <typename T>
+  struct mpi_datatype< diy::detail::VectorWindow<T> >
+  {
+    typedef diy::detail::VectorWindow<T> VecWin;
+    static MPI_Datatype         datatype()                { return get_mpi_datatype<T>(); }
+    static const void*          address(const VecWin& x)  { return x.begin; }
+    static void*                address(VecWin& x)        { return x.begin; }
+    static int                  count(const VecWin& x)    { return static_cast<int>(x.count); }
+  };
+}
+} // namespace mpi::detail
+
+} // namespace diy
+
+/* Communicator */
+void
+diy::Master::
+comm_exchange(ToSendList& to_send, int out_queues_limit)
+{
+  static const size_t MAX_MPI_MESSAGE_COUNT = INT_MAX;
+
+  IncomingRound &current_incoming = incoming_[exchange_round_];
+  // isend outgoing queues, up to the out_queues_limit
+  while(inflight_sends_.size() < (size_t)out_queues_limit && !to_send.empty())
+  {
+    int from = to_send.front();
+
+    // deal with external_local queues
+    for (OutQueueRecords::iterator it = outgoing_[from].external_local.begin(); it != outgoing_[from].external_local.end(); ++it)
+    {
+      int to = it->first.gid;
+
+      log->debug("Processing local queue: {} <- {} of size {}", to, from, it->second.size);
+
+      QueueRecord& in_qr  = current_incoming.map[to].records[from];
+      bool in_external  = block(lid(to)) == 0;
+
+      if (in_external)
+          in_qr = it->second;
+      else
+      {
+          // load the queue
+          in_qr.size     = it->second.size;
+          in_qr.external = -1;
+
+          MemoryBuffer bb;
+          storage_->get(it->second.external, bb);
+
+          current_incoming.map[to].queues[from].swap(bb);
+      }
+      ++current_incoming.received;
+    }
+    outgoing_[from].external_local.clear();
+
+    if (outgoing_[from].external != -1)
+      load_outgoing(from);
+    to_send.pop_front();
+
+    OutgoingQueues& outgoing = outgoing_[from].queues;
+    for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
+    {
+      BlockID to_proc = it->first;
+      int     to      = to_proc.gid;
+      int     proc    = to_proc.proc;
+
+      log->debug("Processing queue:      {} <- {} of size {}", to, from, outgoing_[from].queues[to_proc].size());
+
+      // There may be local outgoing queues that remained in memory
+      if (proc == comm_.rank())     // sending to ourselves: simply swap buffers
+      {
+        log->debug("Moving queue in-place: {} <- {}", to, from);
+
+        QueueRecord& in_qr  = current_incoming.map[to].records[from];
+        bool in_external  = block(lid(to)) == 0;
+        if (in_external)
+        {
+          log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
+          MemoryBuffer& bb = it->second;
+          in_qr.size = bb.size();
+          if (queue_policy_->unload_incoming(*this, from, to, in_qr.size))
+            in_qr.external = storage_->put(bb);
+          else
+          {
+            MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
+            in_bb.swap(bb);
+            in_bb.reset();
+            in_qr.external = -1;
+          }
+        } else        // !in_external
+        {
+          log->debug("Swapping in memory:    {} <- {}", to, from);
+          MemoryBuffer& bb = current_incoming.map[to].queues[from];
+          bb.swap(it->second);
+          bb.reset();
+          in_qr.size = bb.size();
+          in_qr.external = -1;
+        }
+
+        ++current_incoming.received;
+        continue;
+      }
+
+      std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>();
+      buffer->swap(it->second);
+
+      MessageInfo info{from, to, exchange_round_};
+      if (buffer->size() <= (MAX_MPI_MESSAGE_COUNT - sizeof(info)))
+      {
+        diy::save(*buffer, info);
+
+        inflight_sends_.emplace_back();
+        inflight_sends_.back().info = info;
+        inflight_sends_.back().request = comm_.isend(proc, tags::queue, buffer->buffer);
+        inflight_sends_.back().message = buffer;
+      }
+      else
+      {
+        int npieces = static_cast<int>((buffer->size() + MAX_MPI_MESSAGE_COUNT - 1)/MAX_MPI_MESSAGE_COUNT);
+
+        // first send the head
+        std::shared_ptr<MemoryBuffer> hb = std::make_shared<MemoryBuffer>();
+        diy::save(*hb, buffer->size());
+        diy::save(*hb, info);
+
+        inflight_sends_.emplace_back();
+        inflight_sends_.back().info = info;
+        inflight_sends_.back().request = comm_.isend(proc, tags::piece, hb->buffer);
+        inflight_sends_.back().message = hb;
+
+        // send the message pieces
+        size_t msg_buff_idx = 0;
+        for (int i = 0; i < npieces; ++i, msg_buff_idx += MAX_MPI_MESSAGE_COUNT)
+        {
+          int tag = (i == (npieces - 1)) ? tags::queue : tags::piece;
+
+          detail::VectorWindow<char> window;
+          window.begin = &buffer->buffer[msg_buff_idx];
+          window.count = std::min(MAX_MPI_MESSAGE_COUNT, buffer->size() - msg_buff_idx);
+
+          inflight_sends_.emplace_back();
+          inflight_sends_.back().info = info;
+          inflight_sends_.back().request = comm_.isend(proc, tag, window);
+          inflight_sends_.back().message = buffer;
+        }
+      }
+    }
+  }
+
+  // kick requests
+  while(nudge());
+
+  // check incoming queues
+  mpi::optional<mpi::status> ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
+  while(ostatus)
+  {
+    InFlightRecv &ir = inflight_recvs_[ostatus->source()];
+
+    if (ir.info.from == -1) // uninitialized
+    {
+      MemoryBuffer bb;
+      comm_.recv(ostatus->source(), ostatus->tag(), bb.buffer);
+
+      if (ostatus->tag() == tags::piece)
+      {
+        size_t msg_size;
+        diy::load(bb, msg_size);
+        diy::load(bb, ir.info);
+
+        ir.message.buffer.reserve(msg_size);
+      }
+      else // tags::queue
+      {
+        diy::load_back(bb, ir.info);
+        ir.message.swap(bb);
+      }
+    }
+    else
+    {
+      size_t start_idx = ir.message.buffer.size();
+      size_t count = ostatus->count<char>();
+      ir.message.buffer.resize(start_idx + count);
+
+      detail::VectorWindow<char> window;
+      window.begin = &ir.message.buffer[start_idx];
+      window.count = count;
+
+      comm_.recv(ostatus->source(), ostatus->tag(), window);
+    }
+
+    if (ostatus->tag() == tags::queue)
+    {
+      size_t size  = ir.message.size();
+      int from = ir.info.from;
+      int to = ir.info.to;
+      int external = -1;
+
+      assert(ir.info.round >= exchange_round_);
+      IncomingRound *in = &incoming_[ir.info.round];
+
+      bool unload_queue = ((ir.info.round == exchange_round_) ? (block(lid(to)) == 0) : (limit_ != -1)) &&
+                          queue_policy_->unload_incoming(*this, from, to, size);
+      if (unload_queue)
+      {
+        log->debug("Directly unloading queue {} <- {}", to, from);
+        external = storage_->put(ir.message); // unload directly
+      }
+      else
+      {
+        in->map[to].queues[from].swap(ir.message);
+        in->map[to].queues[from].reset();     // buffer position = 0
+      }
+      in->map[to].records[from] = QueueRecord(size, external);
+
+      ++(in->received);
+      ir = InFlightRecv(); // reset
+    }
+
+    ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
+  }
+}
+
+void
+diy::Master::
+flush()
+{
+#ifdef DEBUG
+  time_type start = get_time();
+  unsigned wait = 1;
+#endif
+
+  // prepare for next round
+  incoming_.erase(exchange_round_);
+  ++exchange_round_;
+
+  // make a list of outgoing queues to send (the ones in memory come first)
+  ToSendList    to_send;
+  for (OutgoingQueuesMap::iterator it = outgoing_.begin(); it != outgoing_.end(); ++it)
+  {
+    OutgoingQueuesRecord& out = it->second;
+    if (out.external == -1)
+        to_send.push_front(it->first);
+    else
+        to_send.push_back(it->first);
+  }
+  log->debug("to_send.size(): {}", to_send.size());
+
+  // XXX: we probably want a cleverer limit than block limit times average number of queues per block
+  // XXX: with queues we could easily maintain a specific space limit
+  int out_queues_limit;
+  if (limit_ == -1 || size() == 0)
+    out_queues_limit = to_send.size();
+  else
+    out_queues_limit = std::max((size_t) 1, to_send.size()/size()*limit_);      // average number of queues per block * in-memory block limit
+
+  do
+  {
+    comm_exchange(to_send, out_queues_limit);
+
+#ifdef DEBUG
+    time_type cur = get_time();
+    if (cur - start > wait*1000)
+    {
+        log->warn("Waiting in flush [{}]: {} - {} out of {}",
+                  comm_.rank(), inflight_sends_.size(), incoming_[exchange_round_].received, expected_);
+        wait *= 2;
+    }
+#endif
+  } while (!inflight_sends_.empty() || incoming_[exchange_round_].received < expected_ || !to_send.empty());
+
+  outgoing_.clear();
+
+  log->debug("Done in flush");
+  //show_incoming_records();
+
+  process_collectives();
+}
+
+void
+diy::Master::
+process_collectives()
+{
+  auto scoped = prof.scoped("collectives");
+
+  if (collectives_.empty())
+      return;
+
+  typedef       CollectivesList::iterator       CollectivesIterator;
+  std::vector<CollectivesIterator>  iters;
+  std::vector<int>                  gids;
+  for (CollectivesMap::iterator cur = collectives_.begin(); cur != collectives_.end(); ++cur)
+  {
+    gids.push_back(cur->first);
+    iters.push_back(cur->second.begin());
+  }
+
+  while (iters[0] != collectives_.begin()->second.end())
+  {
+    iters[0]->init();
+    for (unsigned j = 1; j < iters.size(); ++j)
+    {
+      // NB: this assumes that the operations are commutative
+      iters[0]->update(*iters[j]);
+    }
+    iters[0]->global(comm_);        // do the mpi collective
+
+    for (unsigned j = 1; j < iters.size(); ++j)
+    {
+      iters[j]->copy_from(*iters[0]);
+      ++iters[j];
+    }
+
+    ++iters[0];
+  }
+}
+
+bool
+diy::Master::
+nudge()
+{
+  bool success = false;
+  for (InFlightSendsList::iterator it = inflight_sends_.begin(); it != inflight_sends_.end(); ++it)
+  {
+    mpi::optional<mpi::status> ostatus = it->request.test();
+    if (ostatus)
+    {
+      success = true;
+      InFlightSendsList::iterator rm = it;
+      --it;
+      inflight_sends_.erase(rm);
+    }
+  }
+  return success;
+}
+
+void
+diy::Master::
+show_incoming_records() const
+{
+  for (IncomingRoundMap::const_iterator rounds_itr = incoming_.begin(); rounds_itr != incoming_.end(); ++rounds_itr)
+  {
+    for (IncomingQueuesMap::const_iterator it = rounds_itr->second.map.begin(); it != rounds_itr->second.map.end(); ++it)
+    {
+      const IncomingQueuesRecords& in_qrs = it->second;
+      for (InQueueRecords::const_iterator cur = in_qrs.records.begin(); cur != in_qrs.records.end(); ++cur)
+      {
+        const QueueRecord& qr = cur->second;
+        log->info("round: {}, {} <- {}: (size,external) = ({},{})",
+                  rounds_itr->first,
+                  it->first, cur->first,
+                  qr.size,
+                  qr.external);
+      }
+      for (IncomingQueues::const_iterator cur = in_qrs.queues.begin(); cur != in_qrs.queues.end(); ++cur)
+      {
+        log->info("round: {}, {} <- {}: queue.size() = {}",
+                  rounds_itr->first,
+                  it->first, cur->first,
+                  const_cast<IncomingQueuesRecords&>(in_qrs).queues[cur->first].size());
+      }
+    }
+  }
+}
+
+#endif
diff --git a/include/vtkmdiy/mpi.hpp b/include/vtkmdiy/mpi.hpp
new file mode 100644
index 000000000..28502002f
--- /dev/null
+++ b/include/vtkmdiy/mpi.hpp
@@ -0,0 +1,32 @@
+#ifndef DIY_MPI_HPP
+#define DIY_MPI_HPP
+
+#include <mpi.h>
+
+#include "mpi/constants.hpp"
+#include "mpi/datatypes.hpp"
+#include "mpi/optional.hpp"
+#include "mpi/status.hpp"
+#include "mpi/request.hpp"
+#include "mpi/point-to-point.hpp"
+#include "mpi/communicator.hpp"
+#include "mpi/collectives.hpp"
+#include "mpi/io.hpp"
+
+namespace diy
+{
+namespace mpi
+{
+
+//! \ingroup MPI
+struct environment
+{
+  environment()                           { int argc = 0; char** argv; MPI_Init(&argc, &argv); }
+  environment(int argc, char* argv[])     { MPI_Init(&argc, &argv); }
+  ~environment()                          { MPI_Finalize(); }
+};
+
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/mpi/collectives.hpp b/include/vtkmdiy/mpi/collectives.hpp
new file mode 100644
index 000000000..8d70bcf01
--- /dev/null
+++ b/include/vtkmdiy/mpi/collectives.hpp
@@ -0,0 +1,328 @@
+#include <vector>
+
+#include "operations.hpp"
+
+namespace diy
+{
+namespace mpi
+{
+  //!\addtogroup MPI
+  //!@{
+
+  template<class T, class Op>
+  struct Collectives
+  {
+    typedef   detail::mpi_datatype<T>     Datatype;
+
+    static void broadcast(const communicator& comm, T& x, int root)
+    {
+      MPI_Bcast(Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(), root, comm);
+    }
+
+    static void broadcast(const communicator& comm, std::vector<T>& x, int root)
+    {
+      size_t sz = x.size();
+      Collectives<size_t, void*>::broadcast(comm, sz, root);
+
+      if (comm.rank() != root)
+          x.resize(sz);
+
+      MPI_Bcast(Datatype::address(x[0]),
+                x.size(),
+                Datatype::datatype(), root, comm);
+    }
+
+    static request ibroadcast(const communicator& comm, T& x, int root)
+    {
+      request r;
+      MPI_Ibcast(Datatype::address(x),
+                 Datatype::count(x),
+                 Datatype::datatype(), root, comm, &r.r);
+      return r;
+    }
+
+    static void gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
+    {
+      size_t s  = comm.size();
+             s *= Datatype::count(in);
+      out.resize(s);
+      MPI_Gather(Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 Datatype::address(out[0]),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 root, comm);
+    }
+
+    static void gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
+    {
+      std::vector<int>  counts(comm.size());
+      Collectives<int,void*>::gather(comm, (int) in.size(), counts, root);
+
+      std::vector<int>  offsets(comm.size(), 0);
+      for (unsigned i = 1; i < offsets.size(); ++i)
+        offsets[i] = offsets[i-1] + counts[i-1];
+
+      std::vector<T> buffer(offsets.back() + counts.back());
+      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
+                  in.size(),
+                  Datatype::datatype(),
+                  Datatype::address(buffer[0]),
+                  &counts[0],
+                  &offsets[0],
+                  Datatype::datatype(),
+                  root, comm);
+
+      out.resize(comm.size());
+      size_t cur = 0;
+      for (unsigned i = 0; i < (unsigned)comm.size(); ++i)
+      {
+          out[i].reserve(counts[i]);
+          for (unsigned j = 0; j < (unsigned)counts[i]; ++j)
+              out[i].push_back(buffer[cur++]);
+      }
+    }
+
+    static void gather(const communicator& comm, const T& in, int root)
+    {
+      MPI_Gather(Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 root, comm);
+    }
+
+    static void gather(const communicator& comm, const std::vector<T>& in, int root)
+    {
+      Collectives<int,void*>::gather(comm, (int) in.size(), root);
+
+      MPI_Gatherv(Datatype::address(const_cast<T&>(in[0])),
+                  in.size(),
+                  Datatype::datatype(),
+                  0, 0, 0,
+                  Datatype::datatype(),
+                  root, comm);
+    }
+
+    static void all_gather(const communicator& comm, const T& in, std::vector<T>& out)
+    {
+      size_t s  = comm.size();
+             s *= Datatype::count(in);
+      out.resize(s);
+      MPI_Allgather(Datatype::address(const_cast<T&>(in)),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    Datatype::address(out[0]),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    comm);
+    }
+
+    static void all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
+    {
+      std::vector<int>  counts(comm.size());
+      Collectives<int,void*>::all_gather(comm, (int) in.size(), counts);
+
+      std::vector<int>  offsets(comm.size(), 0);
+      for (unsigned i = 1; i < offsets.size(); ++i)
+        offsets[i] = offsets[i-1] + counts[i-1];
+
+      std::vector<T> buffer(offsets.back() + counts.back());
+      MPI_Allgatherv(Datatype::address(const_cast<T&>(in[0])),
+                     in.size(),
+                     Datatype::datatype(),
+                     Datatype::address(buffer[0]),
+                     &counts[0],
+                     &offsets[0],
+                     Datatype::datatype(),
+                     comm);
+
+      out.resize(comm.size());
+      size_t cur = 0;
+      for (int i = 0; i < comm.size(); ++i)
+      {
+          out[i].reserve(counts[i]);
+          for (int j = 0; j < counts[i]; ++j)
+              out[i].push_back(buffer[cur++]);
+      }
+    }
+
+    static void reduce(const communicator& comm, const T& in, T& out, int root, const Op&)
+    {
+      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
+                 Datatype::address(out),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 detail::mpi_op<Op>::get(),
+                 root, comm);
+    }
+
+    static void reduce(const communicator& comm, const T& in, int root, const Op& op)
+    {
+      MPI_Reduce(Datatype::address(const_cast<T&>(in)),
+                 Datatype::address(const_cast<T&>(in)),
+                 Datatype::count(in),
+                 Datatype::datatype(),
+                 detail::mpi_op<Op>::get(),
+                 root, comm);
+    }
+
+    static void all_reduce(const communicator& comm, const T& in, T& out, const Op&)
+    {
+      MPI_Allreduce(Datatype::address(const_cast<T&>(in)),
+                    Datatype::address(out),
+                    Datatype::count(in),
+                    Datatype::datatype(),
+                    detail::mpi_op<Op>::get(),
+                    comm);
+    }
+
+    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
+    {
+      out.resize(in.size());
+      MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])),
+                    Datatype::address(out[0]),
+                    in.size(),
+                    Datatype::datatype(),
+                    detail::mpi_op<Op>::get(),
+                    comm);
+    }
+
+    static void scan(const communicator& comm, const T& in, T& out, const Op&)
+    {
+      MPI_Scan(Datatype::address(const_cast<T&>(in)),
+               Datatype::address(out),
+               Datatype::count(in),
+               Datatype::datatype(),
+               detail::mpi_op<Op>::get(),
+               comm);
+    }
+
+    static void all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
+    {
+      // NB: this will fail if T is a vector
+      MPI_Alltoall(Datatype::address(const_cast<T&>(in[0])), n,
+                   Datatype::datatype(),
+                   Datatype::address(out[0]), n,
+                   Datatype::datatype(),
+                   comm);
+    }
+  };
+
+  //! Broadcast to all processes in `comm`.
+  template<class T>
+  void      broadcast(const communicator& comm, T& x, int root)
+  {
+    Collectives<T,void*>::broadcast(comm, x, root);
+  }
+
+  //! Broadcast for vectors
+  template<class T>
+  void      broadcast(const communicator& comm, std::vector<T>& x, int root)
+  {
+    Collectives<T,void*>::broadcast(comm, x, root);
+  }
+
+  //! iBroadcast to all processes in `comm`.
+  template<class T>
+  request   ibroadcast(const communicator& comm, T& x, int root)
+  {
+    return Collectives<T,void*>::ibroadcast(comm, x, root);
+  }
+
+  //! Gather from all processes in `comm`.
+  //!  On `root` process, `out` is resized to `comm.size()` and filled with
+  //! elements from the respective ranks.
+  template<class T>
+  void      gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, out, root);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T>
+  void      gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, out, root);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T>
+  void      gather(const communicator& comm, const T& in, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, root);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T>
+  void      gather(const communicator& comm, const std::vector<T>& in, int root)
+  {
+    Collectives<T,void*>::gather(comm, in, root);
+  }
+
+  //! all_gather from all processes in `comm`.
+  //! `out` is resized to `comm.size()` and filled with
+  //! elements from the respective ranks.
+  template<class T>
+  void      all_gather(const communicator& comm, const T& in, std::vector<T>& out)
+  {
+    Collectives<T,void*>::all_gather(comm, in, out);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T>
+  void      all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
+  {
+    Collectives<T,void*>::all_gather(comm, in, out);
+  }
+
+  //! reduce
+  template<class T, class Op>
+  void      reduce(const communicator& comm, const T& in, T& out, int root, const Op& op)
+  {
+    Collectives<T, Op>::reduce(comm, in, out, root, op);
+  }
+
+  //! Simplified version (without `out`) for use on non-root processes.
+  template<class T, class Op>
+  void      reduce(const communicator& comm, const T& in, int root, const Op& op)
+  {
+    Collectives<T, Op>::reduce(comm, in, root, op);
+  }
+
+  //! all_reduce
+  template<class T, class Op>
+  void      all_reduce(const communicator& comm, const T& in, T& out, const Op& op)
+  {
+    Collectives<T, Op>::all_reduce(comm, in, out, op);
+  }
+
+  //! Same as above, but for vectors.
+  template<class T, class Op>
+  void      all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
+  {
+    Collectives<T, Op>::all_reduce(comm, in, out, op);
+  }
+
+  //! scan
+  template<class T, class Op>
+  void      scan(const communicator& comm, const T& in, T& out, const Op& op)
+  {
+    Collectives<T, Op>::scan(comm, in, out, op);
+  }
+
+  //! all_to_all
+  template<class T>
+  void      all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
+  {
+    Collectives<T, void*>::all_to_all(comm, in, out, n);
+  }
+
+  //!@}
+}
+}
diff --git a/include/vtkmdiy/mpi/communicator.hpp b/include/vtkmdiy/mpi/communicator.hpp
new file mode 100644
index 000000000..c29b6d033
--- /dev/null
+++ b/include/vtkmdiy/mpi/communicator.hpp
@@ -0,0 +1,72 @@
+namespace diy
+{
+namespace mpi
+{
+
+  //! \ingroup MPI
+  //! Simple wrapper around `MPI_Comm`.
+  class communicator
+  {
+    public:
+                communicator(MPI_Comm comm = MPI_COMM_WORLD):
+                  comm_(comm), rank_(0), size_(1)   { if (comm != MPI_COMM_NULL) { MPI_Comm_rank(comm_, &rank_); MPI_Comm_size(comm_, &size_); } }
+
+      int       rank() const                        { return rank_; }
+      int       size() const                        { return size_; }
+
+      //void      send(int dest,
+      //               int tag,
+      //               const void* buf,
+      //               MPI_Datatype datatype) const   { }
+
+      //! Send `x` to processor `dest` using `tag` (blocking).
+      template<class T>
+      void      send(int dest, int tag, const T& x) const   { detail::send<T>()(comm_, dest, tag, x); }
+
+      //! Receive `x` from `dest` using `tag` (blocking).
+      //! If `T` is an `std::vector<...>`, `recv` will resize it to fit exactly the sent number of values.
+      template<class T>
+      status    recv(int source, int tag, T& x) const       { return detail::recv<T>()(comm_, source, tag, x); }
+
+      //! Non-blocking version of `send()`.
+      template<class T>
+      request   isend(int dest, int tag, const T& x) const  { return detail::isend<T>()(comm_, dest, tag, x); }
+
+      //! Non-blocking version of `recv()`.
+      //! If `T` is an `std::vector<...>`, its size must be big enough to accomodate the sent values.
+      template<class T>
+      request   irecv(int source, int tag, T& x) const      { return detail::irecv<T>()(comm_, source, tag, x); }
+
+      //! probe
+      status    probe(int source, int tag) const            { status s; MPI_Probe(source, tag, comm_, &s.s); return s; }
+
+      //! iprobe
+      inline
+      optional<status>
+                iprobe(int source, int tag) const;
+
+      //! barrier
+      void      barrier() const                             { MPI_Barrier(comm_); }
+
+                operator MPI_Comm() const                   { return comm_; }
+
+    private:
+      MPI_Comm  comm_;
+      int       rank_;
+      int       size_;
+  };
+}
+}
+
+diy::mpi::optional<diy::mpi::status>
+diy::mpi::communicator::
+iprobe(int source, int tag) const
+{
+  status s;
+  int flag;
+  MPI_Iprobe(source, tag, comm_, &flag, &s.s);
+  if (flag)
+    return s;
+  return optional<status>();
+}
+
diff --git a/include/vtkmdiy/mpi/constants.hpp b/include/vtkmdiy/mpi/constants.hpp
new file mode 100644
index 000000000..7668e418f
--- /dev/null
+++ b/include/vtkmdiy/mpi/constants.hpp
@@ -0,0 +1,13 @@
+#ifndef DIY_MPI_CONSTANTS_HPP
+#define DIY_MPI_CONSTANTS_HPP
+
+namespace diy
+{
+namespace mpi
+{
+  const int any_source  = MPI_ANY_SOURCE;
+  const int any_tag     = MPI_ANY_TAG;
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/mpi/datatypes.hpp b/include/vtkmdiy/mpi/datatypes.hpp
new file mode 100644
index 000000000..7d8e3a448
--- /dev/null
+++ b/include/vtkmdiy/mpi/datatypes.hpp
@@ -0,0 +1,63 @@
+#ifndef DIY_MPI_DATATYPES_HPP
+#define DIY_MPI_DATATYPES_HPP
+
+#include <vector>
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+  template<class T> MPI_Datatype  get_mpi_datatype();
+
+  struct true_type  {};
+  struct false_type {};
+
+  /* is_mpi_datatype */
+  template<class T>
+  struct is_mpi_datatype        { typedef false_type    type; };
+
+#define DIY_MPI_DATATYPE_MAP(cpp_type, mpi_type) \
+  template<>  inline MPI_Datatype  get_mpi_datatype<cpp_type>() { return mpi_type; }  \
+  template<>  struct is_mpi_datatype<cpp_type>                  { typedef true_type type; };    \
+  template<>  struct is_mpi_datatype< std::vector<cpp_type> >   { typedef true_type type; };
+
+  DIY_MPI_DATATYPE_MAP(char,                  MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(unsigned char,         MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(bool,                  MPI_BYTE);
+  DIY_MPI_DATATYPE_MAP(int,                   MPI_INT);
+  DIY_MPI_DATATYPE_MAP(unsigned,              MPI_UNSIGNED);
+  DIY_MPI_DATATYPE_MAP(long,                  MPI_LONG);
+  DIY_MPI_DATATYPE_MAP(unsigned long,         MPI_UNSIGNED_LONG);
+  DIY_MPI_DATATYPE_MAP(long long,             MPI_LONG_LONG_INT);
+  DIY_MPI_DATATYPE_MAP(unsigned long long,    MPI_UNSIGNED_LONG_LONG);
+  DIY_MPI_DATATYPE_MAP(float,                 MPI_FLOAT);
+  DIY_MPI_DATATYPE_MAP(double,                MPI_DOUBLE);
+
+  /* mpi_datatype: helper routines, specialized for std::vector<...> */
+  template<class T>
+  struct mpi_datatype
+  {
+    static MPI_Datatype         datatype()              { return get_mpi_datatype<T>(); }
+    static const void*          address(const T& x)     { return &x; }
+    static void*                address(T& x)           { return &x; }
+    static int                  count(const T& x)       { return 1; }
+  };
+
+  template<class U>
+  struct mpi_datatype< std::vector<U> >
+  {
+    typedef     std::vector<U>      VecU;
+
+    static MPI_Datatype         datatype()              { return get_mpi_datatype<U>(); }
+    static const void*          address(const VecU& x)  { return &x[0]; }
+    static void*                address(VecU& x)        { return &x[0]; }
+    static int                  count(const VecU& x)    { return x.size(); }
+  };
+
+}
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/mpi/io.hpp b/include/vtkmdiy/mpi/io.hpp
new file mode 100644
index 000000000..ebe6a2e17
--- /dev/null
+++ b/include/vtkmdiy/mpi/io.hpp
@@ -0,0 +1,137 @@
+#ifndef DIY_MPI_IO_HPP
+#define DIY_MPI_IO_HPP
+
+#include <vector>
+#include <string>
+
+namespace diy
+{
+namespace mpi
+{
+namespace io
+{
+  typedef               MPI_Offset              offset;
+
+  //! Wraps MPI file IO. \ingroup MPI
+  class file
+  {
+    public:
+      enum
+      {
+        rdonly          = MPI_MODE_RDONLY,
+        rdwr            = MPI_MODE_RDWR,
+        wronly          = MPI_MODE_WRONLY,
+        create          = MPI_MODE_CREATE,
+        exclusive       = MPI_MODE_EXCL,
+        delete_on_close = MPI_MODE_DELETE_ON_CLOSE,
+        unique_open     = MPI_MODE_UNIQUE_OPEN,
+        sequential      = MPI_MODE_SEQUENTIAL,
+        append          = MPI_MODE_APPEND
+      };
+
+    public:
+                    file(const communicator&    comm,
+                         const std::string&     filename,
+                         int                    mode):
+                        comm_(comm)                         { MPI_File_open(comm, const_cast<char*>(filename.c_str()), mode, MPI_INFO_NULL, &fh); }
+                    ~file()                                 { close(); }
+      void          close()                                 { if (fh != MPI_FILE_NULL) MPI_File_close(&fh); }
+
+      offset        size() const                            { offset sz; MPI_File_get_size(fh, &sz); return sz; }
+      void          resize(offset size)                     { MPI_File_set_size(fh, size); }
+
+      inline void   read_at(offset o, char* buffer, size_t size);
+      inline void   read_at_all(offset o, char* buffer, size_t size);
+      inline void   write_at(offset o, const char* buffer, size_t size);
+      inline void   write_at_all(offset o, const char* buffer, size_t size);
+
+      template<class T>
+      inline void   read_at(offset o, std::vector<T>& data);
+
+      template<class T>
+      inline void   read_at_all(offset o, std::vector<T>& data);
+
+      template<class T>
+      inline void   write_at(offset o, const std::vector<T>& data);
+
+      template<class T>
+      inline void   write_at_all(offset o, const std::vector<T>& data);
+
+      const communicator&
+                    comm() const                            { return comm_; }
+
+      MPI_File&     handle()                                { return fh; }
+
+    private:
+      const communicator&   comm_;
+      MPI_File              fh;
+  };
+}
+}
+}
+
+void
+diy::mpi::io::file::
+read_at(offset o, char* buffer, size_t size)
+{
+  status s;
+  MPI_File_read_at(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+read_at(offset o, std::vector<T>& data)
+{
+  read_at(o, &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+read_at_all(offset o, char* buffer, size_t size)
+{
+  status s;
+  MPI_File_read_at_all(fh, o, buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+read_at_all(offset o, std::vector<T>& data)
+{
+  read_at_all(o, (char*) &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+write_at(offset o, const char* buffer, size_t size)
+{
+  status s;
+  MPI_File_write_at(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+write_at(offset o, const std::vector<T>& data)
+{
+  write_at(o, (const char*) &data[0], data.size()*sizeof(T));
+}
+
+void
+diy::mpi::io::file::
+write_at_all(offset o, const char* buffer, size_t size)
+{
+  status s;
+  MPI_File_write_at_all(fh, o, (void *)buffer, size, detail::get_mpi_datatype<char>(), &s.s);
+}
+
+template<class T>
+void
+diy::mpi::io::file::
+write_at_all(offset o, const std::vector<T>& data)
+{
+  write_at_all(o, &data[0], data.size()*sizeof(T));
+}
+
+#endif
diff --git a/include/vtkmdiy/mpi/operations.hpp b/include/vtkmdiy/mpi/operations.hpp
new file mode 100644
index 000000000..2f95c0a72
--- /dev/null
+++ b/include/vtkmdiy/mpi/operations.hpp
@@ -0,0 +1,26 @@
+#include <functional>
+
+namespace diy
+{
+namespace mpi
+{
+  //! \addtogroup MPI
+  //!@{
+  template<class U>
+  struct maximum { const U& operator()(const U& x, const U& y) const { return std::max(x,y); } };
+  template<class U>
+  struct minimum { const U& operator()(const U& x, const U& y) const { return std::min(x,y); } };
+  //!@}
+
+namespace detail
+{
+  template<class T> struct mpi_op                           { static MPI_Op  get(); };
+  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get() { return MPI_MAX; }  };
+  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get() { return MPI_MIN; }  };
+  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get() { return MPI_SUM; }  };
+  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get() { return MPI_PROD; }  };
+  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get() { return MPI_LAND; }  };
+  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get() { return MPI_LOR; }  };
+}
+}
+}
diff --git a/include/vtkmdiy/mpi/optional.hpp b/include/vtkmdiy/mpi/optional.hpp
new file mode 100644
index 000000000..ab58aaf81
--- /dev/null
+++ b/include/vtkmdiy/mpi/optional.hpp
@@ -0,0 +1,55 @@
+namespace diy
+{
+namespace mpi
+{
+  template<class T>
+  struct optional
+  {
+                optional():
+                  init_(false)                  {}
+
+                optional(const T& v):
+                  init_(true)                   { new(buf_) T(v); }
+
+                optional(const optional& o):
+                  init_(o.init_)                { if (init_) new(buf_) T(*o);  }
+
+                ~optional()                     { if (init_) clear(); }
+
+    inline
+    optional&   operator=(const optional& o);
+
+                operator bool() const           { return init_; }
+
+    T&          operator*()                     { return *static_cast<T*>(address()); }
+    const T&    operator*() const               { return *static_cast<const T*>(address()); }
+
+    T*          operator->()                    { return &(operator*()); }
+    const T*    operator->() const              { return &(operator*()); }
+
+    private:
+      void      clear()                         { static_cast<T*>(address())->~T(); }
+
+      void*         address()                   { return buf_; }
+      const void*   address() const             { return buf_; }
+
+    private:
+      bool init_;
+      char buf_[sizeof(T)];
+  };
+}
+}
+
+template<class T>
+diy::mpi::optional<T>&
+diy::mpi::optional<T>::
+operator=(const optional& o)
+{
+  if (init_)
+    clear();
+  init_ = o.init_;
+  if (init_)
+    new (buf_) T(*o);
+
+  return *this;
+}
diff --git a/include/vtkmdiy/mpi/point-to-point.hpp b/include/vtkmdiy/mpi/point-to-point.hpp
new file mode 100644
index 000000000..dc8a341dc
--- /dev/null
+++ b/include/vtkmdiy/mpi/point-to-point.hpp
@@ -0,0 +1,98 @@
+#include <vector>
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+  // send
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct send;
+
+  template<class T>
+  struct send<T, true_type>
+  {
+    void operator()(MPI_Comm comm, int dest, int tag, const T& x) const
+    {
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Send((void*) Datatype::address(x),
+               Datatype::count(x),
+               Datatype::datatype(),
+               dest, tag, comm);
+    }
+  };
+
+  // recv
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct recv;
+
+  template<class T>
+  struct recv<T, true_type>
+  {
+    status operator()(MPI_Comm comm, int source, int tag, T& x) const
+    {
+      typedef       mpi_datatype<T>     Datatype;
+      status s;
+      MPI_Recv((void*) Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                source, tag, comm, &s.s);
+      return s;
+    }
+  };
+
+  template<class U>
+  struct recv<std::vector<U>, true_type>
+  {
+    status operator()(MPI_Comm comm, int source, int tag, std::vector<U>& x) const
+    {
+      status s;
+
+      MPI_Probe(source, tag, comm, &s.s);
+      x.resize(s.count<U>());
+      MPI_Recv(&x[0], x.size(), get_mpi_datatype<U>(), source, tag, comm, &s.s);
+      return s;
+    }
+  };
+
+  // isend
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct isend;
+
+  template<class T>
+  struct isend<T, true_type>
+  {
+    request operator()(MPI_Comm comm, int dest, int tag, const T& x) const
+    {
+      request r;
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Isend((void*) Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                dest, tag, comm, &r.r);
+      return r;
+    }
+  };
+
+  // irecv
+  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
+  struct irecv;
+
+  template<class T>
+  struct irecv<T, true_type>
+  {
+    request operator()(MPI_Comm comm, int source, int tag, T& x) const
+    {
+      request r;
+      typedef       mpi_datatype<T>     Datatype;
+      MPI_Irecv(Datatype::address(x),
+                Datatype::count(x),
+                Datatype::datatype(),
+                source, tag, comm, &r.r);
+      return r;
+    }
+  };
+}
+}
+}
diff --git a/include/vtkmdiy/mpi/request.hpp b/include/vtkmdiy/mpi/request.hpp
new file mode 100644
index 000000000..23b11816e
--- /dev/null
+++ b/include/vtkmdiy/mpi/request.hpp
@@ -0,0 +1,26 @@
+namespace diy
+{
+namespace mpi
+{
+  struct request
+  {
+    status              wait()              { status s; MPI_Wait(&r, &s.s); return s; }
+    inline
+    optional<status>    test();
+    void                cancel()            { MPI_Cancel(&r); }
+
+    MPI_Request         r;
+  };
+}
+}
+
+diy::mpi::optional<diy::mpi::status>
+diy::mpi::request::test()
+{
+  status s;
+  int flag;
+  MPI_Test(&r, &flag, &s.s);
+  if (flag)
+    return s;
+  return optional<status>();
+}
diff --git a/include/vtkmdiy/mpi/status.hpp b/include/vtkmdiy/mpi/status.hpp
new file mode 100644
index 000000000..aab500c31
--- /dev/null
+++ b/include/vtkmdiy/mpi/status.hpp
@@ -0,0 +1,30 @@
+namespace diy
+{
+namespace mpi
+{
+  struct status
+  {
+    int             source() const          { return s.MPI_SOURCE; }
+    int             tag() const             { return s.MPI_TAG; }
+    int             error() const           { return s.MPI_ERROR; }
+    bool            cancelled() const       { int flag; MPI_Test_cancelled(const_cast<MPI_Status*>(&s), &flag); return flag; }
+
+    template<class T>
+    int             count() const;
+
+                    operator MPI_Status&()              { return s; }
+                    operator const MPI_Status&() const  { return s; }
+
+    MPI_Status      s;
+  };
+}
+}
+
+template<class T>
+int
+diy::mpi::status::count() const
+{
+  int c;
+  MPI_Get_count(const_cast<MPI_Status*>(&s), detail::get_mpi_datatype<T>(), &c);
+  return c;
+}
diff --git a/include/vtkmdiy/no-thread.hpp b/include/vtkmdiy/no-thread.hpp
new file mode 100644
index 000000000..fd7af88ae
--- /dev/null
+++ b/include/vtkmdiy/no-thread.hpp
@@ -0,0 +1,38 @@
+#ifndef DIY_NO_THREAD_HPP
+#define DIY_NO_THREAD_HPP
+
+// replicates only the parts of the threading interface that we use
+// executes everything in a single thread
+
+namespace diy
+{
+  struct thread
+  {
+                        thread(void (*f)(void *), void* args):
+                            f_(f), args_(args)                    {}
+
+    void                join()                                    { f_(args_); }
+
+    static unsigned     hardware_concurrency()                    { return 1; }
+
+    void (*f_)(void*);
+    void*   args_;
+  };
+
+  struct mutex {};
+  struct fast_mutex {};
+  struct recursive_mutex {};
+
+  template<class T>
+  struct lock_guard
+  {
+      lock_guard(T&)        {}
+  };
+
+  namespace this_thread
+  {
+      inline unsigned long int  get_id()    { return 0; }
+  }
+}
+
+#endif
diff --git a/include/vtkmdiy/partners/all-reduce.hpp b/include/vtkmdiy/partners/all-reduce.hpp
new file mode 100644
index 000000000..e34066595
--- /dev/null
+++ b/include/vtkmdiy/partners/all-reduce.hpp
@@ -0,0 +1,72 @@
+#ifndef DIY_PARTNERS_ALL_REDUCE_HPP
+#define DIY_PARTNERS_ALL_REDUCE_HPP
+
+#include "merge.hpp"
+
+namespace diy
+{
+
+class Master;
+
+//! Allreduce (reduction with results broadcasted to all blocks) is
+//! implemented as two merge reductions, with incoming and outgoing items swapped in second one.
+//! Ie, follows merge reduction up and down the merge tree
+
+/**
+ * \ingroup Communication
+ * \brief Partners for all-reduce
+ *
+ */
+struct RegularAllReducePartners: public RegularMergePartners
+{
+  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
+
+                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                //! contiguous is useful when data needs to be united;
+                //! round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularAllReducePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                         int k,                         //!< target k value
+                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(decomposer, k, contiguous)         {}
+                RegularAllReducePartners(const DivisionVector&   divs,//!< explicit division vector
+                                         const KVSVector&        kvs, //!< explicit k vector
+                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(divs, kvs, contiguous)               {}
+
+  //! returns total number of rounds
+  size_t        rounds() const                                  { return 2*Parent::rounds(); }
+  //! returns size of a group of partners in a given round
+  int           size(int round) const                           { return Parent::size(parent_round(round)); }
+  //! returns dimension (direction of partners in a regular grid) in a given round
+  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
+  //! returns whether a given block in a given round has dropped out of the merge yet or not
+  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
+  //! returns what the current round would be in the first or second parent merge reduction
+  int           parent_round(int round) const                   { return round < (int) Parent::rounds() ? round : rounds() - round; }
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      if (round <= (int) Parent::rounds())
+          Parent::incoming(round, gid, partners, m);
+      else
+          Parent::outgoing(parent_round(round), gid, partners, m);
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      if (round < (int) Parent::rounds())
+          Parent::outgoing(round, gid, partners, m);
+      else
+          Parent::incoming(parent_round(round), gid, partners, m);
+  }
+};
+
+} // diy
+
+#endif
+
+
diff --git a/include/vtkmdiy/partners/broadcast.hpp b/include/vtkmdiy/partners/broadcast.hpp
new file mode 100644
index 000000000..d3f565f82
--- /dev/null
+++ b/include/vtkmdiy/partners/broadcast.hpp
@@ -0,0 +1,62 @@
+#ifndef DIY_PARTNERS_BROADCAST_HPP
+#define DIY_PARTNERS_BROADCAST_HPP
+
+#include "merge.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for broadcast
+ *
+ */
+struct RegularBroadcastPartners: public RegularMergePartners
+{
+  typedef       RegularMergePartners                            Parent; //!< base class merge reduction
+
+                //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                //! contiguous is useful when data needs to be united;
+                //! round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularBroadcastPartners(const Decomposer& decomposer,  //!< domain decomposition
+                                         int k,                         //!< target k value
+                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(decomposer, k, contiguous)         {}
+                RegularBroadcastPartners(const DivisionVector&   divs,//!< explicit division vector
+                                         const KVSVector&        kvs, //!< explicit k vector
+                                         bool  contiguous = true      //!< distance doubling (true) or halving (false)
+                    ):
+                  Parent(divs, kvs, contiguous)               {}
+
+  //! returns total number of rounds
+  size_t        rounds() const                                  { return Parent::rounds(); }
+  //! returns size of a group of partners in a given round
+  int           size(int round) const                           { return Parent::size(parent_round(round)); }
+  //! returns dimension (direction of partners in a regular grid) in a given round
+  int           dim(int round) const                            { return Parent::dim(parent_round(round)); }
+  //! returns whether a given block in a given round has dropped out of the merge yet or not
+  inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
+  //! returns what the current round would be in the first or second parent merge reduction
+  int           parent_round(int round) const                   { return rounds() - round; }
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      Parent::outgoing(parent_round(round), gid, partners, m);
+  }
+
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
+  {
+      Parent::incoming(parent_round(round), gid, partners, m);
+  }
+};
+
+} // diy
+
+#endif
+
+
diff --git a/include/vtkmdiy/partners/common.hpp b/include/vtkmdiy/partners/common.hpp
new file mode 100644
index 000000000..43f8297a0
--- /dev/null
+++ b/include/vtkmdiy/partners/common.hpp
@@ -0,0 +1,204 @@
+#ifndef DIY_PARTNERS_COMMON_HPP
+#define DIY_PARTNERS_COMMON_HPP
+
+#include "../decomposition.hpp"
+#include "../types.hpp"
+
+namespace diy
+{
+
+struct RegularPartners
+{
+  // The record of group size per round in a dimension
+  struct DimK
+  {
+            DimK(int dim_, int k_):
+                dim(dim_), size(k_)               {}
+
+    int dim;
+    int size;           // group size
+  };
+
+  typedef       std::vector<int>                    CoordVector;
+  typedef       std::vector<int>                    DivisionVector;
+  typedef       std::vector<DimK>                   KVSVector;
+
+  // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily)
+  typedef       DiscreteBounds                      Bounds;
+  typedef       RegularDecomposer<Bounds>           Decomposer;
+
+  template<class Decomposer_>
+                RegularPartners(const Decomposer_& decomposer, int k, bool contiguous = true):
+                  divisions_(decomposer.divisions),
+                  contiguous_(contiguous)                       { factor(k, divisions_, kvs_); fill_steps(); }
+                RegularPartners(const DivisionVector&   divs,
+                                const KVSVector&        kvs,
+                                bool  contiguous = true):
+                  divisions_(divs), kvs_(kvs),
+                  contiguous_(contiguous)                       { fill_steps(); }
+
+  size_t        rounds() const                                  { return kvs_.size(); }
+  int           size(int round) const                           { return kvs_[round].size; }
+  int           dim(int round) const                            { return kvs_[round].dim; }
+
+  int           step(int round) const                           { return steps_[round]; }
+
+  const DivisionVector&     divisions() const                   { return divisions_; }
+  const KVSVector&          kvs() const                         { return kvs_; }
+  bool                      contiguous() const                  { return contiguous_; }
+
+  static
+  inline void   factor(int k, const DivisionVector& divisions, KVSVector& kvs);
+
+  inline void   fill(int round, int gid, std::vector<int>& partners) const;
+  inline int    group_position(int round, int c, int step) const;
+
+  private:
+    inline void fill_steps();
+    static
+    inline void factor(int k, int tot_b, std::vector<int>& kvs);
+
+    DivisionVector      divisions_;
+    KVSVector           kvs_;
+    bool                contiguous_;
+    std::vector<int>    steps_;
+};
+
+}
+
+void
+diy::RegularPartners::
+fill_steps()
+{
+  if (contiguous_)
+  {
+    std::vector<int>    cur_steps(divisions().size(), 1);
+
+    for (size_t r = 0; r < rounds(); ++r)
+    {
+      steps_.push_back(cur_steps[kvs_[r].dim]);
+      cur_steps[kvs_[r].dim] *= kvs_[r].size;
+    }
+  } else
+  {
+    std::vector<int>    cur_steps(divisions().begin(), divisions().end());
+    for (size_t r = 0; r < rounds(); ++r)
+    {
+      cur_steps[kvs_[r].dim] /= kvs_[r].size;
+      steps_.push_back(cur_steps[kvs_[r].dim]);
+    }
+  }
+}
+
+void
+diy::RegularPartners::
+fill(int round, int gid, std::vector<int>& partners) const
+{
+  const DimK&   kv  = kvs_[round];
+  partners.reserve(kv.size);
+
+  int step = this->step(round);       // gids jump by this much in the current round
+
+  CoordVector   coords;
+  Decomposer::gid_to_coords(gid, coords, divisions_);
+  int c   = coords[kv.dim];
+  int pos = group_position(round, c, step);
+
+  int partner = c - pos * step;
+  coords[kv.dim] = partner;
+  int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
+  partners.push_back(partner_gid);
+
+  for (int k = 1; k < kv.size; ++k)
+  {
+    partner += step;
+    coords[kv.dim] = partner;
+    int partner_gid = Decomposer::coords_to_gid(coords, divisions_);
+    partners.push_back(partner_gid);
+  }
+}
+
+// Tom's GetGrpPos
+int
+diy::RegularPartners::
+group_position(int round, int c, int step) const
+{
+  // the second term in the following expression does not simplify to
+  // (gid - start_b) / kv[r]
+  // because the division gid / (step * kv[r]) is integer and truncates
+  // this is exactly what we want
+  int g = c % step + c / (step * kvs_[round].size) * step;
+  int p = c / step % kvs_[round].size;
+  static_cast<void>(g);        // shut up the compiler
+
+  // g: group number (output)
+  // p: position number within the group (output)
+  return p;
+}
+
+void
+diy::RegularPartners::
+factor(int k, const DivisionVector& divisions, KVSVector& kvs)
+{
+  // factor in each dimension
+  std::vector< std::vector<int> >       tmp_kvs(divisions.size());
+  for (unsigned i = 0; i < divisions.size(); ++i)
+    factor(k, divisions[i], tmp_kvs[i]);
+
+  // interleave the dimensions
+  std::vector<int>  round_per_dim(divisions.size(), 0);
+  while(true)
+  {
+    // TODO: not the most efficient way to do this
+    bool changed = false;
+    for (unsigned i = 0; i < divisions.size(); ++i)
+    {
+      if (round_per_dim[i] == (int) tmp_kvs[i].size())
+        continue;
+      kvs.push_back(DimK(i, tmp_kvs[i][round_per_dim[i]++]));
+      changed = true;
+    }
+    if (!changed)
+        break;
+  }
+}
+
+// Tom's FactorK
+void
+diy::RegularPartners::
+factor(int k, int tot_b, std::vector<int>& kv)
+{
+  int rem = tot_b; // unfactored remaining portion of tot_b
+  int j;
+
+  while (rem > 1)
+  {
+    // remainder is divisible by k
+    if (rem % k == 0)
+    {
+      kv.push_back(k);
+      rem /= k;
+    }
+    // if not, start at k and linearly look for smaller factors down to 2
+    else
+    {
+      for (j = k - 1; j > 1; j--)
+      {
+        if (rem % j == 0)
+        {
+          kv.push_back(j);
+          rem /= k;
+          break;
+        }
+      }
+      if (j == 1)
+      {
+        kv.push_back(rem);
+        rem = 1;
+      }
+    } // else
+  } // while
+}
+
+
+#endif
diff --git a/include/vtkmdiy/partners/merge.hpp b/include/vtkmdiy/partners/merge.hpp
new file mode 100644
index 000000000..c6be42533
--- /dev/null
+++ b/include/vtkmdiy/partners/merge.hpp
@@ -0,0 +1,60 @@
+#ifndef DIY_PARTNERS_MERGE_HPP
+#define DIY_PARTNERS_MERGE_HPP
+
+#include "common.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for merge-reduce
+ *
+ */
+struct RegularMergePartners: public RegularPartners
+{
+  typedef       RegularPartners                                 Parent;
+
+                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                // contiguous is useful when data needs to be united;
+                // round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularMergePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                     int k,                         //!< target k value
+                                     bool contiguous = true         //!< distance doubling (true) or halving (false)
+                    ):
+                    Parent(decomposer, k, contiguous)           {}
+                RegularMergePartners(const DivisionVector&   divs, //!< explicit division vector
+                                     const KVSVector&        kvs,  //!< explicit k vector
+                                     bool  contiguous = true       //!< distance doubling (true) or halving (false)
+                    ):
+                    Parent(divs, kvs, contiguous)               {}
+
+  inline bool   active(int round, int gid, const Master&) const;
+
+  // incoming is only valid for an active gid; it will only be called with an active gid
+  inline void   incoming(int round, int gid, std::vector<int>& partners, const Master&) const    { Parent::fill(round - 1, gid, partners); }
+  // this is a lazy implementation of outgoing, but it reuses the existing code
+  inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master&) const    { std::vector<int> tmp; Parent::fill(round, gid, tmp); partners.push_back(tmp[0]); }
+};
+
+} // diy
+
+bool
+diy::RegularMergePartners::
+active(int round, int gid, const Master&) const
+{
+  CoordVector   coords;
+  Decomposer::gid_to_coords(gid, coords, divisions());
+
+  for (int r = 0; r < round; ++r)
+      if (Parent::group_position(r, coords[kvs()[r].dim], step(r)) != 0)
+          return false;
+
+  return true;
+}
+
+#endif
+
diff --git a/include/vtkmdiy/partners/swap.hpp b/include/vtkmdiy/partners/swap.hpp
new file mode 100644
index 000000000..cc3b3e494
--- /dev/null
+++ b/include/vtkmdiy/partners/swap.hpp
@@ -0,0 +1,43 @@
+#ifndef DIY_PARTNERS_SWAP_HPP
+#define DIY_PARTNERS_SWAP_HPP
+
+#include "common.hpp"
+
+namespace diy
+{
+
+class Master;
+
+/**
+ * \ingroup Communication
+ * \brief Partners for swap-reduce
+ *
+ */
+struct RegularSwapPartners: public RegularPartners
+{
+  typedef       RegularPartners                                 Parent;
+
+                // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
+                // contiguous is useful when data needs to be united;
+                // round-robin is useful for vector-"halving"
+  template<class Decomposer>
+                RegularSwapPartners(const Decomposer& decomposer,   //!< domain decomposition
+                                    int k,                          //!< target k value
+                                    bool contiguous = true          //!< distance halving (true) or doubling (false)
+                    ):
+                    Parent(decomposer, k, contiguous)         {}
+                RegularSwapPartners(const DivisionVector&   divs, //!< explicit division vector
+                                    const KVSVector&        kvs,  //!< explicit k vector
+                                    bool  contiguous = true       //!< distance halving (true) or doubling (false)
+                    ):
+                    Parent(divs, kvs, contiguous)               {}
+
+  bool          active(int round, int gid, const Master&) const                                 { return true; }    // in swap-reduce every block is always active
+
+  void          incoming(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round - 1, gid, partners); }
+  void          outgoing(int round, int gid, std::vector<int>& partners, const Master&) const   { Parent::fill(round, gid, partners); }
+};
+
+} // diy
+
+#endif
diff --git a/include/vtkmdiy/pick.hpp b/include/vtkmdiy/pick.hpp
new file mode 100644
index 000000000..5f9d8d0e8
--- /dev/null
+++ b/include/vtkmdiy/pick.hpp
@@ -0,0 +1,137 @@
+#ifndef DIY_PICK_HPP
+#define DIY_PICK_HPP
+
+#include "link.hpp"
+
+namespace diy
+{
+    template<class Bounds, class Point, class OutIter>
+    void near(const RegularLink<Bounds>& link, const Point& p, float r, OutIter out,
+              const Bounds& domain);
+
+    template<class Bounds, class Point, class OutIter>
+    void in(const RegularLink<Bounds>& link, const Point& p, OutIter out, const Bounds& domain);
+
+    template<class Point, class Bounds>
+    float distance(int dim, const Bounds& bounds, const Point& p);
+
+    template<class Bounds>
+    inline
+    float distance(int dim, const Bounds& bounds1, const Bounds& bounds2);
+
+    template<class Bounds>
+    void wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim);
+}
+
+//! Finds the neighbors within radius r of a target point.
+template<class Bounds, class Point, class OutIter>
+void
+diy::
+near(const RegularLink<Bounds>& link,  //!< neighbors
+     const Point& p,                   //!< target point (must be in current block)
+     float r,                          //!< target radius (>= 0.0)
+     OutIter out,                      //!< insert iterator for output set of neighbors
+     const Bounds& domain)             //!< global domain bounds
+{
+  Bounds neigh_bounds; // neighbor block bounds
+
+  // for all neighbors of this block
+  for (int n = 0; n < link.size(); n++)
+  {
+    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
+    neigh_bounds = link.bounds(n);
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
+
+    if (distance(link.dimension(), neigh_bounds, p) <= r)
+        *out++ = n;
+  } // for all neighbors
+}
+
+//! Find the distance between point `p` and box `bounds`.
+template<class Point, class Bounds>
+float
+diy::
+distance(int dim, const Bounds& bounds, const Point& p)
+{
+    float res = 0;
+    for (int i = 0; i < dim; ++i)
+    {
+        // avoids all the annoying case logic by finding
+        // diff = max(bounds.min[i] - p[i], 0, p[i] - bounds.max[i])
+        float diff = 0, d;
+
+        d = bounds.min[i] - p[i];
+        if (d > diff) diff = d;
+        d = p[i] - bounds.max[i];
+        if (d > diff) diff = d;
+
+        res += diff*diff;
+    }
+    return sqrt(res);
+}
+
+template<class Bounds>
+float
+diy::
+distance(int dim, const Bounds& bounds1, const Bounds& bounds2)
+{
+    float res = 0;
+    for (int i = 0; i < dim; ++i)
+    {
+        float diff = 0, d;
+
+        float d1 = bounds1.max[i] - bounds2.min[i];
+        float d2 = bounds2.max[i] - bounds1.min[i];
+
+        if (d1 > 0 && d2 > 0)
+            diff = 0;
+        else if (d1 <= 0)
+            diff = -d1;
+        else if (d2 <= 0)
+            diff = -d2;
+
+        res += diff*diff;
+    }
+    return sqrt(res);
+}
+
+//! Finds the neighbor(s) containing the target point.
+template<class Bounds, class Point, class OutIter>
+void
+diy::
+in(const RegularLink<Bounds>& link,  //!< neighbors
+   const Point& p,                   //!< target point
+   OutIter out,                      //!< insert iterator for output set of neighbors
+   const Bounds& domain)             //!< global domain bounds
+{
+  Bounds neigh_bounds; // neighbor block bounds
+
+  // for all neighbors of this block
+  for (int n = 0; n < link.size(); n++)
+  {
+    // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
+    neigh_bounds = link.bounds(n);
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
+
+    if (distance(link.dimension(), neigh_bounds, p) == 0)
+        *out++ = n;
+  } // for all neighbors
+}
+
+// wraps block bounds
+// wrap dir is the wrapping direction from original block to wrapped neighbor block
+// overall domain bounds and dimensionality are also needed
+template<class Bounds>
+void
+diy::
+wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim)
+{
+  for (int i = 0; i < dim; ++i)
+  {
+    bounds.min[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
+    bounds.max[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
+  }
+}
+
+
+#endif
diff --git a/include/vtkmdiy/point.hpp b/include/vtkmdiy/point.hpp
new file mode 100644
index 000000000..cafbe784c
--- /dev/null
+++ b/include/vtkmdiy/point.hpp
@@ -0,0 +1,120 @@
+#ifndef DIY_POINT_HPP
+#define DIY_POINT_HPP
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <sstream>
+
+#include <array>
+
+namespace diy
+{
+
+template<class Coordinate_, unsigned D>
+class Point: public std::array<Coordinate_, D>
+{
+    public:
+        typedef             Coordinate_                             Coordinate;
+        typedef             std::array<Coordinate, D>               ArrayParent;
+
+        typedef             Point<Coordinate, D-1>                  LPoint;
+        typedef             Point<Coordinate, D+1>                  UPoint;
+
+        template<class U>
+        struct rebind       { typedef Point<U,D> type; };
+
+    public:
+                            Point()                                 { for (unsigned i = 0; i < D; ++i) (*this)[i] = 0; }
+                            Point(const ArrayParent& a):
+                                ArrayParent(a)                      {}
+        template<class T>   Point(const Point<T, D>& p)             { for (size_t i = 0; i < D; ++i) (*this)[i] = p[i]; }
+        template<class T>   Point(const T* a)                       { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
+        template<class T>   Point(const std::vector<T>& a)          { for (unsigned i = 0; i < D; ++i) (*this)[i] = a[i]; }
+                            Point(std::initializer_list<Coordinate> lst)   { unsigned i = 0; for (Coordinate x : lst) (*this)[i++] = x; }
+
+                            Point(Point&&)                          =default;
+                            Point(const Point&)                     =default;
+        Point&              operator=(const Point&)                 =default;
+
+        static constexpr
+        unsigned            dimension()                             { return D; }
+
+        static Point        zero()                                  { return Point(); }
+        static Point        one()                                   { Point p; for (unsigned i = 0; i < D; ++i) p[i] = 1; return p; }
+
+        LPoint              drop(int dim) const                     { LPoint p; unsigned c = 0; for (unsigned i = 0; i < D;   ++i) { if (i == dim) continue; p[c++] = (*this)[i]; } return p; }
+        UPoint              lift(int dim, Coordinate x) const       { UPoint p; for (unsigned i = 0; i < D+1; ++i) { if (i < dim) p[i] = (*this)[i]; else if (i == dim) p[i] = x; else if (i > dim) p[i] = (*this)[i-1]; } return p; }
+
+        using ArrayParent::operator[];
+
+        Point&              operator+=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] += y[i];  return *this; }
+        Point&              operator-=(const Point& y)              { for (unsigned i = 0; i < D; ++i) (*this)[i] -= y[i];  return *this; }
+        Point&              operator*=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] *= a;     return *this; }
+        Point&              operator/=(Coordinate a)                { for (unsigned i = 0; i < D; ++i) (*this)[i] /= a;     return *this; }
+
+        Coordinate          norm() const                            { return (*this)*(*this); }
+
+        std::ostream&       operator<<(std::ostream& out) const     { out << (*this)[0]; for (unsigned i = 1; i < D; ++i) out << " " << (*this)[i]; return out; }
+        std::istream&       operator>>(std::istream& in);
+
+        friend
+        Point               operator+(Point x, const Point& y)       { x += y; return x; }
+
+        friend
+        Point               operator-(Point x, const Point& y)       { x -= y; return x; }
+
+        friend
+        Point               operator/(Point x, Coordinate y)         { x /= y; return x; }
+
+        friend
+        Point               operator*(Point x, Coordinate y)         { x *= y; return x; }
+
+        friend
+        Point               operator*(Coordinate y, Point x)         { x *= y; return x; }
+
+        friend
+        Coordinate          operator*(const Point& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
+
+        template<class T>
+        friend
+        Coordinate          operator*(const Point<T,D>& x, const Point& y)   { Coordinate n = 0; for (size_t i = 0; i < D; ++i) n += x[i] * y[i]; return n; }
+};
+
+template<class C, unsigned D>
+std::istream&
+Point<C,D>::
+operator>>(std::istream& in)
+{
+    std::string point_str;
+    in >> point_str;        // read until ' '
+    std::stringstream ps(point_str);
+
+    char x;
+    for (unsigned i = 0; i < dimension(); ++i)
+    {
+        ps >> (*this)[i];
+        ps >> x;
+    }
+
+    return in;
+}
+
+
+template<class Coordinate, unsigned D>
+Coordinate norm2(const Point<Coordinate,D>& p)
+{ Coordinate res = 0; for (unsigned i = 0; i < D; ++i) res += p[i]*p[i]; return res; }
+
+template<class C, unsigned D>
+std::ostream&
+operator<<(std::ostream& out, const Point<C,D>& p)
+{ return p.operator<<(out); }
+
+template<class C, unsigned D>
+std::istream&
+operator>>(std::istream& in, Point<C,D>& p)
+{ return p.operator>>(in); }
+
+}
+
+#endif // DIY_POINT_HPP
diff --git a/include/vtkmdiy/proxy.hpp b/include/vtkmdiy/proxy.hpp
new file mode 100644
index 000000000..0160e0605
--- /dev/null
+++ b/include/vtkmdiy/proxy.hpp
@@ -0,0 +1,228 @@
+#ifndef DIY_PROXY_HPP
+#define DIY_PROXY_HPP
+
+
+namespace diy
+{
+  //! Communication proxy, used for enqueueing and dequeueing items for future exchange.
+  struct Master::Proxy
+  {
+    template <class T>
+    struct EnqueueIterator;
+
+                        Proxy(Master* master, int gid):
+                          gid_(gid),
+                          master_(master),
+                          incoming_(&master->incoming(gid)),
+                          outgoing_(&master->outgoing(gid)),
+                          collectives_(&master->collectives(gid))       {}
+
+    int                 gid() const                                     { return gid_; }
+
+    //! Enqueue data whose size can be determined automatically, e.g., an STL vector.
+    template<class T>
+    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
+                                const T&        x,                                      //!< data (eg. STL vector)
+                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
+                               ) const
+    { OutgoingQueues& out = *outgoing_; save(out[to], x); }
+
+    //! Enqueue data whose size is given explicitly by the user, e.g., an array.
+    template<class T>
+    void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
+                                const T*        x,                                      //!< pointer to the data (eg. address of start of vector)
+                                size_t          n,                                      //!< size in data elements (eg. ints)
+                                void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
+                               ) const;
+
+    //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was
+    //! previously enqueued so that diy knows its size when it is received.
+    //! In this case, diy will allocate the receive buffer; the user does not need to do so.
+    template<class T>
+    void                dequeue(int             from,                                   //!< target block gid
+                                T&              x,                                      //!< data (eg. STL vector)
+                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
+                               ) const
+    { IncomingQueues& in  = *incoming_; load(in[from], x); }
+
+    //! Dequeue an array of data whose size is given explicitly by the user.
+    //! In this case, the user needs to allocate the receive buffer prior to calling dequeue.
+    template<class T>
+    void                dequeue(int             from,                                   //!< target block gid
+                                T*              x,                                      //!< pointer to the data (eg. address of start of vector)
+                                size_t          n,                                      //!< size in data elements (eg. ints)
+                                void (*load)(BinaryBuffer&, T&) = &::diy::load<T>       //!< optional serialization function
+                               ) const;
+
+    template<class T>
+    EnqueueIterator<T>  enqueuer(const T& x,
+                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save<T>) const
+    { return EnqueueIterator<T>(this, x, save); }
+
+    IncomingQueues*     incoming() const                                { return incoming_; }
+    MemoryBuffer&       incoming(int from) const                        { return (*incoming_)[from]; }
+    inline void         incoming(std::vector<int>& v) const;            // fill v with every gid from which we have a message
+
+    OutgoingQueues*     outgoing() const                                { return outgoing_; }
+    MemoryBuffer&       outgoing(const BlockID& to) const               { return (*outgoing_)[to]; }
+
+/**
+ * \ingroup Communication
+ * \brief Post an all-reduce collective using an existing communication proxy.
+ * Available operators are:
+ * maximum<T>, minimum<T>, std::plus<T>, std::multiplies<T>, std::logical_and<T>, and
+ * std::logical_or<T>.
+ */
+    template<class T, class Op>
+    inline void         all_reduce(const T& in,                  //!< local value being reduced
+                                   Op op                         //!< operator
+                                   ) const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective without popping it off the collectives list (same result would be returned multiple times). The list can be cleared with collectives()->clear().
+ */
+    template<class T>
+    inline T            read() const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective; result is popped off the collectives list.
+ */
+    template<class T>
+    inline T            get() const;
+
+    template<class T>
+    inline void         scratch(const T& in) const;
+
+/**
+ * \ingroup Communication
+ * \brief Return the list of proxy collectives (values and operations)
+ */
+    CollectivesList*    collectives() const                             { return collectives_; }
+
+    Master*             master() const                                  { return master_; }
+
+    private:
+      int               gid_;
+      Master*           master_;
+      IncomingQueues*   incoming_;
+      OutgoingQueues*   outgoing_;
+      CollectivesList*  collectives_;
+  };
+
+  template<class T>
+  struct Master::Proxy::EnqueueIterator:
+    public std::iterator<std::output_iterator_tag, void, void, void, void>
+  {
+    typedef     void (*SaveT)(BinaryBuffer&, const T&);
+
+                        EnqueueIterator(const Proxy* proxy, const T& x,
+                                        SaveT save = &::diy::save<T>):
+                            proxy_(proxy), x_(x), save_(save)               {}
+
+    EnqueueIterator&    operator=(const BlockID& to)                        { proxy_->enqueue(to, x_, save_); return *this; }
+    EnqueueIterator&    operator*()                                         { return *this; }
+    EnqueueIterator&    operator++()                                        { return *this; }
+    EnqueueIterator&    operator++(int)                                     { return *this; }
+
+    private:
+      const Proxy*  proxy_;
+      const T&      x_;
+      SaveT         save_;
+
+  };
+
+  struct Master::ProxyWithLink: public Master::Proxy
+  {
+            ProxyWithLink(const Proxy&    proxy,
+                          void*           block,
+                          Link*           link):
+              Proxy(proxy),
+              block_(block),
+              link_(link)                                           {}
+
+      Link*   link() const                                          { return link_; }
+      void*   block() const                                         { return block_; }
+
+    private:
+      void*   block_;
+      Link*   link_;
+  };
+}
+
+
+void
+diy::Master::Proxy::
+incoming(std::vector<int>& v) const
+{
+  for (IncomingQueues::const_iterator it = incoming_->begin(); it != incoming_->end(); ++it)
+    v.push_back(it->first);
+}
+
+template<class T, class Op>
+void
+diy::Master::Proxy::
+all_reduce(const T& in, Op op) const
+{
+  collectives_->push_back(Collective(new detail::AllReduceOp<T,Op>(in, op)));
+}
+
+template<class T>
+T
+diy::Master::Proxy::
+read() const
+{
+  T res;
+  collectives_->front().result_out(&res);
+  return res;
+}
+
+template<class T>
+T
+diy::Master::Proxy::
+get() const
+{
+  T res = read<T>();
+  collectives_->pop_front();
+  return res;
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+scratch(const T& in) const
+{
+  collectives_->push_back(Collective(new detail::Scratch<T>(in)));
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+enqueue(const BlockID& to, const T* x, size_t n,
+        void (*save)(BinaryBuffer&, const T&)) const
+{
+    OutgoingQueues& out = *outgoing_;
+    BinaryBuffer&   bb  = out[to];
+    if (save == (void (*)(BinaryBuffer&, const T&)) &::diy::save<T>)
+        diy::save(bb, x, n);       // optimized for unspecialized types
+    else
+        for (size_t i = 0; i < n; ++i)
+            save(bb, x[i]);
+}
+
+template<class T>
+void
+diy::Master::Proxy::
+dequeue(int from, T* x, size_t n,
+        void (*load)(BinaryBuffer&, T&)) const
+{
+    IncomingQueues& in = *incoming_;
+    BinaryBuffer&   bb = in[from];
+    if (load == (void (*)(BinaryBuffer&, T&)) &::diy::load<T>)
+        diy::load(bb, x, n);       // optimized for unspecialized types
+    else
+        for (size_t i = 0; i < n; ++i)
+            load(bb, x[i]);
+}
+
+
+#endif
diff --git a/include/vtkmdiy/reduce-operations.hpp b/include/vtkmdiy/reduce-operations.hpp
new file mode 100644
index 000000000..629824da5
--- /dev/null
+++ b/include/vtkmdiy/reduce-operations.hpp
@@ -0,0 +1,32 @@
+#ifndef DIY_REDUCE_OPERATIONS_HPP
+#define DIY_REDUCE_OPERATIONS_HPP
+
+#include "reduce.hpp"
+#include "partners/swap.hpp"
+#include "detail/reduce/all-to-all.hpp"
+
+namespace diy
+{
+
+/**
+ * \ingroup Communication
+ * \brief all to all reduction
+ *
+ */
+template<class Op>
+void
+all_to_all(Master&              master,     //!< block owner
+           const Assigner&      assigner,   //!< global block locator (maps gid to proc)
+           const Op&            op,         //!< user-defined operation called to enqueue and dequeue items
+           int                  k = 2       //!< reduction fanout
+          )
+{
+  auto scoped = master.prof.scoped("all_to_all");
+  RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()-1), assigner.nblocks());
+  RegularSwapPartners  partners(decomposer, k, false);
+  reduce(master, assigner, partners, detail::AllToAllReduce<Op>(op, assigner), detail::SkipIntermediate(partners.rounds()));
+}
+
+}
+
+#endif
diff --git a/include/vtkmdiy/reduce.hpp b/include/vtkmdiy/reduce.hpp
new file mode 100644
index 000000000..6d47d7930
--- /dev/null
+++ b/include/vtkmdiy/reduce.hpp
@@ -0,0 +1,216 @@
+#ifndef DIY_REDUCE_HPP
+#define DIY_REDUCE_HPP
+
+#include <vector>
+#include "master.hpp"
+#include "assigner.hpp"
+#include "detail/block_traits.hpp"
+#include "log.hpp"
+
+namespace diy
+{
+//! Enables communication within a group during a reduction.
+//! DIY creates the ReduceProxy for you in diy::reduce()
+//! and provides a reference to ReduceProxy each time the user's reduction function is called
+struct ReduceProxy: public Master::Proxy
+{
+    typedef     std::vector<int>                            GIDVector;
+
+    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+                void*                   block, //!< diy block
+                unsigned                round, //!< current round
+                const Assigner&         assigner, //!< assigner
+                const GIDVector&        incoming_gids, //!< incoming gids in this group
+                const GIDVector&        outgoing_gids): //!< outgoing gids in this group
+      Master::Proxy(proxy),
+      block_(block),
+      round_(round),
+      assigner_(assigner)
+    {
+      // setup in_link
+      for (unsigned i = 0; i < incoming_gids.size(); ++i)
+      {
+        BlockID nbr;
+        nbr.gid  = incoming_gids[i];
+        nbr.proc = assigner.rank(nbr.gid);
+        in_link_.add_neighbor(nbr);
+      }
+
+      // setup out_link
+      for (unsigned i = 0; i < outgoing_gids.size(); ++i)
+      {
+        BlockID nbr;
+        nbr.gid  = outgoing_gids[i];
+        nbr.proc = assigner.rank(nbr.gid);
+        out_link_.add_neighbor(nbr);
+      }
+    }
+
+    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+                void*                   block, //!< diy block
+                unsigned                round, //!< current round
+                const Assigner&         assigner,
+                const Link&             in_link,
+                const Link&             out_link):
+      Master::Proxy(proxy),
+      block_(block),
+      round_(round),
+      assigner_(assigner),
+      in_link_(in_link),
+      out_link_(out_link)
+    {}
+
+    //! returns pointer to block
+    void*         block() const                           { return block_; }
+    //! returns current round number
+    unsigned      round() const                           { return round_; }
+    //! returns incoming link
+    const Link&   in_link() const                         { return in_link_; }
+    //! returns outgoing link
+    const Link&   out_link() const                        { return out_link_; }
+    //! returns total number of blocks
+    int           nblocks() const                         { return assigner_.nblocks(); }
+    //! returns the assigner
+    const Assigner& assigner() const                      { return assigner_; }
+
+    //! advanced: change current round number
+    void          set_round(unsigned r)                   { round_ = r; }
+
+  private:
+    void*         block_;
+    unsigned      round_;
+    const Assigner& assigner_;
+
+    Link          in_link_;
+    Link          out_link_;
+};
+
+namespace detail
+{
+  template<class Block, class Partners>
+  struct ReductionFunctor;
+
+  template<class Partners, class Skip>
+  struct SkipInactiveOr;
+
+  struct ReduceNeverSkip
+  {
+    bool operator()(int round, int lid, const Master& master) const  { return false; }
+  };
+}
+
+/**
+ * \ingroup Communication
+ * \brief Implementation of the reduce communication pattern (includes
+ *        swap-reduce, merge-reduce, and any other global communication).
+ *
+ */
+template<class Reduce, class Partners, class Skip>
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reduce,        //!< reduction callback function
+            const Skip&                skip)          //!< object determining whether a block should be skipped
+{
+  auto log = get_logger();
+
+  int original_expected = master.expected();
+
+  using Block = typename detail::block_traits<Reduce>::type;
+
+  unsigned round;
+  for (round = 0; round < partners.rounds(); ++round)
+  {
+    log->debug("Round {}", round);
+    master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
+                   detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
+    master.execute();
+
+    int expected = 0;
+    for (unsigned i = 0; i < master.size(); ++i)
+    {
+      if (partners.active(round + 1, master.gid(i), master))
+      {
+        std::vector<int> incoming_gids;
+        partners.incoming(round + 1, master.gid(i), incoming_gids, master);
+        expected += incoming_gids.size();
+        master.incoming(master.gid(i)).clear();
+      }
+    }
+    master.set_expected(expected);
+    master.flush();
+  }
+  // final round
+  log->debug("Round {}", round);
+  master.foreach(detail::ReductionFunctor<Block,Partners>(round, reduce, partners, assigner),
+                 detail::SkipInactiveOr<Partners,Skip>(round, partners, skip));
+
+  master.set_expected(original_expected);
+}
+
+/**
+ * \ingroup Communication
+ * \brief Implementation of the reduce communication pattern (includes
+ *        swap-reduce, merge-reduce, and any other global communication).
+ *
+ */
+template<class Reduce, class Partners>
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reducer)       //!< reduction callback function
+{
+  reduce(master, assigner, partners, reducer, detail::ReduceNeverSkip());
+}
+
+namespace detail
+{
+  template<class Block, class Partners>
+  struct ReductionFunctor
+  {
+    using Callback = std::function<void(Block*, const ReduceProxy&, const Partners&)>;
+
+                ReductionFunctor(unsigned round_, const Callback& reduce_, const Partners& partners_, const Assigner& assigner_):
+                    round(round_), reduce(reduce_), partners(partners_), assigner(assigner_)        {}
+
+    void        operator()(Block* b, const Master::ProxyWithLink& cp) const
+    {
+      if (!partners.active(round, cp.gid(), *cp.master())) return;
+
+      std::vector<int> incoming_gids, outgoing_gids;
+      if (round > 0)
+          partners.incoming(round, cp.gid(), incoming_gids, *cp.master());        // receive from the previous round
+      if (round < partners.rounds())
+          partners.outgoing(round, cp.gid(), outgoing_gids, *cp.master());        // send to the next round
+
+      ReduceProxy   rp(cp, b, round, assigner, incoming_gids, outgoing_gids);
+      reduce(b, rp, partners);
+
+      // touch the outgoing queues to make sure they exist
+      Master::OutgoingQueues& outgoing = *cp.outgoing();
+      if (outgoing.size() < (size_t) rp.out_link().size())
+        for (int j = 0; j < rp.out_link().size(); ++j)
+          outgoing[rp.out_link().target(j)];       // touch the outgoing queue, creating it if necessary
+    }
+
+    unsigned        round;
+    Callback        reduce;
+    Partners        partners;
+    const Assigner& assigner;
+  };
+
+  template<class Partners, class Skip>
+  struct SkipInactiveOr
+  {
+                    SkipInactiveOr(int round_, const Partners& partners_, const Skip& skip_):
+                        round(round_), partners(partners_), skip(skip_)         {}
+    bool            operator()(int i, const Master& master) const               { return !partners.active(round, master.gid(i), master) || skip(round, i, master); }
+    int             round;
+    const Partners& partners;
+    Skip            skip;
+  };
+}
+
+} // diy
+
+#endif // DIY_REDUCE_HPP
diff --git a/include/vtkmdiy/serialization.hpp b/include/vtkmdiy/serialization.hpp
new file mode 100644
index 000000000..25640255d
--- /dev/null
+++ b/include/vtkmdiy/serialization.hpp
@@ -0,0 +1,456 @@
+#ifndef DIY_SERIALIZATION_HPP
+#define DIY_SERIALIZATION_HPP
+
+#include <vector>
+#include <valarray>
+#include <map>
+#include <set>
+#include <string>
+#include <fstream>
+
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <type_traits>              // this is used for a safety check for default serialization
+
+namespace diy
+{
+  //! A serialization buffer. \ingroup Serialization
+  struct BinaryBuffer
+  {
+    virtual void        save_binary(const char* x, size_t count)    =0;   //!< copy `count` bytes from `x` into the buffer
+    virtual void        load_binary(char* x, size_t count)          =0;   //!< copy `count` bytes into `x` from the buffer
+    virtual void        load_binary_back(char* x, size_t count)     =0;   //!< copy `count` bytes into `x` from the back of the buffer
+  };
+
+  struct MemoryBuffer: public BinaryBuffer
+  {
+                        MemoryBuffer(size_t position_ = 0):
+                          position(position_)                       {}
+
+    virtual inline void save_binary(const char* x, size_t count) override;   //!< copy `count` bytes from `x` into the buffer
+    virtual inline void load_binary(char* x, size_t count) override;         //!< copy `count` bytes into `x` from the buffer
+    virtual inline void load_binary_back(char* x, size_t count) override;    //!< copy `count` bytes into `x` from the back of the buffer
+
+    void                clear()                                     { buffer.clear(); reset(); }
+    void                wipe()                                      { std::vector<char>().swap(buffer); reset(); }
+    void                reset()                                     { position = 0; }
+    void                skip(size_t s)                              { position += s; }
+    void                swap(MemoryBuffer& o)                       { std::swap(position, o.position); buffer.swap(o.buffer); }
+    bool                empty() const                               { return buffer.empty(); }
+    size_t              size() const                                { return buffer.size(); }
+    void                reserve(size_t s)                           { buffer.reserve(s); }
+                        operator bool() const                       { return position < buffer.size(); }
+
+    //! copy a memory buffer from one buffer to another, bypassing making a temporary copy first
+    inline static void  copy(MemoryBuffer& from, MemoryBuffer& to);
+
+    //! multiplier used for the geometric growth of the container
+    static float        growth_multiplier()                         { return 1.5; }
+
+    // simple file IO
+    void                write(const std::string& fn) const          { std::ofstream out(fn.c_str()); out.write(&buffer[0], size()); }
+    void                read(const std::string& fn)
+    {
+        std::ifstream in(fn.c_str(), std::ios::binary | std::ios::ate);
+        buffer.resize(in.tellg());
+        in.seekg(0);
+        in.read(&buffer[0], size());
+        position = 0;
+    }
+
+    size_t              position;
+    std::vector<char>   buffer;
+  };
+
+  namespace detail
+  {
+    struct Default {};
+  }
+
+  //!\addtogroup Serialization
+  //!@{
+
+  /**
+   * \brief Main interface to serialization, meant to be specialized for the
+   * types that require special handling.  `diy::save()` and `diy::load()` call
+   * the static member functions of this class.
+   *
+   * The default (unspecialized) version copies
+   * `sizeof(T)` bytes from `&x` to or from `bb` via
+   * its `diy::BinaryBuffer::save_binary()` and `diy::BinaryBuffer::load_binary()`
+   * functions.  This works out perfectly for plain old data (e.g., simple structs).
+   * To save a more complicated type, one has to specialize
+   * `diy::Serialization<T>` for that type. Specializations are already provided for
+   * `std::vector<T>`, `std::map<K,V>`, and `std::pair<T,U>`.
+   * As a result one can quickly add a specialization of one's own
+   *
+   */
+  template<class T>
+  struct Serialization: public detail::Default
+  {
+#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)
+    static_assert(std::is_trivially_copyable<T>::value, "Default serialization works only for trivially copyable types");
+#endif
+
+    static void         save(BinaryBuffer& bb, const T& x)          { bb.save_binary((const char*)  &x, sizeof(T)); }
+    static void         load(BinaryBuffer& bb, T& x)                { bb.load_binary((char*)        &x, sizeof(T)); }
+  };
+
+  //! Saves `x` to `bb` by calling `diy::Serialization<T>::save(bb,x)`.
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T& x)          { Serialization<T>::save(bb, x); }
+
+  //! Loads `x` from `bb` by calling `diy::Serialization<T>::load(bb,x)`.
+  template<class T>
+  void                  load(BinaryBuffer& bb, T& x)                { Serialization<T>::load(bb, x); }
+
+  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
+  //! the array will be copied all at once. Otherwise, it's copied element by element.
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T* x, size_t n);
+
+  //! Optimization for arrays. If `diy::Serialization` is not specialized for `T`,
+  //! the array will be filled all at once. Otherwise, it's filled element by element.
+  template<class T>
+  void                  load(BinaryBuffer& bb, T* x, size_t n);
+
+  //! Supports only binary data copying (meant for simple footers).
+  template<class T>
+  void                  load_back(BinaryBuffer& bb, T& x)           { bb.load_binary_back((char*) &x, sizeof(T)); }
+
+  //@}
+
+
+  namespace detail
+  {
+    template<typename T>
+    struct is_default
+    {
+        typedef char    yes;
+        typedef int     no;
+
+        static yes      test(Default*);
+        static no       test(...);
+
+        enum { value = (sizeof(test((T*) 0)) == sizeof(yes)) };
+    };
+  }
+
+  template<class T>
+  void                  save(BinaryBuffer& bb, const T* x, size_t n)
+  {
+    if (!detail::is_default< Serialization<T> >::value)
+      for (size_t i = 0; i < n; ++i)
+        diy::save(bb, x[i]);
+    else        // if Serialization is not specialized for U, just save the binary data
+      bb.save_binary((const char*) &x[0], sizeof(T)*n);
+  }
+
+  template<class T>
+  void                  load(BinaryBuffer& bb, T* x, size_t n)
+  {
+    if (!detail::is_default< Serialization<T> >::value)
+      for (size_t i = 0; i < n; ++i)
+        diy::load(bb, x[i]);
+    else      // if Serialization is not specialized for U, just load the binary data
+      bb.load_binary((char*) &x[0], sizeof(T)*n);
+  }
+
+
+  // save/load for MemoryBuffer
+  template<>
+  struct Serialization< MemoryBuffer >
+  {
+    static void         save(BinaryBuffer& bb, const MemoryBuffer& x)
+    {
+      diy::save(bb, x.position);
+      diy::save(bb, &x.buffer[0], x.position);
+    }
+
+    static void         load(BinaryBuffer& bb, MemoryBuffer& x)
+    {
+      diy::load(bb, x.position);
+      x.buffer.resize(x.position);
+      diy::load(bb, &x.buffer[0], x.position);
+    }
+  };
+
+  // save/load for std::vector<U>
+  template<class U>
+  struct Serialization< std::vector<U> >
+  {
+    typedef             std::vector<U>          Vector;
+
+    static void         save(BinaryBuffer& bb, const Vector& v)
+    {
+      size_t s = v.size();
+      diy::save(bb, s);
+      diy::save(bb, &v[0], v.size());
+    }
+
+    static void         load(BinaryBuffer& bb, Vector& v)
+    {
+      size_t s;
+      diy::load(bb, s);
+      v.resize(s);
+      diy::load(bb, &v[0], s);
+    }
+  };
+
+  template<class U>
+  struct Serialization< std::valarray<U> >
+  {
+    typedef             std::valarray<U>        ValArray;
+
+    static void         save(BinaryBuffer& bb, const ValArray& v)
+    {
+      size_t s = v.size();
+      diy::save(bb, s);
+      diy::save(bb, &v[0], v.size());
+    }
+
+    static void         load(BinaryBuffer& bb, ValArray& v)
+    {
+      size_t s;
+      diy::load(bb, s);
+      v.resize(s);
+      diy::load(bb, &v[0], s);
+    }
+  };
+
+  // save/load for std::string
+  template<>
+  struct Serialization< std::string >
+  {
+    typedef             std::string             String;
+
+    static void         save(BinaryBuffer& bb, const String& s)
+    {
+      size_t sz = s.size();
+      diy::save(bb, sz);
+      diy::save(bb, s.c_str(), sz);
+    }
+
+    static void         load(BinaryBuffer& bb, String& s)
+    {
+      size_t sz;
+      diy::load(bb, sz);
+      s.resize(sz);
+      for (size_t i = 0; i < sz; ++i)
+      {
+          char c;
+          diy::load(bb, c);
+          s[i] = c;
+      }
+    }
+  };
+
+  // save/load for std::pair<X,Y>
+  template<class X, class Y>
+  struct Serialization< std::pair<X,Y> >
+  {
+    typedef             std::pair<X,Y>          Pair;
+
+    static void         save(BinaryBuffer& bb, const Pair& p)
+    {
+      diy::save(bb, p.first);
+      diy::save(bb, p.second);
+    }
+
+    static void         load(BinaryBuffer& bb, Pair& p)
+    {
+      diy::load(bb, p.first);
+      diy::load(bb, p.second);
+    }
+  };
+
+  // save/load for std::map<K,V>
+  template<class K, class V>
+  struct Serialization< std::map<K,V> >
+  {
+    typedef             std::map<K,V>           Map;
+
+    static void         save(BinaryBuffer& bb, const Map& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it)
+        diy::save(bb, *it);
+    }
+
+    static void         load(BinaryBuffer& bb, Map& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        K k;
+        diy::load(bb, k);
+        diy::load(bb, m[k]);
+      }
+    }
+  };
+
+  // save/load for std::set<T>
+  template<class T>
+  struct Serialization< std::set<T> >
+  {
+    typedef             std::set<T>             Set;
+
+    static void         save(BinaryBuffer& bb, const Set& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (typename std::set<T>::const_iterator it = m.begin(); it != m.end(); ++it)
+        diy::save(bb, *it);
+    }
+
+    static void         load(BinaryBuffer& bb, Set& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        T p;
+        diy::load(bb, p);
+        m.insert(p);
+      }
+    }
+  };
+
+  // save/load for std::unordered_map<K,V,H,E,A>
+  template<class K, class V, class H, class E, class A>
+  struct Serialization< std::unordered_map<K,V,H,E,A> >
+  {
+    typedef             std::unordered_map<K,V,H,E,A>   Map;
+
+    static void         save(BinaryBuffer& bb, const Map& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (auto& x : m)
+        diy::save(bb, x);
+    }
+
+    static void         load(BinaryBuffer& bb, Map& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        std::pair<K,V> p;
+        diy::load(bb, p);
+        m.emplace(std::move(p));
+      }
+    }
+  };
+
+  // save/load for std::unordered_set<T,H,E,A>
+  template<class T, class H, class E, class A>
+  struct Serialization< std::unordered_set<T,H,E,A> >
+  {
+    typedef             std::unordered_set<T,H,E,A>     Set;
+
+    static void         save(BinaryBuffer& bb, const Set& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (auto& x : m)
+        diy::save(bb, x);
+    }
+
+    static void         load(BinaryBuffer& bb, Set& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        T p;
+        diy::load(bb, p);
+        m.emplace(std::move(p));
+      }
+    }
+  };
+
+  // save/load for std::tuple<...>
+  // TODO: this ought to be default (copying) serialization
+  //       if all arguments are default
+  template<class... Args>
+  struct Serialization< std::tuple<Args...> >
+  {
+    typedef             std::tuple<Args...>     Tuple;
+
+    static void         save(BinaryBuffer& bb, const Tuple& t)          { save<0>(bb, t); }
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I == sizeof...(Args), void>::type
+                        save(BinaryBuffer&, const Tuple&)               {}
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I < sizeof...(Args), void>::type
+                        save(BinaryBuffer& bb, const Tuple& t)          { diy::save(bb, std::get<I>(t)); save<I+1>(bb, t); }
+
+    static void         load(BinaryBuffer& bb, Tuple& t)                { load<0>(bb, t); }
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I == sizeof...(Args), void>::type
+                        load(BinaryBuffer&, Tuple&)                     {}
+
+    template<std::size_t I = 0>
+    static
+    typename std::enable_if<I < sizeof...(Args), void>::type
+                        load(BinaryBuffer& bb, Tuple& t)                { diy::load(bb, std::get<I>(t)); load<I+1>(bb, t); }
+
+  };
+}
+
+void
+diy::MemoryBuffer::
+save_binary(const char* x, size_t count)
+{
+  if (position + count > buffer.capacity())
+    buffer.reserve((position + count) * growth_multiplier());           // if we have to grow, grow geometrically
+
+  if (position + count > buffer.size())
+    buffer.resize(position + count);
+
+  std::copy(x, x + count, &buffer[position]);
+  position += count;
+}
+
+void
+diy::MemoryBuffer::
+load_binary(char* x, size_t count)
+{
+  std::copy(&buffer[position], &buffer[position + count], x);
+  position += count;
+}
+
+void
+diy::MemoryBuffer::
+load_binary_back(char* x, size_t count)
+{
+  std::copy(&buffer[buffer.size() - count], &buffer[buffer.size()], x);
+  buffer.resize(buffer.size() - count);
+}
+
+void
+diy::MemoryBuffer::
+copy(MemoryBuffer& from, MemoryBuffer& to)
+{
+  size_t sz;
+  diy::load(from, sz);
+  from.position -= sizeof(size_t);
+
+  size_t total = sizeof(size_t) + sz;
+  to.buffer.resize(to.position + total);
+  std::copy(&from.buffer[from.position], &from.buffer[from.position + total], &to.buffer[to.position]);
+  to.position += total;
+  from.position += total;
+}
+
+#endif
diff --git a/include/vtkmdiy/stats.hpp b/include/vtkmdiy/stats.hpp
new file mode 100644
index 000000000..4866ccfb1
--- /dev/null
+++ b/include/vtkmdiy/stats.hpp
@@ -0,0 +1,115 @@
+#ifndef DIY_STATS_HPP
+#define DIY_STATS_HPP
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+#include "log.hpp"      // need this for format
+
+namespace diy
+{
+namespace stats
+{
+
+#if defined(DIY_PROFILE)
+struct Profiler
+{
+    using   Clock = std::chrono::high_resolution_clock;
+    using   Time  = Clock::time_point;
+
+    struct Event
+    {
+            Event(const std::string& name_, bool begin_):
+                name(name_),
+                begin(begin_),
+                stamp(Clock::now())
+                                                        {}
+
+        std::string     name;
+        bool            begin;
+        Time            stamp;
+    };
+
+    using   EventsVector = std::vector<Event>;
+
+    struct  Scoped
+    {
+            Scoped(Profiler& prof_, std::string name_):
+                prof(prof_), name(name_), active(true)  { prof << name; }
+            ~Scoped()                                   { if (active) prof >> name; }
+
+            Scoped(Scoped&& other):
+                prof(other.prof),
+                name(other.name),
+                active(other.active)                    { other.active = false; }
+
+        Scoped&
+            operator=(Scoped&& other) = delete;
+            Scoped(const Scoped&) = delete;
+        Scoped&
+            operator=(const Scoped&) = delete;
+
+        Profiler&   prof;
+        std::string name;
+        bool        active;
+    };
+
+            Profiler()                                  { reset_time(); }
+
+    void    reset_time()                                { start = Clock::now(); }
+
+    void    operator<<(std::string name)                { enter(name); }
+    void    operator>>(std::string name)                { exit(name); }
+
+    void    enter(std::string name)                     { events.push_back(Event(name, true)); }
+    void    exit(std::string name)                      { events.push_back(Event(name, false)); }
+
+    void    output(std::ostream& out)
+    {
+        for (size_t i = 0; i < events.size(); ++i)
+        {
+            const Event& e = events[i];
+            auto time = std::chrono::duration_cast<std::chrono::microseconds>(e.stamp - start).count();
+
+            fmt::print(out, "{:02d}:{:02d}:{:02d}.{:06d} {}{}\n",
+                            time/1000000/60/60,
+                            time/1000000/60 % 60,
+                            time/1000000 % 60,
+                            time % 1000000,
+                            (e.begin ? '<' : '>'),
+                            e.name);
+        }
+    }
+
+    Scoped  scoped(std::string name)                    { return Scoped(*this, name); }
+
+    void    clear()                                     { events.clear(); }
+
+    private:
+        Time            start;
+        EventsVector    events;
+};
+#else
+struct Profiler
+{
+    struct Scoped {};
+
+    void    reset_time()                                {}
+
+    void    operator<<(std::string)                     {}
+    void    operator>>(std::string)                     {}
+
+    void    enter(const std::string&)                   {}
+    void    exit(const std::string&)                    {}
+
+    void    output(std::ostream&)                       {}
+    void    clear()                                     {}
+
+    Scoped  scoped(std::string)                         { return Scoped(); }
+};
+#endif
+}
+}
+
+#endif
diff --git a/include/vtkmdiy/storage.hpp b/include/vtkmdiy/storage.hpp
new file mode 100644
index 000000000..62213b2c5
--- /dev/null
+++ b/include/vtkmdiy/storage.hpp
@@ -0,0 +1,228 @@
+#ifndef DIY_STORAGE_HPP
+#define DIY_STORAGE_HPP
+
+#include <string>
+#include <map>
+#include <fstream>
+
+#include <unistd.h>     // mkstemp() on Mac
+#include <cstdlib>      // mkstemp() on Linux
+#include <cstdio>       // remove()
+#include <fcntl.h>
+
+#include "serialization.hpp"
+#include "thread.hpp"
+#include "log.hpp"
+
+namespace diy
+{
+  namespace detail
+  {
+    typedef       void  (*Save)(const void*, BinaryBuffer& buf);
+    typedef       void  (*Load)(void*,       BinaryBuffer& buf);
+
+    struct FileBuffer: public BinaryBuffer
+    {
+                          FileBuffer(FILE* file_): file(file_), head(0), tail(0)    {}
+
+      // TODO: add error checking
+      virtual inline void save_binary(const char* x, size_t count) override   { fwrite(x, 1, count, file); head += count; }
+      virtual inline void load_binary(char* x, size_t count) override         { fread(x, 1, count, file); }
+      virtual inline void load_binary_back(char* x, size_t count) override    { fseek(file, tail, SEEK_END); fread(x, 1, count, file); tail += count; fseek(file, head, SEEK_SET); }
+
+      size_t              size() const                                { return head; }
+
+      FILE*  file;
+      size_t head, tail;  // tail is used to support reading from the back;
+                          // the mechanism is a little awkward and unused, but should work if needed
+    };
+  }
+
+  class ExternalStorage
+  {
+    public:
+      virtual int   put(MemoryBuffer& bb)                               =0;
+      virtual int   put(const void* x, detail::Save save)               =0;
+      virtual void  get(int i, MemoryBuffer& bb, size_t extra = 0)      =0;
+      virtual void  get(int i, void* x, detail::Load load)              =0;
+      virtual void  destroy(int i)                                      =0;
+  };
+
+  class FileStorage: public ExternalStorage
+  {
+    private:
+      struct FileRecord
+      {
+        size_t          size;
+        std::string     name;
+      };
+
+    public:
+                    FileStorage(const std::string& filename_template = "/tmp/DIY.XXXXXX"):
+                      filename_templates_(1, filename_template),
+                      count_(0), current_size_(0), max_size_(0)         {}
+
+                    FileStorage(const std::vector<std::string>& filename_templates):
+                      filename_templates_(filename_templates),
+                      count_(0), current_size_(0), max_size_(0)         {}
+
+      virtual int   put(MemoryBuffer& bb) override
+      {
+        auto log = get_logger();
+        std::string     filename;
+        int fh = open_random(filename);
+
+        log->debug("FileStorage::put(): {}; buffer size: {}", filename, bb.size());
+
+        size_t sz = bb.buffer.size();
+        size_t written = write(fh, &bb.buffer[0], sz);
+        if (written < sz || written == (size_t)-1)
+          log->warn("Could not write the full buffer to {}: written = {}; size = {}", filename, written, sz);
+        fsync(fh);
+        close(fh);
+        bb.wipe();
+
+#if 0       // double-check the written file size: only for extreme debugging
+        FILE* fp = fopen(filename.c_str(), "r");
+        fseek(fp, 0L, SEEK_END);
+        int fsz = ftell(fp);
+        if (fsz != sz)
+            log->warn("file size doesn't match the buffer size, {} vs {}", fsz, sz);
+        fclose(fp);
+#endif
+
+        return make_file_record(filename, sz);
+      }
+
+      virtual int    put(const void* x, detail::Save save) override
+      {
+        std::string     filename;
+        int fh = open_random(filename);
+
+        detail::FileBuffer fb(fdopen(fh, "w"));
+        save(x, fb);
+        size_t sz = fb.size();
+        fclose(fb.file);
+        fsync(fh);
+
+        return make_file_record(filename, sz);
+      }
+
+      virtual void   get(int i, MemoryBuffer& bb, size_t extra) override
+      {
+        FileRecord fr = extract_file_record(i);
+
+        get_logger()->debug("FileStorage::get(): {}", fr.name);
+
+        bb.buffer.reserve(fr.size + extra);
+        bb.buffer.resize(fr.size);
+        int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
+        read(fh, &bb.buffer[0], fr.size);
+        close(fh);
+
+        remove_file(fr);
+      }
+
+      virtual void   get(int i, void* x, detail::Load load) override
+      {
+        FileRecord fr = extract_file_record(i);
+
+        //int fh = open(fr.name.c_str(), O_RDONLY | O_SYNC, 0600);
+        int fh = open(fr.name.c_str(), O_RDONLY, 0600);
+        detail::FileBuffer fb(fdopen(fh, "r"));
+        load(x, fb);
+        fclose(fb.file);
+
+        remove_file(fr);
+      }
+
+      virtual void  destroy(int i) override
+      {
+        FileRecord      fr;
+        {
+          CriticalMapAccessor accessor = filenames_.access();
+          fr = (*accessor)[i];
+          accessor->erase(i);
+        }
+        remove(fr.name.c_str());
+        (*current_size_.access()) -= fr.size;
+      }
+
+      int           count() const               { return (*count_.const_access()); }
+      size_t        current_size() const        { return (*current_size_.const_access()); }
+      size_t        max_size() const            { return (*max_size_.const_access()); }
+
+                    ~FileStorage()
+      {
+        for (FileRecordMap::const_iterator it =  filenames_.const_access()->begin();
+                                           it != filenames_.const_access()->end();
+                                         ++it)
+        {
+          remove(it->second.name.c_str());
+        }
+      }
+
+    private:
+      int           open_random(std::string& filename) const
+      {
+        if (filename_templates_.size() == 1)
+            filename = filename_templates_[0].c_str();
+        else
+        {
+            // pick a template at random (very basic load balancing mechanism)
+            filename  = filename_templates_[std::rand() % filename_templates_.size()].c_str();
+        }
+#ifdef __MACH__
+        // TODO: figure out how to open with O_SYNC
+        int fh = mkstemp(const_cast<char*>(filename.c_str()));
+#else
+        int fh = mkostemp(const_cast<char*>(filename.c_str()), O_WRONLY | O_SYNC);
+#endif
+
+        return fh;
+      }
+
+      int           make_file_record(const std::string& filename, size_t sz)
+      {
+        int res = (*count_.access())++;
+        FileRecord  fr = { sz, filename };
+        (*filenames_.access())[res] = fr;
+
+        // keep track of sizes
+        critical_resource<size_t>::accessor     cur = current_size_.access();
+        *cur += sz;
+        critical_resource<size_t>::accessor     max = max_size_.access();
+        if (*cur > *max)
+            *max = *cur;
+
+        return res;
+      }
+
+      FileRecord    extract_file_record(int i)
+      {
+        CriticalMapAccessor accessor = filenames_.access();
+        FileRecord fr = (*accessor)[i];
+        accessor->erase(i);
+        return fr;
+      }
+
+      void          remove_file(const FileRecord& fr)
+      {
+        remove(fr.name.c_str());
+        (*current_size_.access()) -= fr.size;
+      }
+
+    private:
+      typedef           std::map<int, FileRecord>                   FileRecordMap;
+      typedef           critical_resource<FileRecordMap>            CriticalMap;
+      typedef           CriticalMap::accessor                       CriticalMapAccessor;
+
+    private:
+      std::vector<std::string>      filename_templates_;
+      CriticalMap                   filenames_;
+      critical_resource<int>        count_;
+      critical_resource<size_t>     current_size_, max_size_;
+  };
+}
+
+#endif
diff --git a/include/vtkmdiy/thread.hpp b/include/vtkmdiy/thread.hpp
new file mode 100644
index 000000000..1c9149a42
--- /dev/null
+++ b/include/vtkmdiy/thread.hpp
@@ -0,0 +1,31 @@
+#ifndef DIY_THREAD_H
+#define DIY_THREAD_H
+
+#ifdef DIY_NO_THREADS
+#include "no-thread.hpp"
+#else
+
+#include "thread/fast_mutex.h"
+
+#include <thread>
+#include <mutex>
+
+namespace diy
+{
+    using std::thread;
+    using std::mutex;
+    using std::recursive_mutex;
+    namespace this_thread = std::this_thread;
+
+    // TODO: replace with our own implementation using std::atomic_flag
+    using fast_mutex = tthread::fast_mutex;
+
+    template<class Mutex>
+    using lock_guard = std::unique_lock<Mutex>;
+}
+
+#endif
+
+#include "critical-resource.hpp"
+
+#endif
diff --git a/include/vtkmdiy/thread/fast_mutex.h b/include/vtkmdiy/thread/fast_mutex.h
new file mode 100644
index 000000000..4d4b7cc43
--- /dev/null
+++ b/include/vtkmdiy/thread/fast_mutex.h
@@ -0,0 +1,248 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef _FAST_MUTEX_H_
+#define _FAST_MUTEX_H_
+
+/// @file
+
+// Which platform are we on?
+#if !defined(_TTHREAD_PLATFORM_DEFINED_)
+  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
+    #define _TTHREAD_WIN32_
+  #else
+    #define _TTHREAD_POSIX_
+  #endif
+  #define _TTHREAD_PLATFORM_DEFINED_
+#endif
+
+// Check if we can support the assembly language level implementation (otherwise
+// revert to the system API)
+#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
+    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
+    (defined(__GNUC__) && (defined(__ppc__)))
+  #define _FAST_MUTEX_ASM_
+#else
+  #define _FAST_MUTEX_SYS_
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+  #ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+    #define __UNDEF_LEAN_AND_MEAN
+  #endif
+  #include <windows.h>
+  #ifdef __UNDEF_LEAN_AND_MEAN
+    #undef WIN32_LEAN_AND_MEAN
+    #undef __UNDEF_LEAN_AND_MEAN
+  #endif
+#else
+  #ifdef _FAST_MUTEX_ASM_
+    #include <sched.h>
+  #else
+    #include <pthread.h>
+  #endif
+#endif
+
+namespace tthread {
+
+/// Fast mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. It is similar to the tthread::mutex class,
+/// but instead of using system level functions, it is implemented as an atomic
+/// spin lock with very low CPU overhead.
+///
+/// The \c fast_mutex class is NOT compatible with the \c condition_variable
+/// class (however, it IS compatible with the \c lock_guard class). It should
+/// also be noted that the \c fast_mutex class typically does not provide
+/// as accurate thread scheduling as a the standard \c mutex class does.
+///
+/// Because of the limitations of the class, it should only be used in
+/// situations where the mutex needs to be locked/unlocked very frequently.
+///
+/// @note The "fast" version of this class relies on inline assembler language,
+/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
+/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
+/// Visual C++).
+/// For other architectures/compilers, system functions are used instead.
+class fast_mutex {
+  public:
+    /// Constructor.
+#if defined(_FAST_MUTEX_ASM_)
+    fast_mutex() : mLock(0) {}
+#else
+    fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      InitializeCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_init(&mHandle, NULL);
+  #endif
+    }
+#endif
+
+#if !defined(_FAST_MUTEX_ASM_)
+    /// Destructor.
+    ~fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      DeleteCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_destroy(&mHandle);
+  #endif
+    }
+#endif
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until \c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      bool gotLock;
+      do {
+        gotLock = try_lock();
+        if(!gotLock)
+        {
+  #if defined(_TTHREAD_WIN32_)
+          Sleep(0);
+  #elif defined(_TTHREAD_POSIX_)
+          sched_yield();
+  #endif
+        }
+      } while(!gotLock);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_lock(&mHandle);
+  #endif
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return \c true if the lock was acquired, or \c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      int oldLock;
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $1,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        "movl %%eax,%1\n\t"
+        : "=m" (mLock), "=m" (oldLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,1
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+        mov oldLock,eax
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      int newLock = 1;
+      asm volatile (
+        "\n1:\n\t"
+        "lwarx  %0,0,%1\n\t"
+        "cmpwi  0,%0,0\n\t"
+        "bne-   2f\n\t"
+        "stwcx. %2,0,%1\n\t"
+        "bne-   1b\n\t"
+        "isync\n"
+        "2:\n\t"
+        : "=&r" (oldLock)
+        : "r" (&mLock), "r" (newLock)
+        : "cr0", "memory"
+      );
+  #endif
+      return (oldLock == 0);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      return TryEnterCriticalSection(&mHandle) ? true : false;
+  #elif defined(_TTHREAD_POSIX_)
+      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
+  #endif
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $0,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        : "=m" (mLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,0
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      asm volatile (
+        "sync\n\t"  // Replace with lwsync where possible?
+        : : : "memory"
+      );
+      mLock = 0;
+  #endif
+#else
+  #if defined(_TTHREAD_WIN32_)
+      LeaveCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_unlock(&mHandle);
+  #endif
+#endif
+    }
+
+  private:
+#if defined(_FAST_MUTEX_ASM_)
+    int mLock;
+#else
+  #if defined(_TTHREAD_WIN32_)
+    CRITICAL_SECTION mHandle;
+  #elif defined(_TTHREAD_POSIX_)
+    pthread_mutex_t mHandle;
+  #endif
+#endif
+};
+
+}
+
+#endif // _FAST_MUTEX_H_
+
diff --git a/include/vtkmdiy/time.hpp b/include/vtkmdiy/time.hpp
new file mode 100644
index 000000000..d6b44c2e1
--- /dev/null
+++ b/include/vtkmdiy/time.hpp
@@ -0,0 +1,33 @@
+#ifndef DIY_TIME_HPP
+#define DIY_TIME_HPP
+
+#include <sys/time.h>
+
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+namespace diy
+{
+
+typedef     unsigned long       time_type;
+
+inline time_type get_time()
+{
+#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+#else
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+    return ts.tv_sec*1000 + ts.tv_nsec/1000000;
+}
+
+}
+
+#endif
diff --git a/include/vtkmdiy/types.hpp b/include/vtkmdiy/types.hpp
new file mode 100644
index 000000000..d52e75030
--- /dev/null
+++ b/include/vtkmdiy/types.hpp
@@ -0,0 +1,85 @@
+#ifndef DIY_TYPES_HPP
+#define DIY_TYPES_HPP
+
+#include <iostream>
+#include "constants.h"
+#include "point.hpp"
+
+namespace diy
+{
+    struct BlockID
+    {
+        int gid, proc;
+    };
+
+    template<class Coordinate_>
+    struct Bounds
+    {
+        using Coordinate = Coordinate_;
+
+        Point<Coordinate, DIY_MAX_DIM>    min, max;
+    };
+    using DiscreteBounds   = Bounds<int>;
+    using ContinuousBounds = Bounds<float>;
+
+    //! Helper to create a 1-dimensional discrete domain with the specified extents
+    inline
+    diy::DiscreteBounds
+    interval(int from, int to)            { DiscreteBounds domain; domain.min[0] = from; domain.max[0] = to; return domain; }
+
+    struct Direction: public Point<int,DIY_MAX_DIM>
+    {
+              Direction()                 { for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0; }
+              Direction(int dir)
+      {
+          for (int i = 0; i < DIY_MAX_DIM; ++i) (*this)[i] = 0;
+          if (dir & DIY_X0) (*this)[0] -= 1;
+          if (dir & DIY_X1) (*this)[0] += 1;
+          if (dir & DIY_Y0) (*this)[1] -= 1;
+          if (dir & DIY_Y1) (*this)[1] += 1;
+          if (dir & DIY_Z0) (*this)[2] -= 1;
+          if (dir & DIY_Z1) (*this)[2] += 1;
+          if (dir & DIY_T0) (*this)[3] -= 1;
+          if (dir & DIY_T1) (*this)[3] += 1;
+      }
+
+      bool
+      operator==(const diy::Direction& y) const
+      {
+        for (int i = 0; i < DIY_MAX_DIM; ++i)
+            if ((*this)[i] != y[i]) return false;
+        return true;
+      }
+
+      // lexicographic comparison
+      bool
+      operator<(const diy::Direction& y) const
+      {
+        for (int i = 0; i < DIY_MAX_DIM; ++i)
+        {
+            if ((*this)[i] < y[i]) return true;
+            if ((*this)[i] > y[i]) return false;
+        }
+        return false;
+      }
+    };
+
+    // Selector of bounds value type
+    template<class Bounds_>
+    struct BoundsValue
+    {
+        using type = typename Bounds_::Coordinate;
+    };
+
+    inline
+    bool
+    operator<(const diy::BlockID& x, const diy::BlockID& y)
+    { return x.gid < y.gid; }
+
+    inline
+    bool
+    operator==(const diy::BlockID& x, const diy::BlockID& y)
+    { return x.gid == y.gid; }
+}
+
+#endif
diff --git a/include/vtkmdiy/vertices.hpp b/include/vtkmdiy/vertices.hpp
new file mode 100644
index 000000000..423209fd6
--- /dev/null
+++ b/include/vtkmdiy/vertices.hpp
@@ -0,0 +1,54 @@
+#ifndef DIY_VERTICES_HPP
+#define DIY_VERTICES_HPP
+
+#include <iterator>
+
+namespace diy
+{
+
+namespace detail
+{
+    template<class Vertex, size_t I>
+    struct IsLast
+    {
+        static constexpr bool value = (Vertex::dimension() - 1 == I);
+    };
+
+    template<class Vertex, class Callback, size_t I, bool P>
+    struct ForEach
+    {
+        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
+        {
+            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
+                ForEach<Vertex, Callback, I+1, IsLast<Vertex,I+1>::value>()(pos, from, to, callback);
+        }
+    };
+
+    template<class Vertex, class Callback, size_t I>
+    struct ForEach<Vertex,Callback,I,true>
+    {
+        void operator()(Vertex& pos, const Vertex& from, const Vertex& to, const Callback& callback) const
+        {
+            for (pos[I] = from[I]; pos[I] <= to[I]; ++pos[I])
+                callback(pos);
+        }
+    };
+}
+
+template<class Vertex, class Callback>
+void for_each(const Vertex& from, const Vertex& to, const Callback& callback)
+{
+    Vertex pos;
+    grid::detail::ForEach<Vertex, Callback, 0, detail::IsLast<Vertex,0>::value>()(pos, from, to, callback);
+}
+
+template<class Vertex, class Callback>
+void for_each(const Vertex& shape, const Callback& callback)
+{
+    // specify grid namespace to disambiguate with std::for_each(...)
+    grid::for_each(Vertex::zero(), shape - Vertex::one(), callback);
+}
+
+}
+
+#endif

From ce193592c52fce9442e87fc99f029a78f0aaf903 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 14:35:00 -0500
Subject: [PATCH 19/24] diy: update cmake rules.

DIY Thirdparty module now builds and also generates a
`vtkm/thirdparty/diy/Configure.h` which can in future support external
DIY.
---
 vtkm/thirdparty/diy/CMakeLists.txt | 67 ++++++++++++++++++++++++++++++
 vtkm/thirdparty/diy/Configure.h.in | 35 ++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 vtkm/thirdparty/diy/CMakeLists.txt
 create mode 100644 vtkm/thirdparty/diy/Configure.h.in

diff --git a/vtkm/thirdparty/diy/CMakeLists.txt b/vtkm/thirdparty/diy/CMakeLists.txt
new file mode 100644
index 000000000..9d84d87b4
--- /dev/null
+++ b/vtkm/thirdparty/diy/CMakeLists.txt
@@ -0,0 +1,67 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+##  Copyright 2017 UT-Battelle, LLC.
+##  Copyright 2017 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-NA0003525 with NTESS,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+add_library(diy INTERFACE)
+
+vtkm_get_kit_name(kit_name kit_dir)
+
+# diy needs C++11
+target_compile_features(diy INTERFACE cxx_auto_type)
+
+# placeholder to support external DIY
+set(VTKM_USE_EXTERNAL_DIY OFF)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Configure.h.in
+  ${VTKm_BINARY_INCLUDE_DIR}/${kit_dir}/Configure.h)
+
+target_include_directories(diy INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+  $<INSTALL_INTERFACE:${VTKm_INSTALL_INCLUDE_DIR}>)
+
+# presently, this dependency is required. Make it optional in the future.
+set(arg)
+foreach(apath IN LISTS MPI_C_INCLUDE_PATH MPI_CXX_INCLUDE_PATH)
+  list(APPEND arg $<BUILD_INTERFACE:${apath}>)
+endforeach()
+target_include_directories(diy INTERFACE ${arg})
+
+target_link_libraries(diy INTERFACE
+  $<BUILD_INTERFACE:${MPI_C_LIBRARIES}>
+  $<BUILD_INTERFACE:${MPI_CXX_LIBRARIES}>)
+
+if(MPI_C_COMPILE_DEFINITIONS)
+  target_compile_definitions(diy INTERFACE
+    $<$<COMPILE_LANGUAGE:C>:${MPI_C_COMPILE_DEFINITIONS}>)
+endif()
+if(MPI_CXX_COMPILE_DEFNITIONS)
+  target_compile_definitions(diy INTERFACE
+    $<$<COMPILE_LANGUAGE:CXX>:${MPI_CXX_COMPILE_DEFNITIONS>)
+endif()
+
+install(TARGETS diy
+  EXPORT ${VTKm_EXPORT_NAME})
+
+## Install headers
+install(DIRECTORY vtkmdiy
+  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/${kit_dir}/)
+install(FILES ${VTKm_BINARY_INCLUDE_DIR}/${kit_dir}/Configure.h
+  DESTINATION ${VTKm_INSTALL_INCLUDE_DIR}/${kit_dir}/)
diff --git a/vtkm/thirdparty/diy/Configure.h.in b/vtkm/thirdparty/diy/Configure.h.in
new file mode 100644
index 000000000..9aafe6e00
--- /dev/null
+++ b/vtkm/thirdparty/diy/Configure.h.in
@@ -0,0 +1,35 @@
+//=============================================================================
+//
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2015 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2015 UT-Battelle, LLC.
+//  Copyright 2015 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//
+//=============================================================================
+#ifndef vtkm_diy_h
+#define vtkm_diy_h
+
+/* Use the diy library configured for VTM-m. */
+#cmakedefine01 VTKM_USE_EXTERNAL_DIY
+
+#if VTKM_USE_EXTERNAL_DIY
+# define VTKM_DIY(header) <header>
+#else
+# define VTKM_DIY(header) <vtkmdiy/include/header>
+# define diy vtkmdiy // mangle namespace diy
+#endif
+
+#endif

From 92cf8bf6d450e2ccae1004562a59dde4db5d457d Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 14:37:23 -0500
Subject: [PATCH 20/24] cmake: update cmake rules to use `vtkm/thirdparty/diy`.

---
 CMakeLists.txt      | 5 -----
 vtkm/CMakeLists.txt | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f57e3194..51fdb0e11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,11 +221,6 @@ find_package(Pyexpander)
 
 #-----------------------------------------------------------------------------
 # Add subdirectories
-if(VTKm_ENABLE_MPI)
-  # This `if` is temporary and will be removed once `diy` supports building
-  # without MPI.
-  add_subdirectory(diy)
-endif()
 add_subdirectory(vtkm)
 
 #-----------------------------------------------------------------------------
diff --git a/vtkm/CMakeLists.txt b/vtkm/CMakeLists.txt
index 1edc1a9b6..436d07569 100644
--- a/vtkm/CMakeLists.txt
+++ b/vtkm/CMakeLists.txt
@@ -67,6 +67,11 @@ vtkm_declare_headers(${headers})
 
 #-----------------------------------------------------------------------------
 #first add all the components vtkm that are shared between control and exec
+if(VTKm_ENABLE_MPI)
+  # This `if` is temporary and will be removed once `diy` supports building
+  # without MPI.
+  add_subdirectory(thirdparty/diy)
+endif()
 add_subdirectory(testing)
 add_subdirectory(internal)
 

From 37969e9602fe4e9ff7c84cbbf5416ae9d79a13fa Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 14:37:54 -0500
Subject: [PATCH 21/24] Use `VTKM_DIY()` to include diy headers.

This makes it easier to mangle diy and support using external diy.
---
 vtkm/cont/AssignerMultiBlock.cxx         |  4 +++-
 vtkm/cont/AssignerMultiBlock.h           |  8 ++++++--
 vtkm/cont/EnvironmentTracker.cxx         |  7 ++++++-
 vtkm/cont/EnvironmentTracker.h           |  5 +++++
 vtkm/cont/MultiBlock.cxx                 | 13 ++++++++-----
 vtkm/cont/testing/UnitTestMultiBlock.cxx |  7 ++++++-
 6 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/vtkm/cont/AssignerMultiBlock.cxx b/vtkm/cont/AssignerMultiBlock.cxx
index 629fcabf0..ecb34ce8f 100644
--- a/vtkm/cont/AssignerMultiBlock.cxx
+++ b/vtkm/cont/AssignerMultiBlock.cxx
@@ -21,8 +21,10 @@
 
 #if defined(VTKM_ENABLE_MPI)
 
-#include <diy/mpi.hpp>
+// clang-format off
 #include <vtkm/cont/EnvironmentTracker.h>
+#include VTKM_DIY(diy/mpi.hpp)
+// clang-format on
 
 #include <algorithm> // std::lower_bound
 #include <numeric>   // std::iota
diff --git a/vtkm/cont/AssignerMultiBlock.h b/vtkm/cont/AssignerMultiBlock.h
index fb7440609..bbd32a513 100644
--- a/vtkm/cont/AssignerMultiBlock.h
+++ b/vtkm/cont/AssignerMultiBlock.h
@@ -21,11 +21,15 @@
 #define vtk_m_cont_AssignerMultiBlock_h
 
 #include <vtkm/internal/Configure.h>
-#if defined(VTKM_ENABLE_MPI)
 
-#include <diy/assigner.hpp>
+#if defined(VTKM_ENABLE_MPI)
 #include <vtkm/cont/MultiBlock.h>
 
+// clang-format off
+#include <vtkm/thirdparty/diy/Configure.h>
+#include VTKM_DIY(diy/assigner.hpp)
+// clang-format on
+
 namespace vtkm
 {
 namespace cont
diff --git a/vtkm/cont/EnvironmentTracker.cxx b/vtkm/cont/EnvironmentTracker.cxx
index 942ea4255..9767d90bc 100644
--- a/vtkm/cont/EnvironmentTracker.cxx
+++ b/vtkm/cont/EnvironmentTracker.cxx
@@ -20,7 +20,12 @@
 #include <vtkm/cont/EnvironmentTracker.h>
 
 #if defined(VTKM_ENABLE_MPI)
-#include <diy/mpi.hpp>
+
+// clang-format off
+#include <vtkm/thirdparty/diy/Configure.h>
+#include VTKM_DIY(diy/mpi.hpp)
+// clang-format on
+
 #else
 namespace diy
 {
diff --git a/vtkm/cont/EnvironmentTracker.h b/vtkm/cont/EnvironmentTracker.h
index a046f8c77..d26785abc 100644
--- a/vtkm/cont/EnvironmentTracker.h
+++ b/vtkm/cont/EnvironmentTracker.h
@@ -25,6 +25,11 @@
 #include <vtkm/internal/Configure.h>
 #include <vtkm/internal/ExportMacros.h>
 
+#if defined(VTKM_ENABLE_MPI)
+// needed for diy mangling.
+#include <vtkm/thirdparty/diy/Configure.h>
+#endif
+
 namespace diy
 {
 namespace mpi
diff --git a/vtkm/cont/MultiBlock.cxx b/vtkm/cont/MultiBlock.cxx
index f2bdd66ad..07287912a 100644
--- a/vtkm/cont/MultiBlock.cxx
+++ b/vtkm/cont/MultiBlock.cxx
@@ -32,11 +32,14 @@
 #include <vtkm/cont/MultiBlock.h>
 
 #if defined(VTKM_ENABLE_MPI)
-#include <diy/decomposition.hpp>
-#include <diy/master.hpp>
-#include <diy/partners/all-reduce.hpp>
-#include <diy/partners/swap.hpp>
-#include <diy/reduce.hpp>
+// clang-format off
+#include <vtkm/thirdparty/diy/Configure.h>
+#include VTKM_DIY(diy/decomposition.hpp)
+#include VTKM_DIY(diy/master.hpp)
+#include VTKM_DIY(diy/partners/all-reduce.hpp)
+#include VTKM_DIY(diy/partners/swap.hpp)
+#include VTKM_DIY(diy/reduce.hpp)
+// clang-format on
 
 namespace vtkm
 {
diff --git a/vtkm/cont/testing/UnitTestMultiBlock.cxx b/vtkm/cont/testing/UnitTestMultiBlock.cxx
index b341b9449..460d9fd59 100644
--- a/vtkm/cont/testing/UnitTestMultiBlock.cxx
+++ b/vtkm/cont/testing/UnitTestMultiBlock.cxx
@@ -36,7 +36,12 @@
 #include <vtkm/exec/ConnectivityStructured.h>
 
 #if defined(VTKM_ENABLE_MPI)
-#include <diy/master.hpp>
+
+// clang-format off
+#include <vtkm/thirdparty/diy/Configure.h>
+#include VTKM_DIY(diy/master.hpp)
+// clang-format on
+
 #endif
 
 void DataSet_Compare(vtkm::cont::DataSet& LeftDateSet, vtkm::cont::DataSet& RightDateSet);

From 4b4e43ae830f45295d3e82eedf9265674b3d1557 Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 14:38:33 -0500
Subject: [PATCH 22/24] update VTKmCheckCopyright for new diy header locations.

---
 CMake/VTKmCheckCopyright.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMake/VTKmCheckCopyright.cmake b/CMake/VTKmCheckCopyright.cmake
index cddeb3f73..d201a570c 100644
--- a/CMake/VTKmCheckCopyright.cmake
+++ b/CMake/VTKmCheckCopyright.cmake
@@ -39,9 +39,7 @@ set(FILES_TO_CHECK
 set(EXCEPTIONS
   LICENSE.txt
   README.txt
-  diy/include/diy
-  diy/LEGAL.txt
-  diy/LICENSE.txt
+  vtkm/thirdparty/diy/vtkmdiy
   )
 
 if (NOT VTKm_SOURCE_DIR)

From 4a05277296a185f82b8b58a62be326592de5f03c Mon Sep 17 00:00:00 2001
From: Utkarsh Ayachit <utkarsh.ayachit@kitware.com>
Date: Wed, 3 Jan 2018 15:08:22 -0500
Subject: [PATCH 23/24] Exclude thirdparty/diy from VTKmCheckSourceInBuild
 check.

---
 CMake/VTKmCheckSourceInBuild.cmake | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMake/VTKmCheckSourceInBuild.cmake b/CMake/VTKmCheckSourceInBuild.cmake
index f727ef6ab..f0f7af500 100644
--- a/CMake/VTKmCheckSourceInBuild.cmake
+++ b/CMake/VTKmCheckSourceInBuild.cmake
@@ -39,11 +39,21 @@ set(FILES_TO_CHECK
 set(EXCEPTIONS
   )
 
+set(DIRECTORY_EXCEPTIONS
+  ${VTKm_SOURCE_DIR}/vtkm/thirdparty/diy/vtkmdiy
+  )
+
 if (NOT VTKm_SOURCE_DIR)
   message(SEND_ERROR "VTKm_SOURCE_DIR not defined.")
 endif (NOT VTKm_SOURCE_DIR)
 
 function(check_directory directory parent_CMakeLists_contents)
+  foreach(exception IN LISTS DIRECTORY_EXCEPTIONS)
+    if(directory MATCHES "^${exception}$")
+      return()
+    endif()
+  endforeach(exception)
+
   message("Checking directory ${directory}...")
 
   get_filename_component(directory_name "${directory}" NAME)

From a8415d8e37bd095be8ddb38381d1c3aa8155b456 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robert.maynard@kitware.com>
Date: Wed, 3 Jan 2018 10:24:17 -0500
Subject: [PATCH 24/24] VTK-m now widens result type for
 UInt8/Int8/UInt16/Int16 input.

When using vtkm::dot on narrow types you easily rollover the values.
Instead the result type of vtkm::dot should be wide enough to store the results
(32bits) when this occurs.

Fixes #193
---
 vtkm/Types.h                                  | 123 ++++++++++--------
 .../testing/UnitTestArrayHandleTransform.cxx  |  14 +-
 vtkm/testing/UnitTestTypes.cxx                |  23 +++-
 vtkm/testing/VecTraitsTests.h                 |   8 +-
 4 files changed, 94 insertions(+), 74 deletions(-)

diff --git a/vtkm/Types.h b/vtkm/Types.h
index 5bdab9783..85dc460bb 100644
--- a/vtkm/Types.h
+++ b/vtkm/Types.h
@@ -516,22 +516,6 @@ public:
   VTKM_EXEC_CONT
   bool operator!=(const DerivedClass& other) const { return !(this->operator==(other)); }
 
-  VTKM_EXEC_CONT
-  ComponentType Dot(const VecBaseCommon<ComponentType, DerivedClass>& other) const
-  {
-    // Why the static_cast here and below? Because * on small integers (char,
-    // short) promotes the result to a 32-bit int. After helpfully promoting
-    // the width of the result, some compilers then warn you about casting it
-    // back to the type you were expecting in the first place. The static_cast
-    // suppresses this warning.
-    ComponentType result = static_cast<ComponentType>(this->Component(0) * other.Component(0));
-    for (vtkm::IdComponent i = 1; i < this->NumComponents(); ++i)
-    {
-      result = static_cast<ComponentType>(result + this->Component(i) * other.Component(i));
-    }
-    return result;
-  }
-
 #if (!(defined(VTKM_CUDA) && (__CUDACC_VER_MAJOR__ < 8)))
 #if (defined(VTKM_GCC) || defined(VTKM_CLANG))
 #pragma GCC diagnostic push
@@ -1241,46 +1225,85 @@ VTKM_EXEC_CONT static inline vtkm::VecCConst<T> make_VecC(const T* array, vtkm::
   return vtkm::VecCConst<T>(array, size);
 }
 
-// A pre-declaration of vtkm::Pair so that classes templated on them can refer
-// to it. The actual implementation is in vtkm/Pair.h.
-template <typename U, typename V>
-struct Pair;
-
-template <typename T, vtkm::IdComponent Size>
-static inline VTKM_EXEC_CONT T dot(const vtkm::Vec<T, Size>& a, const vtkm::Vec<T, Size>& b)
+namespace detail
 {
-  T result = T(a[0] * b[0]);
-  for (vtkm::IdComponent i = 1; i < Size; ++i)
+template <typename T>
+struct DotType
+{
+  //results when < 32bit can be float if somehow we are using float16/float8, otherwise is
+  // int32 or uint32 depending on if it signed or not.
+  using float_type = vtkm::Float32;
+  using integer_type =
+    typename std::conditional<std::is_signed<T>::value, vtkm::Int32, vtkm::UInt32>::type;
+  using promote_type =
+    typename std::conditional<std::is_integral<T>::value, integer_type, float_type>::type;
+  using type =
+    typename std::conditional<(sizeof(T) < sizeof(vtkm::Float32)), promote_type, T>::type;
+};
+
+template <typename T>
+static inline VTKM_EXEC_CONT typename DotType<typename T::ComponentType>::type vec_dot(const T& a,
+                                                                                       const T& b)
+{
+  using U = typename DotType<typename T::ComponentType>::type;
+  U result = a[0] * b[0];
+  for (vtkm::IdComponent i = 1; i < a.GetNumberOfComponents(); ++i)
   {
-    result = T(result + a[i] * b[i]);
+    result = result + a[i] * b[i];
   }
   return result;
 }
-
-template <typename T>
-static inline VTKM_EXEC_CONT T dot(const vtkm::Vec<T, 2>& a, const vtkm::Vec<T, 2>& b)
+template <typename T, vtkm::IdComponent Size>
+static inline VTKM_EXEC_CONT typename DotType<T>::type vec_dot(const vtkm::Vec<T, Size>& a,
+                                                               const vtkm::Vec<T, Size>& b)
 {
-  return T((a[0] * b[0]) + (a[1] * b[1]));
+  using U = typename DotType<T>::type;
+  U result = a[0] * b[0];
+  for (vtkm::IdComponent i = 1; i < Size; ++i)
+  {
+    result = result + a[i] * b[i];
+  }
+  return result;
+}
 }
 
 template <typename T>
-static inline VTKM_EXEC_CONT T dot(const vtkm::Vec<T, 3>& a, const vtkm::Vec<T, 3>& b)
+static inline VTKM_EXEC_CONT auto dot(const T& a, const T& b) -> decltype(detail::vec_dot(a, b))
 {
-  return T((a[0] * b[0]) + (a[1] * b[1]) + (a[2] * b[2]));
+  return detail::vec_dot(a, b);
 }
-
 template <typename T>
-static inline VTKM_EXEC_CONT T dot(const vtkm::Vec<T, 4>& a, const vtkm::Vec<T, 4>& b)
+static inline VTKM_EXEC_CONT typename detail::DotType<T>::type dot(const vtkm::Vec<T, 2>& a,
+                                                                   const vtkm::Vec<T, 2>& b)
 {
-  return T((a[0] * b[0]) + (a[1] * b[1]) + (a[2] * b[2]) + (a[3] * b[3]));
+  return (a[0] * b[0]) + (a[1] * b[1]);
 }
-
-template <typename T, typename VecType>
-static inline VTKM_EXEC_CONT T dot(const vtkm::detail::VecBaseCommon<T, VecType>& a,
-                                   const vtkm::detail::VecBaseCommon<T, VecType>& b)
+template <typename T>
+static inline VTKM_EXEC_CONT typename detail::DotType<T>::type dot(const vtkm::Vec<T, 3>& a,
+                                                                   const vtkm::Vec<T, 3>& b)
 {
-  return a.Dot(b);
+  return (a[0] * b[0]) + (a[1] * b[1]) + (a[2] * b[2]);
 }
+template <typename T>
+static inline VTKM_EXEC_CONT typename detail::DotType<T>::type dot(const vtkm::Vec<T, 4>& a,
+                                                                   const vtkm::Vec<T, 4>& b)
+{
+  return (a[0] * b[0]) + (a[1] * b[1]) + (a[2] * b[2]) + (a[3] * b[3]);
+}
+// Integer types of a width less than an integer get implicitly casted to
+// an integer when doing a multiplication.
+#define VTK_M_SCALAR_DOT(stype)                                                                    \
+  static inline VTKM_EXEC_CONT detail::DotType<stype>::type dot(stype a, stype b) { return a * b; }
+VTK_M_SCALAR_DOT(vtkm::Int8)
+VTK_M_SCALAR_DOT(vtkm::UInt8)
+VTK_M_SCALAR_DOT(vtkm::Int16)
+VTK_M_SCALAR_DOT(vtkm::UInt16)
+VTK_M_SCALAR_DOT(vtkm::Int32)
+VTK_M_SCALAR_DOT(vtkm::UInt32)
+VTK_M_SCALAR_DOT(vtkm::Int64)
+VTK_M_SCALAR_DOT(vtkm::UInt64)
+VTK_M_SCALAR_DOT(vtkm::Float32)
+VTK_M_SCALAR_DOT(vtkm::Float64)
 
 template <typename T, vtkm::IdComponent Size>
 VTKM_EXEC_CONT T ReduceSum(const vtkm::Vec<T, Size>& a)
@@ -1340,22 +1363,10 @@ VTKM_EXEC_CONT T ReduceProduct(const vtkm::Vec<T, 4>& a)
   return a[0] * a[1] * a[2] * a[3];
 }
 
-// Integer types of a width less than an integer get implicitly casted to
-// an integer when doing a multiplication.
-#define VTK_M_INTEGER_PROMOTION_SCALAR_DOT(type)                                                   \
-  static inline VTKM_EXEC_CONT type dot(type a, type b) { return static_cast<type>(a * b); }
-VTK_M_INTEGER_PROMOTION_SCALAR_DOT(vtkm::Int8)
-VTK_M_INTEGER_PROMOTION_SCALAR_DOT(vtkm::UInt8)
-VTK_M_INTEGER_PROMOTION_SCALAR_DOT(vtkm::Int16)
-VTK_M_INTEGER_PROMOTION_SCALAR_DOT(vtkm::UInt16)
-#define VTK_M_SCALAR_DOT(type)                                                                     \
-  static inline VTKM_EXEC_CONT type dot(type a, type b) { return a * b; }
-VTK_M_SCALAR_DOT(vtkm::Int32)
-VTK_M_SCALAR_DOT(vtkm::UInt32)
-VTK_M_SCALAR_DOT(vtkm::Int64)
-VTK_M_SCALAR_DOT(vtkm::UInt64)
-VTK_M_SCALAR_DOT(vtkm::Float32)
-VTK_M_SCALAR_DOT(vtkm::Float64)
+// A pre-declaration of vtkm::Pair so that classes templated on them can refer
+// to it. The actual implementation is in vtkm/Pair.h.
+template <typename U, typename V>
+struct Pair;
 
 } // End of namespace vtkm
 
diff --git a/vtkm/cont/testing/UnitTestArrayHandleTransform.cxx b/vtkm/cont/testing/UnitTestArrayHandleTransform.cxx
index 309f8289c..edccc0e8a 100644
--- a/vtkm/cont/testing/UnitTestArrayHandleTransform.cxx
+++ b/vtkm/cont/testing/UnitTestArrayHandleTransform.cxx
@@ -37,11 +37,10 @@ namespace
 
 const vtkm::Id ARRAY_SIZE = 10;
 
-template <typename ValueType>
 struct MySquare
 {
   template <typename U>
-  VTKM_EXEC ValueType operator()(U u) const
+  VTKM_EXEC auto operator()(U u) const -> decltype(vtkm::dot(u, u))
   {
     return vtkm::dot(u, u);
   }
@@ -59,7 +58,7 @@ struct CheckTransformFunctor : vtkm::exec::FunctorBase
     using T = typename TransformedPortalType::ValueType;
     typename OriginalPortalType::ValueType original = this->OriginalPortal.Get(index);
     T transformed = this->TransformedPortal.Get(index);
-    if (!test_equal(transformed, MySquare<T>()(original)))
+    if (!test_equal(transformed, MySquare{}(original)))
     {
       this->RaiseError("Encountered bad transformed value.");
     }
@@ -107,7 +106,7 @@ VTKM_CONT void CheckControlPortals(const OriginalArrayHandleType& originalArray,
     using T = typename TransformedPortalType::ValueType;
     typename OriginalPortalType::ValueType original = originalPortal.Get(index);
     T transformed = transformedPortal.Get(index);
-    VTKM_TEST_ASSERT(test_equal(transformed, MySquare<T>()(original)), "Bad transform value.");
+    VTKM_TEST_ASSERT(test_equal(transformed, MySquare{}(original)), "Bad transform value.");
   }
 }
 
@@ -115,20 +114,19 @@ template <typename InputValueType>
 struct TransformTests
 {
   using OutputValueType = typename vtkm::VecTraits<InputValueType>::ComponentType;
-  using FunctorType = MySquare<OutputValueType>;
 
   using TransformHandle =
-    vtkm::cont::ArrayHandleTransform<vtkm::cont::ArrayHandle<InputValueType>, FunctorType>;
+    vtkm::cont::ArrayHandleTransform<vtkm::cont::ArrayHandle<InputValueType>, MySquare>;
 
   using CountingTransformHandle =
-    vtkm::cont::ArrayHandleTransform<vtkm::cont::ArrayHandleCounting<InputValueType>, FunctorType>;
+    vtkm::cont::ArrayHandleTransform<vtkm::cont::ArrayHandleCounting<InputValueType>, MySquare>;
 
   using Device = VTKM_DEFAULT_DEVICE_ADAPTER_TAG;
   using Algorithm = vtkm::cont::DeviceAdapterAlgorithm<Device>;
 
   void operator()() const
   {
-    FunctorType functor;
+    MySquare functor;
 
     std::cout << "Test a transform handle with a counting handle as the values" << std::endl;
     vtkm::cont::ArrayHandleCounting<InputValueType> counting = vtkm::cont::make_ArrayHandleCounting(
diff --git a/vtkm/testing/UnitTestTypes.cxx b/vtkm/testing/UnitTestTypes.cxx
index 9a8be0a1e..e5dcb0030 100644
--- a/vtkm/testing/UnitTestTypes.cxx
+++ b/vtkm/testing/UnitTestTypes.cxx
@@ -189,7 +189,7 @@ void GeneralVecCTypeTest(const vtkm::Vec<ComponentType, Size>&)
   div = aSrc / b;
   VTKM_TEST_ASSERT(test_equal(div, correct_div), "Tuples not divided correctly.");
 
-  ComponentType d = vtkm::dot(a, b);
+  ComponentType d = static_cast<ComponentType>(vtkm::dot(a, b));
   ComponentType correct_d = 0;
   for (vtkm::IdComponent i = 0; i < Size; ++i)
   {
@@ -286,7 +286,7 @@ void GeneralVecCConstTypeTest(const vtkm::Vec<ComponentType, Size>&)
   div = aSrc / b;
   VTKM_TEST_ASSERT(test_equal(div, correct_div), "Tuples not divided correctly.");
 
-  ComponentType d = vtkm::dot(a, b);
+  ComponentType d = static_cast<ComponentType>(vtkm::dot(a, b));
   ComponentType correct_d = 0;
   for (vtkm::IdComponent i = 0; i < Size; ++i)
   {
@@ -403,7 +403,7 @@ void GeneralVecTypeTest(const vtkm::Vec<ComponentType, Size>&)
   div = a / ComponentType(2);
   VTKM_TEST_ASSERT(test_equal(div, b), "Tuple does not divide by Scalar correctly.");
 
-  ComponentType d = vtkm::dot(a, b);
+  ComponentType d = static_cast<ComponentType>(vtkm::dot(a, b));
   ComponentType correct_d = 0;
   for (vtkm::IdComponent i = 0; i < T::NUM_COMPONENTS; ++i)
   {
@@ -477,7 +477,7 @@ void TypeTest(const vtkm::Vec<Scalar, 2>&)
   VTKM_TEST_ASSERT(test_equal(div, vtkm::make_Vec(1, 2)),
                    "Vector does not divide by Scalar correctly.");
 
-  Scalar d = vtkm::dot(a, b);
+  Scalar d = static_cast<Scalar>(vtkm::dot(a, b));
   VTKM_TEST_ASSERT(test_equal(d, Scalar(10)), "dot(Vector2) wrong");
 
   VTKM_TEST_ASSERT(!(a < b), "operator< wrong");
@@ -539,7 +539,7 @@ void TypeTest(const vtkm::Vec<Scalar, 3>&)
   div = a / Scalar(2);
   VTKM_TEST_ASSERT(test_equal(div, b), "Vector does not divide by Scalar correctly.");
 
-  Scalar d = vtkm::dot(a, b);
+  Scalar d = static_cast<Scalar>(vtkm::dot(a, b));
   VTKM_TEST_ASSERT(test_equal(d, Scalar(28)), "dot(Vector3) wrong");
 
   VTKM_TEST_ASSERT(!(a < b), "operator< wrong");
@@ -601,7 +601,7 @@ void TypeTest(const vtkm::Vec<Scalar, 4>&)
   div = a / Scalar(2);
   VTKM_TEST_ASSERT(test_equal(div, b), "Vector does not divide by Scalar correctly.");
 
-  Scalar d = vtkm::dot(a, b);
+  Scalar d = static_cast<Scalar>(vtkm::dot(a, b));
   VTKM_TEST_ASSERT(test_equal(d, Scalar(60)), "dot(Vector4) wrong");
 
   VTKM_TEST_ASSERT(!(a < b), "operator< wrong");
@@ -672,6 +672,17 @@ void TypeTest(Scalar)
   {
     VTKM_TEST_FAIL("dot(Scalar) wrong");
   }
+
+  //verify we don't roll over
+  Scalar c = 128;
+  Scalar d = 32;
+  auto r = vtkm::dot(c, d);
+  VTKM_TEST_ASSERT((sizeof(r) >= sizeof(int)),
+                   "dot(Scalar) didn't promote smaller than 32bit types");
+  if (r != 4096)
+  {
+    VTKM_TEST_FAIL("dot(Scalar) wrong");
+  }
 }
 
 struct TypeTestFunctor
diff --git a/vtkm/testing/VecTraitsTests.h b/vtkm/testing/VecTraitsTests.h
index eff54626d..b32846618 100644
--- a/vtkm/testing/VecTraitsTests.h
+++ b/vtkm/testing/VecTraitsTests.h
@@ -135,14 +135,14 @@ static void TestVecTypeImpl(const typename std::remove_const<T>::type& inVector,
   VTKM_TEST_ASSERT(test_equal(vectorCopy, inVector), "CopyInto does not work.");
 
   {
-    ComponentType result = 0;
+    auto expected = vtkm::dot(vectorCopy, vectorCopy);
+    decltype(expected) result = 0;
     for (vtkm::IdComponent i = 0; i < NUM_COMPONENTS; i++)
     {
       ComponentType component = Traits::GetComponent(inVector, i);
-      result = ComponentType(result + (component * component));
+      result = result + (component * component);
     }
-    VTKM_TEST_ASSERT(test_equal(result, vtkm::dot(vectorCopy, vectorCopy)),
-                     "Got bad result for dot product");
+    VTKM_TEST_ASSERT(test_equal(result, expected), "Got bad result for dot product");
   }
 
   // This will fail to compile if the tags are wrong.