From 82c40a6374deb408ff485376e9768c6ea5992043 Mon Sep 17 00:00:00 2001
From: Christopher Sewell <csewell@lanl.gov>
Date: Wed, 18 Jan 2017 11:43:49 -0700
Subject: [PATCH 1/3] First support for unified memory

---
 CMakeLists.txt                                |  12 +
 examples/CMakeLists.txt                       |   2 +-
 examples/unified_memory/CMakeLists.txt        |  36 +++
 examples/unified_memory/UnifiedMemory.cu      | 223 ++++++++++++++++++
 .../ArrayManagerExecutionThrustDevice.h       | 108 ++++++++-
 5 files changed, 372 insertions(+), 9 deletions(-)
 create mode 100644 examples/unified_memory/CMakeLists.txt
 create mode 100644 examples/unified_memory/UnifiedMemory.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ca6f4eb0..1df2fe8b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,18 @@ option(VTKm_USE_DOUBLE_PRECISION
   )
 option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)
 
+if (VTKm_ENABLE_CUDA)
+  option(VTKm_USE_UNIFIED_MEMORY
+    "Use CUDA unified memory"
+    OFF
+    )
+endif (VTKm_ENABLE_CUDA)
+
+if (VTKm_USE_UNIFIED_MEMORY)
+  set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
+endif()
+
+
 option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
 set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c993fcd02..dd49bb824 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,7 +24,6 @@
 set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})
 
 add_subdirectory(clipping)
-add_subdirectory(contour_tree)
 add_subdirectory(demo)
 add_subdirectory(dynamic_dispatcher)
 add_subdirectory(hello_world)
@@ -35,4 +34,5 @@ add_subdirectory(tetrahedra)
 if(VTKm_ENABLE_RENDERING)
   add_subdirectory(rendering)
 endif()
+add_subdirectory(unified_memory)
 
diff --git a/examples/unified_memory/CMakeLists.txt b/examples/unified_memory/CMakeLists.txt
new file mode 100644
index 000000000..7f7519ba7
--- /dev/null
+++ b/examples/unified_memory/CMakeLists.txt
@@ -0,0 +1,36 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2015 Sandia Corporation.
+##  Copyright 2015 UT-Battelle, LLC.
+##  Copyright 2015 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+
+#Find the VTK-m package
+find_package(VTKm REQUIRED QUIET
+  OPTIONAL_COMPONENTS Serial CUDA TBB OpenGL GLUT
+  )
+
+if(VTKm_CUDA_FOUND)
+  vtkm_disable_troublesome_thrust_warnings()
+  # Cuda compiles do not respect target_include_directories
+  cuda_include_directories(${VTKm_INCLUDE_DIRS})
+  cuda_add_executable(UnifiedMemory_CUDA UnifiedMemory.cu)
+  target_link_libraries(UnifiedMemory_CUDA ${VTKm_LIBRARIES})
+  target_compile_options(UnifiedMemory_CUDA PRIVATE ${VTKm_COMPILE_OPTIONS})
+endif()
+
diff --git a/examples/unified_memory/UnifiedMemory.cu b/examples/unified_memory/UnifiedMemory.cu
new file mode 100644
index 000000000..826f9aa9f
--- /dev/null
+++ b/examples/unified_memory/UnifiedMemory.cu
@@ -0,0 +1,223 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
+
+#include <vtkm/cont/ArrayHandleStreaming.h>
+#include <vtkm/worklet/DispatcherStreamingMapField.h>
+#include <vtkm/filter/MarchingCubes.h>
+#include <vtkm/worklet/DispatcherMapField.h>
+
+#include <vtkm/Math.h>
+#include <vtkm/cont/ArrayHandleCounting.h>
+#include <vtkm/cont/CellSetExplicit.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/Timer.h>
+
+
+namespace {
+
+// Define the tangle field for the input data
+class TangleField : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
+  typedef void ExecutionSignature(_1, _2);
+  typedef _1 InputDomain;
+
+  const vtkm::Id xdim, ydim, zdim;
+  const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
+  const vtkm::Id cellsPerLayer;
+
+  VTKM_CONT
+  TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3]) : xdim(dims[0]), ydim(dims[1]), zdim(dims[2]),
+              xmin(mins[0]), ymin(mins[1]), zmin(mins[2]), xmax(maxs[0]), ymax(maxs[1]), zmax(maxs[2]), cellsPerLayer((xdim) * (ydim)) { };
+
+  VTKM_EXEC
+  void operator()(const vtkm::Id &vertexId, vtkm::Float32 &v) const
+  {
+    const vtkm::Id x = vertexId % (xdim);
+    const vtkm::Id y = (vertexId / (xdim)) % (ydim);
+    const vtkm::Id z = vertexId / cellsPerLayer;
+
+    const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim-1);
+    const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim-1);
+    const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim-1);
+
+    const vtkm::Float32 xx = 3.0f*(xmin+(xmax-xmin)*(fx));
+    const vtkm::Float32 yy = 3.0f*(ymin+(ymax-ymin)*(fy));
+    const vtkm::Float32 zz = 3.0f*(zmin+(zmax-zmin)*(fz));
+
+    v = (xx*xx*xx*xx - 5.0f*xx*xx + yy*yy*yy*yy - 5.0f*yy*yy + zz*zz*zz*zz - 5.0f*zz*zz + 11.8f) * 0.2f + 0.5f;
+  }
+};
+
+
+// Construct an input data set using the tangle field worklet
+vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
+{
+  vtkm::cont::DataSet dataSet;
+
+  const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
+
+  vtkm::Float32 mins[3] = {-1.0f, -1.0f, -1.0f};
+  vtkm::Float32 maxs[3] = {1.0f, 1.0f, 1.0f};
+
+  vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
+  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0]*vdims[1]*vdims[2]);
+  vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(TangleField(vdims, mins, maxs));
+  tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
+
+  vtkm::Vec<vtkm::FloatDefault,3> origin(0.0f, 0.0f, 0.0f);
+  vtkm::Vec<vtkm::FloatDefault,3> spacing(
+        1.0f/static_cast<vtkm::FloatDefault>(dims[0]),
+        1.0f/static_cast<vtkm::FloatDefault>(dims[2]),
+        1.0f/static_cast<vtkm::FloatDefault>(dims[1]));
+
+  vtkm::cont::ArrayHandleUniformPointCoordinates
+      coordinates(vdims, origin, spacing);
+  dataSet.AddCoordinateSystem(
+          vtkm::cont::CoordinateSystem("coordinates", coordinates));
+
+  dataSet.AddField(vtkm::cont::Field("nodevar", vtkm::cont::Field::ASSOC_POINTS, fieldArray));
+
+  static const vtkm::IdComponent ndim = 3;
+  vtkm::cont::CellSetStructured<ndim> cellSet("cells");
+  cellSet.SetPointDimensions(vdims);
+  dataSet.AddCellSet(cellSet);
+
+  return dataSet;
+}
+
+}
+
+
+namespace vtkm
+{
+namespace worklet
+{
+class SineWorklet : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<>, FieldOut<>);
+  typedef _2 ExecutionSignature(_1, WorkIndex);
+
+  VTKM_EXEC
+  vtkm::Float32 operator()(vtkm::Int64 x, vtkm::Id& index) const {
+    return (vtkm::Sin(1.0*x)); 
+  }
+};
+}
+}
+
+
+// Run a simple worklet, and compute an isosurface
+int main(int argc, char* argv[])
+{
+  vtkm::Int64 N = 1024*1024*1024;
+  if (argc > 1) N = N*atoi(argv[1]);
+  else N = N*4;
+  std::cout << "Testing streaming worklet with size " << N << std::endl;
+
+  vtkm::cont::ArrayHandle<vtkm::Int64> input;
+  vtkm::cont::ArrayHandle<vtkm::Float32> output;
+  std::vector<vtkm::Int64> data(N); 
+  for (vtkm::Int64 i=0; i<N; i++) data[i] = i; 
+  input = vtkm::cont::make_ArrayHandle(data);
+
+  typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
+  vtkm::worklet::SineWorklet sineWorklet;
+
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  std::cout << "Testing with unified memory" << std::endl;
+
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet>
+      dispatcher(sineWorklet);
+
+  vtkm::cont::Timer<> timer;
+
+  dispatcher.Invoke(input, output);
+  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
+
+  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+  std::cout << "Time: " << elapsedTime << std::endl;
+
+#else
+
+  vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet>
+      dispatcher(sineWorklet);
+  vtkm::Id NBlocks = N/(1024*1024*1024);
+  NBlocks *= 2;
+  dispatcher.SetNumberOfBlocks(NBlocks);
+  std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks" << std::endl;
+
+  vtkm::cont::Timer<> timer;
+
+  dispatcher.Invoke(input, output);
+  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
+
+  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+  std::cout << "Time: " << elapsedTime << std::endl;
+
+#endif
+
+  int dim = 128;
+  if (argc > 2) dim = atoi(argv[2]);
+  std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
+
+  vtkm::Id3 dims(dim, dim, dim);
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32,3> > verticesArray, normalsArray;
+  vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
+  vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
+
+  vtkm::filter::MarchingCubes filter;
+  filter.SetGenerateNormals(true);
+  filter.SetMergeDuplicatePoints( false );
+  filter.SetIsoValue( 0.5 );
+  vtkm::filter::ResultDataSet result =
+      filter.Execute( dataSet, dataSet.GetField("nodevar") );
+
+  filter.MapFieldOntoOutput(result, dataSet.GetField("nodevar"));
+
+  //need to extract vertices, normals, and scalars
+  vtkm::cont::DataSet& outputData = result.GetDataSet();
+
+  typedef vtkm::cont::ArrayHandle< vtkm::Vec<vtkm::Float32,3> > VertType;
+  vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
+
+  verticesArray = coords.GetData().Cast<VertType>();
+  normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
+  scalarsArray = outputData.GetField("nodevar").GetData().Cast< vtkm::cont::ArrayHandle<vtkm::Float32> >();
+
+  std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
+
+  std::cout << "vertices: ";
+  vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
+  std::cout << std::endl;
+  std::cout << "normals: ";
+  vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
+  std::cout << std::endl;
+  std::cout << "scalars: ";
+  vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
+  std::cout << std::endl;
+
+  return 0;
+}
+
diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
index ae8c86ce5..526a62431 100644
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@@ -77,8 +77,13 @@ public:
   typedef vtkm::exec::cuda::internal::ConstArrayPortalFromThrust< T > PortalConstType;
 
   VTKM_CONT
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  ArrayManagerExecutionThrustDevice(StorageType *storage)
+    : Storage(storage), Pointer(0), Length(0) 
+#else
   ArrayManagerExecutionThrustDevice(StorageType *storage)
     : Storage(storage), Array()
+#endif
   {
 
   }
@@ -93,7 +98,11 @@ public:
   ///
   VTKM_CONT
   vtkm::Id GetNumberOfValues() const {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    return this->Length;
+#else
     return static_cast<vtkm::Id>(this->Array.size());
+#endif
   }
 
   /// Allocates the appropriate size of the array and copies the given data
@@ -111,9 +120,14 @@ public:
       // The data in this->Array should already be valid.
     }
 
-
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
     return PortalConstType(this->Array.data(),
                            this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -139,8 +153,14 @@ public:
       // The data in this->Array should already be valid.
     }
 
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
     return PortalType(this->Array.data(),
                       this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -160,20 +180,43 @@ public:
     {
       // Resize to 0 first so that you don't have to copy data when resizing
       // to a larger size.
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0;  this->Length = 0; }
+#else
       this->Array.clear();
+#endif
     }
 
     try
       {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      if (numberOfValues != this->GetNumberOfValues())
+      {
+        ValueType* temp;
+        cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
+        if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+        if (numberOfValues <= this->Length) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
+        if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
+        this->Pointer = temp;
+  
+        this->Length = numberOfValues;
+      }
+#else
       this->Array.resize(static_cast<std::size_t>(numberOfValues));
+#endif
       }
     catch (std::bad_alloc error)
       {
       throw vtkm::cont::ErrorControlBadAllocation(error.what());
       }
-
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
     return PortalType(this->Array.data(),
                       this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -190,14 +233,23 @@ public:
   VTKM_CONT
   void RetrieveOutputData(StorageType *storage) const
   {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    storage->Allocate(this->Length);
+#else
     storage->Allocate(static_cast<vtkm::Id>(this->Array.size()));
+#endif
     try
-      {
+    {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      cudaDeviceSynchronize();
+      ::thrust::copy(this->Pointer, this->Pointer + this->Length, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
+#else
       ::thrust::copy(
-          this->Array.data(),
-          this->Array.data() + static_cast<difference_type>(this->Array.size()),
-          vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
-      }
+        this->Array.data(),
+        this->Array.data() + static_cast<difference_type>(this->Array.size()),
+        vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
+#endif
+    }
     catch (...)
     {
       vtkm::cont::cuda::internal::throwAsVTKmException();
@@ -211,10 +263,14 @@ public:
   template <class IteratorTypeControl>
   VTKM_CONT void CopyInto(IteratorTypeControl dest) const
   {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::copy(this->Pointer, this->Pointer + this->Length, dest);
+#else
     ::thrust::copy(
           this->Array.data(),
           this->Array.data() + static_cast<difference_type>(this->Array.size()),
           dest);
+#endif
   }
 
   /// Resizes the device vector.
@@ -223,9 +279,27 @@ public:
   {
     // The operation will succeed even if this assertion fails, but this
     // is still supposed to be a precondition to Shrink.
-    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Length));
 
+    try
+    {
+      ValueType* temp;
+      cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
+      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+      if (this->Length > 0) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
+      if (this->Pointer) cudaFree(this->Pointer);
+      this->Pointer = temp;
+      this->Length = numberOfValues;
+    }
+    catch (std::bad_alloc error)
+    {
+      throw vtkm::cont::ErrorControlBadAllocation(error.what());
+    }
+#else
+    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
     this->Array.resize(static_cast<std::size_t>(numberOfValues));
+#endif
   }
 
 
@@ -233,8 +307,12 @@ public:
   ///
   VTKM_CONT void ReleaseResources()
   {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
+#else
     this->Array.clear();
     this->Array.shrink_to_fit();
+#endif
   }
 
 private:
@@ -246,17 +324,31 @@ private:
 
   StorageType *Storage;
 
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  ValueType *Pointer;
+  vtkm::Id Length;
+#else
   ::thrust::system::cuda::vector<ValueType,
                                  UninitializedAllocator<ValueType> > Array;
+#endif
 
   VTKM_CONT
   void CopyToExecution()
   {
     try
     {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      cudaError_t r = cudaMallocManaged(&(this->Pointer), (this->Storage->GetNumberOfValues())*sizeof(ValueType));
+      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+      ::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
+                     vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
+                     this->Pointer);  
+      this->Length = this->Storage->GetNumberOfValues();
+#else
       this->Array.assign(
             vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
             vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()));
+#endif
     }
     catch (...)
     {

From dfae8f5e3e00519169c4c02ab9e29546562777c8 Mon Sep 17 00:00:00 2001
From: Christopher Sewell <csewell@lanl.gov>
Date: Wed, 18 Jan 2017 11:47:52 -0700
Subject: [PATCH 2/3] Fixing merge issue with contour_tree

---
 examples/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index dd49bb824..95edf8f5c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@
 set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})
 
 add_subdirectory(clipping)
+add_subdirectory(contour_tree)
 add_subdirectory(demo)
 add_subdirectory(dynamic_dispatcher)
 add_subdirectory(hello_world)

From 835073dae2cb8a5fe898e57ac2328f1de9927a4a Mon Sep 17 00:00:00 2001
From: Li-Ta Lo - 194699 <ollie@darwin-fe1.lanl.gov>
Date: Mon, 13 Feb 2017 11:45:17 -0700
Subject: [PATCH 3/3] clean up with custom allocator

---
 .../ArrayManagerExecutionThrustDevice.h       | 116 ++++--------------
 1 file changed, 21 insertions(+), 95 deletions(-)

diff --git a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
index 526a62431..3dea48334 100644
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@@ -56,6 +56,21 @@ template<typename T>
   {
     // no-op
   }
+
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  thrust::device_ptr<T> allocate(std::size_t num)
+  {
+    T* temp;
+    cudaError_t r = cudaMallocManaged(&(temp), num*sizeof(T));
+    if (r == cudaErrorMemoryAllocation)
+      throw std::bad_alloc();
+    return thrust::device_ptr<T>(temp);
+  }
+  void deallocate(thrust::device_ptr<T> p, std::size_t vtkmNotUsed(num))
+  {
+    cudaFree(p.get());
+  }
+#endif
 };
 
 /// \c ArrayManagerExecutionThrustDevice provides an implementation for a \c
@@ -77,13 +92,9 @@ public:
   typedef vtkm::exec::cuda::internal::ConstArrayPortalFromThrust< T > PortalConstType;
 
   VTKM_CONT
-#ifdef VTKM_USE_UNIFIED_MEMORY
-  ArrayManagerExecutionThrustDevice(StorageType *storage)
-    : Storage(storage), Pointer(0), Length(0) 
-#else
   ArrayManagerExecutionThrustDevice(StorageType *storage)
     : Storage(storage), Array()
-#endif
+
   {
 
   }
@@ -98,11 +109,7 @@ public:
   ///
   VTKM_CONT
   vtkm::Id GetNumberOfValues() const {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    return this->Length;
-#else
     return static_cast<vtkm::Id>(this->Array.size());
-#endif
   }
 
   /// Allocates the appropriate size of the array and copies the given data
@@ -119,15 +126,8 @@ public:
     {
       // The data in this->Array should already be valid.
     }
-
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
-    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
-    return PortalType(first, last);
-#else
     return PortalConstType(this->Array.data(),
                            this->Array.data() + static_cast<difference_type>(this->Array.size()));
-#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -152,15 +152,8 @@ public:
     {
       // The data in this->Array should already be valid.
     }
-
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
-    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
-    return PortalType(first, last);
-#else
     return PortalType(this->Array.data(),
                       this->Array.data() + static_cast<difference_type>(this->Array.size()));
-#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -180,43 +173,19 @@ public:
     {
       // Resize to 0 first so that you don't have to copy data when resizing
       // to a larger size.
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0;  this->Length = 0; }
-#else
       this->Array.clear();
-#endif
     }
 
     try
       {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      if (numberOfValues != this->GetNumberOfValues())
-      {
-        ValueType* temp;
-        cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
-        if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
-        if (numberOfValues <= this->Length) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
-        if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
-        this->Pointer = temp;
-  
-        this->Length = numberOfValues;
-      }
-#else
       this->Array.resize(static_cast<std::size_t>(numberOfValues));
-#endif
       }
     catch (std::bad_alloc error)
       {
       throw vtkm::cont::ErrorControlBadAllocation(error.what());
       }
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
-    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
-    return PortalType(first, last);
-#else
     return PortalType(this->Array.data(),
                       this->Array.data() + static_cast<difference_type>(this->Array.size()));
-#endif
   }
 
   /// Workaround for nvcc 7.5 compiler warning bug.
@@ -233,22 +202,17 @@ public:
   VTKM_CONT
   void RetrieveOutputData(StorageType *storage) const
   {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    storage->Allocate(this->Length);
-#else
     storage->Allocate(static_cast<vtkm::Id>(this->Array.size()));
-#endif
     try
     {
 #ifdef VTKM_USE_UNIFIED_MEMORY
       cudaDeviceSynchronize();
-      ::thrust::copy(this->Pointer, this->Pointer + this->Length, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
-#else
+#endif
       ::thrust::copy(
         this->Array.data(),
         this->Array.data() + static_cast<difference_type>(this->Array.size()),
         vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
-#endif
+
     }
     catch (...)
     {
@@ -263,14 +227,10 @@ public:
   template <class IteratorTypeControl>
   VTKM_CONT void CopyInto(IteratorTypeControl dest) const
   {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    ::thrust::copy(this->Pointer, this->Pointer + this->Length, dest);
-#else
     ::thrust::copy(
           this->Array.data(),
           this->Array.data() + static_cast<difference_type>(this->Array.size()),
           dest);
-#endif
   }
 
   /// Resizes the device vector.
@@ -279,27 +239,8 @@ public:
   {
     // The operation will succeed even if this assertion fails, but this
     // is still supposed to be a precondition to Shrink.
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Length));
-
-    try
-    {
-      ValueType* temp;
-      cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
-      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
-      if (this->Length > 0) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
-      if (this->Pointer) cudaFree(this->Pointer);
-      this->Pointer = temp;
-      this->Length = numberOfValues;
-    }
-    catch (std::bad_alloc error)
-    {
-      throw vtkm::cont::ErrorControlBadAllocation(error.what());
-    }
-#else
     VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
     this->Array.resize(static_cast<std::size_t>(numberOfValues));
-#endif
   }
 
 
@@ -307,12 +248,8 @@ public:
   ///
   VTKM_CONT void ReleaseResources()
   {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-    if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
-#else
     this->Array.clear();
     this->Array.shrink_to_fit();
-#endif
   }
 
 private:
@@ -324,31 +261,20 @@ private:
 
   StorageType *Storage;
 
-#ifdef VTKM_USE_UNIFIED_MEMORY
-  ValueType *Pointer;
-  vtkm::Id Length;
-#else
   ::thrust::system::cuda::vector<ValueType,
                                  UninitializedAllocator<ValueType> > Array;
-#endif
+
 
   VTKM_CONT
   void CopyToExecution()
   {
     try
     {
-#ifdef VTKM_USE_UNIFIED_MEMORY
-      cudaError_t r = cudaMallocManaged(&(this->Pointer), (this->Storage->GetNumberOfValues())*sizeof(ValueType));
-      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
-      ::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
-                     vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
-                     this->Pointer);  
-      this->Length = this->Storage->GetNumberOfValues();
-#else
+
       this->Array.assign(
             vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
             vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()));
-#endif
+
     }
     catch (...)
     {