First support for unified memory

2024-09-08 21:33:55 +00:00 · 2017-01-18 11:43:49 -07:00 · 2017-01-18 11:43:49 -07:00 · 82c40a6374
commit 82c40a6374
parent 503bebda54
5 changed files with 372 additions and 9 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -104,6 +104,18 @@ option(VTKm_USE_DOUBLE_PRECISION
  )
 option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)

+if (VTKm_ENABLE_CUDA)
+  option(VTKm_USE_UNIFIED_MEMORY
+    "Use CUDA unified memory"
+    OFF
+    )
+endif (VTKm_ENABLE_CUDA)
+
+if (VTKm_USE_UNIFIED_MEMORY)
+  set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
+endif()
+
+
 option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
 set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -24,7 +24,6 @@
 set(CMAKE_PREFIX_PATH ${VTKm_BINARY_DIR}/${VTKm_INSTALL_CONFIG_DIR})

 add_subdirectory(clipping)
-add_subdirectory(contour_tree)
 add_subdirectory(demo)
 add_subdirectory(dynamic_dispatcher)
 add_subdirectory(hello_world)
@ -35,4 +34,5 @@ add_subdirectory(tetrahedra)
 if(VTKm_ENABLE_RENDERING)
  add_subdirectory(rendering)
 endif()
+add_subdirectory(unified_memory)

--- a/examples/unified_memory/CMakeLists.txt
+++ b/examples/unified_memory/CMakeLists.txt
@ -0,0 +1,36 @@
+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2015 Sandia Corporation.
+##  Copyright 2015 UT-Battelle, LLC.
+##  Copyright 2015 Los Alamos National Security.
+##
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+##  Laboratory (LANL), the U.S. Government retains certain rights in
+##  this software.
+##
+##=============================================================================
+
+#Find the VTK-m package
+find_package(VTKm REQUIRED QUIET
+  OPTIONAL_COMPONENTS Serial CUDA TBB OpenGL GLUT
+  )
+
+if(VTKm_CUDA_FOUND)
+  vtkm_disable_troublesome_thrust_warnings()
+  # Cuda compiles do not respect target_include_directories
+  cuda_include_directories(${VTKm_INCLUDE_DIRS})
+  cuda_add_executable(UnifiedMemory_CUDA UnifiedMemory.cu)
+  target_link_libraries(UnifiedMemory_CUDA ${VTKm_LIBRARIES})
+  target_compile_options(UnifiedMemory_CUDA PRIVATE ${VTKm_COMPILE_OPTIONS})
+endif()
+
--- a/examples/unified_memory/UnifiedMemory.cu
+++ b/examples/unified_memory/UnifiedMemory.cu
@ -0,0 +1,223 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2014 Sandia Corporation.
+//  Copyright 2014 UT-Battelle, LLC.
+//  Copyright 2014 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
+
+#include <vtkm/cont/ArrayHandleStreaming.h>
+#include <vtkm/worklet/DispatcherStreamingMapField.h>
+#include <vtkm/filter/MarchingCubes.h>
+#include <vtkm/worklet/DispatcherMapField.h>
+
+#include <vtkm/Math.h>
+#include <vtkm/cont/ArrayHandleCounting.h>
+#include <vtkm/cont/CellSetExplicit.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/Timer.h>
+
+
+namespace {
+
+// Define the tangle field for the input data
+class TangleField : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
+  typedef void ExecutionSignature(_1, _2);
+  typedef _1 InputDomain;
+
+  const vtkm::Id xdim, ydim, zdim;
+  const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
+  const vtkm::Id cellsPerLayer;
+
+  VTKM_CONT
+  TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3]) : xdim(dims[0]), ydim(dims[1]), zdim(dims[2]),
+              xmin(mins[0]), ymin(mins[1]), zmin(mins[2]), xmax(maxs[0]), ymax(maxs[1]), zmax(maxs[2]), cellsPerLayer((xdim) * (ydim)) { };
+
+  VTKM_EXEC
+  void operator()(const vtkm::Id &vertexId, vtkm::Float32 &v) const
+  {
+    const vtkm::Id x = vertexId % (xdim);
+    const vtkm::Id y = (vertexId / (xdim)) % (ydim);
+    const vtkm::Id z = vertexId / cellsPerLayer;
+
+    const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim-1);
+    const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim-1);
+    const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim-1);
+
+    const vtkm::Float32 xx = 3.0f*(xmin+(xmax-xmin)*(fx));
+    const vtkm::Float32 yy = 3.0f*(ymin+(ymax-ymin)*(fy));
+    const vtkm::Float32 zz = 3.0f*(zmin+(zmax-zmin)*(fz));
+
+    v = (xx*xx*xx*xx - 5.0f*xx*xx + yy*yy*yy*yy - 5.0f*yy*yy + zz*zz*zz*zz - 5.0f*zz*zz + 11.8f) * 0.2f + 0.5f;
+  }
+};
+
+
+// Construct an input data set using the tangle field worklet
+vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
+{
+  vtkm::cont::DataSet dataSet;
+
+  const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
+
+  vtkm::Float32 mins[3] = {-1.0f, -1.0f, -1.0f};
+  vtkm::Float32 maxs[3] = {1.0f, 1.0f, 1.0f};
+
+  vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
+  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0]*vdims[1]*vdims[2]);
+  vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(TangleField(vdims, mins, maxs));
+  tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
+
+  vtkm::Vec<vtkm::FloatDefault,3> origin(0.0f, 0.0f, 0.0f);
+  vtkm::Vec<vtkm::FloatDefault,3> spacing(
+        1.0f/static_cast<vtkm::FloatDefault>(dims[0]),
+        1.0f/static_cast<vtkm::FloatDefault>(dims[2]),
+        1.0f/static_cast<vtkm::FloatDefault>(dims[1]));
+
+  vtkm::cont::ArrayHandleUniformPointCoordinates
+      coordinates(vdims, origin, spacing);
+  dataSet.AddCoordinateSystem(
+          vtkm::cont::CoordinateSystem("coordinates", coordinates));
+
+  dataSet.AddField(vtkm::cont::Field("nodevar", vtkm::cont::Field::ASSOC_POINTS, fieldArray));
+
+  static const vtkm::IdComponent ndim = 3;
+  vtkm::cont::CellSetStructured<ndim> cellSet("cells");
+  cellSet.SetPointDimensions(vdims);
+  dataSet.AddCellSet(cellSet);
+
+  return dataSet;
+}
+
+}
+
+
+namespace vtkm
+{
+namespace worklet
+{
+class SineWorklet : public vtkm::worklet::WorkletMapField
+{
+public:
+  typedef void ControlSignature(FieldIn<>, FieldOut<>);
+  typedef _2 ExecutionSignature(_1, WorkIndex);
+
+  VTKM_EXEC
+  vtkm::Float32 operator()(vtkm::Int64 x, vtkm::Id& index) const {
+    return (vtkm::Sin(1.0*x)); 
+  }
+};
+}
+}
+
+
+// Run a simple worklet, and compute an isosurface
+int main(int argc, char* argv[])
+{
+  vtkm::Int64 N = 1024*1024*1024;
+  if (argc > 1) N = N*atoi(argv[1]);
+  else N = N*4;
+  std::cout << "Testing streaming worklet with size " << N << std::endl;
+
+  vtkm::cont::ArrayHandle<vtkm::Int64> input;
+  vtkm::cont::ArrayHandle<vtkm::Float32> output;
+  std::vector<vtkm::Int64> data(N); 
+  for (vtkm::Int64 i=0; i<N; i++) data[i] = i; 
+  input = vtkm::cont::make_ArrayHandle(data);
+
+  typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
+  vtkm::worklet::SineWorklet sineWorklet;
+
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  std::cout << "Testing with unified memory" << std::endl;
+
+  vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet>
+      dispatcher(sineWorklet);
+
+  vtkm::cont::Timer<> timer;
+
+  dispatcher.Invoke(input, output);
+  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
+
+  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+  std::cout << "Time: " << elapsedTime << std::endl;
+
+#else
+
+  vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet>
+      dispatcher(sineWorklet);
+  vtkm::Id NBlocks = N/(1024*1024*1024);
+  NBlocks *= 2;
+  dispatcher.SetNumberOfBlocks(NBlocks);
+  std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks" << std::endl;
+
+  vtkm::cont::Timer<> timer;
+
+  dispatcher.Invoke(input, output);
+  std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues()-1) << std::endl;
+
+  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+  std::cout << "Time: " << elapsedTime << std::endl;
+
+#endif
+
+  int dim = 128;
+  if (argc > 2) dim = atoi(argv[2]);
+  std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
+
+  vtkm::Id3 dims(dim, dim, dim);
+  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32,3> > verticesArray, normalsArray;
+  vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
+  vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
+
+  vtkm::filter::MarchingCubes filter;
+  filter.SetGenerateNormals(true);
+  filter.SetMergeDuplicatePoints( false );
+  filter.SetIsoValue( 0.5 );
+  vtkm::filter::ResultDataSet result =
+      filter.Execute( dataSet, dataSet.GetField("nodevar") );
+
+  filter.MapFieldOntoOutput(result, dataSet.GetField("nodevar"));
+
+  //need to extract vertices, normals, and scalars
+  vtkm::cont::DataSet& outputData = result.GetDataSet();
+
+  typedef vtkm::cont::ArrayHandle< vtkm::Vec<vtkm::Float32,3> > VertType;
+  vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
+
+  verticesArray = coords.GetData().Cast<VertType>();
+  normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
+  scalarsArray = outputData.GetField("nodevar").GetData().Cast< vtkm::cont::ArrayHandle<vtkm::Float32> >();
+
+  std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
+
+  std::cout << "vertices: ";
+  vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
+  std::cout << std::endl;
+  std::cout << "normals: ";
+  vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
+  std::cout << std::endl;
+  std::cout << "scalars: ";
+  vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
+  std::cout << std::endl;
+
+  return 0;
+}
+
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@ -77,8 +77,13 @@ public:
  typedef vtkm::exec::cuda::internal::ConstArrayPortalFromThrust< T > PortalConstType;

  VTKM_CONT
+#ifdef VTKM_USE_UNIFIED_MEMORY
+  ArrayManagerExecutionThrustDevice(StorageType *storage)
+    : Storage(storage), Pointer(0), Length(0) 
+#else
  ArrayManagerExecutionThrustDevice(StorageType *storage)
    : Storage(storage), Array()
+#endif
  {

  }
@ -93,7 +98,11 @@ public:
  ///
  VTKM_CONT
  vtkm::Id GetNumberOfValues() const {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    return this->Length;
+#else
    return static_cast<vtkm::Id>(this->Array.size());
+#endif
  }

  /// Allocates the appropriate size of the array and copies the given data
@ -111,9 +120,14 @@ public:
      // The data in this->Array should already be valid.
    }

-
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
    return PortalConstType(this->Array.data(),
                           this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
  }

  /// Workaround for nvcc 7.5 compiler warning bug.
@ -139,8 +153,14 @@ public:
      // The data in this->Array should already be valid.
    }

+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
    return PortalType(this->Array.data(),
                      this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
  }

  /// Workaround for nvcc 7.5 compiler warning bug.
@ -160,20 +180,43 @@ public:
    {
      // Resize to 0 first so that you don't have to copy data when resizing
      // to a larger size.
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      if (this->Pointer) { cudaFree(this->Pointer); this->Pointer = 0;  this->Length = 0; }
+#else
      this->Array.clear();
+#endif
    }

    try
      {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      if (numberOfValues != this->GetNumberOfValues())
+      {
+        ValueType* temp;
+        cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
+        if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+        if (numberOfValues <= this->Length) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
+        if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
+        this->Pointer = temp;
+  
+        this->Length = numberOfValues;
+      }
+#else
      this->Array.resize(static_cast<std::size_t>(numberOfValues));
+#endif
      }
    catch (std::bad_alloc error)
      {
      throw vtkm::cont::ErrorControlBadAllocation(error.what());
      }
-
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::cuda::pointer<ValueType> first(this->Pointer);
+    ::thrust::cuda::pointer<ValueType> last(this->Pointer + this->Length);
+    return PortalType(first, last);
+#else
    return PortalType(this->Array.data(),
                      this->Array.data() + static_cast<difference_type>(this->Array.size()));
+#endif
  }

  /// Workaround for nvcc 7.5 compiler warning bug.
@ -190,14 +233,23 @@ public:
  VTKM_CONT
  void RetrieveOutputData(StorageType *storage) const
  {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    storage->Allocate(this->Length);
+#else
    storage->Allocate(static_cast<vtkm::Id>(this->Array.size()));
+#endif
    try
-      {
+    {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      cudaDeviceSynchronize();
+      ::thrust::copy(this->Pointer, this->Pointer + this->Length, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
+#else
      ::thrust::copy(
-          this->Array.data(),
-          this->Array.data() + static_cast<difference_type>(this->Array.size()),
-          vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
-      }
+        this->Array.data(),
+        this->Array.data() + static_cast<difference_type>(this->Array.size()),
+        vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
+#endif
+    }
    catch (...)
    {
      vtkm::cont::cuda::internal::throwAsVTKmException();
@ -211,10 +263,14 @@ public:
  template <class IteratorTypeControl>
  VTKM_CONT void CopyInto(IteratorTypeControl dest) const
  {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    ::thrust::copy(this->Pointer, this->Pointer + this->Length, dest);
+#else
    ::thrust::copy(
          this->Array.data(),
          this->Array.data() + static_cast<difference_type>(this->Array.size()),
          dest);
+#endif
  }

  /// Resizes the device vector.
@ -223,9 +279,27 @@ public:
  {
    // The operation will succeed even if this assertion fails, but this
    // is still supposed to be a precondition to Shrink.
-    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Length));

+    try
+    {
+      ValueType* temp;
+      cudaError_t r = cudaMallocManaged(&(temp), numberOfValues*sizeof(ValueType));
+      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+      if (this->Length > 0) ::thrust::copy(this->Pointer, this->Pointer + numberOfValues, temp);
+      if (this->Pointer) cudaFree(this->Pointer);
+      this->Pointer = temp;
+      this->Length = numberOfValues;
+    }
+    catch (std::bad_alloc error)
+    {
+      throw vtkm::cont::ErrorControlBadAllocation(error.what());
+    }
+#else
+    VTKM_ASSERT(numberOfValues <= static_cast<vtkm::Id>(this->Array.size()));
    this->Array.resize(static_cast<std::size_t>(numberOfValues));
+#endif
  }


@ -233,8 +307,12 @@ public:
  ///
  VTKM_CONT void ReleaseResources()
  {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+    if (this->Pointer) { cudaFree(this->Pointer);  this->Pointer = 0;  this->Length = 0; }
+#else
    this->Array.clear();
    this->Array.shrink_to_fit();
+#endif
  }

 private:
@ -246,17 +324,31 @@ private:

  StorageType *Storage;

+#ifdef VTKM_USE_UNIFIED_MEMORY
+  ValueType *Pointer;
+  vtkm::Id Length;
+#else
  ::thrust::system::cuda::vector<ValueType,
                                 UninitializedAllocator<ValueType> > Array;
+#endif

  VTKM_CONT
  void CopyToExecution()
  {
    try
    {
+#ifdef VTKM_USE_UNIFIED_MEMORY
+      cudaError_t r = cudaMallocManaged(&(this->Pointer), (this->Storage->GetNumberOfValues())*sizeof(ValueType));
+      if (r == cudaErrorMemoryAllocation) throw std::bad_alloc();
+      ::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
+                     vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
+                     this->Pointer);  
+      this->Length = this->Storage->GetNumberOfValues();
+#else
      this->Array.assign(
            vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
            vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()));
+#endif
    }
    catch (...)
    {