Merge topic 'simplify_unified_memory_example'

8ca24baec Update the UnifiedMemory example to properly disable managed memory 718caaaea CudaAllocator allows managed memory to be explicitly disabled Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Allison Vacanti <allison.vacanti@kitware.com> Merge-request: !1492
2024-09-16 17:22:55 +00:00 · 2018-12-28 18:40:55 +00:00 · 2018-12-28 18:40:55 +00:00 · c4bf46ecf0
commit c4bf46ecf0
parent 448ec29265 8ca24baecd
4 changed files with 119 additions and 161 deletions
--- a/docs/changelog/cuda-allocator-disable-managed-memory-from-code.md
+++ b/docs/changelog/cuda-allocator-disable-managed-memory-from-code.md
@ -0,0 +1,6 @@
+# CudaAllocator Managed Memory can be disabled from C++
+
+Previously it was impossible for calling code to explicitly
+disable managed memory. This can be desirable for projects
+that know they don't need managed memory and are super
+performance critical.
--- a/examples/unified_memory/UnifiedMemory.cu
+++ b/examples/unified_memory/UnifiedMemory.cu
@ -18,107 +18,18 @@
 //  this software.
 //============================================================================

-#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
-
 #include <vtkm/cont/ArrayHandleStreaming.h>
+#include <vtkm/cont/Initialize.h>
+#include <vtkm/cont/Logging.h> //for GetHumanReadableSize
 #include <vtkm/filter/MarchingCubes.h>
 #include <vtkm/worklet/DispatcherMapField.h>
 #include <vtkm/worklet/DispatcherStreamingMapField.h>

 #include <vtkm/Math.h>
 #include <vtkm/cont/ArrayHandleCounting.h>
-#include <vtkm/cont/CellSetExplicit.h>
-#include <vtkm/cont/DataSet.h>
 #include <vtkm/cont/Timer.h>
 #include <vtkm/cont/cuda/internal/CudaAllocator.h>

-namespace
-{
-
-// Define the tangle field for the input data
-class TangleField : public vtkm::worklet::WorkletMapField
-{
-public:
-  using ControlSignature = void(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
-  using ExecutionSignature = void(_1, _2);
-  using InputDomain = _1;
-
-  const vtkm::Id xdim, ydim, zdim;
-  const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
-  const vtkm::Id cellsPerLayer;
-
-  VTKM_CONT
-  TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3])
-    : xdim(dims[0])
-    , ydim(dims[1])
-    , zdim(dims[2])
-    , xmin(mins[0])
-    , ymin(mins[1])
-    , zmin(mins[2])
-    , xmax(maxs[0])
-    , ymax(maxs[1])
-    , zmax(maxs[2])
-    , cellsPerLayer((xdim) * (ydim)){};
-
-  VTKM_EXEC
-  void operator()(const vtkm::Id& vertexId, vtkm::Float32& v) const
-  {
-    const vtkm::Id x = vertexId % (xdim);
-    const vtkm::Id y = (vertexId / (xdim)) % (ydim);
-    const vtkm::Id z = vertexId / cellsPerLayer;
-
-    const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim - 1);
-    const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim - 1);
-    const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim - 1);
-
-    const vtkm::Float32 xx = 3.0f * (xmin + (xmax - xmin) * (fx));
-    const vtkm::Float32 yy = 3.0f * (ymin + (ymax - ymin) * (fy));
-    const vtkm::Float32 zz = 3.0f * (zmin + (zmax - zmin) * (fz));
-
-    v = (xx * xx * xx * xx - 5.0f * xx * xx + yy * yy * yy * yy - 5.0f * yy * yy +
-         zz * zz * zz * zz - 5.0f * zz * zz + 11.8f) *
-        0.2f +
-      0.5f;
-  }
-};
-
-// Construct an input data set using the tangle field worklet
-vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
-{
-  vtkm::cont::DataSet dataSet;
-
-  const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
-
-  vtkm::Float32 mins[3] = { -1.0f, -1.0f, -1.0f };
-  vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
-
-  vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
-  vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
-    0, 1, vdims[0] * vdims[1] * vdims[2]);
-  vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
-    TangleField(vdims, mins, maxs));
-  tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
-
-  vtkm::Vec<vtkm::FloatDefault, 3> origin(0.0f, 0.0f, 0.0f);
-  vtkm::Vec<vtkm::FloatDefault, 3> spacing(1.0f / static_cast<vtkm::FloatDefault>(dims[0]),
-                                           1.0f / static_cast<vtkm::FloatDefault>(dims[2]),
-                                           1.0f / static_cast<vtkm::FloatDefault>(dims[1]));
-
-  vtkm::cont::ArrayHandleUniformPointCoordinates coordinates(vdims, origin, spacing);
-  dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));
-
-  dataSet.AddField(
-    vtkm::cont::Field("nodevar", vtkm::cont::Field::Association::POINTS, fieldArray));
-
-  static const vtkm::IdComponent ndim = 3;
-  vtkm::cont::CellSetStructured<ndim> cellSet("cells");
-  cellSet.SetPointDimensions(vdims);
-  dataSet.AddCellSet(cellSet);
-
-  return dataSet;
-}
-}
-
 namespace vtkm
 {
 namespace worklet
@ -141,94 +52,82 @@ public:
 // Run a simple worklet, and compute an isosurface
 int main(int argc, char* argv[])
 {
-  vtkm::Int64 N = 1024 * 1024 * 1024;
-  if (argc > 1)
-    N = N * atoi(argv[1]);
-  else
-    N = N * 4;
-  std::cout << "Testing streaming worklet with size " << N << std::endl;
+  vtkm::cont::Initialize(argc, argv);
+
+
+  vtkm::Int64 N = 4 * 512 * 512 * 512;
+  if (argc > 1)
+  {
+    N = atoi(argv[1]);
+  }
+
+  std::cout << "Testing streaming worklet on "
+            << vtkm::cont::GetHumanReadableSize(N * sizeof(vtkm::Int64)) << std::endl;

-  vtkm::cont::ArrayHandle<vtkm::Int64> input;
-  vtkm::cont::ArrayHandle<vtkm::Float32> output;
  std::vector<vtkm::Int64> data(N);
  for (vtkm::Int64 i = 0; i < N; i++)
    data[i] = i;
-  input = vtkm::cont::make_ArrayHandle(data);

-  using DeviceAlgorithms = vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>;
+  using DeviceTag = vtkm::cont::DeviceAdapterTagCuda;
+  const bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
  vtkm::worklet::SineWorklet sineWorklet;

-  bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
-
  if (usingManagedMemory)
  {
+    vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
+    vtkm::cont::ArrayHandle<vtkm::Float32> output;
+
    std::cout << "Testing with unified memory" << std::endl;
-
    vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+    dispatcher.SetDevice(DeviceTag{});

-    vtkm::cont::Timer<> timer;
-
+    //run once to get the CUDA code warmed up
    dispatcher.Invoke(input, output);
-    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
+
+    vtkm::cont::Timer<DeviceTag> timer;
+
+    for (int i = 0; i < 3; ++i)
+    {
+      dispatcher.Invoke(input, output);
+      std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
+    }

    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-    std::cout << "Time: " << elapsedTime << std::endl;
+    std::cout << "Time for 3 iterations with managed memory: " << elapsedTime << std::endl;
  }
-  else
+
+  if (usingManagedMemory)
+  { //disable managed memory if it is enabled to get
+    //the correct performance numbers on GPU's that support managed memory
+    vtkm::cont::cuda::internal::CudaAllocator::ForceManagedMemoryOff();
+  }
+
+  vtkm::Id NBlocks = (N * sizeof(vtkm::Int64)) / (1 << 25);
+  NBlocks = std::max(vtkm::Id(1), NBlocks);
+
+  vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
+  dispatcher.SetNumberOfBlocks(NBlocks);
+
+  vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
+  vtkm::cont::ArrayHandle<vtkm::Float32> output;
+
+  std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
+            << std::endl;
+
+  //run once to get the CUDA code warmed up
+  dispatcher.Invoke(input, output);
+
+  vtkm::cont::Timer<DeviceTag> timer;
+
+  for (int i = 0; i < 3; ++i)
  {
-    vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
-    vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
-    NBlocks *= 2;
-    dispatcher.SetNumberOfBlocks(NBlocks);
-    std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
-              << std::endl;
-
-    vtkm::cont::Timer<> timer;
-
    dispatcher.Invoke(input, output);
    std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
-
-    vtkm::Float64 elapsedTime = timer.GetElapsedTime();
-    std::cout << "Time: " << elapsedTime << std::endl;
  }

-  int dim = 128;
-  if (argc > 2)
-    dim = atoi(argv[2]);
-  std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
+  vtkm::Float64 elapsedTime = timer.GetElapsedTime();
+  std::cout << "Time for 3 iterations: " << elapsedTime << std::endl;

-  vtkm::Id3 dims(dim, dim, dim);
-  vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>> verticesArray, normalsArray;
-  vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
-  vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
-
-  vtkm::filter::MarchingCubes filter;
-  filter.SetGenerateNormals(true);
-  filter.SetMergeDuplicatePoints(false);
-  filter.SetActiveField("nodevar");
-  filter.SetIsoValue(0.5);
-  auto outputData = filter.Execute(dataSet);
-
-  //need to extract vertices, normals, and scalars
-  using VertType = vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>>;
-  vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
-
-  verticesArray = coords.GetData().Cast<VertType>();
-  normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
-  scalarsArray =
-    outputData.GetField("nodevar").GetData().Cast<vtkm::cont::ArrayHandle<vtkm::Float32>>();
-
-  std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
-
-  std::cout << "vertices: ";
-  vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
-  std::cout << std::endl;
-  std::cout << "normals: ";
-  vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
-  std::cout << std::endl;
-  std::cout << "scalars: ";
-  vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
-  std::cout << std::endl;

  return 0;
 }
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@ -40,10 +40,17 @@ namespace
 static std::once_flag IsInitialized;
 #endif

-// True if concurrent pagable managed memory is not disabled by user via a system
-// environment variable and all devices support it.
+// Holds how VTK-m currently allocates memory.
+// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
+// The user can explicitly disable managed memory through an enviornment variable
+// or by calling a function on the CudaAllocator.
+// Likewise managed memory can be re-enabled by calling a function on CudaAllocator
+// if and only if the underlying hardware supports pageable managed memory
 static bool ManagedMemoryEnabled = false;

+// True if concurrent pagable managed memory is supported by the machines hardware.
+static bool HardwareSupportsManagedMemory = false;
+
 // Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
 // This value should be > 0 or else these functions will error out.
 static std::size_t Threshold = 1 << 20;
@ -64,6 +71,35 @@ bool CudaAllocator::UsingManagedMemory()
  return ManagedMemoryEnabled;
 }

+void CudaAllocator::ForceManagedMemoryOff()
+{
+  if (HardwareSupportsManagedMemory)
+  {
+    ManagedMemoryEnabled = false;
+    VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator disabling managed memory");
+  }
+  else
+  {
+    VTKM_LOG_F(
+      vtkm::cont::LogLevel::Warn,
+      "CudaAllocator trying to disable managed memory on hardware that doesn't support it");
+  }
+}
+
+void CudaAllocator::ForceManagedMemoryOn()
+{
+  if (HardwareSupportsManagedMemory)
+  {
+    ManagedMemoryEnabled = true;
+    VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator enabling managed memory");
+  }
+  else
+  {
+    VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
+               "CudaAllocator trying to enable managed memory on hardware that doesn't support it");
+  }
+}
+
 bool CudaAllocator::IsDevicePointer(const void* ptr)
 {
  CudaAllocator::Initialize();
@ -273,6 +309,13 @@ void CudaAllocator::Initialize()
      managedMemorySupported = managedMemorySupported && prop.concurrentManagedAccess;
    }

+    HardwareSupportsManagedMemory = managedMemorySupported;
+    ManagedMemoryEnabled = managedMemorySupported;
+
+    VTKM_LOG_F(vtkm::cont::LogLevel::Info,
+               "CudaAllocator hardware %s managed memory",
+               HardwareSupportsManagedMemory ? "supports" : "doesn't support");
+
 // Check if users want to disable managed memory
 #pragma warning(push)
 // getenv is not thread safe on windows but since it's inside a call_once block so
@ -283,9 +326,11 @@ void CudaAllocator::Initialize()
    if (managedMemorySupported && buf != nullptr)
    { //only makes sense to disable managed memory if the hardware supports it
      //in the first place
-      managedMemorySupported = (std::string(buf) != "1");
+      ManagedMemoryEnabled = false;
+      VTKM_LOG_F(
+        vtkm::cont::LogLevel::Info,
+        "CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
    }
-    ManagedMemoryEnabled = managedMemorySupported;
  });
 #endif
 }
--- a/vtkm/cont/cuda/internal/CudaAllocator.h
+++ b/vtkm/cont/cuda/internal/CudaAllocator.h
@ -42,6 +42,14 @@ struct VTKM_CONT_EXPORT CudaAllocator
  /// that can be accessed concurrently by the CPU and GPUs.
  static VTKM_CONT bool UsingManagedMemory();

+  /// Force CUDA allocations to occur with unmanaged memory (aka cudaMalloc).
+  static VTKM_CONT void ForceManagedMemoryOff();
+
+  /// Force CUDA allocations to occur with pageable managed memory.
+  /// If the current hardware doesn't support pageable managed memory
+  /// VTK-m will ignore the request and continue to use unmanaged memory (aka cudaMalloc).
+  static VTKM_CONT void ForceManagedMemoryOn();
+
  /// Returns true if the pointer is accessible from a CUDA device.
  static VTKM_CONT bool IsDevicePointer(const void* ptr);