mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-16 17:22:55 +00:00
Merge topic 'simplify_unified_memory_example'
8ca24baec Update the UnifiedMemory example to properly disable managed memory 718caaaea CudaAllocator allows managed memory to be explicitly disabled Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Allison Vacanti <allison.vacanti@kitware.com> Merge-request: !1492
This commit is contained in:
commit
c4bf46ecf0
@ -0,0 +1,6 @@
|
||||
# CudaAllocator Managed Memory can be disabled from C++
|
||||
|
||||
Previously it was impossible for calling code to explicitly
|
||||
disable managed memory. This can be desirable for projects
|
||||
that know they don't need managed memory and are super
|
||||
performance critical.
|
@ -18,107 +18,18 @@
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
|
||||
|
||||
#include <vtkm/cont/ArrayHandleStreaming.h>
|
||||
#include <vtkm/cont/Initialize.h>
|
||||
#include <vtkm/cont/Logging.h> //for GetHumanReadableSize
|
||||
#include <vtkm/filter/MarchingCubes.h>
|
||||
#include <vtkm/worklet/DispatcherMapField.h>
|
||||
#include <vtkm/worklet/DispatcherStreamingMapField.h>
|
||||
|
||||
#include <vtkm/Math.h>
|
||||
#include <vtkm/cont/ArrayHandleCounting.h>
|
||||
#include <vtkm/cont/CellSetExplicit.h>
|
||||
#include <vtkm/cont/DataSet.h>
|
||||
#include <vtkm/cont/Timer.h>
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Define the tangle field for the input data
|
||||
class TangleField : public vtkm::worklet::WorkletMapField
|
||||
{
|
||||
public:
|
||||
using ControlSignature = void(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
|
||||
using ExecutionSignature = void(_1, _2);
|
||||
using InputDomain = _1;
|
||||
|
||||
const vtkm::Id xdim, ydim, zdim;
|
||||
const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
|
||||
const vtkm::Id cellsPerLayer;
|
||||
|
||||
VTKM_CONT
|
||||
TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3])
|
||||
: xdim(dims[0])
|
||||
, ydim(dims[1])
|
||||
, zdim(dims[2])
|
||||
, xmin(mins[0])
|
||||
, ymin(mins[1])
|
||||
, zmin(mins[2])
|
||||
, xmax(maxs[0])
|
||||
, ymax(maxs[1])
|
||||
, zmax(maxs[2])
|
||||
, cellsPerLayer((xdim) * (ydim)){};
|
||||
|
||||
VTKM_EXEC
|
||||
void operator()(const vtkm::Id& vertexId, vtkm::Float32& v) const
|
||||
{
|
||||
const vtkm::Id x = vertexId % (xdim);
|
||||
const vtkm::Id y = (vertexId / (xdim)) % (ydim);
|
||||
const vtkm::Id z = vertexId / cellsPerLayer;
|
||||
|
||||
const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim - 1);
|
||||
const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim - 1);
|
||||
const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim - 1);
|
||||
|
||||
const vtkm::Float32 xx = 3.0f * (xmin + (xmax - xmin) * (fx));
|
||||
const vtkm::Float32 yy = 3.0f * (ymin + (ymax - ymin) * (fy));
|
||||
const vtkm::Float32 zz = 3.0f * (zmin + (zmax - zmin) * (fz));
|
||||
|
||||
v = (xx * xx * xx * xx - 5.0f * xx * xx + yy * yy * yy * yy - 5.0f * yy * yy +
|
||||
zz * zz * zz * zz - 5.0f * zz * zz + 11.8f) *
|
||||
0.2f +
|
||||
0.5f;
|
||||
}
|
||||
};
|
||||
|
||||
// Construct an input data set using the tangle field worklet
|
||||
vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
|
||||
{
|
||||
vtkm::cont::DataSet dataSet;
|
||||
|
||||
const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
|
||||
|
||||
vtkm::Float32 mins[3] = { -1.0f, -1.0f, -1.0f };
|
||||
vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
|
||||
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
|
||||
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
|
||||
0, 1, vdims[0] * vdims[1] * vdims[2]);
|
||||
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
|
||||
TangleField(vdims, mins, maxs));
|
||||
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
|
||||
|
||||
vtkm::Vec<vtkm::FloatDefault, 3> origin(0.0f, 0.0f, 0.0f);
|
||||
vtkm::Vec<vtkm::FloatDefault, 3> spacing(1.0f / static_cast<vtkm::FloatDefault>(dims[0]),
|
||||
1.0f / static_cast<vtkm::FloatDefault>(dims[2]),
|
||||
1.0f / static_cast<vtkm::FloatDefault>(dims[1]));
|
||||
|
||||
vtkm::cont::ArrayHandleUniformPointCoordinates coordinates(vdims, origin, spacing);
|
||||
dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));
|
||||
|
||||
dataSet.AddField(
|
||||
vtkm::cont::Field("nodevar", vtkm::cont::Field::Association::POINTS, fieldArray));
|
||||
|
||||
static const vtkm::IdComponent ndim = 3;
|
||||
vtkm::cont::CellSetStructured<ndim> cellSet("cells");
|
||||
cellSet.SetPointDimensions(vdims);
|
||||
dataSet.AddCellSet(cellSet);
|
||||
|
||||
return dataSet;
|
||||
}
|
||||
}
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace worklet
|
||||
@ -141,94 +52,82 @@ public:
|
||||
// Run a simple worklet, and compute an isosurface
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
vtkm::Int64 N = 1024 * 1024 * 1024;
|
||||
if (argc > 1)
|
||||
N = N * atoi(argv[1]);
|
||||
else
|
||||
N = N * 4;
|
||||
std::cout << "Testing streaming worklet with size " << N << std::endl;
|
||||
vtkm::cont::Initialize(argc, argv);
|
||||
|
||||
|
||||
vtkm::Int64 N = 4 * 512 * 512 * 512;
|
||||
if (argc > 1)
|
||||
{
|
||||
N = atoi(argv[1]);
|
||||
}
|
||||
|
||||
std::cout << "Testing streaming worklet on "
|
||||
<< vtkm::cont::GetHumanReadableSize(N * sizeof(vtkm::Int64)) << std::endl;
|
||||
|
||||
vtkm::cont::ArrayHandle<vtkm::Int64> input;
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> output;
|
||||
std::vector<vtkm::Int64> data(N);
|
||||
for (vtkm::Int64 i = 0; i < N; i++)
|
||||
data[i] = i;
|
||||
input = vtkm::cont::make_ArrayHandle(data);
|
||||
|
||||
using DeviceAlgorithms = vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>;
|
||||
using DeviceTag = vtkm::cont::DeviceAdapterTagCuda;
|
||||
const bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
|
||||
vtkm::worklet::SineWorklet sineWorklet;
|
||||
|
||||
bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
|
||||
|
||||
if (usingManagedMemory)
|
||||
{
|
||||
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> output;
|
||||
|
||||
std::cout << "Testing with unified memory" << std::endl;
|
||||
|
||||
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
dispatcher.SetDevice(DeviceTag{});
|
||||
|
||||
vtkm::cont::Timer<> timer;
|
||||
|
||||
//run once to get the CUDA code warmed up
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
|
||||
vtkm::cont::Timer<DeviceTag> timer;
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
}
|
||||
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
std::cout << "Time for 3 iterations with managed memory: " << elapsedTime << std::endl;
|
||||
}
|
||||
else
|
||||
|
||||
if (usingManagedMemory)
|
||||
{ //disable managed memory if it is enabled to get
|
||||
//the correct performance numbers on GPU's that support managed memory
|
||||
vtkm::cont::cuda::internal::CudaAllocator::ForceManagedMemoryOff();
|
||||
}
|
||||
|
||||
vtkm::Id NBlocks = (N * sizeof(vtkm::Int64)) / (1 << 25);
|
||||
NBlocks = std::max(vtkm::Id(1), NBlocks);
|
||||
|
||||
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
dispatcher.SetNumberOfBlocks(NBlocks);
|
||||
|
||||
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> output;
|
||||
|
||||
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
|
||||
<< std::endl;
|
||||
|
||||
//run once to get the CUDA code warmed up
|
||||
dispatcher.Invoke(input, output);
|
||||
|
||||
vtkm::cont::Timer<DeviceTag> timer;
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
|
||||
NBlocks *= 2;
|
||||
dispatcher.SetNumberOfBlocks(NBlocks);
|
||||
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
|
||||
<< std::endl;
|
||||
|
||||
vtkm::cont::Timer<> timer;
|
||||
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
}
|
||||
|
||||
int dim = 128;
|
||||
if (argc > 2)
|
||||
dim = atoi(argv[2]);
|
||||
std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time for 3 iterations: " << elapsedTime << std::endl;
|
||||
|
||||
vtkm::Id3 dims(dim, dim, dim);
|
||||
vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>> verticesArray, normalsArray;
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
|
||||
vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
|
||||
|
||||
vtkm::filter::MarchingCubes filter;
|
||||
filter.SetGenerateNormals(true);
|
||||
filter.SetMergeDuplicatePoints(false);
|
||||
filter.SetActiveField("nodevar");
|
||||
filter.SetIsoValue(0.5);
|
||||
auto outputData = filter.Execute(dataSet);
|
||||
|
||||
//need to extract vertices, normals, and scalars
|
||||
using VertType = vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>>;
|
||||
vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
|
||||
|
||||
verticesArray = coords.GetData().Cast<VertType>();
|
||||
normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
|
||||
scalarsArray =
|
||||
outputData.GetField("nodevar").GetData().Cast<vtkm::cont::ArrayHandle<vtkm::Float32>>();
|
||||
|
||||
std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
|
||||
|
||||
std::cout << "vertices: ";
|
||||
vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
|
||||
std::cout << std::endl;
|
||||
std::cout << "normals: ";
|
||||
vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
|
||||
std::cout << std::endl;
|
||||
std::cout << "scalars: ";
|
||||
vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
|
||||
std::cout << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -40,10 +40,17 @@ namespace
|
||||
static std::once_flag IsInitialized;
|
||||
#endif
|
||||
|
||||
// True if concurrent pagable managed memory is not disabled by user via a system
|
||||
// environment variable and all devices support it.
|
||||
// Holds how VTK-m currently allocates memory.
|
||||
// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
|
||||
// The user can explicitly disable managed memory through an enviornment variable
|
||||
// or by calling a function on the CudaAllocator.
|
||||
// Likewise managed memory can be re-enabled by calling a function on CudaAllocator
|
||||
// if and only if the underlying hardware supports pageable managed memory
|
||||
static bool ManagedMemoryEnabled = false;
|
||||
|
||||
// True if concurrent pagable managed memory is supported by the machines hardware.
|
||||
static bool HardwareSupportsManagedMemory = false;
|
||||
|
||||
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
|
||||
// This value should be > 0 or else these functions will error out.
|
||||
static std::size_t Threshold = 1 << 20;
|
||||
@ -64,6 +71,35 @@ bool CudaAllocator::UsingManagedMemory()
|
||||
return ManagedMemoryEnabled;
|
||||
}
|
||||
|
||||
void CudaAllocator::ForceManagedMemoryOff()
|
||||
{
|
||||
if (HardwareSupportsManagedMemory)
|
||||
{
|
||||
ManagedMemoryEnabled = false;
|
||||
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator disabling managed memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
VTKM_LOG_F(
|
||||
vtkm::cont::LogLevel::Warn,
|
||||
"CudaAllocator trying to disable managed memory on hardware that doesn't support it");
|
||||
}
|
||||
}
|
||||
|
||||
void CudaAllocator::ForceManagedMemoryOn()
|
||||
{
|
||||
if (HardwareSupportsManagedMemory)
|
||||
{
|
||||
ManagedMemoryEnabled = true;
|
||||
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator enabling managed memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
|
||||
"CudaAllocator trying to enable managed memory on hardware that doesn't support it");
|
||||
}
|
||||
}
|
||||
|
||||
bool CudaAllocator::IsDevicePointer(const void* ptr)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
@ -273,6 +309,13 @@ void CudaAllocator::Initialize()
|
||||
managedMemorySupported = managedMemorySupported && prop.concurrentManagedAccess;
|
||||
}
|
||||
|
||||
HardwareSupportsManagedMemory = managedMemorySupported;
|
||||
ManagedMemoryEnabled = managedMemorySupported;
|
||||
|
||||
VTKM_LOG_F(vtkm::cont::LogLevel::Info,
|
||||
"CudaAllocator hardware %s managed memory",
|
||||
HardwareSupportsManagedMemory ? "supports" : "doesn't support");
|
||||
|
||||
// Check if users want to disable managed memory
|
||||
#pragma warning(push)
|
||||
// getenv is not thread safe on windows but since it's inside a call_once block so
|
||||
@ -283,9 +326,11 @@ void CudaAllocator::Initialize()
|
||||
if (managedMemorySupported && buf != nullptr)
|
||||
{ //only makes sense to disable managed memory if the hardware supports it
|
||||
//in the first place
|
||||
managedMemorySupported = (std::string(buf) != "1");
|
||||
ManagedMemoryEnabled = false;
|
||||
VTKM_LOG_F(
|
||||
vtkm::cont::LogLevel::Info,
|
||||
"CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
|
||||
}
|
||||
ManagedMemoryEnabled = managedMemorySupported;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
@ -42,6 +42,14 @@ struct VTKM_CONT_EXPORT CudaAllocator
|
||||
/// that can be accessed concurrently by the CPU and GPUs.
|
||||
static VTKM_CONT bool UsingManagedMemory();
|
||||
|
||||
/// Force CUDA allocations to occur with unmanaged memory (aka cudaMalloc).
|
||||
static VTKM_CONT void ForceManagedMemoryOff();
|
||||
|
||||
/// Force CUDA allocations to occur with pageable managed memory.
|
||||
/// If the current hardware doesn't support pageable managed memory
|
||||
/// VTK-m will ignore the request and continue to use unmanaged memory (aka cudaMalloc).
|
||||
static VTKM_CONT void ForceManagedMemoryOn();
|
||||
|
||||
/// Returns true if the pointer is accessible from a CUDA device.
|
||||
static VTKM_CONT bool IsDevicePointer(const void* ptr);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user