Merge topic 'simplify_unified_memory_example'

8ca24baec Update the UnifiedMemory example to properly disable managed memory
718caaaea CudaAllocator allows managed memory to be explicitly disabled

Acked-by: Kitware Robot <kwrobot@kitware.com>
Acked-by: Allison Vacanti <allison.vacanti@kitware.com>
Merge-request: !1492
This commit is contained in:
Robert Maynard 2018-12-28 18:40:55 +00:00 committed by Kitware Robot
commit c4bf46ecf0
4 changed files with 119 additions and 161 deletions

@ -0,0 +1,6 @@
# CudaAllocator Managed Memory can be disabled from C++
Previously it was impossible for calling code to explicitly
disable managed memory. This can be desirable for projects
that know they don't need managed memory and are super
performance critical.

@ -18,107 +18,18 @@
// this software.
//============================================================================
#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
#include <vtkm/cont/ArrayHandleStreaming.h>
#include <vtkm/cont/Initialize.h>
#include <vtkm/cont/Logging.h> //for GetHumanReadableSize
#include <vtkm/filter/MarchingCubes.h>
#include <vtkm/worklet/DispatcherMapField.h>
#include <vtkm/worklet/DispatcherStreamingMapField.h>
#include <vtkm/Math.h>
#include <vtkm/cont/ArrayHandleCounting.h>
#include <vtkm/cont/CellSetExplicit.h>
#include <vtkm/cont/DataSet.h>
#include <vtkm/cont/Timer.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
namespace
{
// Define the tangle field for the input data
class TangleField : public vtkm::worklet::WorkletMapField
{
public:
using ControlSignature = void(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
using ExecutionSignature = void(_1, _2);
using InputDomain = _1;
const vtkm::Id xdim, ydim, zdim;
const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
const vtkm::Id cellsPerLayer;
VTKM_CONT
TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3])
: xdim(dims[0])
, ydim(dims[1])
, zdim(dims[2])
, xmin(mins[0])
, ymin(mins[1])
, zmin(mins[2])
, xmax(maxs[0])
, ymax(maxs[1])
, zmax(maxs[2])
, cellsPerLayer((xdim) * (ydim)){};
VTKM_EXEC
void operator()(const vtkm::Id& vertexId, vtkm::Float32& v) const
{
const vtkm::Id x = vertexId % (xdim);
const vtkm::Id y = (vertexId / (xdim)) % (ydim);
const vtkm::Id z = vertexId / cellsPerLayer;
const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 xx = 3.0f * (xmin + (xmax - xmin) * (fx));
const vtkm::Float32 yy = 3.0f * (ymin + (ymax - ymin) * (fy));
const vtkm::Float32 zz = 3.0f * (zmin + (zmax - zmin) * (fz));
v = (xx * xx * xx * xx - 5.0f * xx * xx + yy * yy * yy * yy - 5.0f * yy * yy +
zz * zz * zz * zz - 5.0f * zz * zz + 11.8f) *
0.2f +
0.5f;
}
};
// Construct an input data set using the tangle field worklet
vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
{
vtkm::cont::DataSet dataSet;
const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
vtkm::Float32 mins[3] = { -1.0f, -1.0f, -1.0f };
vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
0, 1, vdims[0] * vdims[1] * vdims[2]);
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
TangleField(vdims, mins, maxs));
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
vtkm::Vec<vtkm::FloatDefault, 3> origin(0.0f, 0.0f, 0.0f);
vtkm::Vec<vtkm::FloatDefault, 3> spacing(1.0f / static_cast<vtkm::FloatDefault>(dims[0]),
1.0f / static_cast<vtkm::FloatDefault>(dims[2]),
1.0f / static_cast<vtkm::FloatDefault>(dims[1]));
vtkm::cont::ArrayHandleUniformPointCoordinates coordinates(vdims, origin, spacing);
dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));
dataSet.AddField(
vtkm::cont::Field("nodevar", vtkm::cont::Field::Association::POINTS, fieldArray));
static const vtkm::IdComponent ndim = 3;
vtkm::cont::CellSetStructured<ndim> cellSet("cells");
cellSet.SetPointDimensions(vdims);
dataSet.AddCellSet(cellSet);
return dataSet;
}
}
namespace vtkm
{
namespace worklet
@ -141,94 +52,82 @@ public:
// Run a simple worklet, and compute an isosurface
int main(int argc, char* argv[])
{
vtkm::Int64 N = 1024 * 1024 * 1024;
if (argc > 1)
N = N * atoi(argv[1]);
else
N = N * 4;
std::cout << "Testing streaming worklet with size " << N << std::endl;
vtkm::cont::Initialize(argc, argv);
vtkm::Int64 N = 4 * 512 * 512 * 512;
if (argc > 1)
{
N = atoi(argv[1]);
}
std::cout << "Testing streaming worklet on "
<< vtkm::cont::GetHumanReadableSize(N * sizeof(vtkm::Int64)) << std::endl;
vtkm::cont::ArrayHandle<vtkm::Int64> input;
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::vector<vtkm::Int64> data(N);
for (vtkm::Int64 i = 0; i < N; i++)
data[i] = i;
input = vtkm::cont::make_ArrayHandle(data);
using DeviceAlgorithms = vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>;
using DeviceTag = vtkm::cont::DeviceAdapterTagCuda;
const bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
vtkm::worklet::SineWorklet sineWorklet;
bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
if (usingManagedMemory)
{
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::cout << "Testing with unified memory" << std::endl;
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
dispatcher.SetDevice(DeviceTag{});
vtkm::cont::Timer<> timer;
//run once to get the CUDA code warmed up
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::cont::Timer<DeviceTag> timer;
for (int i = 0; i < 3; ++i)
{
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
}
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
std::cout << "Time for 3 iterations with managed memory: " << elapsedTime << std::endl;
}
else
if (usingManagedMemory)
{ //disable managed memory if it is enabled to get
//the correct performance numbers on GPU's that support managed memory
vtkm::cont::cuda::internal::CudaAllocator::ForceManagedMemoryOff();
}
vtkm::Id NBlocks = (N * sizeof(vtkm::Int64)) / (1 << 25);
NBlocks = std::max(vtkm::Id(1), NBlocks);
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
dispatcher.SetNumberOfBlocks(NBlocks);
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
//run once to get the CUDA code warmed up
dispatcher.Invoke(input, output);
vtkm::cont::Timer<DeviceTag> timer;
for (int i = 0; i < 3; ++i)
{
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
NBlocks *= 2;
dispatcher.SetNumberOfBlocks(NBlocks);
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
vtkm::cont::Timer<> timer;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
}
int dim = 128;
if (argc > 2)
dim = atoi(argv[2]);
std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time for 3 iterations: " << elapsedTime << std::endl;
vtkm::Id3 dims(dim, dim, dim);
vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>> verticesArray, normalsArray;
vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
vtkm::filter::MarchingCubes filter;
filter.SetGenerateNormals(true);
filter.SetMergeDuplicatePoints(false);
filter.SetActiveField("nodevar");
filter.SetIsoValue(0.5);
auto outputData = filter.Execute(dataSet);
//need to extract vertices, normals, and scalars
using VertType = vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>>;
vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
verticesArray = coords.GetData().Cast<VertType>();
normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
scalarsArray =
outputData.GetField("nodevar").GetData().Cast<vtkm::cont::ArrayHandle<vtkm::Float32>>();
std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
std::cout << "vertices: ";
vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
std::cout << std::endl;
std::cout << "normals: ";
vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
std::cout << std::endl;
std::cout << "scalars: ";
vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
std::cout << std::endl;
return 0;
}

@ -40,10 +40,17 @@ namespace
static std::once_flag IsInitialized;
#endif
// True if concurrent pagable managed memory is not disabled by user via a system
// environment variable and all devices support it.
// Holds how VTK-m currently allocates memory.
// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
// The user can explicitly disable managed memory through an enviornment variable
// or by calling a function on the CudaAllocator.
// Likewise managed memory can be re-enabled by calling a function on CudaAllocator
// if and only if the underlying hardware supports pageable managed memory
static bool ManagedMemoryEnabled = false;
// True if concurrent pagable managed memory is supported by the machines hardware.
static bool HardwareSupportsManagedMemory = false;
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
// This value should be > 0 or else these functions will error out.
static std::size_t Threshold = 1 << 20;
@ -64,6 +71,35 @@ bool CudaAllocator::UsingManagedMemory()
return ManagedMemoryEnabled;
}
void CudaAllocator::ForceManagedMemoryOff()
{
if (HardwareSupportsManagedMemory)
{
ManagedMemoryEnabled = false;
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator disabling managed memory");
}
else
{
VTKM_LOG_F(
vtkm::cont::LogLevel::Warn,
"CudaAllocator trying to disable managed memory on hardware that doesn't support it");
}
}
void CudaAllocator::ForceManagedMemoryOn()
{
if (HardwareSupportsManagedMemory)
{
ManagedMemoryEnabled = true;
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator enabling managed memory");
}
else
{
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
"CudaAllocator trying to enable managed memory on hardware that doesn't support it");
}
}
bool CudaAllocator::IsDevicePointer(const void* ptr)
{
CudaAllocator::Initialize();
@ -273,6 +309,13 @@ void CudaAllocator::Initialize()
managedMemorySupported = managedMemorySupported && prop.concurrentManagedAccess;
}
HardwareSupportsManagedMemory = managedMemorySupported;
ManagedMemoryEnabled = managedMemorySupported;
VTKM_LOG_F(vtkm::cont::LogLevel::Info,
"CudaAllocator hardware %s managed memory",
HardwareSupportsManagedMemory ? "supports" : "doesn't support");
// Check if users want to disable managed memory
#pragma warning(push)
// getenv is not thread safe on windows but since it's inside a call_once block so
@ -283,9 +326,11 @@ void CudaAllocator::Initialize()
if (managedMemorySupported && buf != nullptr)
{ //only makes sense to disable managed memory if the hardware supports it
//in the first place
managedMemorySupported = (std::string(buf) != "1");
ManagedMemoryEnabled = false;
VTKM_LOG_F(
vtkm::cont::LogLevel::Info,
"CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
}
ManagedMemoryEnabled = managedMemorySupported;
});
#endif
}

@ -42,6 +42,14 @@ struct VTKM_CONT_EXPORT CudaAllocator
/// that can be accessed concurrently by the CPU and GPUs.
static VTKM_CONT bool UsingManagedMemory();
/// Force CUDA allocations to occur with unmanaged memory (aka cudaMalloc).
static VTKM_CONT void ForceManagedMemoryOff();
/// Force CUDA allocations to occur with pageable managed memory.
/// If the current hardware doesn't support pageable managed memory
/// VTK-m will ignore the request and continue to use unmanaged memory (aka cudaMalloc).
static VTKM_CONT void ForceManagedMemoryOn();
/// Returns true if the pointer is accessible from a CUDA device.
static VTKM_CONT bool IsDevicePointer(const void* ptr);