Add CudaAllocator to encapsulate runtime managed memory logic.

Unified memory is now used when we detect that the hardware supports it.
This commit is contained in:
David C. Lonie 2017-07-14 10:36:15 -04:00 committed by Allison Vacanti
parent 938db09ad5
commit bd042ec567
7 changed files with 268 additions and 66 deletions

@ -108,18 +108,6 @@ option(VTKm_USE_DOUBLE_PRECISION
)
option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)
if (VTKm_ENABLE_CUDA)
option(VTKm_USE_UNIFIED_MEMORY
"Use CUDA unified memory"
OFF
)
endif (VTKm_ENABLE_CUDA)
if (VTKm_USE_UNIFIED_MEMORY)
set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
endif()
option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})

@ -30,6 +30,7 @@
#include <vtkm/cont/CellSetExplicit.h>
#include <vtkm/cont/DataSet.h>
#include <vtkm/cont/Timer.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
namespace
{
@ -92,8 +93,8 @@ vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0] * vdims[1] *
vdims[2]);
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
0, 1, vdims[0] * vdims[1] * vdims[2]);
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
TangleField(vdims, mins, maxs));
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
@ -153,37 +154,39 @@ int main(int argc, char* argv[])
typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
vtkm::worklet::SineWorklet sineWorklet;
#ifdef VTKM_USE_UNIFIED_MEMORY
std::cout << "Testing with unified memory" << std::endl;
bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
if (usingManagedMemory)
{
std::cout << "Testing with unified memory" << std::endl;
vtkm::cont::Timer<> timer;
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::cont::Timer<> timer;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
#else
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
}
else
{
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
NBlocks *= 2;
dispatcher.SetNumberOfBlocks(NBlocks);
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
NBlocks *= 2;
dispatcher.SetNumberOfBlocks(NBlocks);
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
vtkm::cont::Timer<> timer;
vtkm::cont::Timer<> timer;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
#endif
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
}
int dim = 128;
if (argc > 2)

@ -21,6 +21,9 @@
#define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_cu
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
using vtkm::cont::cuda::internal::CudaAllocator;
namespace vtkm
{
@ -64,18 +67,8 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Allocate(TypelessExecut
// Attempt to allocate:
try
{
char* tmp;
#ifdef VTKM_USE_UNIFIED_MEMORY
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMallocManaged(&tmp, static_cast<std::size_t>(numBytes)));
VTKM_CUDA_CALL(cudaMemAdvise(tmp, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, numBytes, dev, 0));
VTKM_CUDA_CALL(cudaStreamSynchronize(0));
#else
VTKM_CUDA_CALL(cudaMalloc(&tmp, static_cast<std::size_t>(numBytes)));
#endif
// Cast to char* so that the pointer math below will work.
char* tmp = static_cast<char*>(CudaAllocator::Allocate(static_cast<size_t>(numBytes)));
execArray.Array = tmp;
execArray.ArrayEnd = tmp + numBytes;
execArray.ArrayCapacity = tmp + numBytes;
@ -93,7 +86,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Free(
{
if (execArray.Array != nullptr)
{
VTKM_CUDA_CALL(cudaFree(execArray.Array));
CudaAllocator::Free(execArray.Array);
execArray.Array = nullptr;
execArray.ArrayEnd = nullptr;
execArray.ArrayCapacity = nullptr;

@ -35,6 +35,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
VTKM_THIRDPARTY_POST_INCLUDE
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
@ -172,17 +173,8 @@ public:
// Attempt to allocate:
try
{
ValueType* tmp;
#ifdef VTKM_USE_UNIFIED_MEMORY
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMallocManaged(&tmp, bufferSize));
VTKM_CUDA_CALL(cudaMemAdvise(tmp, bufferSize, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, bufferSize, dev, 0));
VTKM_CUDA_CALL(cudaStreamSynchronize(0));
#else
VTKM_CUDA_CALL(cudaMalloc(&tmp, bufferSize));
#endif
ValueType* tmp =
static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
this->Begin = PointerType(tmp);
}
catch (const std::exception& error)
@ -214,9 +206,6 @@ public:
storage->Allocate(this->GetNumberOfValues());
try
{
#ifdef VTKM_USE_UNIFIED_MEMORY
cudaDeviceSynchronize();
#endif
::thrust::copy(
this->Begin, this->End, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
}
@ -254,7 +243,7 @@ public:
{
if (this->Begin.get() != nullptr)
{
VTKM_CUDA_CALL(cudaFree(this->Begin.get()));
vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin.get());
this->Begin = PointerType(static_cast<ValueType*>(nullptr));
this->End = PointerType(static_cast<ValueType*>(nullptr));
this->Capacity = PointerType(static_cast<ValueType*>(nullptr));

@ -21,6 +21,7 @@
set(headers
ArrayManagerExecutionCuda.h
ArrayManagerExecutionThrustDevice.h
CudaAllocator.h
DeviceAdapterAlgorithmCuda.h
DeviceAdapterAlgorithmThrust.h
DeviceAdapterTagCuda.h
@ -30,6 +31,11 @@ set(headers
VirtualObjectTransferCuda.h
)
set(sources
ArrayManagerExecutionCuda.cu
CudaAllocator.cu
)
vtkm_declare_headers(CUDA ${headers} TESTABLE ${VTKm_ENABLE_CUDA})
#-----------------------------------------------------------------------------
@ -49,7 +55,7 @@ if (VTKm_ENABLE_CUDA)
list(APPEND compile_options -Xcompiler=-fPIC)
endif()
cuda_compile(vtkm_cont_cuda_object_files ArrayManagerExecutionCuda.cu
cuda_compile(vtkm_cont_cuda_object_files ${sources}
OPTIONS "${compile_options}")
#Setup the dependency chain for the custom object build so that

@ -0,0 +1,164 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2017 Sandia Corporation.
// Copyright 2017 UT-Battelle, LLC.
// Copyright 2017 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <cuda_runtime.h>
// These static vars are in an anon namespace to work around MSVC linker issues.
namespace
{
// Has CudaAllocator::Initialize been called?
static bool IsInitialized = false;
// True if all devices support concurrent pagable managed memory.
static bool ManagedMemorySupported = false;
}
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
bool CudaAllocator::UsingManagedMemory()
{
CudaAllocator::Initialize();
return ManagedMemorySupported;
}
void* CudaAllocator::Allocate(std::size_t numBytes)
{
CudaAllocator::Initialize();
void* ptr = nullptr;
if (ManagedMemorySupported)
{
VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
}
else
{
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
}
return ptr;
}
void CudaAllocator::Free(void* ptr)
{
CudaAllocator::Initialize();
VTKM_CUDA_CALL(cudaFree(ptr));
}
void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
{
CudaAllocator::Initialize();
if (ManagedMemorySupported)
{
// TODO these hints need to be benchmarked and adjusted once we start
// sharing the pointers between cont/exec
VTKM_CUDA_CALL(
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
}
}
void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
{
CudaAllocator::Initialize();
if (ManagedMemorySupported)
{
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
}
}
void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
{
CudaAllocator::Initialize();
if (ManagedMemorySupported)
{
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
}
}
void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
{
CudaAllocator::Initialize();
if (ManagedMemorySupported)
{
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
}
}
void CudaAllocator::Initialize()
{
if (!IsInitialized)
{
int numDevices;
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
if (numDevices == 0)
{
ManagedMemorySupported = false;
IsInitialized = true;
return;
}
// Check all devices, use the feature set supported by all
bool managed = true;
cudaDeviceProp prop;
for (int i = 0; i < numDevices && managed; ++i)
{
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
// We check for concurrentManagedAccess, as devices with only the
// managedAccess property have extra synchronization requirements.
managed = managed && prop.concurrentManagedAccess;
}
ManagedMemorySupported = managed;
IsInitialized = true;
}
}
}
}
}
} // end namespace vtkm::cont::cuda::internal

@ -0,0 +1,59 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2017 Sandia Corporation.
// Copyright 2017 UT-Battelle, LLC.
// Copyright 2017 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_CudaAllocator_h
#define vtk_m_cont_cuda_internal_CudaAllocator_h
#include <vtkm/cont/vtkm_cont_export.h>
#include <vtkm/internal/ExportMacros.h>
#include <cstddef>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
struct VTKM_CONT_EXPORT CudaAllocator
{
static VTKM_CONT bool UsingManagedMemory();
static VTKM_CONT void* Allocate(std::size_t numBytes);
static VTKM_CONT void Free(void* ptr);
static VTKM_CONT void PrepareForControl(const void* ptr, std::size_t numBytes);
static VTKM_CONT void PrepareForInput(const void* ptr, std::size_t numBytes);
static VTKM_CONT void PrepareForOutput(const void* ptr, std::size_t numBytes);
static VTKM_CONT void PrepareForInPlace(const void* ptr, std::size_t numBytes);
private:
static VTKM_CONT void Initialize();
};
}
}
}
} // end namespace vtkm::cont::cuda::internal
#endif // vtk_m_cont_cuda_internal_CudaAllocator_h