mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-08 21:33:55 +00:00
Add CudaAllocator to encapsulate runtime managed memory logic.
Unified memory is now used when we detect that the hardware supports it.
This commit is contained in:
parent
938db09ad5
commit
bd042ec567
@ -108,18 +108,6 @@ option(VTKm_USE_DOUBLE_PRECISION
|
||||
)
|
||||
option(VTKm_USE_64BIT_IDS "Use 64-bit indices." ON)
|
||||
|
||||
if (VTKm_ENABLE_CUDA)
|
||||
option(VTKm_USE_UNIFIED_MEMORY
|
||||
"Use CUDA unified memory"
|
||||
OFF
|
||||
)
|
||||
endif (VTKm_ENABLE_CUDA)
|
||||
|
||||
if (VTKm_USE_UNIFIED_MEMORY)
|
||||
set(CMAKE_CXX_FLAGS "-DVTKM_USE_UNIFIED_MEMORY ${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
|
||||
|
||||
option(BUILD_SHARED_LIBS "Build VTK-m with shared libraries" ON)
|
||||
set(VTKm_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <vtkm/cont/CellSetExplicit.h>
|
||||
#include <vtkm/cont/DataSet.h>
|
||||
#include <vtkm/cont/Timer.h>
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -92,8 +93,8 @@ vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
|
||||
vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
|
||||
|
||||
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
|
||||
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(0, 1, vdims[0] * vdims[1] *
|
||||
vdims[2]);
|
||||
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
|
||||
0, 1, vdims[0] * vdims[1] * vdims[2]);
|
||||
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
|
||||
TangleField(vdims, mins, maxs));
|
||||
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
|
||||
@ -153,37 +154,39 @@ int main(int argc, char* argv[])
|
||||
typedef vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG> DeviceAlgorithms;
|
||||
vtkm::worklet::SineWorklet sineWorklet;
|
||||
|
||||
#ifdef VTKM_USE_UNIFIED_MEMORY
|
||||
std::cout << "Testing with unified memory" << std::endl;
|
||||
bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
|
||||
|
||||
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
if (usingManagedMemory)
|
||||
{
|
||||
std::cout << "Testing with unified memory" << std::endl;
|
||||
|
||||
vtkm::cont::Timer<> timer;
|
||||
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
vtkm::cont::Timer<> timer;
|
||||
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
|
||||
#else
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
|
||||
NBlocks *= 2;
|
||||
dispatcher.SetNumberOfBlocks(NBlocks);
|
||||
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
|
||||
<< std::endl;
|
||||
|
||||
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
|
||||
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
|
||||
NBlocks *= 2;
|
||||
dispatcher.SetNumberOfBlocks(NBlocks);
|
||||
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
|
||||
<< std::endl;
|
||||
vtkm::cont::Timer<> timer;
|
||||
|
||||
vtkm::cont::Timer<> timer;
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
|
||||
dispatcher.Invoke(input, output);
|
||||
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
|
||||
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
|
||||
#endif
|
||||
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
|
||||
std::cout << "Time: " << elapsedTime << std::endl;
|
||||
}
|
||||
|
||||
int dim = 128;
|
||||
if (argc > 2)
|
||||
|
@ -21,6 +21,9 @@
|
||||
#define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_cu
|
||||
|
||||
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
|
||||
using vtkm::cont::cuda::internal::CudaAllocator;
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
@ -64,18 +67,8 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Allocate(TypelessExecut
|
||||
// Attempt to allocate:
|
||||
try
|
||||
{
|
||||
char* tmp;
|
||||
#ifdef VTKM_USE_UNIFIED_MEMORY
|
||||
int dev;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMallocManaged(&tmp, static_cast<std::size_t>(numBytes)));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(tmp, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, numBytes, dev, 0));
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(0));
|
||||
#else
|
||||
VTKM_CUDA_CALL(cudaMalloc(&tmp, static_cast<std::size_t>(numBytes)));
|
||||
#endif
|
||||
|
||||
// Cast to char* so that the pointer math below will work.
|
||||
char* tmp = static_cast<char*>(CudaAllocator::Allocate(static_cast<size_t>(numBytes)));
|
||||
execArray.Array = tmp;
|
||||
execArray.ArrayEnd = tmp + numBytes;
|
||||
execArray.ArrayCapacity = tmp + numBytes;
|
||||
@ -93,7 +86,7 @@ void ExecutionArrayInterfaceBasic<DeviceAdapterTagCuda>::Free(
|
||||
{
|
||||
if (execArray.Array != nullptr)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaFree(execArray.Array));
|
||||
CudaAllocator::Free(execArray.Array);
|
||||
execArray.Array = nullptr;
|
||||
execArray.ArrayEnd = nullptr;
|
||||
execArray.ArrayCapacity = nullptr;
|
||||
|
@ -35,6 +35,7 @@ VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
VTKM_THIRDPARTY_POST_INCLUDE
|
||||
|
||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
|
||||
#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
|
||||
|
||||
@ -172,17 +173,8 @@ public:
|
||||
// Attempt to allocate:
|
||||
try
|
||||
{
|
||||
ValueType* tmp;
|
||||
#ifdef VTKM_USE_UNIFIED_MEMORY
|
||||
int dev;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMallocManaged(&tmp, bufferSize));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(tmp, bufferSize, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(tmp, bufferSize, dev, 0));
|
||||
VTKM_CUDA_CALL(cudaStreamSynchronize(0));
|
||||
#else
|
||||
VTKM_CUDA_CALL(cudaMalloc(&tmp, bufferSize));
|
||||
#endif
|
||||
ValueType* tmp =
|
||||
static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
|
||||
this->Begin = PointerType(tmp);
|
||||
}
|
||||
catch (const std::exception& error)
|
||||
@ -214,9 +206,6 @@ public:
|
||||
storage->Allocate(this->GetNumberOfValues());
|
||||
try
|
||||
{
|
||||
#ifdef VTKM_USE_UNIFIED_MEMORY
|
||||
cudaDeviceSynchronize();
|
||||
#endif
|
||||
::thrust::copy(
|
||||
this->Begin, this->End, vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
|
||||
}
|
||||
@ -254,7 +243,7 @@ public:
|
||||
{
|
||||
if (this->Begin.get() != nullptr)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaFree(this->Begin.get()));
|
||||
vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin.get());
|
||||
this->Begin = PointerType(static_cast<ValueType*>(nullptr));
|
||||
this->End = PointerType(static_cast<ValueType*>(nullptr));
|
||||
this->Capacity = PointerType(static_cast<ValueType*>(nullptr));
|
||||
|
@ -21,6 +21,7 @@
|
||||
set(headers
|
||||
ArrayManagerExecutionCuda.h
|
||||
ArrayManagerExecutionThrustDevice.h
|
||||
CudaAllocator.h
|
||||
DeviceAdapterAlgorithmCuda.h
|
||||
DeviceAdapterAlgorithmThrust.h
|
||||
DeviceAdapterTagCuda.h
|
||||
@ -30,6 +31,11 @@ set(headers
|
||||
VirtualObjectTransferCuda.h
|
||||
)
|
||||
|
||||
set(sources
|
||||
ArrayManagerExecutionCuda.cu
|
||||
CudaAllocator.cu
|
||||
)
|
||||
|
||||
vtkm_declare_headers(CUDA ${headers} TESTABLE ${VTKm_ENABLE_CUDA})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
@ -49,7 +55,7 @@ if (VTKm_ENABLE_CUDA)
|
||||
list(APPEND compile_options -Xcompiler=-fPIC)
|
||||
endif()
|
||||
|
||||
cuda_compile(vtkm_cont_cuda_object_files ArrayManagerExecutionCuda.cu
|
||||
cuda_compile(vtkm_cont_cuda_object_files ${sources}
|
||||
OPTIONS "${compile_options}")
|
||||
|
||||
#Setup the dependency chain for the custom object build so that
|
||||
|
164
vtkm/cont/cuda/internal/CudaAllocator.cu
Normal file
164
vtkm/cont/cuda/internal/CudaAllocator.cu
Normal file
@ -0,0 +1,164 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2017 Sandia Corporation.
|
||||
// Copyright 2017 UT-Battelle, LLC.
|
||||
// Copyright 2017 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// These static vars are in an anon namespace to work around MSVC linker issues.
|
||||
namespace
|
||||
{
|
||||
// Has CudaAllocator::Initialize been called?
|
||||
static bool IsInitialized = false;
|
||||
|
||||
// True if all devices support concurrent pagable managed memory.
|
||||
static bool ManagedMemorySupported = false;
|
||||
}
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
bool CudaAllocator::UsingManagedMemory()
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
return ManagedMemorySupported;
|
||||
}
|
||||
|
||||
void* CudaAllocator::Allocate(std::size_t numBytes)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
void* ptr = nullptr;
|
||||
if (ManagedMemorySupported)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
|
||||
}
|
||||
else
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void CudaAllocator::Free(void* ptr)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
VTKM_CUDA_CALL(cudaFree(ptr));
|
||||
}
|
||||
|
||||
void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
if (ManagedMemorySupported)
|
||||
{
|
||||
// TODO these hints need to be benchmarked and adjusted once we start
|
||||
// sharing the pointers between cont/exec
|
||||
VTKM_CUDA_CALL(
|
||||
cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, cudaCpuDeviceId));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
if (ManagedMemorySupported)
|
||||
{
|
||||
int dev;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
if (ManagedMemorySupported)
|
||||
{
|
||||
int dev;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
|
||||
{
|
||||
CudaAllocator::Initialize();
|
||||
|
||||
if (ManagedMemorySupported)
|
||||
{
|
||||
int dev;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
|
||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void CudaAllocator::Initialize()
|
||||
{
|
||||
if (!IsInitialized)
|
||||
{
|
||||
int numDevices;
|
||||
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
|
||||
|
||||
if (numDevices == 0)
|
||||
{
|
||||
ManagedMemorySupported = false;
|
||||
IsInitialized = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Check all devices, use the feature set supported by all
|
||||
bool managed = true;
|
||||
cudaDeviceProp prop;
|
||||
for (int i = 0; i < numDevices && managed; ++i)
|
||||
{
|
||||
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
|
||||
// We check for concurrentManagedAccess, as devices with only the
|
||||
// managedAccess property have extra synchronization requirements.
|
||||
managed = managed && prop.concurrentManagedAccess;
|
||||
}
|
||||
|
||||
ManagedMemorySupported = managed;
|
||||
IsInitialized = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // end namespace vtkm::cont::cuda::internal
|
59
vtkm/cont/cuda/internal/CudaAllocator.h
Normal file
59
vtkm/cont/cuda/internal/CudaAllocator.h
Normal file
@ -0,0 +1,59 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2017 Sandia Corporation.
|
||||
// Copyright 2017 UT-Battelle, LLC.
|
||||
// Copyright 2017 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#ifndef vtk_m_cont_cuda_internal_CudaAllocator_h
|
||||
#define vtk_m_cont_cuda_internal_CudaAllocator_h
|
||||
|
||||
#include <vtkm/cont/vtkm_cont_export.h>
|
||||
#include <vtkm/internal/ExportMacros.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
struct VTKM_CONT_EXPORT CudaAllocator
|
||||
{
|
||||
static VTKM_CONT bool UsingManagedMemory();
|
||||
|
||||
static VTKM_CONT void* Allocate(std::size_t numBytes);
|
||||
static VTKM_CONT void Free(void* ptr);
|
||||
|
||||
static VTKM_CONT void PrepareForControl(const void* ptr, std::size_t numBytes);
|
||||
|
||||
static VTKM_CONT void PrepareForInput(const void* ptr, std::size_t numBytes);
|
||||
static VTKM_CONT void PrepareForOutput(const void* ptr, std::size_t numBytes);
|
||||
static VTKM_CONT void PrepareForInPlace(const void* ptr, std::size_t numBytes);
|
||||
|
||||
private:
|
||||
static VTKM_CONT void Initialize();
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
} // end namespace vtkm::cont::cuda::internal
|
||||
|
||||
#endif // vtk_m_cont_cuda_internal_CudaAllocator_h
|
Loading…
Reference in New Issue
Block a user