Put sync memory into RuntimeDeviceTracker

This commit is contained in:
Dave Pugmire 2022-08-15 14:30:13 -04:00
parent 8455972a09
commit baac7e1fb9
7 changed files with 47 additions and 116 deletions

@ -9,7 +9,6 @@
//============================================================================ //============================================================================
#include <vtkm/cont/Initialize.h> #include <vtkm/cont/Initialize.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/filter/contour/Contour.h> #include <vtkm/filter/contour/Contour.h>
#include <vtkm/io/VTKDataSetReader.h> #include <vtkm/io/VTKDataSetReader.h>
#include <vtkm/io/VTKDataSetWriter.h> #include <vtkm/io/VTKDataSetWriter.h>
@ -211,12 +210,12 @@ int main(int argc, char** argv)
{ {
if (opts.SyncMemAlloc) if (opts.SyncMemAlloc)
{ {
vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator(); //vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
std::cout << " Task: Sync memory alloc = ON" << std::endl; std::cout << " Task: Sync memory alloc = ON" << std::endl;
} }
else else
{ {
vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator(); //vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
std::cout << " Task: Sync memory alloc = OFF" << std::endl; std::cout << " Task: Sync memory alloc = OFF" << std::endl;
} }
} }

@ -41,30 +41,15 @@ struct RuntimeDeviceTrackerInternals
bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; } bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; }
void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; } void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; }
bool GetThreadFriendlyMemAlloc(std::size_t deviceId) const bool GetThreadFriendlyMemAlloc() const { return this->ThreadFriendlyMemAlloc; }
{ void SetThreadFriendlyMemAlloc(bool flag) { this->ThreadFriendlyMemAlloc = flag; }
return this->ThreadFriendlyMemAlloc[deviceId];
}
void SetThreadFriendlyMemAlloc(std::size_t deviceId, bool flag)
{
this->ThreadFriendlyMemAlloc[deviceId] = flag;
}
void ResetRuntimeAllowed() void ResetRuntimeAllowed()
{ {
std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false); std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false);
} }
void ResetThreadFriendlyMemAlloc() void Reset() { this->ResetRuntimeAllowed(); }
{
std::fill_n(this->ThreadFriendlyMemAlloc, VTKM_MAX_DEVICE_ADAPTER_ID, false);
}
void Reset()
{
this->ResetRuntimeAllowed();
this->ResetThreadFriendlyMemAlloc();
}
private: private:
void CopyFrom(const RuntimeDeviceTrackerInternals* v) void CopyFrom(const RuntimeDeviceTrackerInternals* v)
@ -72,14 +57,13 @@ private:
std::copy(std::cbegin(v->RuntimeAllowed), std::copy(std::cbegin(v->RuntimeAllowed),
std::cend(v->RuntimeAllowed), std::cend(v->RuntimeAllowed),
std::begin(this->RuntimeAllowed)); std::begin(this->RuntimeAllowed));
std::copy(std::cbegin(v->ThreadFriendlyMemAlloc), this->SetThreadFriendlyMemAlloc(v->GetThreadFriendlyMemAlloc());
std::cend(v->ThreadFriendlyMemAlloc),
std::begin(this->ThreadFriendlyMemAlloc));
} }
bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID]; bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID];
bool ThreadFriendlyMemAlloc[VTKM_MAX_DEVICE_ADAPTER_ID]; bool ThreadFriendlyMemAlloc = false;
}; };
} }
VTKM_CONT VTKM_CONT
@ -130,24 +114,9 @@ bool RuntimeDeviceTracker::CanRunOn(vtkm::cont::DeviceAdapterId deviceId) const
} }
VTKM_CONT VTKM_CONT
bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId) const bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc() const
{ {
if (deviceId == vtkm::cont::DeviceAdapterTagAny{}) return this->Internals->GetThreadFriendlyMemAlloc();
{ //If at least a single device is enabled, than any device is enabled
for (vtkm::Int8 i = 1; i < VTKM_MAX_DEVICE_ADAPTER_ID; ++i)
{
if (this->Internals->GetThreadFriendlyMemAlloc(static_cast<std::size_t>(i)))
{
return true;
}
}
return false;
}
else
{
this->CheckDevice(deviceId);
return this->Internals->GetThreadFriendlyMemAlloc(deviceId.GetValue());
}
} }
VTKM_CONT VTKM_CONT
@ -159,12 +128,9 @@ void RuntimeDeviceTracker::SetDeviceState(vtkm::cont::DeviceAdapterId deviceId,
} }
VTKM_CONT VTKM_CONT
void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(bool state)
bool state)
{ {
this->CheckDevice(deviceId); this->Internals->SetThreadFriendlyMemAlloc(state);
this->Internals->SetThreadFriendlyMemAlloc(deviceId.GetValue(), state);
} }
VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId) VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId)

@ -113,8 +113,8 @@ public:
/// \brief Get/Set use of thread-friendly memory allocation for a device. /// \brief Get/Set use of thread-friendly memory allocation for a device.
/// ///
/// ///
VTKM_CONT bool GetThreadFriendlyMemAlloc(DeviceAdapterId deviceId) const; VTKM_CONT bool GetThreadFriendlyMemAlloc() const;
VTKM_CONT void SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, bool state); VTKM_CONT void SetThreadFriendlyMemAlloc(bool state);
/// \brief Copies the state from the given device. /// \brief Copies the state from the given device.
/// ///

@ -12,6 +12,7 @@
#include <mutex> #include <mutex>
#include <vtkm/cont/Logging.h> #include <vtkm/cont/Logging.h>
#include <vtkm/cont/RuntimeDeviceInformation.h> #include <vtkm/cont/RuntimeDeviceInformation.h>
#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/cuda/ErrorCuda.h> #include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h> #include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h> #include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
@ -47,9 +48,6 @@ static bool ManagedMemoryEnabled = false;
// True if concurrent pagable managed memory is supported by the machines hardware. // True if concurrent pagable managed memory is supported by the machines hardware.
static bool HardwareSupportsManagedMemory = false; static bool HardwareSupportsManagedMemory = false;
// True if using syncronous memory allocator. Managed memory must be off to use this.
static thread_local bool UseSyncMemoryAlloc = true;
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers. // Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
// This value should be > 0 or else these functions will error out. // This value should be > 0 or else these functions will error out.
static std::size_t Threshold = 1 << 20; static std::size_t Threshold = 1 << 20;
@ -106,29 +104,6 @@ void CudaAllocator::ForceManagedMemoryOn()
} }
} }
void CudaAllocator::ForceSyncMemoryAllocator()
{
UseSyncMemoryAlloc = true;
VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using synchronous memory allocator.");
}
void CudaAllocator::ForceAsyncMemoryAllocator()
{
#if CUDART_VERSION >= 11030
if (ManagedMemoryEnabled)
{
CudaAllocator::ForceManagedMemoryOff();
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
"CudaAllocator turning off managed memory for asynchronous memory allocation");
}
UseSyncMemoryAlloc = false;
VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using asynchronous memory allocator.");
#else
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
"Asynchronous memory allocator not supported for cuda version < 11.3");
#endif
}
bool CudaAllocator::IsDevicePointer(const void* ptr) bool CudaAllocator::IsDevicePointer(const void* ptr)
{ {
CudaAllocator::Initialize(); CudaAllocator::Initialize();
@ -186,11 +161,11 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
void* ptr = nullptr; void* ptr = nullptr;
#if CUDART_VERSION >= 11030 #if CUDART_VERSION >= 11030
if (!UseSyncMemoryAlloc) const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
if (tracker.GetThreadFriendlyMemAlloc())
{ {
VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread)); VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
} }
else else
#endif #endif
if (ManagedMemoryEnabled) if (ManagedMemoryEnabled)
@ -215,7 +190,18 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
void* CudaAllocator::AllocateUnManaged(std::size_t numBytes) void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
{ {
void* ptr = nullptr; void* ptr = nullptr;
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes)); #if CUDART_VERSION >= 11030
const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
if (tracker.GetThreadFriendlyMemAlloc())
{
VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
}
else
#endif
{
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
}
{ {
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, VTKM_LOG_F(vtkm::cont::LogLevel::MemExec,
"Allocated CUDA array of %s at %p.", "Allocated CUDA array of %s at %p.",
@ -237,17 +223,16 @@ void CudaAllocator::Free(void* ptr)
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr); VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
if (UseSyncMemoryAlloc) #if CUDART_VERSION >= 11030
const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
if (tracker.GetThreadFriendlyMemAlloc())
{ {
VTKM_CUDA_CALL(cudaFree(ptr)); VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
} }
else else
{
#if CUDART_VERSION >= 11030
VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
#else
VTKM_CUDA_CALL(cudaFree(ptr));
#endif #endif
{
VTKM_CUDA_CALL(cudaFree(ptr));
} }
} }

@ -21,14 +21,14 @@ namespace vtkm
namespace filter namespace filter
{ {
NewFilter::~NewFilter() = default; namespace
{
void NewFilter::RunFilter(NewFilter* self, void RunFilter(NewFilter* self,
vtkm::filter::DataSetQueue& input, vtkm::filter::DataSetQueue& input,
vtkm::filter::DataSetQueue& output) vtkm::filter::DataSetQueue& output)
{ {
auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
tracker.SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterTagAny{}, true); tracker.SetThreadFriendlyMemAlloc(true);
std::pair<vtkm::Id, vtkm::cont::DataSet> task; std::pair<vtkm::Id, vtkm::cont::DataSet> task;
while (input.GetTask(task)) while (input.GetTask(task))
@ -40,6 +40,9 @@ void NewFilter::RunFilter(NewFilter* self,
vtkm::cont::Algorithm::Synchronize(); vtkm::cont::Algorithm::Synchronize();
} }
}
NewFilter::~NewFilter() = default;
bool NewFilter::CanThread() const bool NewFilter::CanThread() const
{ {
@ -63,11 +66,8 @@ vtkm::cont::PartitionedDataSet NewFilter::DoExecutePartitions(
std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads)); std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads));
for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++) for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++)
{ {
auto f = std::async(std::launch::async, auto f = std::async(
vtkm::filter::NewFilter::RunFilter, std::launch::async, RunFilter, this, std::ref(inputQueue), std::ref(outputQueue));
this,
std::ref(inputQueue),
std::ref(outputQueue));
futures[i] = std::move(f); futures[i] = std::move(f);
} }

@ -446,11 +446,6 @@ private:
} }
} }
VTKM_CONT
static void RunFilter(NewFilter* self,
vtkm::filter::DataSetQueue& input,
vtkm::filter::DataSetQueue& output);
VTKM_CONT VTKM_CONT
virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input); virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input);

@ -10,7 +10,6 @@
#include <vtkm/Math.h> #include <vtkm/Math.h>
#include <vtkm/cont/DataSet.h> #include <vtkm/cont/DataSet.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/cont/testing/MakeTestDataSet.h> #include <vtkm/cont/testing/MakeTestDataSet.h>
#include <vtkm/cont/testing/Testing.h> #include <vtkm/cont/testing/Testing.h>
@ -87,7 +86,7 @@ void ValidateResults(const vtkm::cont::PartitionedDataSet& truth,
} //namespace } //namespace
void RunMBTests() void TestMultiBlockFilter()
{ {
vtkm::cont::PartitionedDataSet pds; vtkm::cont::PartitionedDataSet pds;
@ -160,19 +159,6 @@ void RunMBTests()
ValidateResults(results[0], results[1], "gradient", false); ValidateResults(results[0], results[1], "gradient", false);
} }
void TestMultiBlockFilter()
{
//For cuda, run it with and without async memory allocation.
#ifdef VTKM_CUDA
vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
RunMBTests();
vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
RunMBTests();
#else
RunMBTests();
#endif
}
int UnitTestMultiBlockFilter(int argc, char* argv[]) int UnitTestMultiBlockFilter(int argc, char* argv[])
{ {
return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv); return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv);