From baac7e1fb9dd8c38e87385b9122dd326964833a1 Mon Sep 17 00:00:00 2001 From: Dave Pugmire Date: Mon, 15 Aug 2022 14:30:13 -0400 Subject: [PATCH] Put sync memory into RuntimeDeviceTracker --- .../threaded_multiblock_filter/MultiBlock.cxx | 5 +- vtkm/cont/RuntimeDeviceTracker.cxx | 54 ++++-------------- vtkm/cont/RuntimeDeviceTracker.h | 4 +- vtkm/cont/cuda/internal/CudaAllocator.cu | 57 +++++++------------ vtkm/filter/NewFilter.cxx | 22 +++---- vtkm/filter/NewFilter.h | 5 -- .../testing/UnitTestMultiBlockFilter.cxx | 16 +----- 7 files changed, 47 insertions(+), 116 deletions(-) diff --git a/examples/threaded_multiblock_filter/MultiBlock.cxx b/examples/threaded_multiblock_filter/MultiBlock.cxx index fca9f9a8f..de14c21c1 100644 --- a/examples/threaded_multiblock_filter/MultiBlock.cxx +++ b/examples/threaded_multiblock_filter/MultiBlock.cxx @@ -9,7 +9,6 @@ //============================================================================ #include -#include #include #include #include @@ -211,12 +210,12 @@ int main(int argc, char** argv) { if (opts.SyncMemAlloc) { - vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator(); + //vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator(); std::cout << " Task: Sync memory alloc = ON" << std::endl; } else { - vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator(); + //vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator(); std::cout << " Task: Sync memory alloc = OFF" << std::endl; } } diff --git a/vtkm/cont/RuntimeDeviceTracker.cxx b/vtkm/cont/RuntimeDeviceTracker.cxx index 384851190..63d333b27 100644 --- a/vtkm/cont/RuntimeDeviceTracker.cxx +++ b/vtkm/cont/RuntimeDeviceTracker.cxx @@ -41,30 +41,15 @@ struct RuntimeDeviceTrackerInternals bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; } void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; } - bool GetThreadFriendlyMemAlloc(std::size_t deviceId) const - { - return this->ThreadFriendlyMemAlloc[deviceId]; - } - void SetThreadFriendlyMemAlloc(std::size_t deviceId, bool flag) - { - this->ThreadFriendlyMemAlloc[deviceId] = flag; - } + bool GetThreadFriendlyMemAlloc() const { return this->ThreadFriendlyMemAlloc; } + void SetThreadFriendlyMemAlloc(bool flag) { this->ThreadFriendlyMemAlloc = flag; } void ResetRuntimeAllowed() { std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false); } - void ResetThreadFriendlyMemAlloc() - { - std::fill_n(this->ThreadFriendlyMemAlloc, VTKM_MAX_DEVICE_ADAPTER_ID, false); - } - - void Reset() - { - this->ResetRuntimeAllowed(); - this->ResetThreadFriendlyMemAlloc(); - } + void Reset() { this->ResetRuntimeAllowed(); } private: void CopyFrom(const RuntimeDeviceTrackerInternals* v) @@ -72,14 +57,13 @@ private: std::copy(std::cbegin(v->RuntimeAllowed), std::cend(v->RuntimeAllowed), std::begin(this->RuntimeAllowed)); - std::copy(std::cbegin(v->ThreadFriendlyMemAlloc), - std::cend(v->ThreadFriendlyMemAlloc), - std::begin(this->ThreadFriendlyMemAlloc)); + this->SetThreadFriendlyMemAlloc(v->GetThreadFriendlyMemAlloc()); } bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID]; - bool ThreadFriendlyMemAlloc[VTKM_MAX_DEVICE_ADAPTER_ID]; + bool ThreadFriendlyMemAlloc = false; }; + } VTKM_CONT @@ -130,24 +114,9 @@ bool RuntimeDeviceTracker::CanRunOn(vtkm::cont::DeviceAdapterId deviceId) const } VTKM_CONT -bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId) const +bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc() const { - if (deviceId == vtkm::cont::DeviceAdapterTagAny{}) - { //If at least a single device is enabled, than any device is enabled - for (vtkm::Int8 i = 1; i < VTKM_MAX_DEVICE_ADAPTER_ID; ++i) - { - if (this->Internals->GetThreadFriendlyMemAlloc(static_cast(i))) - { - return true; - } - } - return false; - } - else - { - this->CheckDevice(deviceId); - return this->Internals->GetThreadFriendlyMemAlloc(deviceId.GetValue()); - } + return this->Internals->GetThreadFriendlyMemAlloc(); } VTKM_CONT @@ -159,12 +128,9 @@ void RuntimeDeviceTracker::SetDeviceState(vtkm::cont::DeviceAdapterId deviceId, } VTKM_CONT -void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, - bool state) +void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(bool state) { - this->CheckDevice(deviceId); - - this->Internals->SetThreadFriendlyMemAlloc(deviceId.GetValue(), state); + this->Internals->SetThreadFriendlyMemAlloc(state); } VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId) diff --git a/vtkm/cont/RuntimeDeviceTracker.h b/vtkm/cont/RuntimeDeviceTracker.h index 8dc5189db..5de7c22bc 100644 --- a/vtkm/cont/RuntimeDeviceTracker.h +++ b/vtkm/cont/RuntimeDeviceTracker.h @@ -113,8 +113,8 @@ public: /// \brief Get/Set use of thread-friendly memory allocation for a device. /// /// - VTKM_CONT bool GetThreadFriendlyMemAlloc(DeviceAdapterId deviceId) const; - VTKM_CONT void SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, bool state); + VTKM_CONT bool GetThreadFriendlyMemAlloc() const; + VTKM_CONT void SetThreadFriendlyMemAlloc(bool state); /// \brief Copies the state from the given device. /// diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu index 83cf6b1fd..c786844aa 100644 --- a/vtkm/cont/cuda/internal/CudaAllocator.cu +++ b/vtkm/cont/cuda/internal/CudaAllocator.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -47,9 +48,6 @@ static bool ManagedMemoryEnabled = false; // True if concurrent pagable managed memory is supported by the machines hardware. static bool HardwareSupportsManagedMemory = false; -// True if using syncronous memory allocator. Managed memory must be off to use this. -static thread_local bool UseSyncMemoryAlloc = true; - // Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers. // This value should be > 0 or else these functions will error out. static std::size_t Threshold = 1 << 20; @@ -106,29 +104,6 @@ void CudaAllocator::ForceManagedMemoryOn() } } -void CudaAllocator::ForceSyncMemoryAllocator() -{ - UseSyncMemoryAlloc = true; - VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using synchronous memory allocator."); -} - -void CudaAllocator::ForceAsyncMemoryAllocator() -{ -#if CUDART_VERSION >= 11030 - if (ManagedMemoryEnabled) - { - CudaAllocator::ForceManagedMemoryOff(); - VTKM_LOG_F(vtkm::cont::LogLevel::Warn, - "CudaAllocator turning off managed memory for asynchronous memory allocation"); - } - UseSyncMemoryAlloc = false; - VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using asynchronous memory allocator."); -#else - VTKM_LOG_F(vtkm::cont::LogLevel::Warn, - "Asynchronous memory allocator not supported for cuda version < 11.3"); -#endif -} - bool CudaAllocator::IsDevicePointer(const void* ptr) { CudaAllocator::Initialize(); @@ -186,11 +161,11 @@ void* CudaAllocator::Allocate(std::size_t numBytes) void* ptr = nullptr; #if CUDART_VERSION >= 11030 - if (!UseSyncMemoryAlloc) + const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); + if (tracker.GetThreadFriendlyMemAlloc()) { VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread)); } - else #endif if (ManagedMemoryEnabled) @@ -215,7 +190,18 @@ void* CudaAllocator::Allocate(std::size_t numBytes) void* CudaAllocator::AllocateUnManaged(std::size_t numBytes) { void* ptr = nullptr; - VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes)); +#if CUDART_VERSION >= 11030 + const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); + if (tracker.GetThreadFriendlyMemAlloc()) + { + VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread)); + } + else +#endif + { + VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes)); + } + { VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Allocated CUDA array of %s at %p.", @@ -237,17 +223,16 @@ void CudaAllocator::Free(void* ptr) VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr); - if (UseSyncMemoryAlloc) +#if CUDART_VERSION >= 11030 + const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); + if (tracker.GetThreadFriendlyMemAlloc()) { - VTKM_CUDA_CALL(cudaFree(ptr)); + VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread)); } else - { -#if CUDART_VERSION >= 11030 - VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread)); -#else - VTKM_CUDA_CALL(cudaFree(ptr)); #endif + { + VTKM_CUDA_CALL(cudaFree(ptr)); } } diff --git a/vtkm/filter/NewFilter.cxx b/vtkm/filter/NewFilter.cxx index 7365910e9..847b591dd 100644 --- a/vtkm/filter/NewFilter.cxx +++ b/vtkm/filter/NewFilter.cxx @@ -21,14 +21,14 @@ namespace vtkm namespace filter { -NewFilter::~NewFilter() = default; - -void NewFilter::RunFilter(NewFilter* self, - vtkm::filter::DataSetQueue& input, - vtkm::filter::DataSetQueue& output) +namespace +{ +void RunFilter(NewFilter* self, + vtkm::filter::DataSetQueue& input, + vtkm::filter::DataSetQueue& output) { auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); - tracker.SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterTagAny{}, true); + tracker.SetThreadFriendlyMemAlloc(true); std::pair task; while (input.GetTask(task)) @@ -40,6 +40,9 @@ void NewFilter::RunFilter(NewFilter* self, vtkm::cont::Algorithm::Synchronize(); } +} + +NewFilter::~NewFilter() = default; bool NewFilter::CanThread() const { @@ -63,11 +66,8 @@ vtkm::cont::PartitionedDataSet NewFilter::DoExecutePartitions( std::vector> futures(static_cast(numThreads)); for (std::size_t i = 0; i < static_cast(numThreads); i++) { - auto f = std::async(std::launch::async, - vtkm::filter::NewFilter::RunFilter, - this, - std::ref(inputQueue), - std::ref(outputQueue)); + auto f = std::async( + std::launch::async, RunFilter, this, std::ref(inputQueue), std::ref(outputQueue)); futures[i] = std::move(f); } diff --git a/vtkm/filter/NewFilter.h b/vtkm/filter/NewFilter.h index dc21c259b..b312d8da2 100644 --- a/vtkm/filter/NewFilter.h +++ b/vtkm/filter/NewFilter.h @@ -446,11 +446,6 @@ private: } } - VTKM_CONT - static void RunFilter(NewFilter* self, - vtkm::filter::DataSetQueue& input, - vtkm::filter::DataSetQueue& output); - VTKM_CONT virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input); diff --git a/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx b/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx index 8353b3a4d..6ee313c1b 100644 --- a/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx +++ b/vtkm/filter/testing/UnitTestMultiBlockFilter.cxx @@ -10,7 +10,6 @@ #include #include -#include #include #include @@ -87,7 +86,7 @@ void ValidateResults(const vtkm::cont::PartitionedDataSet& truth, } //namespace -void RunMBTests() +void TestMultiBlockFilter() { vtkm::cont::PartitionedDataSet pds; @@ -160,19 +159,6 @@ void RunMBTests() ValidateResults(results[0], results[1], "gradient", false); } -void TestMultiBlockFilter() -{ - //For cuda, run it with and without async memory allocation. -#ifdef VTKM_CUDA - vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator(); - RunMBTests(); - vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator(); - RunMBTests(); -#else - RunMBTests(); -#endif -} - int UnitTestMultiBlockFilter(int argc, char* argv[]) { return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv);