mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-16 17:22:55 +00:00
Put sync memory into RuntimeDeviceTracker
This commit is contained in:
parent
8455972a09
commit
baac7e1fb9
@ -9,7 +9,6 @@
|
|||||||
//============================================================================
|
//============================================================================
|
||||||
|
|
||||||
#include <vtkm/cont/Initialize.h>
|
#include <vtkm/cont/Initialize.h>
|
||||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
|
||||||
#include <vtkm/filter/contour/Contour.h>
|
#include <vtkm/filter/contour/Contour.h>
|
||||||
#include <vtkm/io/VTKDataSetReader.h>
|
#include <vtkm/io/VTKDataSetReader.h>
|
||||||
#include <vtkm/io/VTKDataSetWriter.h>
|
#include <vtkm/io/VTKDataSetWriter.h>
|
||||||
@ -211,12 +210,12 @@ int main(int argc, char** argv)
|
|||||||
{
|
{
|
||||||
if (opts.SyncMemAlloc)
|
if (opts.SyncMemAlloc)
|
||||||
{
|
{
|
||||||
vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
|
//vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
|
||||||
std::cout << " Task: Sync memory alloc = ON" << std::endl;
|
std::cout << " Task: Sync memory alloc = ON" << std::endl;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
|
//vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
|
||||||
std::cout << " Task: Sync memory alloc = OFF" << std::endl;
|
std::cout << " Task: Sync memory alloc = OFF" << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,30 +41,15 @@ struct RuntimeDeviceTrackerInternals
|
|||||||
bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; }
|
bool GetRuntimeAllowed(std::size_t deviceId) const { return this->RuntimeAllowed[deviceId]; }
|
||||||
void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; }
|
void SetRuntimeAllowed(std::size_t deviceId, bool flag) { this->RuntimeAllowed[deviceId] = flag; }
|
||||||
|
|
||||||
bool GetThreadFriendlyMemAlloc(std::size_t deviceId) const
|
bool GetThreadFriendlyMemAlloc() const { return this->ThreadFriendlyMemAlloc; }
|
||||||
{
|
void SetThreadFriendlyMemAlloc(bool flag) { this->ThreadFriendlyMemAlloc = flag; }
|
||||||
return this->ThreadFriendlyMemAlloc[deviceId];
|
|
||||||
}
|
|
||||||
void SetThreadFriendlyMemAlloc(std::size_t deviceId, bool flag)
|
|
||||||
{
|
|
||||||
this->ThreadFriendlyMemAlloc[deviceId] = flag;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ResetRuntimeAllowed()
|
void ResetRuntimeAllowed()
|
||||||
{
|
{
|
||||||
std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false);
|
std::fill_n(this->RuntimeAllowed, VTKM_MAX_DEVICE_ADAPTER_ID, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ResetThreadFriendlyMemAlloc()
|
void Reset() { this->ResetRuntimeAllowed(); }
|
||||||
{
|
|
||||||
std::fill_n(this->ThreadFriendlyMemAlloc, VTKM_MAX_DEVICE_ADAPTER_ID, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Reset()
|
|
||||||
{
|
|
||||||
this->ResetRuntimeAllowed();
|
|
||||||
this->ResetThreadFriendlyMemAlloc();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void CopyFrom(const RuntimeDeviceTrackerInternals* v)
|
void CopyFrom(const RuntimeDeviceTrackerInternals* v)
|
||||||
@ -72,14 +57,13 @@ private:
|
|||||||
std::copy(std::cbegin(v->RuntimeAllowed),
|
std::copy(std::cbegin(v->RuntimeAllowed),
|
||||||
std::cend(v->RuntimeAllowed),
|
std::cend(v->RuntimeAllowed),
|
||||||
std::begin(this->RuntimeAllowed));
|
std::begin(this->RuntimeAllowed));
|
||||||
std::copy(std::cbegin(v->ThreadFriendlyMemAlloc),
|
this->SetThreadFriendlyMemAlloc(v->GetThreadFriendlyMemAlloc());
|
||||||
std::cend(v->ThreadFriendlyMemAlloc),
|
|
||||||
std::begin(this->ThreadFriendlyMemAlloc));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID];
|
bool RuntimeAllowed[VTKM_MAX_DEVICE_ADAPTER_ID];
|
||||||
bool ThreadFriendlyMemAlloc[VTKM_MAX_DEVICE_ADAPTER_ID];
|
bool ThreadFriendlyMemAlloc = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
@ -130,24 +114,9 @@ bool RuntimeDeviceTracker::CanRunOn(vtkm::cont::DeviceAdapterId deviceId) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId) const
|
bool RuntimeDeviceTracker::GetThreadFriendlyMemAlloc() const
|
||||||
{
|
{
|
||||||
if (deviceId == vtkm::cont::DeviceAdapterTagAny{})
|
return this->Internals->GetThreadFriendlyMemAlloc();
|
||||||
{ //If at least a single device is enabled, than any device is enabled
|
|
||||||
for (vtkm::Int8 i = 1; i < VTKM_MAX_DEVICE_ADAPTER_ID; ++i)
|
|
||||||
{
|
|
||||||
if (this->Internals->GetThreadFriendlyMemAlloc(static_cast<std::size_t>(i)))
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
this->CheckDevice(deviceId);
|
|
||||||
return this->Internals->GetThreadFriendlyMemAlloc(deviceId.GetValue());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
@ -159,12 +128,9 @@ void RuntimeDeviceTracker::SetDeviceState(vtkm::cont::DeviceAdapterId deviceId,
|
|||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId,
|
void RuntimeDeviceTracker::SetThreadFriendlyMemAlloc(bool state)
|
||||||
bool state)
|
|
||||||
{
|
{
|
||||||
this->CheckDevice(deviceId);
|
this->Internals->SetThreadFriendlyMemAlloc(state);
|
||||||
|
|
||||||
this->Internals->SetThreadFriendlyMemAlloc(deviceId.GetValue(), state);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId)
|
VTKM_CONT void RuntimeDeviceTracker::ResetDevice(vtkm::cont::DeviceAdapterId deviceId)
|
||||||
|
@ -113,8 +113,8 @@ public:
|
|||||||
/// \brief Get/Set use of thread-friendly memory allocation for a device.
|
/// \brief Get/Set use of thread-friendly memory allocation for a device.
|
||||||
///
|
///
|
||||||
///
|
///
|
||||||
VTKM_CONT bool GetThreadFriendlyMemAlloc(DeviceAdapterId deviceId) const;
|
VTKM_CONT bool GetThreadFriendlyMemAlloc() const;
|
||||||
VTKM_CONT void SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterId deviceId, bool state);
|
VTKM_CONT void SetThreadFriendlyMemAlloc(bool state);
|
||||||
|
|
||||||
/// \brief Copies the state from the given device.
|
/// \brief Copies the state from the given device.
|
||||||
///
|
///
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <vtkm/cont/Logging.h>
|
#include <vtkm/cont/Logging.h>
|
||||||
#include <vtkm/cont/RuntimeDeviceInformation.h>
|
#include <vtkm/cont/RuntimeDeviceInformation.h>
|
||||||
|
#include <vtkm/cont/RuntimeDeviceTracker.h>
|
||||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
#include <vtkm/cont/cuda/ErrorCuda.h>
|
||||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
||||||
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
|
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
|
||||||
@ -47,9 +48,6 @@ static bool ManagedMemoryEnabled = false;
|
|||||||
// True if concurrent pagable managed memory is supported by the machines hardware.
|
// True if concurrent pagable managed memory is supported by the machines hardware.
|
||||||
static bool HardwareSupportsManagedMemory = false;
|
static bool HardwareSupportsManagedMemory = false;
|
||||||
|
|
||||||
// True if using syncronous memory allocator. Managed memory must be off to use this.
|
|
||||||
static thread_local bool UseSyncMemoryAlloc = true;
|
|
||||||
|
|
||||||
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
|
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
|
||||||
// This value should be > 0 or else these functions will error out.
|
// This value should be > 0 or else these functions will error out.
|
||||||
static std::size_t Threshold = 1 << 20;
|
static std::size_t Threshold = 1 << 20;
|
||||||
@ -106,29 +104,6 @@ void CudaAllocator::ForceManagedMemoryOn()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaAllocator::ForceSyncMemoryAllocator()
|
|
||||||
{
|
|
||||||
UseSyncMemoryAlloc = true;
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using synchronous memory allocator.");
|
|
||||||
}
|
|
||||||
|
|
||||||
void CudaAllocator::ForceAsyncMemoryAllocator()
|
|
||||||
{
|
|
||||||
#if CUDART_VERSION >= 11030
|
|
||||||
if (ManagedMemoryEnabled)
|
|
||||||
{
|
|
||||||
CudaAllocator::ForceManagedMemoryOff();
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
|
|
||||||
"CudaAllocator turning off managed memory for asynchronous memory allocation");
|
|
||||||
}
|
|
||||||
UseSyncMemoryAlloc = false;
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::Warn, "CudaAllocator using asynchronous memory allocator.");
|
|
||||||
#else
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
|
|
||||||
"Asynchronous memory allocator not supported for cuda version < 11.3");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CudaAllocator::IsDevicePointer(const void* ptr)
|
bool CudaAllocator::IsDevicePointer(const void* ptr)
|
||||||
{
|
{
|
||||||
CudaAllocator::Initialize();
|
CudaAllocator::Initialize();
|
||||||
@ -186,11 +161,11 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
|
|||||||
|
|
||||||
void* ptr = nullptr;
|
void* ptr = nullptr;
|
||||||
#if CUDART_VERSION >= 11030
|
#if CUDART_VERSION >= 11030
|
||||||
if (!UseSyncMemoryAlloc)
|
const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
||||||
|
if (tracker.GetThreadFriendlyMemAlloc())
|
||||||
{
|
{
|
||||||
VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
|
VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
|
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
if (ManagedMemoryEnabled)
|
if (ManagedMemoryEnabled)
|
||||||
@ -215,7 +190,18 @@ void* CudaAllocator::Allocate(std::size_t numBytes)
|
|||||||
void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
|
void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
|
||||||
{
|
{
|
||||||
void* ptr = nullptr;
|
void* ptr = nullptr;
|
||||||
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
|
#if CUDART_VERSION >= 11030
|
||||||
|
const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
||||||
|
if (tracker.GetThreadFriendlyMemAlloc())
|
||||||
|
{
|
||||||
|
VTKM_CUDA_CALL(cudaMallocAsync(&ptr, numBytes, cudaStreamPerThread));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec,
|
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec,
|
||||||
"Allocated CUDA array of %s at %p.",
|
"Allocated CUDA array of %s at %p.",
|
||||||
@ -237,17 +223,16 @@ void CudaAllocator::Free(void* ptr)
|
|||||||
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
|
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
|
||||||
|
|
||||||
if (UseSyncMemoryAlloc)
|
#if CUDART_VERSION >= 11030
|
||||||
|
const auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
||||||
|
if (tracker.GetThreadFriendlyMemAlloc())
|
||||||
{
|
{
|
||||||
VTKM_CUDA_CALL(cudaFree(ptr));
|
VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
|
||||||
#if CUDART_VERSION >= 11030
|
|
||||||
VTKM_CUDA_CALL(cudaFreeAsync(ptr, cudaStreamPerThread));
|
|
||||||
#else
|
|
||||||
VTKM_CUDA_CALL(cudaFree(ptr));
|
|
||||||
#endif
|
#endif
|
||||||
|
{
|
||||||
|
VTKM_CUDA_CALL(cudaFree(ptr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,14 +21,14 @@ namespace vtkm
|
|||||||
namespace filter
|
namespace filter
|
||||||
{
|
{
|
||||||
|
|
||||||
NewFilter::~NewFilter() = default;
|
namespace
|
||||||
|
{
|
||||||
void NewFilter::RunFilter(NewFilter* self,
|
void RunFilter(NewFilter* self,
|
||||||
vtkm::filter::DataSetQueue& input,
|
vtkm::filter::DataSetQueue& input,
|
||||||
vtkm::filter::DataSetQueue& output)
|
vtkm::filter::DataSetQueue& output)
|
||||||
{
|
{
|
||||||
auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
||||||
tracker.SetThreadFriendlyMemAlloc(vtkm::cont::DeviceAdapterTagAny{}, true);
|
tracker.SetThreadFriendlyMemAlloc(true);
|
||||||
|
|
||||||
std::pair<vtkm::Id, vtkm::cont::DataSet> task;
|
std::pair<vtkm::Id, vtkm::cont::DataSet> task;
|
||||||
while (input.GetTask(task))
|
while (input.GetTask(task))
|
||||||
@ -40,6 +40,9 @@ void NewFilter::RunFilter(NewFilter* self,
|
|||||||
vtkm::cont::Algorithm::Synchronize();
|
vtkm::cont::Algorithm::Synchronize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
NewFilter::~NewFilter() = default;
|
||||||
|
|
||||||
bool NewFilter::CanThread() const
|
bool NewFilter::CanThread() const
|
||||||
{
|
{
|
||||||
@ -63,11 +66,8 @@ vtkm::cont::PartitionedDataSet NewFilter::DoExecutePartitions(
|
|||||||
std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads));
|
std::vector<std::future<void>> futures(static_cast<std::size_t>(numThreads));
|
||||||
for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++)
|
for (std::size_t i = 0; i < static_cast<std::size_t>(numThreads); i++)
|
||||||
{
|
{
|
||||||
auto f = std::async(std::launch::async,
|
auto f = std::async(
|
||||||
vtkm::filter::NewFilter::RunFilter,
|
std::launch::async, RunFilter, this, std::ref(inputQueue), std::ref(outputQueue));
|
||||||
this,
|
|
||||||
std::ref(inputQueue),
|
|
||||||
std::ref(outputQueue));
|
|
||||||
futures[i] = std::move(f);
|
futures[i] = std::move(f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -446,11 +446,6 @@ private:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
VTKM_CONT
|
|
||||||
static void RunFilter(NewFilter* self,
|
|
||||||
vtkm::filter::DataSetQueue& input,
|
|
||||||
vtkm::filter::DataSetQueue& output);
|
|
||||||
|
|
||||||
VTKM_CONT
|
VTKM_CONT
|
||||||
virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input);
|
virtual vtkm::Id DetermineNumberOfThreads(const vtkm::cont::PartitionedDataSet& input);
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@
|
|||||||
|
|
||||||
#include <vtkm/Math.h>
|
#include <vtkm/Math.h>
|
||||||
#include <vtkm/cont/DataSet.h>
|
#include <vtkm/cont/DataSet.h>
|
||||||
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
|
|
||||||
#include <vtkm/cont/testing/MakeTestDataSet.h>
|
#include <vtkm/cont/testing/MakeTestDataSet.h>
|
||||||
#include <vtkm/cont/testing/Testing.h>
|
#include <vtkm/cont/testing/Testing.h>
|
||||||
|
|
||||||
@ -87,7 +86,7 @@ void ValidateResults(const vtkm::cont::PartitionedDataSet& truth,
|
|||||||
} //namespace
|
} //namespace
|
||||||
|
|
||||||
|
|
||||||
void RunMBTests()
|
void TestMultiBlockFilter()
|
||||||
{
|
{
|
||||||
vtkm::cont::PartitionedDataSet pds;
|
vtkm::cont::PartitionedDataSet pds;
|
||||||
|
|
||||||
@ -160,19 +159,6 @@ void RunMBTests()
|
|||||||
ValidateResults(results[0], results[1], "gradient", false);
|
ValidateResults(results[0], results[1], "gradient", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestMultiBlockFilter()
|
|
||||||
{
|
|
||||||
//For cuda, run it with and without async memory allocation.
|
|
||||||
#ifdef VTKM_CUDA
|
|
||||||
vtkm::cont::cuda::internal::CudaAllocator::ForceSyncMemoryAllocator();
|
|
||||||
RunMBTests();
|
|
||||||
vtkm::cont::cuda::internal::CudaAllocator::ForceAsyncMemoryAllocator();
|
|
||||||
RunMBTests();
|
|
||||||
#else
|
|
||||||
RunMBTests();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int UnitTestMultiBlockFilter(int argc, char* argv[])
|
int UnitTestMultiBlockFilter(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv);
|
return vtkm::cont::testing::Testing::Run(TestMultiBlockFilter, argc, argv);
|
||||||
|
Loading…
Reference in New Issue
Block a user