mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-16 17:22:55 +00:00
Merge topic 'deallocate-after-finalize'
9855db096 Add test for array and datas that are cleaned up after finalize Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Sujin Philip <sujin.philip@kitware.com> Merge-request: !2801
This commit is contained in:
commit
c7053f584c
14
docs/changelog/deallocate-after-initialize.md
Normal file
14
docs/changelog/deallocate-after-initialize.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Add test for array and datas that are cleaned up after finalize
|
||||||
|
|
||||||
|
It is the case that arrays might be deallocated from a device after the
|
||||||
|
device is closed. This can happen, for example, when an `ArrayHandle` is
|
||||||
|
declared globally. It gets constructed before VTK-m is initialized. This
|
||||||
|
is OK as long as you do not otherwise use it until VTK-m is initialized.
|
||||||
|
However, if you use that `ArrayHandle` to move data to a device and that
|
||||||
|
data is left on the device when the object closes, then the
|
||||||
|
`ArrayHandle` will be left holding a reference to invalid device memory
|
||||||
|
once the device is shut down. This can cause problems when the
|
||||||
|
`ArrayHandle` destructs itself and attempts to release this memory.
|
||||||
|
|
||||||
|
The VTK-m devices should gracefully handle deallocations that happen
|
||||||
|
after device shutdown.
|
@ -18,6 +18,7 @@
|
|||||||
#include <vtkm/cont/cuda/internal/RuntimeDeviceConfigurationCuda.h>
|
#include <vtkm/cont/cuda/internal/RuntimeDeviceConfigurationCuda.h>
|
||||||
#define NO_VTKM_MANAGED_MEMORY "NO_VTKM_MANAGED_MEMORY"
|
#define NO_VTKM_MANAGED_MEMORY "NO_VTKM_MANAGED_MEMORY"
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -28,10 +29,12 @@ VTKM_THIRDPARTY_POST_INCLUDE
|
|||||||
// These static vars are in an anon namespace to work around MSVC linker issues.
|
// These static vars are in an anon namespace to work around MSVC linker issues.
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
|
||||||
// Has CudaAllocator::Initialize been called by any thread?
|
// Has CudaAllocator::Initialize been called by any thread?
|
||||||
static std::once_flag IsInitialized;
|
static std::once_flag IsInitializedFlag;
|
||||||
#endif
|
|
||||||
|
// Used to keep track of whether the CUDA allocator has been initialized CUDA has not
|
||||||
|
// been finalized (since CUDA does not seem to track that for us).
|
||||||
|
static bool IsInitialized = false;
|
||||||
|
|
||||||
// Holds how VTK-m currently allocates memory.
|
// Holds how VTK-m currently allocates memory.
|
||||||
// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
|
// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
|
||||||
@ -183,12 +186,28 @@ void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
|
|||||||
|
|
||||||
void CudaAllocator::Free(void* ptr)
|
void CudaAllocator::Free(void* ptr)
|
||||||
{
|
{
|
||||||
|
if (!IsInitialized)
|
||||||
|
{
|
||||||
|
// Since the data was successfully allocated, it is a fair assumption that the CUDA
|
||||||
|
// runtime has been finalized and a global object is trying to destroy itself. Since
|
||||||
|
// CUDA already cleaned up all memory for program exit, we can ignore this free.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
|
VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
|
||||||
VTKM_CUDA_CALL(cudaFree(ptr));
|
VTKM_CUDA_CALL(cudaFree(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaAllocator::FreeDeferred(void* ptr, std::size_t numBytes)
|
void CudaAllocator::FreeDeferred(void* ptr, std::size_t numBytes)
|
||||||
{
|
{
|
||||||
|
if (!IsInitialized)
|
||||||
|
{
|
||||||
|
// Since the data was successfully allocated, it is a fair assumption that the CUDA
|
||||||
|
// runtime has been finalized and a global object is trying to destroy itself. Since
|
||||||
|
// CUDA already cleaned up all memory for program exit, we can ignore this free.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
static std::mutex deferredMutex;
|
static std::mutex deferredMutex;
|
||||||
static std::vector<void*> deferredPointers;
|
static std::vector<void*> deferredPointers;
|
||||||
static std::size_t deferredSize = 0;
|
static std::size_t deferredSize = 0;
|
||||||
@ -225,12 +244,10 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
|
|||||||
{
|
{
|
||||||
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
|
||||||
// TODO these hints need to be benchmarked and adjusted once we start
|
// TODO these hints need to be benchmarked and adjusted once we start
|
||||||
// sharing the pointers between cont/exec
|
// sharing the pointers between cont/exec
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
|
||||||
#endif // CUDA >= 8.0
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,7 +255,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
|
|||||||
{
|
{
|
||||||
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
|
||||||
vtkm::Id dev;
|
vtkm::Id dev;
|
||||||
vtkm::cont::RuntimeDeviceInformation()
|
vtkm::cont::RuntimeDeviceInformation()
|
||||||
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
||||||
@ -247,7 +263,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
|
|||||||
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
#endif // CUDA >= 8.0
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -255,7 +270,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
|
|||||||
{
|
{
|
||||||
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
|
||||||
vtkm::Id dev;
|
vtkm::Id dev;
|
||||||
vtkm::cont::RuntimeDeviceInformation()
|
vtkm::cont::RuntimeDeviceInformation()
|
||||||
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
||||||
@ -264,7 +278,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
|
|||||||
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
#endif // CUDA >= 8.0
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -272,7 +285,6 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
|
|||||||
{
|
{
|
||||||
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
if (IsManagedPointer(ptr) && numBytes >= Threshold)
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
|
||||||
vtkm::Id dev;
|
vtkm::Id dev;
|
||||||
vtkm::cont::RuntimeDeviceInformation()
|
vtkm::cont::RuntimeDeviceInformation()
|
||||||
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
|
||||||
@ -281,14 +293,12 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
|
|||||||
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
|
||||||
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
|
||||||
#endif // CUDA >= 8.0
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CudaAllocator::Initialize()
|
void CudaAllocator::Initialize()
|
||||||
{
|
{
|
||||||
#if CUDART_VERSION >= 8000
|
std::call_once(IsInitializedFlag, []() {
|
||||||
std::call_once(IsInitialized, []() {
|
|
||||||
auto cudaDeviceConfig = dynamic_cast<
|
auto cudaDeviceConfig = dynamic_cast<
|
||||||
vtkm::cont::internal::RuntimeDeviceConfiguration<vtkm::cont::DeviceAdapterTagCuda>&>(
|
vtkm::cont::internal::RuntimeDeviceConfiguration<vtkm::cont::DeviceAdapterTagCuda>&>(
|
||||||
vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration(
|
vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration(
|
||||||
@ -334,8 +344,17 @@ void CudaAllocator::Initialize()
|
|||||||
vtkm::cont::LogLevel::Info,
|
vtkm::cont::LogLevel::Info,
|
||||||
"CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
|
"CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CUDA does not give any indication of whether it is still running, but we have found from
|
||||||
|
// experience that it finalizes itself during program termination. However, the user might
|
||||||
|
// have their own objects being cleaned up during termination after CUDA. We need a flag
|
||||||
|
// to catch if this happens after CUDA finalizes itself. We will set this flag to true now
|
||||||
|
// and false on termination. Because we are creating the atexit call here (after CUDA must
|
||||||
|
// have initialized itself), C++ will require our function that unsets the flag to happen
|
||||||
|
// before CUDA finalizes.
|
||||||
|
IsInitialized = true;
|
||||||
|
std::atexit([]() { IsInitialized = false; });
|
||||||
});
|
});
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -39,8 +39,16 @@ void* Allocate(std::size_t size)
|
|||||||
|
|
||||||
void Free(void* ptr)
|
void Free(void* ptr)
|
||||||
{
|
{
|
||||||
GetExecutionSpaceInstance().fence();
|
if (Kokkos::is_initialized())
|
||||||
Kokkos::kokkos_free<ExecutionSpace::memory_space>(ptr);
|
{
|
||||||
|
GetExecutionSpaceInstance().fence();
|
||||||
|
Kokkos::kokkos_free<ExecutionSpace::memory_space>(ptr);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// It is possible that a Buffer instance might try to free its Kokkos data after
|
||||||
|
// Kokkos has been finalized. If that is the case, silently do nothing.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void* Reallocate(void* ptr, std::size_t newSize)
|
void* Reallocate(void* ptr, std::size_t newSize)
|
||||||
|
@ -61,6 +61,7 @@ set(unit_tests
|
|||||||
UnitTestError.cxx
|
UnitTestError.cxx
|
||||||
UnitTestFieldRangeCompute.cxx
|
UnitTestFieldRangeCompute.cxx
|
||||||
UnitTestInitialize.cxx
|
UnitTestInitialize.cxx
|
||||||
|
UnitTestLateDeallocate.cxx
|
||||||
UnitTestLogging.cxx
|
UnitTestLogging.cxx
|
||||||
UnitTestMergePartitionedDataSet.cxx
|
UnitTestMergePartitionedDataSet.cxx
|
||||||
UnitTestMoveConstructors.cxx
|
UnitTestMoveConstructors.cxx
|
||||||
|
72
vtkm/cont/testing/UnitTestLateDeallocate.cxx
Normal file
72
vtkm/cont/testing/UnitTestLateDeallocate.cxx
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
//============================================================================
|
||||||
|
// Copyright (c) Kitware, Inc.
|
||||||
|
// All rights reserved.
|
||||||
|
// See LICENSE.txt for details.
|
||||||
|
//
|
||||||
|
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||||
|
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||||
|
// PURPOSE. See the above copyright notice for more information.
|
||||||
|
//============================================================================
|
||||||
|
|
||||||
|
#include <vtkm/cont/ArrayCopy.h>
|
||||||
|
#include <vtkm/cont/ArrayHandle.h>
|
||||||
|
#include <vtkm/cont/ArrayHandleIndex.h>
|
||||||
|
#include <vtkm/cont/CellSetExplicit.h>
|
||||||
|
#include <vtkm/cont/DataSet.h>
|
||||||
|
#include <vtkm/cont/DeviceAdapterTag.h>
|
||||||
|
#include <vtkm/cont/RuntimeDeviceTracker.h>
|
||||||
|
#include <vtkm/cont/Token.h>
|
||||||
|
|
||||||
|
#include <vtkm/cont/testing/MakeTestDataSet.h>
|
||||||
|
#include <vtkm/cont/testing/Testing.h>
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
// These should be constructed early in program startup and destroyed late on
|
||||||
|
// program shutdown. They will likely be destroyed after any device is cleaned up.
|
||||||
|
struct Data
|
||||||
|
{
|
||||||
|
vtkm::cont::ArrayHandle<vtkm::Id> Array;
|
||||||
|
vtkm::cont::DataSet DataSet;
|
||||||
|
|
||||||
|
~Data() { std::cout << "Destroying global data." << std::endl; }
|
||||||
|
};
|
||||||
|
Data Globals;
|
||||||
|
|
||||||
|
void AllocateDeviceMemory()
|
||||||
|
{
|
||||||
|
// Load data.
|
||||||
|
vtkm::cont::ArrayCopy(vtkm::cont::ArrayHandleIndex(10), Globals.Array);
|
||||||
|
Globals.DataSet = vtkm::cont::testing::MakeTestDataSet{}.Make3DExplicitDataSet0();
|
||||||
|
|
||||||
|
vtkm::cont::CellSetExplicit<> cellSet;
|
||||||
|
Globals.DataSet.GetCellSet().AsCellSet(cellSet);
|
||||||
|
|
||||||
|
// Put data on devices.
|
||||||
|
auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
|
||||||
|
for (vtkm::Int8 deviceIndex = 0; deviceIndex < VTKM_MAX_DEVICE_ADAPTER_ID; ++deviceIndex)
|
||||||
|
{
|
||||||
|
vtkm::cont::DeviceAdapterId device = vtkm::cont::make_DeviceAdapterId(deviceIndex);
|
||||||
|
if (device.IsValueValid() && tracker.CanRunOn(device))
|
||||||
|
{
|
||||||
|
std::cout << "Loading data on " << device.GetName() << std::endl;
|
||||||
|
|
||||||
|
vtkm::cont::Token token;
|
||||||
|
Globals.Array.PrepareForInput(device, token);
|
||||||
|
cellSet.PrepareForInput(
|
||||||
|
device, vtkm::TopologyElementTagPoint{}, vtkm::TopologyElementTagCell{}, token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
int UnitTestLateDeallocate(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
return vtkm::cont::testing::Testing::Run(AllocateDeviceMemory, argc, argv);
|
||||||
|
|
||||||
|
// After this test returns, the global data structures will be deallocated. This will likely
|
||||||
|
// happen after all the devices are deallocated. You may get a warning, but you should not
|
||||||
|
// get a crash.
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user