diff --git a/docs/changelog/deallocate-after-initialize.md b/docs/changelog/deallocate-after-initialize.md new file mode 100644 index 000000000..a8f171362 --- /dev/null +++ b/docs/changelog/deallocate-after-initialize.md @@ -0,0 +1,14 @@ +# Add test for array and datas that are cleaned up after finalize + +It is the case that arrays might be deallocated from a device after the +device is closed. This can happen, for example, when an `ArrayHandle` is +declared globally. It gets constructed before VTK-m is initialized. This +is OK as long as you do not otherwise use it until VTK-m is initialized. +However, if you use that `ArrayHandle` to move data to a device and that +data is left on the device when the object closes, then the +`ArrayHandle` will be left holding a reference to invalid device memory +once the device is shut down. This can cause problems when the +`ArrayHandle` destructs itself and attempts to release this memory. + +The VTK-m devices should gracefully handle deallocations that happen +after device shutdown. diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu index 244d045b7..8167ea694 100644 --- a/vtkm/cont/cuda/internal/CudaAllocator.cu +++ b/vtkm/cont/cuda/internal/CudaAllocator.cu @@ -18,6 +18,7 @@ #include #define NO_VTKM_MANAGED_MEMORY "NO_VTKM_MANAGED_MEMORY" +#include #include #include @@ -28,10 +29,12 @@ VTKM_THIRDPARTY_POST_INCLUDE // These static vars are in an anon namespace to work around MSVC linker issues. namespace { -#if CUDART_VERSION >= 8000 // Has CudaAllocator::Initialize been called by any thread? -static std::once_flag IsInitialized; -#endif +static std::once_flag IsInitializedFlag; + +// Used to keep track of whether the CUDA allocator has been initialized CUDA has not +// been finalized (since CUDA does not seem to track that for us). +static bool IsInitialized = false; // Holds how VTK-m currently allocates memory. // When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ). @@ -183,12 +186,28 @@ void* CudaAllocator::AllocateUnManaged(std::size_t numBytes) void CudaAllocator::Free(void* ptr) { + if (!IsInitialized) + { + // Since the data was successfully allocated, it is a fair assumption that the CUDA + // runtime has been finalized and a global object is trying to destroy itself. Since + // CUDA already cleaned up all memory for program exit, we can ignore this free. + return; + } + VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr); VTKM_CUDA_CALL(cudaFree(ptr)); } void CudaAllocator::FreeDeferred(void* ptr, std::size_t numBytes) { + if (!IsInitialized) + { + // Since the data was successfully allocated, it is a fair assumption that the CUDA + // runtime has been finalized and a global object is trying to destroy itself. Since + // CUDA already cleaned up all memory for program exit, we can ignore this free. + return; + } + static std::mutex deferredMutex; static std::vector deferredPointers; static std::size_t deferredSize = 0; @@ -225,12 +244,10 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes) { if (IsManagedPointer(ptr) && numBytes >= Threshold) { -#if CUDART_VERSION >= 8000 // TODO these hints need to be benchmarked and adjusted once we start // sharing the pointers between cont/exec VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread)); -#endif // CUDA >= 8.0 } } @@ -238,7 +255,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes) { if (IsManagedPointer(ptr) && numBytes >= Threshold) { -#if CUDART_VERSION >= 8000 vtkm::Id dev; vtkm::cont::RuntimeDeviceInformation() .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda()) @@ -247,7 +263,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes) // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread)); -#endif // CUDA >= 8.0 } } @@ -255,7 +270,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes) { if (IsManagedPointer(ptr) && numBytes >= Threshold) { -#if CUDART_VERSION >= 8000 vtkm::Id dev; vtkm::cont::RuntimeDeviceInformation() .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda()) @@ -264,7 +278,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes) // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread)); -#endif // CUDA >= 8.0 } } @@ -272,7 +285,6 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes) { if (IsManagedPointer(ptr) && numBytes >= Threshold) { -#if CUDART_VERSION >= 8000 vtkm::Id dev; vtkm::cont::RuntimeDeviceInformation() .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda()) @@ -281,14 +293,12 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes) // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev)); VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev)); VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread)); -#endif // CUDA >= 8.0 } } void CudaAllocator::Initialize() { -#if CUDART_VERSION >= 8000 - std::call_once(IsInitialized, []() { + std::call_once(IsInitializedFlag, []() { auto cudaDeviceConfig = dynamic_cast< vtkm::cont::internal::RuntimeDeviceConfiguration&>( vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration( @@ -334,8 +344,17 @@ void CudaAllocator::Initialize() vtkm::cont::LogLevel::Info, "CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable"); } + + // CUDA does not give any indication of whether it is still running, but we have found from + // experience that it finalizes itself during program termination. However, the user might + // have their own objects being cleaned up during termination after CUDA. We need a flag + // to catch if this happens after CUDA finalizes itself. We will set this flag to true now + // and false on termination. Because we are creating the atexit call here (after CUDA must + // have initialized itself), C++ will require our function that unsets the flag to happen + // before CUDA finalizes. + IsInitialized = true; + std::atexit([]() { IsInitialized = false; }); }); -#endif } } } diff --git a/vtkm/cont/kokkos/internal/KokkosAlloc.cxx b/vtkm/cont/kokkos/internal/KokkosAlloc.cxx index b9e9c9f8a..37ba6f114 100644 --- a/vtkm/cont/kokkos/internal/KokkosAlloc.cxx +++ b/vtkm/cont/kokkos/internal/KokkosAlloc.cxx @@ -39,8 +39,16 @@ void* Allocate(std::size_t size) void Free(void* ptr) { - GetExecutionSpaceInstance().fence(); - Kokkos::kokkos_free(ptr); + if (Kokkos::is_initialized()) + { + GetExecutionSpaceInstance().fence(); + Kokkos::kokkos_free(ptr); + } + else + { + // It is possible that a Buffer instance might try to free its Kokkos data after + // Kokkos has been finalized. If that is the case, silently do nothing. + } } void* Reallocate(void* ptr, std::size_t newSize) diff --git a/vtkm/cont/testing/CMakeLists.txt b/vtkm/cont/testing/CMakeLists.txt index 1bd3499be..c7c1fab64 100644 --- a/vtkm/cont/testing/CMakeLists.txt +++ b/vtkm/cont/testing/CMakeLists.txt @@ -80,6 +80,7 @@ set(unit_tests UnitTestError.cxx UnitTestFieldRangeCompute.cxx UnitTestInitialize.cxx + UnitTestLateDeallocate.cxx UnitTestLogging.cxx UnitTestMergePartitionedDataSet.cxx UnitTestMoveConstructors.cxx diff --git a/vtkm/cont/testing/UnitTestLateDeallocate.cxx b/vtkm/cont/testing/UnitTestLateDeallocate.cxx new file mode 100644 index 000000000..c57245f43 --- /dev/null +++ b/vtkm/cont/testing/UnitTestLateDeallocate.cxx @@ -0,0 +1,72 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +//============================================================================ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace +{ + +// These should be constructed early in program startup and destroyed late on +// program shutdown. They will likely be destroyed after any device is cleaned up. +struct Data +{ + vtkm::cont::ArrayHandle Array; + vtkm::cont::DataSet DataSet; + + ~Data() { std::cout << "Destroying global data." << std::endl; } +}; +Data Globals; + +void AllocateDeviceMemory() +{ + // Load data. + vtkm::cont::ArrayCopy(vtkm::cont::ArrayHandleIndex(10), Globals.Array); + Globals.DataSet = vtkm::cont::testing::MakeTestDataSet{}.Make3DExplicitDataSet0(); + + vtkm::cont::CellSetExplicit<> cellSet; + Globals.DataSet.GetCellSet().AsCellSet(cellSet); + + // Put data on devices. + auto& tracker = vtkm::cont::GetRuntimeDeviceTracker(); + for (vtkm::Int8 deviceIndex = 0; deviceIndex < VTKM_MAX_DEVICE_ADAPTER_ID; ++deviceIndex) + { + vtkm::cont::DeviceAdapterId device = vtkm::cont::make_DeviceAdapterId(deviceIndex); + if (device.IsValueValid() && tracker.CanRunOn(device)) + { + std::cout << "Loading data on " << device.GetName() << std::endl; + + vtkm::cont::Token token; + Globals.Array.PrepareForInput(device, token); + cellSet.PrepareForInput( + device, vtkm::TopologyElementTagPoint{}, vtkm::TopologyElementTagCell{}, token); + } + } +} + +} // anonymous namespace + +int UnitTestLateDeallocate(int argc, char* argv[]) +{ + return vtkm::cont::testing::Testing::Run(AllocateDeviceMemory, argc, argv); + + // After this test returns, the global data structures will be deallocated. This will likely + // happen after all the devices are deallocated. You may get a warning, but you should not + // get a crash. +}