diff --git a/docs/changelog/deallocate-after-initialize.md b/docs/changelog/deallocate-after-initialize.md
new file mode 100644
index 000000000..a8f171362
--- /dev/null
+++ b/docs/changelog/deallocate-after-initialize.md
@@ -0,0 +1,14 @@
+# Add test for array and datas that are cleaned up after finalize
+
+It is the case that arrays might be deallocated from a device after the
+device is closed. This can happen, for example, when an `ArrayHandle` is
+declared globally. It gets constructed before VTK-m is initialized. This
+is OK as long as you do not otherwise use it until VTK-m is initialized.
+However, if you use that `ArrayHandle` to move data to a device and that
+data is left on the device when the object closes, then the
+`ArrayHandle` will be left holding a reference to invalid device memory
+once the device is shut down. This can cause problems when the
+`ArrayHandle` destructs itself and attempts to release this memory.
+
+The VTK-m devices should gracefully handle deallocations that happen
+after device shutdown.
diff --git a/vtkm/cont/cuda/internal/CudaAllocator.cu b/vtkm/cont/cuda/internal/CudaAllocator.cu
index 244d045b7..8167ea694 100644
--- a/vtkm/cont/cuda/internal/CudaAllocator.cu
+++ b/vtkm/cont/cuda/internal/CudaAllocator.cu
@@ -18,6 +18,7 @@
 #include <vtkm/cont/cuda/internal/RuntimeDeviceConfigurationCuda.h>
 #define NO_VTKM_MANAGED_MEMORY "NO_VTKM_MANAGED_MEMORY"
 
+#include <cstdlib>
 #include <mutex>
 #include <vector>
 
@@ -28,10 +29,12 @@ VTKM_THIRDPARTY_POST_INCLUDE
 // These static vars are in an anon namespace to work around MSVC linker issues.
 namespace
 {
-#if CUDART_VERSION >= 8000
 // Has CudaAllocator::Initialize been called by any thread?
-static std::once_flag IsInitialized;
-#endif
+static std::once_flag IsInitializedFlag;
+
+// Used to keep track of whether the CUDA allocator has been initialized CUDA has not
+// been finalized (since CUDA does not seem to track that for us).
+static bool IsInitialized = false;
 
 // Holds how VTK-m currently allocates memory.
 // When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
@@ -183,12 +186,28 @@ void* CudaAllocator::AllocateUnManaged(std::size_t numBytes)
 
 void CudaAllocator::Free(void* ptr)
 {
+  if (!IsInitialized)
+  {
+    // Since the data was successfully allocated, it is a fair assumption that the CUDA
+    // runtime has been finalized and a global object is trying to destroy itself. Since
+    // CUDA already cleaned up all memory for program exit, we can ignore this free.
+    return;
+  }
+
   VTKM_LOG_F(vtkm::cont::LogLevel::MemExec, "Freeing CUDA allocation at %p.", ptr);
   VTKM_CUDA_CALL(cudaFree(ptr));
 }
 
 void CudaAllocator::FreeDeferred(void* ptr, std::size_t numBytes)
 {
+  if (!IsInitialized)
+  {
+    // Since the data was successfully allocated, it is a fair assumption that the CUDA
+    // runtime has been finalized and a global object is trying to destroy itself. Since
+    // CUDA already cleaned up all memory for program exit, we can ignore this free.
+    return;
+  }
+
   static std::mutex deferredMutex;
   static std::vector<void*> deferredPointers;
   static std::size_t deferredSize = 0;
@@ -225,12 +244,10 @@ void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
 {
   if (IsManagedPointer(ptr) && numBytes >= Threshold)
   {
-#if CUDART_VERSION >= 8000
     // TODO these hints need to be benchmarked and adjusted once we start
     // sharing the pointers between cont/exec
     VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
     VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
-#endif // CUDA >= 8.0
   }
 }
 
@@ -238,7 +255,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
 {
   if (IsManagedPointer(ptr) && numBytes >= Threshold)
   {
-#if CUDART_VERSION >= 8000
     vtkm::Id dev;
     vtkm::cont::RuntimeDeviceInformation()
       .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
@@ -247,7 +263,6 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
     // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
     VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
     VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
-#endif // CUDA >= 8.0
   }
 }
 
@@ -255,7 +270,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
 {
   if (IsManagedPointer(ptr) && numBytes >= Threshold)
   {
-#if CUDART_VERSION >= 8000
     vtkm::Id dev;
     vtkm::cont::RuntimeDeviceInformation()
       .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
@@ -264,7 +278,6 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
     // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
     VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
     VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
-#endif // CUDA >= 8.0
   }
 }
 
@@ -272,7 +285,6 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
 {
   if (IsManagedPointer(ptr) && numBytes >= Threshold)
   {
-#if CUDART_VERSION >= 8000
     vtkm::Id dev;
     vtkm::cont::RuntimeDeviceInformation()
       .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
@@ -281,14 +293,12 @@ void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
     // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
     VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
     VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
-#endif // CUDA >= 8.0
   }
 }
 
 void CudaAllocator::Initialize()
 {
-#if CUDART_VERSION >= 8000
-  std::call_once(IsInitialized, []() {
+  std::call_once(IsInitializedFlag, []() {
     auto cudaDeviceConfig = dynamic_cast<
       vtkm::cont::internal::RuntimeDeviceConfiguration<vtkm::cont::DeviceAdapterTagCuda>&>(
       vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration(
@@ -334,8 +344,17 @@ void CudaAllocator::Initialize()
         vtkm::cont::LogLevel::Info,
         "CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
     }
+
+    // CUDA does not give any indication of whether it is still running, but we have found from
+    // experience that it finalizes itself during program termination. However, the user might
+    // have their own objects being cleaned up during termination after CUDA. We need a flag
+    // to catch if this happens after CUDA finalizes itself. We will set this flag to true now
+    // and false on termination. Because we are creating the atexit call here (after CUDA must
+    // have initialized itself), C++ will require our function that unsets the flag to happen
+    // before CUDA finalizes.
+    IsInitialized = true;
+    std::atexit([]() { IsInitialized = false; });
   });
-#endif
 }
 }
 }
diff --git a/vtkm/cont/kokkos/internal/KokkosAlloc.cxx b/vtkm/cont/kokkos/internal/KokkosAlloc.cxx
index b9e9c9f8a..37ba6f114 100644
--- a/vtkm/cont/kokkos/internal/KokkosAlloc.cxx
+++ b/vtkm/cont/kokkos/internal/KokkosAlloc.cxx
@@ -39,8 +39,16 @@ void* Allocate(std::size_t size)
 
 void Free(void* ptr)
 {
-  GetExecutionSpaceInstance().fence();
-  Kokkos::kokkos_free<ExecutionSpace::memory_space>(ptr);
+  if (Kokkos::is_initialized())
+  {
+    GetExecutionSpaceInstance().fence();
+    Kokkos::kokkos_free<ExecutionSpace::memory_space>(ptr);
+  }
+  else
+  {
+    // It is possible that a Buffer instance might try to free its Kokkos data after
+    // Kokkos has been finalized. If that is the case, silently do nothing.
+  }
 }
 
 void* Reallocate(void* ptr, std::size_t newSize)
diff --git a/vtkm/cont/testing/CMakeLists.txt b/vtkm/cont/testing/CMakeLists.txt
index 1bd3499be..c7c1fab64 100644
--- a/vtkm/cont/testing/CMakeLists.txt
+++ b/vtkm/cont/testing/CMakeLists.txt
@@ -80,6 +80,7 @@ set(unit_tests
   UnitTestError.cxx
   UnitTestFieldRangeCompute.cxx
   UnitTestInitialize.cxx
+  UnitTestLateDeallocate.cxx
   UnitTestLogging.cxx
   UnitTestMergePartitionedDataSet.cxx
   UnitTestMoveConstructors.cxx
diff --git a/vtkm/cont/testing/UnitTestLateDeallocate.cxx b/vtkm/cont/testing/UnitTestLateDeallocate.cxx
new file mode 100644
index 000000000..c57245f43
--- /dev/null
+++ b/vtkm/cont/testing/UnitTestLateDeallocate.cxx
@@ -0,0 +1,72 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include <vtkm/cont/ArrayCopy.h>
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/ArrayHandleIndex.h>
+#include <vtkm/cont/CellSetExplicit.h>
+#include <vtkm/cont/DataSet.h>
+#include <vtkm/cont/DeviceAdapterTag.h>
+#include <vtkm/cont/RuntimeDeviceTracker.h>
+#include <vtkm/cont/Token.h>
+
+#include <vtkm/cont/testing/MakeTestDataSet.h>
+#include <vtkm/cont/testing/Testing.h>
+
+namespace
+{
+
+// These should be constructed early in program startup and destroyed late on
+// program shutdown. They will likely be destroyed after any device is cleaned up.
+struct Data
+{
+  vtkm::cont::ArrayHandle<vtkm::Id> Array;
+  vtkm::cont::DataSet DataSet;
+
+  ~Data() { std::cout << "Destroying global data." << std::endl; }
+};
+Data Globals;
+
+void AllocateDeviceMemory()
+{
+  // Load data.
+  vtkm::cont::ArrayCopy(vtkm::cont::ArrayHandleIndex(10), Globals.Array);
+  Globals.DataSet = vtkm::cont::testing::MakeTestDataSet{}.Make3DExplicitDataSet0();
+
+  vtkm::cont::CellSetExplicit<> cellSet;
+  Globals.DataSet.GetCellSet().AsCellSet(cellSet);
+
+  // Put data on devices.
+  auto& tracker = vtkm::cont::GetRuntimeDeviceTracker();
+  for (vtkm::Int8 deviceIndex = 0; deviceIndex < VTKM_MAX_DEVICE_ADAPTER_ID; ++deviceIndex)
+  {
+    vtkm::cont::DeviceAdapterId device = vtkm::cont::make_DeviceAdapterId(deviceIndex);
+    if (device.IsValueValid() && tracker.CanRunOn(device))
+    {
+      std::cout << "Loading data on " << device.GetName() << std::endl;
+
+      vtkm::cont::Token token;
+      Globals.Array.PrepareForInput(device, token);
+      cellSet.PrepareForInput(
+        device, vtkm::TopologyElementTagPoint{}, vtkm::TopologyElementTagCell{}, token);
+    }
+  }
+}
+
+} // anonymous namespace
+
+int UnitTestLateDeallocate(int argc, char* argv[])
+{
+  return vtkm::cont::testing::Testing::Run(AllocateDeviceMemory, argc, argv);
+
+  // After this test returns, the global data structures will be deallocated. This will likely
+  // happen after all the devices are deallocated. You may get a warning, but you should not
+  // get a crash.
+}