Merge topic '228-consolidate_cuda_internals'

1f6a662c0 Merge DevAdaptAlgoThrust --> DevAdaptAlgoCuda. be0c6a17a Move DevAdaptAtomicArrayImplementation to its own file. 3af9f6608 Merge ArrayManagerExecutionThrustDevice into AMECuda. Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Robert Maynard <robert.maynard@kitware.com> Merge-request: !1228
2024-09-16 17:22:55 +00:00 · 2018-05-29 19:59:07 +00:00 · 2018-05-29 19:59:07 +00:00 · 9cfc9112cd
commit 9cfc9112cd
parent c94422d2ed 1f6a662c0a
10 changed files with 1673 additions and 1750 deletions
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h
@ -20,16 +20,30 @@
 #ifndef vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_h
 #define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_h

+#include <vtkm/cont/cuda/ErrorCuda.h>
+#include <vtkm/cont/cuda/internal/CudaAllocator.h>
 #include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
+#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
+#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>

-#include <vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h>
 #include <vtkm/cont/internal/ArrayExportMacros.h>
 #include <vtkm/cont/internal/ArrayManagerExecution.h>

+#include <vtkm/cont/ArrayPortalToIterators.h>
+#include <vtkm/cont/ErrorBadAllocation.h>
+#include <vtkm/cont/Storage.h>
+
 //This is in a separate header so that ArrayHandleBasicImpl can include
 //the interface without getting any CUDA headers
 #include <vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.h>

+VTKM_THIRDPARTY_PRE_INCLUDE
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+VTKM_THIRDPARTY_POST_INCLUDE
+
+#include <limits>
+
 // These must be placed in the vtkm::cont::internal namespace so that
 // the template can be found.

@ -42,29 +56,43 @@ namespace internal

 template <typename T, class StorageTag>
 class ArrayManagerExecution<T, StorageTag, vtkm::cont::DeviceAdapterTagCuda>
-  : public vtkm::cont::cuda::internal::ArrayManagerExecutionThrustDevice<T, StorageTag>
 {
 public:
-  using Superclass = vtkm::cont::cuda::internal::ArrayManagerExecutionThrustDevice<T, StorageTag>;
-  using ValueType = typename Superclass::ValueType;
-  using PortalType = typename Superclass::PortalType;
-  using PortalConstType = typename Superclass::PortalConstType;
-  using StorageType = typename Superclass::StorageType;
+  using ValueType = T;
+  using PointerType = T*;
+  using PortalType = vtkm::exec::cuda::internal::ArrayPortalFromThrust<T>;
+  using PortalConstType = vtkm::exec::cuda::internal::ConstArrayPortalFromThrust<T>;
+  using StorageType = vtkm::cont::internal::Storage<ValueType, StorageTag>;
+  using difference_type = std::ptrdiff_t;

  VTKM_CONT
  ArrayManagerExecution(StorageType* storage)
-    : Superclass(storage)
+    : Storage(storage)
+    , Begin(nullptr)
+    , End(nullptr)
+    , Capacity(nullptr)
  {
  }

+  VTKM_CONT
+  ~ArrayManagerExecution() { this->ReleaseResources(); }
+
+  /// Returns the size of the array.
+  ///
+  VTKM_CONT
+  vtkm::Id GetNumberOfValues() const { return static_cast<vtkm::Id>(this->End - this->Begin); }
+
  VTKM_CONT
  PortalConstType PrepareForInput(bool updateData)
  {
    try
    {
-      // This alternate form of PrepareForInput works around an issue
-      // with nvcc 7.5.
-      return this->Superclass::template _PrepareForInput<void>(updateData);
+      if (updateData)
+      {
+        this->CopyToExecution();
+      }
+
+      return PortalConstType(this->Begin, this->End);
    }
    catch (vtkm::cont::ErrorBadAllocation& error)
    {
@ -83,9 +111,12 @@ public:
  {
    try
    {
-      // This alternate form of PrepareForInPlace works around an issue
-      // with nvcc 7.5.
-      return this->Superclass::template _PrepareForInPlace<void>(updateData);
+      if (updateData)
+      {
+        this->CopyToExecution();
+      }
+
+      return PortalType(this->Begin, this->End);
    }
    catch (vtkm::cont::ErrorBadAllocation& error)
    {
@ -104,9 +135,49 @@ public:
  {
    try
    {
-      // This alternate form of PrepareForOutput works around an issue
-      // with nvcc 7.5.
-      return this->Superclass::template _PrepareForOutput<void>(numberOfValues);
+      // Can we reuse the existing buffer?
+      vtkm::Id curCapacity =
+        this->Begin != nullptr ? static_cast<vtkm::Id>(this->Capacity - this->Begin) : 0;
+
+      // Just mark a new end if we don't need to increase the allocation:
+      if (curCapacity >= numberOfValues)
+      {
+        this->End = this->Begin + static_cast<difference_type>(numberOfValues);
+
+        return PortalType(this->Begin, this->End);
+      }
+
+      const std::size_t maxNumVals = (std::numeric_limits<std::size_t>::max() / sizeof(ValueType));
+
+      if (static_cast<std::size_t>(numberOfValues) > maxNumVals)
+      {
+        std::ostringstream err;
+        err << "Failed to allocate " << numberOfValues << " values on device: "
+            << "Number of bytes is not representable by std::size_t.";
+        throw vtkm::cont::ErrorBadAllocation(err.str());
+      }
+
+      this->ReleaseResources();
+
+      const std::size_t bufferSize = static_cast<std::size_t>(numberOfValues) * sizeof(ValueType);
+
+      // Attempt to allocate:
+      try
+      {
+        this->Begin =
+          static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
+      }
+      catch (const std::exception& error)
+      {
+        std::ostringstream err;
+        err << "Failed to allocate " << bufferSize << " bytes on device: " << error.what();
+        throw vtkm::cont::ErrorBadAllocation(err.str());
+      }
+
+      this->Capacity = this->Begin + static_cast<difference_type>(numberOfValues);
+      this->End = this->Capacity;
+
+      return PortalType(this->Begin, this->End);
    }
    catch (vtkm::cont::ErrorBadAllocation& error)
    {
@ -119,6 +190,75 @@ public:
      throw error;
    }
  }
+
+  /// Allocates enough space in \c storage and copies the data in the
+  /// device vector into it.
+  ///
+  VTKM_CONT
+  void RetrieveOutputData(StorageType* storage) const
+  {
+    storage->Allocate(this->GetNumberOfValues());
+    try
+    {
+      ::thrust::copy(thrust::cuda::pointer<ValueType>(this->Begin),
+                     thrust::cuda::pointer<ValueType>(this->End),
+                     vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
+    }
+    catch (...)
+    {
+      vtkm::cont::cuda::internal::throwAsVTKmException();
+    }
+  }
+
+  /// Resizes the device vector.
+  ///
+  VTKM_CONT void Shrink(vtkm::Id numberOfValues)
+  {
+    // The operation will succeed even if this assertion fails, but this
+    // is still supposed to be a precondition to Shrink.
+    VTKM_ASSERT(this->Begin != nullptr && this->Begin + numberOfValues <= this->End);
+
+    this->End = this->Begin + static_cast<difference_type>(numberOfValues);
+  }
+
+  /// Frees all memory.
+  ///
+  VTKM_CONT void ReleaseResources()
+  {
+    if (this->Begin != nullptr)
+    {
+      vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin);
+      this->Begin = nullptr;
+      this->End = nullptr;
+      this->Capacity = nullptr;
+    }
+  }
+
+private:
+  ArrayManagerExecution(ArrayManagerExecution&) = delete;
+  void operator=(ArrayManagerExecution&) = delete;
+
+  StorageType* Storage;
+
+  PointerType Begin;
+  PointerType End;
+  PointerType Capacity;
+
+  VTKM_CONT
+  void CopyToExecution()
+  {
+    try
+    {
+      this->PrepareForOutput(this->Storage->GetNumberOfValues());
+      ::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
+                     vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
+                     thrust::cuda::pointer<ValueType>(this->Begin));
+    }
+    catch (...)
+    {
+      vtkm::cont::cuda::internal::throwAsVTKmException();
+    }
+  }
 };

 template <typename T>
--- a/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
+++ b/vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h
@ -1,262 +0,0 @@
-//============================================================================
-//  Copyright (c) Kitware, Inc.
-//  All rights reserved.
-//  See LICENSE.txt for details.
-//  This software is distributed WITHOUT ANY WARRANTY; without even
-//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
-//  PURPOSE.  See the above copyright notice for more information.
-//
-//  Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
-//  Copyright 2014 UT-Battelle, LLC.
-//  Copyright 2014 Los Alamos National Security.
-//
-//  Under the terms of Contract DE-NA0003525 with NTESS,
-//  the U.S. Government retains certain rights in this software.
-//
-//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
-//  Laboratory (LANL), the U.S. Government retains certain rights in
-//  this software.
-//============================================================================
-#ifndef vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
-#define vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
-
-#include <vtkm/cont/ArrayPortalToIterators.h>
-#include <vtkm/cont/ErrorBadAllocation.h>
-#include <vtkm/cont/Storage.h>
-
-#include <vtkm/cont/cuda/ErrorCuda.h>
-#include <vtkm/cont/cuda/internal/CudaAllocator.h>
-#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
-#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
-
-VTKM_THIRDPARTY_PRE_INCLUDE
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-VTKM_THIRDPARTY_POST_INCLUDE
-
-#include <limits>
-
-namespace vtkm
-{
-namespace cont
-{
-namespace cuda
-{
-namespace internal
-{
-
-/// \c ArrayManagerExecutionThrustDevice provides an implementation for a \c
-/// ArrayManagerExecution class for a thrust device adapter that is designed
-/// for the cuda backend which has separate memory spaces for host and device.
-template <typename T, class StorageTag>
-class ArrayManagerExecutionThrustDevice
-{
-public:
-  using ValueType = T;
-  using PointerType = T*;
-  using difference_type = std::ptrdiff_t;
-
-  using StorageType = vtkm::cont::internal::Storage<ValueType, StorageTag>;
-
-  using PortalType = vtkm::exec::cuda::internal::ArrayPortalFromThrust<T>;
-  using PortalConstType = vtkm::exec::cuda::internal::ConstArrayPortalFromThrust<T>;
-
-  VTKM_CONT
-  ArrayManagerExecutionThrustDevice(StorageType* storage)
-    : Storage(storage)
-    , Begin(nullptr)
-    , End(nullptr)
-    , Capacity(nullptr)
-  {
-  }
-
-  VTKM_CONT
-  ~ArrayManagerExecutionThrustDevice() { this->ReleaseResources(); }
-
-  /// Returns the size of the array.
-  ///
-  VTKM_CONT
-  vtkm::Id GetNumberOfValues() const { return static_cast<vtkm::Id>(this->End - this->Begin); }
-
-  /// Allocates the appropriate size of the array and copies the given data
-  /// into the array.
-  ///
-  VTKM_CONT
-  PortalConstType PrepareForInput(bool updateData)
-  {
-    if (updateData)
-    {
-      this->CopyToExecution();
-    }
-    else // !updateData
-    {
-      // The data in this->Array should already be valid.
-    }
-
-    return PortalConstType(this->Begin, this->End);
-  }
-
-  /// Workaround for nvcc 7.5 compiler warning bug.
-  template <typename DummyType>
-  VTKM_CONT PortalConstType _PrepareForInput(bool updateData)
-  {
-    return this->PrepareForInput(updateData);
-  }
-
-  /// Allocates the appropriate size of the array and copies the given data
-  /// into the array.
-  ///
-  VTKM_CONT
-  PortalType PrepareForInPlace(bool updateData)
-  {
-    if (updateData)
-    {
-      this->CopyToExecution();
-    }
-    else // !updateData
-    {
-      // The data in this->Array should already be valid.
-    }
-
-    return PortalType(this->Begin, this->End);
-  }
-
-  /// Workaround for nvcc 7.5 compiler warning bug.
-  template <typename DummyType>
-  VTKM_CONT PortalType _PrepareForInPlace(bool updateData)
-  {
-    return this->PrepareForInPlace(updateData);
-  }
-
-  /// Allocates the array to the given size.
-  ///
-  VTKM_CONT
-  PortalType PrepareForOutput(vtkm::Id numberOfValues)
-  {
-    // Can we reuse the existing buffer?
-    vtkm::Id curCapacity =
-      this->Begin != nullptr ? static_cast<vtkm::Id>(this->Capacity - this->Begin) : 0;
-
-    // Just mark a new end if we don't need to increase the allocation:
-    if (curCapacity >= numberOfValues)
-    {
-      this->End = this->Begin + static_cast<difference_type>(numberOfValues);
-
-      return PortalType(this->Begin, this->End);
-    }
-
-    const std::size_t maxNumVals = (std::numeric_limits<std::size_t>::max() / sizeof(ValueType));
-
-    if (static_cast<std::size_t>(numberOfValues) > maxNumVals)
-    {
-      std::ostringstream err;
-      err << "Failed to allocate " << numberOfValues << " values on device: "
-          << "Number of bytes is not representable by std::size_t.";
-      throw vtkm::cont::ErrorBadAllocation(err.str());
-    }
-
-    this->ReleaseResources();
-
-    const std::size_t bufferSize = static_cast<std::size_t>(numberOfValues) * sizeof(ValueType);
-
-    // Attempt to allocate:
-    try
-    {
-      this->Begin =
-        static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
-    }
-    catch (const std::exception& error)
-    {
-      std::ostringstream err;
-      err << "Failed to allocate " << bufferSize << " bytes on device: " << error.what();
-      throw vtkm::cont::ErrorBadAllocation(err.str());
-    }
-
-    this->Capacity = this->Begin + static_cast<difference_type>(numberOfValues);
-    this->End = this->Capacity;
-
-    return PortalType(this->Begin, this->End);
-  }
-
-  /// Workaround for nvcc 7.5 compiler warning bug.
-  template <typename DummyType>
-  VTKM_CONT PortalType _PrepareForOutput(vtkm::Id numberOfValues)
-  {
-    return this->PrepareForOutput(numberOfValues);
-  }
-
-  /// Allocates enough space in \c storage and copies the data in the
-  /// device vector into it.
-  ///
-  VTKM_CONT
-  void RetrieveOutputData(StorageType* storage) const
-  {
-    storage->Allocate(this->GetNumberOfValues());
-    try
-    {
-      ::thrust::copy(thrust::cuda::pointer<ValueType>(this->Begin),
-                     thrust::cuda::pointer<ValueType>(this->End),
-                     vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
-    }
-    catch (...)
-    {
-      vtkm::cont::cuda::internal::throwAsVTKmException();
-    }
-  }
-
-  /// Resizes the device vector.
-  ///
-  VTKM_CONT void Shrink(vtkm::Id numberOfValues)
-  {
-    // The operation will succeed even if this assertion fails, but this
-    // is still supposed to be a precondition to Shrink.
-    VTKM_ASSERT(this->Begin != nullptr && this->Begin + numberOfValues <= this->End);
-
-    this->End = this->Begin + static_cast<difference_type>(numberOfValues);
-  }
-
-  /// Frees all memory.
-  ///
-  VTKM_CONT void ReleaseResources()
-  {
-    if (this->Begin != nullptr)
-    {
-      vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin);
-      this->Begin = nullptr;
-      this->End = nullptr;
-      this->Capacity = nullptr;
-    }
-  }
-
-private:
-  ArrayManagerExecutionThrustDevice(ArrayManagerExecutionThrustDevice<T, StorageTag>&) = delete;
-  void operator=(ArrayManagerExecutionThrustDevice<T, StorageTag>&) = delete;
-
-  StorageType* Storage;
-
-  PointerType Begin;
-  PointerType End;
-  PointerType Capacity;
-
-  VTKM_CONT
-  void CopyToExecution()
-  {
-    try
-    {
-      this->PrepareForOutput(this->Storage->GetNumberOfValues());
-      ::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
-                     vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
-                     thrust::cuda::pointer<ValueType>(this->Begin));
-    }
-    catch (...)
-    {
-      vtkm::cont::cuda::internal::throwAsVTKmException();
-    }
-  }
-};
-}
-}
-}
-} // namespace vtkm::cont::cuda::internal
-
-#endif // vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
--- a/vtkm/cont/cuda/internal/CMakeLists.txt
+++ b/vtkm/cont/cuda/internal/CMakeLists.txt
@ -20,10 +20,9 @@

 set(headers
  ArrayManagerExecutionCuda.h
-  ArrayManagerExecutionThrustDevice.h
  CudaAllocator.h
  DeviceAdapterAlgorithmCuda.h
-  DeviceAdapterAlgorithmThrust.h
+  DeviceAdapterAtomicArrayImplementationCuda.h
  DeviceAdapterRuntimeDetectorCuda.h
  DeviceAdapterTagCuda.h
  DeviceAdapterTimerImplementationCuda.h
@ -49,7 +48,7 @@ endif()
 target_sources(vtkm_cont PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterRuntimeDetectorCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterTimerImplementationCuda.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.cu
@ -18,7 +18,7 @@
 //  this software.
 //============================================================================

-#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
+#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>

 #include <atomic>
 #include <mutex>
@ -65,11 +65,12 @@ VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
  }
  return numSMs[index];
 }
+}
+} // end namespace cuda::internal

 // we use cuda pinned memory to reduce the amount of synchronization
 // and mem copies between the host and device.
-template <>
-char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
+char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
  vtkm::Id& arraySize,
  char** hostPointer)
 {
@ -92,8 +93,7 @@ char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedE
  return devicePtr;
 }

-template <>
-char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
+char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
  vtkm::exec::cuda::internal::TaskStrided& functor)
 {
  //since the memory is pinned we can access it safely on the host
@ -111,8 +111,7 @@ char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupError
  return hostErrorPtr;
 }

-template <>
-void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
  vtkm::UInt32& grids,
  vtkm::UInt32& blocks,
  vtkm::Id size)
@ -120,19 +119,18 @@ void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAnd
  (void)size;
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * getNumSMs(deviceId);
+  grids = 32 * cuda::internal::getNumSMs(deviceId);
  blocks = 128;
 }

-template <>
-void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
  vtkm::UInt32& grids,
  dim3& blocks,
  const dim3& size)
 {
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
-  grids = 32 * getNumSMs(deviceId);
+  grids = 32 * cuda::internal::getNumSMs(deviceId);

  if (size.x == 0)
  { //grids that have no x dimension
@ -154,6 +152,4 @@ void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAnd
  }
 }
 }
-}
-}
-}
+} // end namespace vtkm::cont
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
--- a/vtkm/cont/cuda/internal/DeviceAdapterAtomicArrayImplementationCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAtomicArrayImplementationCuda.h
@ -0,0 +1,104 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementationCuda_h
+#define vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementationCuda_h
+
+#include <vtkm/Types.h>
+
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/DeviceAdapterAlgorithm.h>
+#include <vtkm/cont/StorageBasic.h>
+
+#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
+
+// Disable warnings we check vtkm for but Thrust does not.
+VTKM_THIRDPARTY_PRE_INCLUDE
+#include <thrust/device_ptr.h>
+VTKM_THIRDPARTY_POST_INCLUDE
+
+namespace vtkm
+{
+namespace cont
+{
+
+/// CUDA contains its own atomic operations
+///
+template <typename T>
+class DeviceAdapterAtomicArrayImplementation<T, vtkm::cont::DeviceAdapterTagCuda>
+{
+public:
+  VTKM_CONT
+  DeviceAdapterAtomicArrayImplementation(
+    vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle)
+    : Portal(handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagCuda()))
+  {
+  }
+
+  VTKM_EXEC T Add(vtkm::Id index, const T& value) const
+  {
+    T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
+    return this->vtkmAtomicAdd(lockedValue, value);
+  }
+
+  VTKM_EXEC T CompareAndSwap(vtkm::Id index,
+                             const vtkm::Int64& newValue,
+                             const vtkm::Int64& oldValue) const
+  {
+    T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
+    return this->vtkmCompareAndSwap(lockedValue, newValue, oldValue);
+  }
+
+private:
+  using PortalType =
+    typename vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>::template ExecutionTypes<
+      vtkm::cont::DeviceAdapterTagCuda>::Portal;
+  PortalType Portal;
+
+  __device__ vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
+  {
+    return atomicAdd((unsigned long long*)address, (unsigned long long)value);
+  }
+
+  __device__ vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
+  {
+    return atomicAdd(address, value);
+  }
+
+  __device__ vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
+                                            const vtkm::Int32& newValue,
+                                            const vtkm::Int32& oldValue) const
+  {
+    return atomicCAS(address, oldValue, newValue);
+  }
+
+  __device__ vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
+                                            const vtkm::Int64& newValue,
+                                            const vtkm::Int64& oldValue) const
+  {
+    return atomicCAS((unsigned long long int*)address,
+                     (unsigned long long int)oldValue,
+                     (unsigned long long int)newValue);
+  }
+};
+}
+} // end namespace vtkm::cont
+
+#endif // vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementationCuda_h
--- a/vtkm/cont/internal/CMakeLists.txt
+++ b/vtkm/cont/internal/CMakeLists.txt
@ -30,6 +30,7 @@ set(headers
  ArrayTransfer.h
  ConnectivityExplicitInternals.h
  DeviceAdapterAlgorithmGeneral.h
+  DeviceAdapterAtomicArrayImplementation.h
  DeviceAdapterDefaultSelection.h
  DeviceAdapterError.h
  DeviceAdapterListHelpers.h
--- a/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
+++ b/vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h
@ -27,6 +27,7 @@
 #include <vtkm/cont/ArrayHandleIndex.h>
 #include <vtkm/cont/ArrayHandleStreaming.h>
 #include <vtkm/cont/ArrayHandleZip.h>
+#include <vtkm/cont/internal/DeviceAdapterAtomicArrayImplementation.h>
 #include <vtkm/cont/internal/FunctorsGeneral.h>

 #include <vtkm/exec/internal/ErrorMessageBuffer.h>
@ -874,125 +875,8 @@ public:
      input, values_output, values_output);
  }
 };
-}
-}
-} // namespace vtkm::cont::internal

-namespace vtkm
-{
-namespace cont
-{
-/// \brief Class providing a device-specific atomic interface.
-///
-/// The class provide the actual implementation used by vtkm::exec::AtomicArray.
-/// A serial default implementation is provided. But each device will have a different
-/// implementation.
-///
-/// Serial requires no form of atomicity
-///
-template <typename T, typename DeviceTag>
-class DeviceAdapterAtomicArrayImplementation
-{
-public:
-  VTKM_CONT
-  DeviceAdapterAtomicArrayImplementation(
-    vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle)
-    : Iterators(IteratorsType(handle.PrepareForInPlace(DeviceTag())))
-  {
-  }
-
-  T Add(vtkm::Id index, const T& value) const
-  {
-    T* lockedValue;
-#if defined(_ITERATOR_DEBUG_LEVEL) && _ITERATOR_DEBUG_LEVEL > 0
-    using IteratorType = typename vtkm::cont::ArrayPortalToIterators<PortalType>::IteratorType;
-    typename IteratorType::pointer temp =
-      &(*(Iterators.GetBegin() + static_cast<std::ptrdiff_t>(index)));
-    lockedValue = temp;
-    return vtkmAtomicAdd(lockedValue, value);
-#else
-    lockedValue = (Iterators.GetBegin() + index);
-    return vtkmAtomicAdd(lockedValue, value);
-#endif
-  }
-
-  T CompareAndSwap(vtkm::Id index, const T& newValue, const T& oldValue) const
-  {
-    T* lockedValue;
-#if defined(_ITERATOR_DEBUG_LEVEL) && _ITERATOR_DEBUG_LEVEL > 0
-    using IteratorType = typename vtkm::cont::ArrayPortalToIterators<PortalType>::IteratorType;
-    typename IteratorType::pointer temp =
-      &(*(Iterators.GetBegin() + static_cast<std::ptrdiff_t>(index)));
-    lockedValue = temp;
-    return vtkmCompareAndSwap(lockedValue, newValue, oldValue);
-#else
-    lockedValue = (Iterators.GetBegin() + index);
-    return vtkmCompareAndSwap(lockedValue, newValue, oldValue);
-#endif
-  }
-
-private:
-  using PortalType =
-    typename vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>::template ExecutionTypes<
-      DeviceTag>::Portal;
-  using IteratorsType = vtkm::cont::ArrayPortalToIterators<PortalType>;
-  IteratorsType Iterators;
-
-#if defined(VTKM_MSVC) //MSVC atomics
-  vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
-  {
-    return InterlockedExchangeAdd(reinterpret_cast<volatile long*>(address), value);
-  }
-
-  vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
-  {
-    return InterlockedExchangeAdd64(reinterpret_cast<volatile long long*>(address), value);
-  }
-
-  vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
-                                 const vtkm::Int32& newValue,
-                                 const vtkm::Int32& oldValue) const
-  {
-    return InterlockedCompareExchange(
-      reinterpret_cast<volatile long*>(address), newValue, oldValue);
-  }
-
-  vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
-                                 const vtkm::Int64& newValue,
-                                 const vtkm::Int64& oldValue) const
-  {
-    return InterlockedCompareExchange64(
-      reinterpret_cast<volatile long long*>(address), newValue, oldValue);
-  }
-
-#else //gcc built-in atomics
-
-  vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
-  {
-    return __sync_fetch_and_add(address, value);
-  }
-
-  vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
-  {
-    return __sync_fetch_and_add(address, value);
-  }
-
-  vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
-                                 const vtkm::Int32& newValue,
-                                 const vtkm::Int32& oldValue) const
-  {
-    return __sync_val_compare_and_swap(address, oldValue, newValue);
-  }
-
-  vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
-                                 const vtkm::Int64& newValue,
-                                 const vtkm::Int64& oldValue) const
-  {
-    return __sync_val_compare_and_swap(address, oldValue, newValue);
-  }
-
-#endif
-};
+} // namespace internal

 /// \brief Class providing a device-specific support for selecting the optimal
 /// Task type for a given worklet.
--- a/vtkm/cont/internal/DeviceAdapterAtomicArrayImplementation.h
+++ b/vtkm/cont/internal/DeviceAdapterAtomicArrayImplementation.h
@ -0,0 +1,152 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//
+//  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+//  Copyright 2018 UT-Battelle, LLC.
+//  Copyright 2018 Los Alamos National Security.
+//
+//  Under the terms of Contract DE-NA0003525 with NTESS,
+//  the U.S. Government retains certain rights in this software.
+//
+//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
+//  Laboratory (LANL), the U.S. Government retains certain rights in
+//  this software.
+//============================================================================
+
+#ifndef vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementation_h
+#define vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementation_h
+
+#include <vtkm/cont/ArrayHandle.h>
+#include <vtkm/cont/ArrayPortalToIterators.h>
+#include <vtkm/cont/StorageBasic.h>
+
+#include <vtkm/internal/Configure.h>
+#include <vtkm/internal/Windows.h>
+
+#include <vtkm/Types.h>
+
+namespace vtkm
+{
+namespace cont
+{
+
+/// \brief Class providing a device-specific atomic interface.
+///
+/// The class provide the actual implementation used by vtkm::exec::AtomicArray.
+/// A serial default implementation is provided. But each device will have a different
+/// implementation.
+///
+/// Serial requires no form of atomicity
+///
+template <typename T, typename DeviceTag>
+class DeviceAdapterAtomicArrayImplementation
+{
+  using PortalType =
+    typename vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>::template ExecutionTypes<
+      DeviceTag>::Portal;
+  using IteratorsType = vtkm::cont::ArrayPortalToIterators<PortalType>;
+  IteratorsType Iterators;
+
+public:
+  VTKM_CONT
+  DeviceAdapterAtomicArrayImplementation(
+    vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle)
+    : Iterators(IteratorsType(handle.PrepareForInPlace(DeviceTag())))
+  {
+  }
+
+  T Add(vtkm::Id index, const T& value) const
+  {
+    T* lockedValue;
+#if defined(_ITERATOR_DEBUG_LEVEL) && _ITERATOR_DEBUG_LEVEL > 0
+    using IteratorType = typename vtkm::cont::ArrayPortalToIterators<PortalType>::IteratorType;
+    typename IteratorType::pointer temp =
+      &(*(Iterators.GetBegin() + static_cast<std::ptrdiff_t>(index)));
+    lockedValue = temp;
+    return this->vtkmAtomicAdd(lockedValue, value);
+#else
+    lockedValue = (Iterators.GetBegin() + index);
+    return this->vtkmAtomicAdd(lockedValue, value);
+#endif
+  }
+
+  T CompareAndSwap(vtkm::Id index, const T& newValue, const T& oldValue) const
+  {
+    T* lockedValue;
+#if defined(_ITERATOR_DEBUG_LEVEL) && _ITERATOR_DEBUG_LEVEL > 0
+    using IteratorType = typename vtkm::cont::ArrayPortalToIterators<PortalType>::IteratorType;
+    typename IteratorType::pointer temp =
+      &(*(Iterators.GetBegin() + static_cast<std::ptrdiff_t>(index)));
+    lockedValue = temp;
+    return this->vtkmCompareAndSwap(lockedValue, newValue, oldValue);
+#else
+    lockedValue = (Iterators.GetBegin() + index);
+    return this->vtkmCompareAndSwap(lockedValue, newValue, oldValue);
+#endif
+  }
+
+private:
+#if defined(VTKM_MSVC) //MSVC atomics
+  vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
+  {
+    return InterlockedExchangeAdd(reinterpret_cast<volatile long*>(address), value);
+  }
+
+  vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
+  {
+    return InterlockedExchangeAdd64(reinterpret_cast<volatile long long*>(address), value);
+  }
+
+  vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
+                                 const vtkm::Int32& newValue,
+                                 const vtkm::Int32& oldValue) const
+  {
+    return InterlockedCompareExchange(
+      reinterpret_cast<volatile long*>(address), newValue, oldValue);
+  }
+
+  vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
+                                 const vtkm::Int64& newValue,
+                                 const vtkm::Int64& oldValue) const
+  {
+    return InterlockedCompareExchange64(
+      reinterpret_cast<volatile long long*>(address), newValue, oldValue);
+  }
+
+#else //gcc built-in atomics
+
+  vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
+  {
+    return __sync_fetch_and_add(address, value);
+  }
+
+  vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
+  {
+    return __sync_fetch_and_add(address, value);
+  }
+
+  vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
+                                 const vtkm::Int32& newValue,
+                                 const vtkm::Int32& oldValue) const
+  {
+    return __sync_val_compare_and_swap(address, oldValue, newValue);
+  }
+
+  vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
+                                 const vtkm::Int64& newValue,
+                                 const vtkm::Int64& oldValue) const
+  {
+    return __sync_val_compare_and_swap(address, oldValue, newValue);
+  }
+
+#endif
+};
+}
+} // end namespace vtkm::cont
+
+#endif // vtk_m_cont_internal_DeviceAdapterAtomicArrayImplementation_h