From ebbebd7369257b0783e0eb38a519f6de6711ca0f Mon Sep 17 00:00:00 2001
From: Kenneth Moreland <kmorel@sandia.gov>
Date: Tue, 18 Aug 2020 13:28:02 -0600
Subject: [PATCH] Add atomic free functions

Previously, all atomic functions were stored in classes named
`AtomicInterfaceControl` and `AtomicInterfaceExecution`, which required
you to know at compile time which device was using the methods. That in
turn means that anything using an atomic needed to be templated on the
device it is running on.

That can be a big hassle (and is problematic for some code structure).
Instead, these methods are moved to free functions in the `vtkm`
namespace. These functions operate like those in `Math.h`. Using
compiler directives, an appropriate version of the function is compiled
for the current device the compiler is using.
---
 docs/changelog/free-atomic-functions.md |  14 +
 vtkm/Atomic.h                           | 516 ++++++++++++++++++++++++
 vtkm/CMakeLists.txt                     |   1 +
 vtkm/testing/CMakeLists.txt             |   1 +
 vtkm/testing/UnitTestAtomic.cxx         | 359 +++++++++++++++++
 5 files changed, 891 insertions(+)
 create mode 100644 docs/changelog/free-atomic-functions.md
 create mode 100644 vtkm/Atomic.h
 create mode 100644 vtkm/testing/UnitTestAtomic.cxx
diff --git a/docs/changelog/free-atomic-functions.md b/docs/changelog/free-atomic-functions.md
new file mode 100644
index 000000000..632b1cc1f
--- /dev/null
+++ b/docs/changelog/free-atomic-functions.md
@@ -0,0 +1,14 @@
+# Add atomic free functions
+
+Previously, all atomic functions were stored in classes named
+`AtomicInterfaceControl` and `AtomicInterfaceExecution`, which required
+you to know at compile time which device was using the methods. That in
+turn means that anything using an atomic needed to be templated on the
+device it is running on.
+
+That can be a big hassle (and is problematic for some code structure).
+Instead, these methods are moved to free functions in the `vtkm`
+namespace. These functions operate like those in `Math.h`. Using
+compiler directives, an appropriate version of the function is compiled
+for the current device the compiler is using.
+
diff --git a/vtkm/Atomic.h b/vtkm/Atomic.h
new file mode 100644
index 000000000..7008c2b05
--- /dev/null
+++ b/vtkm/Atomic.h
@@ -0,0 +1,516 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+#ifndef vtk_m_Atomic_h
+#define vtk_m_Atomic_h
+
+#include <vtkm/List.h>
+
+#include <vtkm/internal/Windows.h>
+
+
+#if defined(VTKM_ENABLE_KOKKOS)
+
+VTKM_THIRDPARTY_PRE_INCLUDE
+#include <Kokkos_Core.hpp>
+VTKM_THIRDPARTY_POST_INCLUDE
+
+namespace vtkm
+{
+namespace detail
+{
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
+{
+  return Kokkos::Impl::atomic_load(addr);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
+{
+  Kokkos::Impl::atomic_store(addr, value);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
+{
+  return Kokkos::atomic_fetch_add(addr, arg);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
+{
+  return Kokkos::atomic_fetch_and(addr, mask);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
+{
+  return Kokkos::atomic_fetch_or(addr, mask);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
+{
+  return Kokkos::atomic_fetch_xor(addr, mask);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
+{
+  return Kokkos::atomic_fetch_xor(addr, static_cast<T>(~T{ 0u }));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
+{
+  return Kokkos::atomic_compare_exchange(addr, expected, desired);
+}
+}
+} // namespace vtkm::detail
+
+#elif defined(VTKM_CUDA_DEVICE_PASS)
+
+namespace vtkm
+{
+namespace detail
+{
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
+{
+  const volatile T* vaddr = addr; /* volatile to bypass cache*/
+  const T value = *vaddr;
+  /* fence to ensure that dependent reads are correctly ordered */
+  __threadfence();
+  return value;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
+{
+  volatile T* vaddr = addr; /* volatile to bypass cache */
+  /* fence to ensure that previous non-atomic stores are visible to other threads */
+  __threadfence();
+  *vaddr = value;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
+{
+  __threadfence();
+  auto result = atomicAdd(addr, arg);
+  __threadfence();
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
+{
+  __threadfence();
+  auto result = atomicAnd(addr, mask);
+  __threadfence();
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
+{
+  __threadfence();
+  auto result = atomicOr(addr, mask);
+  __threadfence();
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
+{
+  __threadfence();
+  auto result = atomicXor(addr, mask);
+  __threadfence();
+  return result;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
+{
+  return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
+{
+  __threadfence();
+  auto result = atomicCAS(addr, expected, desired);
+  __threadfence();
+  return result;
+}
+}
+} // namespace vtkm::detail
+
+#elif defined(VTKM_MSVC)
+
+// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
+
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+#include <intrin.h> // For MSVC atomics
+
+namespace vtkm
+{
+namespace detail
+{
+
+template <typename To, typename From>
+VTKM_EXEC_CONT inline To BitCast(const From& src)
+{
+  // The memcpy should be removed by the compiler when possible, but this
+  // works around a host of issues with bitcasting using reinterpret_cast.
+  VTKM_STATIC_ASSERT(sizeof(From) == sizeof(To));
+  To dst;
+  std::memcpy(&dst, &src, sizeof(From));
+  return dst;
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T BitCast(T&& src)
+{
+  return std::forward<T>(src);
+}
+
+// Note about Load and Store implementations:
+//
+// "Simple reads and writes to properly-aligned 32-bit variables are atomic
+//  operations"
+//
+// "Simple reads and writes to properly aligned 64-bit variables are atomic on
+// 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be
+// atomic on 32-bit Windows."
+//
+// "Reads and writes to variables of other sizes [than 32 or 64 bits] are not
+// guaranteed to be atomic on any platform."
+//
+// https://docs.microsoft.com/en-us/windows/desktop/sync/interlocked-variable-access
+
+VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(const vtkm::UInt8* addr)
+{
+  // This assumes that the memory interface is smart enough to load a 32-bit
+  // word atomically and a properly aligned 8-bit word from it.
+  // We could build address masks and do shifts to perform this manually if
+  // this assumption is incorrect.
+  auto result = *static_cast<volatile const vtkm::UInt8*>(addr);
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(const vtkm::UInt16* addr)
+{
+  // This assumes that the memory interface is smart enough to load a 32-bit
+  // word atomically and a properly aligned 16-bit word from it.
+  // We could build address masks and do shifts to perform this manually if
+  // this assumption is incorrect.
+  auto result = *static_cast<volatile const vtkm::UInt16*>(addr);
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(const vtkm::UInt32* addr)
+{
+  auto result = *static_cast<volatile const vtkm::UInt32*>(addr);
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return result;
+}
+VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(const vtkm::UInt64* addr)
+{
+  auto result = *static_cast<volatile const vtkm::UInt64*>(addr);
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return result;
+}
+
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt8* addr, vtkm::UInt8 val)
+{
+  // There doesn't seem to be an atomic store instruction in the windows
+  // API, so just exchange and discard the result.
+  _InterlockedExchange8(reinterpret_cast<volatile CHAR*>(addr), BitCast<CHAR>(val));
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt16* addr, vtkm::UInt16 val)
+{
+  // There doesn't seem to be an atomic store instruction in the windows
+  // API, so just exchange and discard the result.
+  _InterlockedExchange16(reinterpret_cast<volatile SHORT*>(addr), BitCast<SHORT>(val));
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt32* addr, vtkm::UInt32 val)
+{
+  std::atomic_thread_fence(std::memory_order_release);
+  *addr = val;
+}
+VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt64* addr, vtkm::UInt64 val)
+{
+  std::atomic_thread_fence(std::memory_order_release);
+  *addr = val;
+}
+
+#define VTKM_ATOMIC_OPS_FOR_TYPE(vtkmType, winType, suffix)                                        \
+  VTKM_EXEC_CONT inline vtkmType AtomicAddImpl(vtkmType* addr, vtkmType arg)                       \
+  {                                                                                                \
+    return BitCast<vtkmType>(_InterlockedExchangeAdd##suffix(                                      \
+      reinterpret_cast<volatile winType*>(addr), BitCast<winType>(arg)));                          \
+  }                                                                                                \
+  VTKM_EXEC_CONT inline vtkmType AtomicAndImpl(vtkmType* addr, vtkmType mask)                      \
+  {                                                                                                \
+    return BitCast<vtkmType>(                                                                      \
+      _InterlockedAnd##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask))); \
+  }                                                                                                \
+  VTKM_EXEC_CONT inline vtkmType AtomicOrImpl(vtkmType* addr, vtkmType mask)                       \
+  {                                                                                                \
+    return BitCast<vtkmType>(                                                                      \
+      _InterlockedOr##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask)));  \
+  }                                                                                                \
+  VTKM_EXEC_CONT inline vtkmType AtomicXorImpl(vtkmType* addr, vtkmType mask)                      \
+  {                                                                                                \
+    return BitCast<vtkmType>(                                                                      \
+      _InterlockedXor##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask))); \
+  }                                                                                                \
+  VTKM_EXEC_CONT inline vtkmType AtomicNotImpl(vtkmType* addr)                                     \
+  {                                                                                                \
+    return AtomicXorImpl(addr, static_cast<vtkmType>(~vtkmType{ 0u }));                            \
+  }                                                                                                \
+  VTKM_EXEC_CONT inline vtkmType AtomicCompareAndSwapImpl(                                         \
+    vtkmType* addr, vtkmType expected, vtkmType desired)                                           \
+  {                                                                                                \
+    return BitCast<vtkmType>(                                                                      \
+      _InterlockedCompareExchange##suffix(reinterpret_cast<volatile winType*>(addr),               \
+                                          BitCast<winType>(desired),                               \
+                                          BitCast<winType>(expected)));                            \
+  }
+
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt8, CHAR, 8)
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt16, SHORT, 16)
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt32, LONG, )
+VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt64, LONG64, 64)
+
+#undef VTKM_ATOMIC_OPS_FOR_TYPE
+}
+} // namespace vtkm::detail
+
+#else // gcc/clang for CPU
+
+// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
+
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+
+namespace vtkm
+{
+namespace detail
+{
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
+{
+  return __atomic_load_n(addr, __ATOMIC_ACQUIRE);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
+{
+  return __atomic_store_n(addr, value, __ATOMIC_RELEASE);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
+{
+  return __atomic_fetch_add(addr, arg, __ATOMIC_SEQ_CST);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
+{
+  return __atomic_fetch_and(addr, mask, __ATOMIC_SEQ_CST);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
+{
+  return __atomic_fetch_or(addr, mask, __ATOMIC_SEQ_CST);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
+{
+  return __atomic_fetch_xor(addr, mask, __ATOMIC_SEQ_CST);
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
+{
+  return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }));
+}
+
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
+{
+  __atomic_compare_exchange_n(addr, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return expected;
+}
+}
+} // namespace vtkm::detail
+
+#endif // gcc/clang
+
+
+namespace vtkm
+{
+
+/// \brief The preferred type to use for atomic operations.
+///
+using AtomicTypePreferred = vtkm::UInt32;
+
+/// \brief A list of types that can be used with atomic operations.
+///
+/// TODO: Adjust based on devices being compiled.
+///
+/// BUG: vtkm::UInt64 is provided in this list even though it is not supported on CUDA
+/// before compute capability 3.5.
+///
+using AtomicTypesSupported = vtkm::List<vtkm::UInt32, vtkm::UInt64>;
+
+/// \brief Atomic function to load a value from a shared memory location.
+///
+/// Given a pointer, returns the value in that pointer. If other threads are writing to
+/// that same location, the returned value will be consistent to what was present before
+/// or after that write.
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicLoad(const T* pointer)
+{
+  return detail::AtomicLoadImpl(pointer);
+}
+
+/// \brief Atomic function to save a value to a shared memory location.
+///
+/// Given a pointer and a value, stores that value at the pointer's location. If two
+/// threads are simultaneously using `AtomicStore` at the same location, the resulting
+/// value will be one of the values or the other (as opposed to a mix of bits).
+///
+template <typename T>
+VTKM_EXEC_CONT inline void AtomicStore(T* pointer, T value)
+{
+  detail::AtomicStoreImpl(pointer, value);
+}
+
+/// \brief Atomic function to add a value to a shared memory location.
+///
+/// Given a pointer and an operand, adds the operand to the value at the given memory
+/// location. The result of the addition is put into that memory location and the
+/// _old_ value that was originally in the memory is returned. For example, if you
+/// call `AtomicAdd` on a memory location that holds a 5 with an operand of 3, the
+/// value of 8 is stored in the memory location and the value of 5 is returned.
+///
+/// If multiple threads call `AtomicAdd` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T, typename U>
+VTKM_EXEC_CONT inline T AtomicAdd(T* pointer, U operand)
+{
+  return detail::AtomicAddImpl(pointer, static_cast<T>(operand));
+}
+
+/// \brief Atomic function to AND bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise AND of the operand and thevalue at the given
+/// memory location. The result of the AND is put into that memory location and the _old_ value
+/// that was originally in the memory is returned. For example, if you call `AtomicAnd` on a memory
+/// location that holds a 0x6 with an operand of 0x3, the value of 0x2 is stored in the memory
+/// location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicAnd` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T, typename U>
+VTKM_EXEC_CONT inline T AtomicAnd(T* pointer, U operand)
+{
+  return detail::AtomicAndImpl(pointer, static_cast<T>(operand));
+}
+
+/// \brief Atomic function to OR bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise OR of the operand and the value at the given
+/// memory location. The result of the OR is put into that memory location and the _old_ value
+/// that was originally in the memory is returned. For example, if you call `AtomicOr` on a memory
+/// location that holds a 0x6 with an operand of 0x3, the value of 0x7 is stored in the memory
+/// location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicOr` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other
+/// (although it is indeterminate which will be applied first).
+///
+template <typename T, typename U>
+VTKM_EXEC_CONT inline T AtomicOr(T* pointer, U operand)
+{
+  return detail::AtomicOrImpl(pointer, static_cast<T>(operand));
+}
+
+/// \brief Atomic function to XOR bits to a shared memory location.
+///
+/// Given a pointer and an operand, performs a bitwise exclusive-OR of the operand and the value at
+/// the given memory location. The result of the XOR is put into that memory location and the _old_
+/// value that was originally in the memory is returned. For example, if you call `AtomicXor` on a
+/// memory location that holds a 0x6 with an operand of 0x3, the value of 0x5 is stored in the
+/// memory location and the value of 0x6 is returned.
+///
+/// If multiple threads call `AtomicXor` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other.
+///
+template <typename T, typename U>
+VTKM_EXEC_CONT inline T AtomicXor(T* pointer, U operand)
+{
+  return detail::AtomicXorImpl(pointer, static_cast<T>(operand));
+}
+
+/// \brief Atomic function to NOT bits to a shared memory location.
+///
+/// Given a pointer, performs a bitwise NOT of the value at the given
+/// memory location. The result of the NOT is put into that memory location and the _old_ value
+/// that was originally in the memory is returned.
+///
+/// If multiple threads call `AtomicNot` simultaneously, they will not interfere with
+/// each other. The result will be consistent as if one was called before the other.
+///
+template <typename T>
+VTKM_EXEC_CONT inline T AtomicNot(T* pointer)
+{
+  return detail::AtomicNotImpl(pointer);
+}
+
+/// \brief Atomic function that replaces a value given a condition.
+///
+/// Given a pointer, an expected value, and a new desired value, replaces the value at the
+/// pointer if it is the same as the expected value with the new desired value. If the original
+/// value in the pointer does not equal the expected value, then the memory at the pointer
+/// remains unchanged. In either case, the function returns the _old_ original value that
+/// was at the pointer.
+///
+/// If multiple threads call `AtomicCompareAndSwap` simultaneously, the result will be consistent
+/// as if one was called before the other (although it is indeterminate which will be applied
+/// first).
+///
+template <typename T, typename U, typename V>
+VTKM_EXEC_CONT inline T AtomicCompareAndSwap(T* pointer, U expected, V desired)
+{
+  return detail::AtomicCompareAndSwapImpl(
+    pointer, static_cast<T>(expected), static_cast<V>(desired));
+}
+
+} // namespace vtkm
+
+#endif //vtk_m_Atomic_h
diff --git a/vtkm/CMakeLists.txt b/vtkm/CMakeLists.txt
index e6d54c639..531aafe14 100644
--- a/vtkm/CMakeLists.txt
+++ b/vtkm/CMakeLists.txt
@@ -19,6 +19,7 @@ vtkm_install_headers(
 set(headers
   Algorithms.h
   Assert.h
+  Atomic.h
   BinaryPredicates.h
   BinaryOperators.h
   Bitset.h
diff --git a/vtkm/testing/CMakeLists.txt b/vtkm/testing/CMakeLists.txt
index 6bdc3abe8..dbc547c66 100644
--- a/vtkm/testing/CMakeLists.txt
+++ b/vtkm/testing/CMakeLists.txt
@@ -49,6 +49,7 @@ set(unit_tests
 # Unit tests that have device-specific code to be tested
 set(unit_tests_device
   UnitTestAlgorithms.cxx
+  UnitTestAtomic.cxx
   UnitTestGeometry.cxx
   UnitTestMath.cxx
   )
diff --git a/vtkm/testing/UnitTestAtomic.cxx b/vtkm/testing/UnitTestAtomic.cxx
new file mode 100644
index 000000000..83ff8a79d
--- /dev/null
+++ b/vtkm/testing/UnitTestAtomic.cxx
@@ -0,0 +1,359 @@
+//============================================================================
+//  Copyright (c) Kitware, Inc.
+//  All rights reserved.
+//  See LICENSE.txt for details.
+//
+//  This software is distributed WITHOUT ANY WARRANTY; without even
+//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+//  PURPOSE.  See the above copyright notice for more information.
+//============================================================================
+
+#include <vtkm/Atomic.h>
+
+#include <vtkm/cont/Algorithm.h>
+#include <vtkm/cont/ArrayCopy.h>
+#include <vtkm/cont/ArrayHandleBasic.h>
+#include <vtkm/cont/ArrayHandleConstant.h>
+#include <vtkm/cont/ArrayHandleIndex.h>
+#include <vtkm/cont/DeviceAdapterTag.h>
+#include <vtkm/cont/ExecutionObjectBase.h>
+#include <vtkm/cont/Invoker.h>
+
+#include <vtkm/worklet/WorkletMapField.h>
+
+#include <vtkm/cont/testing/Testing.h>
+
+namespace
+{
+
+constexpr vtkm::Id ARRAY_SIZE = 100;
+
+template <typename T>
+struct AtomicTests
+{
+  vtkm::cont::Invoker Invoke;
+
+  static constexpr vtkm::Id OVERLAP = sizeof(T) * CHAR_BIT;
+  static constexpr vtkm::Id EXTENDED_SIZE = ARRAY_SIZE * OVERLAP;
+
+  VTKM_EXEC_CONT static T TestValue(vtkm::Id index) { return ::TestValue(index, T{}); }
+
+  struct ArrayToRawPointer : vtkm::cont::ExecutionObjectBase
+  {
+    vtkm::cont::ArrayHandleBasic<T> Array;
+    VTKM_CONT ArrayToRawPointer(const vtkm::cont::ArrayHandleBasic<T>& array)
+      : Array(array)
+    {
+    }
+
+    VTKM_CONT T* PrepareForExecution(vtkm::cont::DeviceAdapterId device,
+                                     vtkm::cont::Token& token) const
+    {
+      return reinterpret_cast<T*>(this->Array.GetBuffers()[0].WritePointerDevice(device, token));
+    }
+  };
+
+  struct LoadFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      if (!test_equal(vtkm::AtomicLoad(data + index), TestValue(index)))
+      {
+        this->RaiseError("Bad AtomicLoad");
+      }
+    }
+  };
+
+  VTKM_CONT void TestLoad()
+  {
+    std::cout << "AtomicLoad" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    array.Allocate(ARRAY_SIZE);
+    SetPortal(array.WritePortal());
+
+    this->Invoke(LoadFunctor{}, array, ArrayToRawPointer(array));
+  }
+
+  struct StoreFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::AtomicStore(data + (index % ARRAY_SIZE), TestValue(index));
+    }
+  };
+
+  VTKM_CONT void TestStore()
+  {
+    std::cout << "AtomicStore" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      StoreFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      bool foundExpected = false;
+      T foundValue = portal.Get(arrayIndex);
+      for (vtkm::Id overlapIndex = 0; overlapIndex < OVERLAP; ++overlapIndex)
+      {
+        if (test_equal(foundValue, TestValue(arrayIndex + (overlapIndex * ARRAY_SIZE))))
+        {
+          foundExpected = true;
+          break;
+        }
+      }
+      VTKM_TEST_ASSERT(
+        foundExpected, "Wrong value (", foundValue, ") stored in index ", arrayIndex);
+    }
+  }
+
+  struct AddFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::AtomicAdd(data + (index % ARRAY_SIZE), 2);
+      vtkm::AtomicAdd(data + (index % ARRAY_SIZE), -1);
+    }
+  };
+
+  VTKM_CONT void TestAdd()
+  {
+    std::cout << "AtomicAdd" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0, ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AddFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    T expectedValue = T(OVERLAP);
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, expectedValue), foundValue, " != ", expectedValue);
+    }
+  }
+
+  struct AndFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::Id arrayIndex = index % ARRAY_SIZE;
+      vtkm::Id offsetIndex = index / ARRAY_SIZE;
+      vtkm::AtomicAnd(data + arrayIndex, ~(0x1u << offsetIndex));
+    }
+  };
+
+  VTKM_CONT void TestAnd()
+  {
+    std::cout << "AtomicAnd" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(T(-1), ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AndFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, 0), foundValue, " != 0");
+    }
+  }
+
+  struct OrFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::Id arrayIndex = index % ARRAY_SIZE;
+      vtkm::Id offsetIndex = index / ARRAY_SIZE;
+      vtkm::AtomicOr(data + arrayIndex, 0x1u << offsetIndex);
+    }
+  };
+
+  VTKM_CONT void TestOr()
+  {
+    std::cout << "AtomicOr" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0, ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AndFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    T expectedValue = T(-1);
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, 0), foundValue, " != ", expectedValue);
+    }
+  }
+
+  struct XorFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::Id arrayIndex = index % ARRAY_SIZE;
+      vtkm::Id offsetIndex = index / ARRAY_SIZE;
+      vtkm::AtomicXor(data + arrayIndex, 0x3u << offsetIndex);
+    }
+  };
+
+  VTKM_CONT void TestXor()
+  {
+    std::cout << "AtomicXor" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0, ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AndFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    T expectedValue = T(1);
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, 0), foundValue, " != ", expectedValue);
+    }
+  }
+
+  struct NotFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::Id arrayIndex = index % ARRAY_SIZE;
+      vtkm::Id offsetIndex = index / ARRAY_SIZE;
+      if (offsetIndex < arrayIndex)
+      {
+        vtkm::AtomicNot(data + arrayIndex);
+      }
+    }
+  };
+
+  VTKM_CONT void TestNot()
+  {
+    std::cout << "AtomicNot" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0xA, ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AndFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    T expectedValue = T(0xA);
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, 0), foundValue, " != ", expectedValue);
+      expectedValue = static_cast<T>(~expectedValue);
+    }
+  }
+
+  struct CompareAndSwapFunctor : vtkm::worklet::WorkletMapField
+  {
+    using ControlSignature = void(FieldIn ignored, ExecObject);
+    using ExecutionSignature = void(WorkIndex, _2);
+
+    VTKM_EXEC void operator()(vtkm::Id index, T* data) const
+    {
+      vtkm::Id arrayIndex = index % ARRAY_SIZE;
+      bool success = false;
+      for (T overlapIndex = 0; overlapIndex < static_cast<T>(OVERLAP); ++overlapIndex)
+      {
+        T oldValue = vtkm::AtomicCompareAndSwap(data + arrayIndex, overlapIndex, overlapIndex + 1);
+        if (oldValue == overlapIndex)
+        {
+          success = true;
+          break;
+        }
+      }
+
+      if (!success)
+      {
+        this->RaiseError("No compare succeeded");
+      }
+    }
+  };
+
+  VTKM_CONT void TestCompareAndSwap()
+  {
+    std::cout << "AtomicCompareAndSwap" << std::endl;
+    vtkm::cont::ArrayHandleBasic<T> array;
+    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0, ARRAY_SIZE), array);
+    array.Allocate(ARRAY_SIZE);
+
+    this->Invoke(
+      AddFunctor{}, vtkm::cont::ArrayHandleIndex(EXTENDED_SIZE), ArrayToRawPointer(array));
+
+    auto portal = array.ReadPortal();
+    T expectedValue = T(OVERLAP);
+    for (vtkm::Id arrayIndex = 0; arrayIndex < ARRAY_SIZE; ++arrayIndex)
+    {
+      T foundValue = portal.Get(arrayIndex);
+      VTKM_TEST_ASSERT(test_equal(foundValue, expectedValue), foundValue, " != ", expectedValue);
+    }
+  }
+
+  VTKM_CONT void TestAll()
+  {
+    TestLoad();
+    TestStore();
+    TestAdd();
+    TestAnd();
+    TestOr();
+    TestXor();
+    TestNot();
+    TestCompareAndSwap();
+  }
+};
+
+struct TestFunctor
+{
+  template <typename T>
+  VTKM_CONT void operator()(T) const
+  {
+    AtomicTests<T>().TestAll();
+  }
+};
+
+void Run()
+{
+  VTKM_TEST_ASSERT(vtkm::ListHas<vtkm::AtomicTypesSupported, vtkm::AtomicTypePreferred>::value);
+
+  vtkm::testing::Testing::TryTypes(TestFunctor{}, vtkm::AtomicTypesSupported{});
+}
+
+} // anonymous namespace
+
+int UnitTestAtomic(int argc, char* argv[])
+{
+  return vtkm::cont::testing::Testing::Run(Run, argc, argv);
+}