Merge branch 'master' of https://gitlab.kitware.com/vtk/vtk-m into ghost_streamlines

2020-10-21 12:01:31 -04:00 · 2020-10-21 12:01:31 -04:00 · ec37279b01
commit ec37279b01
parent 381886c947 0277ff1440
23 changed files with 294 additions and 196 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -32,9 +32,10 @@
 #     - tbb
 #     - openmp
 #     - mpich2
-#   * .gitlab/ci/docker/ubuntu1804/cuda10.1/
+#   * .gitlab/ci/docker/ubuntu1804/cuda11.1/
 #     - cuda
-#     - gcc 7.4
+#     - gcc 7
+#     - gcc 8
 #     - tbb
 #     - openmp
 #     - mpich2
@ -49,52 +50,52 @@
    GIT_CLONE_PATH: $CI_BUILDS_DIR/gitlab-kitware-sciviz-ci

 .centos7: &centos7
-  image: "kitware/vtkm:ci-centos7_cuda10.2-20200820"
+  image: "kitware/vtkm:ci-centos7_cuda10.2-20201016"
  extends:
    - .docker_image

 .centos8: &centos8
-  image: "kitware/vtkm:ci-centos8-20200820"
+  image: "kitware/vtkm:ci-centos8-20201016"
  extends:
    - .docker_image

 .rhel8: &rhel8
-  image: "kitware/vtkm:ci-rhel8_cuda10.2-20200820"
+  image: "kitware/vtkm:ci-rhel8_cuda10.2-20201016"
  extends:
    - .docker_image

 .ubuntu1604: &ubuntu1604
-  image: "kitware/vtkm:ci-ubuntu1604-20200820"
+  image: "kitware/vtkm:ci-ubuntu1604-20201016"
  extends:
    - .docker_image

 .ubuntu1604_cuda: &ubuntu1604_cuda
-  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20200820"
+  image: "kitware/vtkm:ci-ubuntu1604_cuda9.2-20201016"
  extends:
    - .docker_image

 .ubuntu1804: &ubuntu1804
-  image: "kitware/vtkm:ci-ubuntu1804-20200820"
+  image: "kitware/vtkm:ci-ubuntu1804-20201016"
  extends:
    - .docker_image

 .ubuntu1804_cuda: &ubuntu1804_cuda
-  image: "kitware/vtkm:ci-ubuntu1804_cuda10.1-20200820"
+  image: "kitware/vtkm:ci-ubuntu1804_cuda11.1-20201016"
  extends:
    - .docker_image

 .ubuntu1804_cuda_kokkos: &ubuntu1804_cuda_kokkos
-  image: "kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-20200820"
+  image: "kitware/vtkm:ci-ubuntu1804_cuda11_kokkos-20201016"
  extends:
    - .docker_image

 .ubuntu2004_doxygen: &ubuntu2004_doxygen
-  image: "kitware/vtkm:ci-doxygen-20200820"
+  image: "kitware/vtkm:ci-doxygen-20201016"
  extends:
    - .docker_image

 .ubuntu2004_kokkos: &ubuntu2004_kokkos
-  image: "kitware/vtkm:ci-ubuntu2004_kokkos-20200820"
+  image: "kitware/vtkm:ci-ubuntu2004_kokkos-20201016"
  extends:
    - .docker_image

--- a/.gitlab/ci/docker/ubuntu1804/cuda11.1/Dockerfile
+++ b/.gitlab/ci/docker/ubuntu1804/cuda11.1/Dockerfile
@ -1,10 +1,10 @@
-FROM nvidia/cuda:10.1-devel-ubuntu18.04
+FROM nvidia/cuda:11.1-devel-ubuntu18.04
 LABEL maintainer "Robert Maynard<robert.maynard@kitware.com>"

 # Base dependencies for building VTK-m projects
 RUN apt-get update && apt-get install -y --no-install-recommends \
      curl \
-      g++ \
+      g++-8 \
      clang-8 \
      git \
      git-lfs \
--- a/.gitlab/ci/docker/update_all.sh
+++ b/.gitlab/ci/docker/update_all.sh
@ -30,8 +30,8 @@ cd ubuntu1804/base
 sudo docker build -t kitware/vtkm:ci-ubuntu1804-$date .
 cd ../..

-cd ubuntu1804/cuda10.1
-sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda10.1-$date .
+cd ubuntu1804/cuda11.1
+sudo docker build -t kitware/vtkm:ci-ubuntu1804_cuda11.1-$date .
 cd ../..

 cd ubuntu1804/kokkos-cuda
--- a/.gitlab/ci/ubuntu1804.yml
+++ b/.gitlab/ci/ubuntu1804.yml
@ -38,7 +38,7 @@ test:ubuntu1804_gcc9:
    - build:ubuntu1804_gcc9

 # Build on ubuntu1804 with CUDA + MPI and test on ubuntu1804
-# Uses gcc 7.4
+# Uses gcc 7
 # Uses MPICH2
 build:ubuntu1804_gcc7:
  tags:
@ -55,6 +55,7 @@ build:ubuntu1804_gcc7:
  variables:
    CC: "gcc-7"
    CXX: "g++-7"
+    CUDAHOSTCXX: "g++-7"
    VTKM_SETTINGS: "cuda+turing+mpi+64bit_floats+no_virtual"

 test:ubuntu1804_gcc7:
--- a/README.md
+++ b/README.md
@ -76,8 +76,8 @@ VTK-m Requires:
 Optional dependencies are:

  + CUDA Device Adapter
-      + [Cuda Toolkit 9.2+](https://developer.nvidia.com/cuda-toolkit)
-      + Note CUDA >= 10.1 is required on Windows
+      + [Cuda Toolkit 9.2, >= 10.2](https://developer.nvidia.com/cuda-toolkit)
+      + Note CUDA >= 10.2 is required on Windows
  + TBB Device Adapter
      + [TBB](https://www.threadingbuildingblocks.org/)
  + OpenMP Device Adapter
@ -105,12 +105,12 @@ VTK-m has been tested on the following configurations:c
  + On Linux
      + GCC 4.8.5, 5.4, 6.5, 7.4, 8.2, 9.2; Clang 5, 8; Intel 17.0.4; 19.0.0
      + CMake 3.12, 3.13, 3.16, 3.17
-      + CUDA 9.2.148, 10.0.130, 10.1.105, 10.2.89
+      + CUDA 9.2, 10.2, 11.0, 11.1 
      + TBB 4.4 U2, 2017 U7
  + On Windows
      + Visual Studio 2015, 2017
      + CMake 3.12, 3.17
-      + CUDA 10.1
+      + CUDA 10.2
      + TBB 2017 U3, 2018 U2
  + On MacOS
      + AppleClang 9.1
--- a/benchmarking/BenchmarkAtomicArray.cxx
+++ b/benchmarking/BenchmarkAtomicArray.cxx
@ -260,7 +260,7 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(
    ->ArgNames({ "Values", "Ops", "Stride" }),
  vtkm::cont::AtomicArrayTypeList);

-// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to adjacent
+// Benchmarks AtomicArray::CompareExchange such that each work index writes to adjacent
 // indices.
 struct CASSeqWorker : public vtkm::worklet::WorkletMapField
 {
@ -273,12 +273,8 @@ struct CASSeqWorker : public vtkm::worklet::WorkletMapField
    const vtkm::Id idx = i % portal.GetNumberOfValues();
    const T val = static_cast<T>(i) + in;
    T oldVal = portal.Get(idx);
-    T assumed = static_cast<T>(0);
-    do
-    {
-      assumed = oldVal;
-      oldVal = portal.CompareAndSwap(idx, assumed + val, assumed);
-    } while (assumed != oldVal);
+    while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
+      ;
  }
 };

@ -371,7 +367,7 @@ VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCASSeqBaseline,
                                ->ArgNames({ "Values", "Ops" }),
                              vtkm::cont::AtomicArrayTypeList);

-// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to
+// Benchmarks AtomicArray::CompareExchange such that each work index writes to
 // a strided index:
 // ( floor(i / stride) + stride * (i % stride)
 struct CASStrideWorker : public vtkm::worklet::WorkletMapField
@ -393,12 +389,8 @@ struct CASStrideWorker : public vtkm::worklet::WorkletMapField
    const vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
    const T val = static_cast<T>(i) + in;
    T oldVal = portal.Get(idx);
-    T assumed = static_cast<T>(0);
-    do
-    {
-      assumed = oldVal;
-      oldVal = portal.CompareAndSwap(idx, assumed + val, assumed);
-    } while (assumed != oldVal);
+    while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
+      ;
  }
 };

--- a/vtkm/Algorithms.h
+++ b/vtkm/Algorithms.h
@ -90,7 +90,7 @@ VTKM_EXEC_CONT vtkm::Id BinarySearch(const PortalT& portal, const T& val)
 template <typename IterT, typename T, typename Comp>
 VTKM_EXEC_CONT IterT LowerBound(IterT first, IterT last, const T& val, Comp comp)
 {
-#ifdef VTKM_CUDA
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
  auto len = last - first;
  while (len != 0)
  {
@ -107,9 +107,9 @@ VTKM_EXEC_CONT IterT LowerBound(IterT first, IterT last, const T& val, Comp comp
    }
  }
  return first;
-#else  // VTKM_CUDA
+#else  // VTKM_CUDA || VTKM_HIP
  return std::lower_bound(first, last, val, std::move(comp));
-#endif // VTKM_CUDA
+#endif // VTKM_CUDA || VTKM_HIP
 }

 template <typename IterT, typename T>
@ -139,7 +139,7 @@ VTKM_EXEC_CONT vtkm::Id LowerBound(const PortalT& portal, const T& val)
 template <typename IterT, typename T, typename Comp>
 VTKM_EXEC_CONT IterT UpperBound(IterT first, IterT last, const T& val, Comp comp)
 {
-#ifdef VTKM_CUDA
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
  auto len = last - first;
  while (len != 0)
  {
@ -156,9 +156,9 @@ VTKM_EXEC_CONT IterT UpperBound(IterT first, IterT last, const T& val, Comp comp
    }
  }
  return first;
-#else  // VTKM_CUDA
+#else  // VTKM_CUDA || VTKM_HIP
  return std::upper_bound(first, last, val, std::move(comp));
-#endif // VTKM_CUDA
+#endif // VTKM_CUDA || VTKM_HIP
 }

 template <typename IterT, typename T>
--- a/vtkm/Atomic.h
+++ b/vtkm/Atomic.h
@ -142,9 +142,9 @@ VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
 {
-  const volatile T* vaddr = addr; /* volatile to bypass cache*/
+  volatile T* const vaddr = addr; /* volatile to bypass cache*/
  if (order == vtkm::MemoryOrder::SequentiallyConsistent)
  {
    __threadfence();
@ -207,15 +207,23 @@ VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
-                                                 T desired,
-                                                 T expected,
-                                                 vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
+                                                     T* expected,
+                                                     T desired,
+                                                     vtkm::MemoryOrder order)
 {
  AtomicStoreFence(order);
-  auto result = atomicCAS(addr, expected, desired);
+  auto result = atomicCAS(addr, *expected, desired);
  AtomicLoadFence(order);
-  return result;
+  if (result == *expected)
+  {
+    return true;
+  }
+  else
+  {
+    *expected = result;
+    return false;
+  }
 }
 }
 } // namespace vtkm::detail
@ -239,9 +247,14 @@ VTKM_THIRDPARTY_PRE_INCLUDE
 #if defined(KOKKOS_ENABLE_CUDA) && !defined(VTKM_CUDA)
 #undef KOKKOS_ENABLE_CUDA
 #endif
+
+#if defined(KOKKOS_ENABLE_HIP) && !defined(VTKM_HIP)
+#undef KOKKOS_ENABLE_HIP
+#endif
+
 #endif //KOKKOS_MACROS_HPP not loaded

-#include <Kokkos_Core.hpp>
+#include <Kokkos_Atomic.hpp>
 VTKM_THIRDPARTY_POST_INCLUDE

 namespace vtkm
@ -270,7 +283,7 @@ VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
 {
  switch (order)
  {
@ -350,15 +363,23 @@ VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
-                                                 T desired,
-                                                 T expected,
-                                                 vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
+                                                     T* expected,
+                                                     T desired,
+                                                     vtkm::MemoryOrder order)
 {
  AtomicStoreFence(order);
-  T result = Kokkos::atomic_compare_exchange(addr, expected, desired);
+  T oldValue = Kokkos::atomic_compare_exchange(addr, *expected, desired);
  AtomicLoadFence(order);
-  return result;
+  if (oldValue == *expected)
+  {
+    return true;
+  }
+  else
+  {
+    *expected = oldValue;
+    return false;
+  }
 }
 }
 } // namespace vtkm::detail
@ -407,35 +428,35 @@ VTKM_EXEC_CONT inline T BitCast(T&& src)
 //
 // https://docs.microsoft.com/en-us/windows/desktop/sync/interlocked-variable-access

-VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(const vtkm::UInt8* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(vtkm::UInt8* const addr, vtkm::MemoryOrder order)
 {
  // This assumes that the memory interface is smart enough to load a 32-bit
  // word atomically and a properly aligned 8-bit word from it.
  // We could build address masks and do shifts to perform this manually if
  // this assumption is incorrect.
-  auto result = *static_cast<volatile const vtkm::UInt8*>(addr);
+  auto result = *static_cast<volatile vtkm::UInt8* const>(addr);
  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
  return result;
 }
-VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(const vtkm::UInt16* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(vtkm::UInt16* const addr, vtkm::MemoryOrder order)
 {
  // This assumes that the memory interface is smart enough to load a 32-bit
  // word atomically and a properly aligned 16-bit word from it.
  // We could build address masks and do shifts to perform this manually if
  // this assumption is incorrect.
-  auto result = *static_cast<volatile const vtkm::UInt16*>(addr);
+  auto result = *static_cast<volatile vtkm::UInt16* const>(addr);
  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
  return result;
 }
-VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(const vtkm::UInt32* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(vtkm::UInt32* const addr, vtkm::MemoryOrder order)
 {
-  auto result = *static_cast<volatile const vtkm::UInt32*>(addr);
+  auto result = *static_cast<volatile vtkm::UInt32* const>(addr);
  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
  return result;
 }
-VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(const vtkm::UInt64* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(vtkm::UInt64* const addr, vtkm::MemoryOrder order)
 {
-  auto result = *static_cast<volatile const vtkm::UInt64*>(addr);
+  auto result = *static_cast<volatile vtkm::UInt64* const>(addr);
  std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
  return result;
 }
@ -487,13 +508,22 @@ VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt64* addr,
  {                                                                                     \
    return AtomicXorImpl(addr, static_cast<vtkmType>(~vtkmType{ 0u }), order);          \
  }                                                                                     \
-  VTKM_EXEC_CONT inline vtkmType AtomicCompareAndSwapImpl(                              \
-    vtkmType* addr, vtkmType desired, vtkmType expected, vtkm::MemoryOrder order)       \
+  VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(                                 \
+    vtkmType* addr, vtkmType* expected, vtkmType desired, vtkm::MemoryOrder order)      \
  {                                                                                     \
-    return BitCast<vtkmType>(                                                           \
+    vtkmType result = BitCast<vtkmType>(                                                \
      _InterlockedCompareExchange##suffix(reinterpret_cast<volatile winType*>(addr),    \
                                          BitCast<winType>(desired),                    \
-                                          BitCast<winType>(expected)));                 \
+                                          BitCast<winType>(*expected)));                \
+    if (result == *expected)                                                            \
+    {                                                                                   \
+      return true;                                                                      \
+    }                                                                                   \
+    else                                                                                \
+    {                                                                                   \
+      *expected = result;                                                               \
+      return false;                                                                     \
+    }                                                                                   \
  }

 VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt8, CHAR, 8)
@ -538,7 +568,7 @@ VTKM_EXEC_CONT inline int GccAtomicMemOrder(vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr, vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
 {
  return __atomic_load_n(addr, GccAtomicMemOrder(order));
 }
@ -580,14 +610,13 @@ VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
 }

 template <typename T>
-VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr,
-                                                 T desired,
-                                                 T expected,
-                                                 vtkm::MemoryOrder order)
+VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
+                                                     T* expected,
+                                                     T desired,
+                                                     vtkm::MemoryOrder order)
 {
-  __atomic_compare_exchange_n(
-    addr, &expected, desired, false, GccAtomicMemOrder(order), GccAtomicMemOrder(order));
-  return expected;
+  return __atomic_compare_exchange_n(
+    addr, expected, desired, false, GccAtomicMemOrder(order), GccAtomicMemOrder(order));
 }
 }
 } // namespace vtkm::detail
@ -627,7 +656,7 @@ using AtomicTypesSupported = vtkm::List<vtkm::UInt32, vtkm::UInt64>;
 /// or after that write.
 ///
 template <typename T>
-VTKM_EXEC_CONT inline T AtomicLoad(const T* pointer,
+VTKM_EXEC_CONT inline T AtomicLoad(T* const pointer,
                                   vtkm::MemoryOrder order = vtkm::MemoryOrder::Acquire)
 {
  return detail::AtomicLoadImpl(pointer, order);
@ -796,24 +825,34 @@ VTKM_EXEC_CONT inline T AtomicNot(

 /// \brief Atomic function that replaces a value given a condition.
 ///
-/// Given a pointer, a new desired value, and an expected value, replaces the value at the
-/// pointer if it is the same as the expected value with the new desired value. If the original
-/// value in the pointer does not equal the expected value, then the memory at the pointer
-/// remains unchanged. In either case, the function returns the _old_ original value that
-/// was at the pointer.
+/// Given a pointer to a `shared` value, a pointer holding the `expected` value at that shared
+/// location, and a new `desired` value, `AtomicCompareExchange` compares the existing `shared`
+/// value to the `expected` value, and then conditionally replaces the `shared` value with
+/// the provided `desired` value. Otherwise, the `expected` value gets replaced with the
+/// `shared` value. Note that in either case, the function returns with `expected` replaced
+/// with the value _originally_ in `shared` at the start of the call.
 ///
-/// If multiple threads call `AtomicCompareAndSwap` simultaneously, the result will be consistent
-/// as if one was called before the other (although it is indeterminate which will be applied
-/// first).
+/// If the `shared` value and `expected` value are the same, then `shared` gets set to
+/// `desired`, and `AtomicCompareAndExchange` returns `true`.
+///
+/// If the `shared` value and `expected` value are different, then `expected` gets set
+/// to `shared`, and `AtomicCompareAndExchange` returns `false`. The value at `shared`
+/// is _not_ changed in this case.
+///
+/// If multiple threads call `AtomicCompareExchange` simultaneously with the same `shared`
+/// pointer, the result will be consistent as if one was called before the other (although
+/// it is indeterminate which will be applied first). Note that the `expected` pointer should
+/// _not_ be shared among threads. The `expected` pointer should be thread-local (often
+/// pointing to an object on the stack).
 ///
 template <typename T>
-VTKM_EXEC_CONT inline T AtomicCompareAndSwap(
-  T* pointer,
+VTKM_EXEC_CONT inline T AtomicCompareExchange(
+  T* shared,
+  T* expected,
  T desired,
-  T expected,
  vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
 {
-  return detail::AtomicCompareAndSwapImpl(pointer, desired, expected, order);
+  return detail::AtomicCompareExchangeImpl(shared, expected, desired, order);
 }

 } // namespace vtkm
--- a/vtkm/Math.h
+++ b/vtkm/Math.h
@ -2381,7 +2381,8 @@ static inline VTKM_EXEC_CONT vtkm::Float32 RemainderQuotient(vtkm::Float32 numer
                                                             QType& quotient)
 {
  int iQuotient;
-#ifdef VTKM_CUDA
+  // See: https://github.com/ROCm-Developer-Tools/HIP/issues/2169
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
  const vtkm::Float32 result =
    VTKM_CUDA_MATH_FUNCTION_32(remquo)(numerator, denominator, &iQuotient);
 #else
@ -2411,11 +2412,20 @@ static inline VTKM_EXEC_CONT vtkm::Float64 RemainderQuotient(vtkm::Float64 numer
 ///
 static inline VTKM_EXEC_CONT vtkm::Float32 ModF(vtkm::Float32 x, vtkm::Float32& integral)
 {
+  // See: https://github.com/ROCm-Developer-Tools/HIP/issues/2169
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
+  return VTKM_CUDA_MATH_FUNCTION_32(modf)(x, &integral);
+#else
  return std::modf(x, &integral);
+#endif
 }
 static inline VTKM_EXEC_CONT vtkm::Float64 ModF(vtkm::Float64 x, vtkm::Float64& integral)
 {
+#if defined(VTKM_CUDA)
+  return VTKM_CUDA_MATH_FUNCTION_64(modf)(x, &integral);
+#else
  return std::modf(x, &integral);
+#endif
 }

 //-----------------------------------------------------------------------------
--- a/vtkm/Math.h.in
+++ b/vtkm/Math.h.in
@ -1011,7 +1011,8 @@ static inline VTKM_EXEC_CONT vtkm::Float32 RemainderQuotient(vtkm::Float32 numer
                                                             QType& quotient)
 {
  int iQuotient;
-#ifdef VTKM_CUDA
+  // See: https://github.com/ROCm-Developer-Tools/HIP/issues/2169
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
  const vtkm::Float32 result =
    VTKM_CUDA_MATH_FUNCTION_32(remquo)(numerator, denominator, &iQuotient);
 #else
@ -1041,11 +1042,20 @@ static inline VTKM_EXEC_CONT vtkm::Float64 RemainderQuotient(vtkm::Float64 numer
 ///
 static inline VTKM_EXEC_CONT vtkm::Float32 ModF(vtkm::Float32 x, vtkm::Float32& integral)
 {
+  // See: https://github.com/ROCm-Developer-Tools/HIP/issues/2169
+#if defined(VTKM_CUDA) || defined(VTKM_HIP)
+  return VTKM_CUDA_MATH_FUNCTION_32(modf)(x, &integral);
+#else
  return std::modf(x, &integral);
+#endif
 }
 static inline VTKM_EXEC_CONT vtkm::Float64 ModF(vtkm::Float64 x, vtkm::Float64& integral)
 {
+#if defined(VTKM_CUDA)
+  return VTKM_CUDA_MATH_FUNCTION_64(modf)(x, &integral);
+#else
  return std::modf(x, &integral);
+#endif
 }

 //-----------------------------------------------------------------------------
--- a/vtkm/cont/BitField.h
+++ b/vtkm/cont/BitField.h
@ -430,56 +430,74 @@ public:
  }

  /// Perform an atomic compare-and-swap operation on the bit at @a bitIdx.
-  /// If the value in memory is equal to @a expectedBit, it is replaced with
-  /// the value of @a newBit and the original value of the bit is returned as a
-  /// boolean. This method implements a full memory barrier around the atomic
+  /// If the value in memory is equal to @a oldBit, it is replaced with
+  /// the value of @a newBit and true is returned. If the value in memory is
+  /// not equal to @oldBit, @oldBit is changed to that value and false is
+  /// returned. This method implements a full memory barrier around the atomic
  /// operation.
  VTKM_EXEC_CONT
-  bool CompareAndSwapBitAtomic(vtkm::Id bitIdx, bool newBit, bool expectedBit) const
+  bool CompareExchangeBitAtomic(vtkm::Id bitIdx, bool* oldBit, bool newBit) const
  {
    VTKM_STATIC_ASSERT_MSG(!IsConst, "Attempt to modify const BitField portal.");
    using WordType = WordTypePreferred;
    const auto coord = this->GetBitCoordinateFromIndex<WordType>(bitIdx);
    const auto bitmask = WordType(1) << coord.BitOffset;

-    WordType oldWord;
-    WordType newWord;
+    WordType oldWord = this->GetWord<WordType>(coord.WordIndex);
    do
    {
-      oldWord = this->GetWord<WordType>(coord.WordIndex);
-      bool oldBitSet = (oldWord & bitmask) != WordType(0);
-      if (oldBitSet != expectedBit)
+      bool actualBit = (oldWord & bitmask) != WordType(0);
+      if (actualBit != *oldBit)
      { // The bit-of-interest does not match what we expected.
-        return oldBitSet;
+        *oldBit = actualBit;
+        return false;
      }
-      else if (oldBitSet == newBit)
+      else if (actualBit == newBit)
      { // The bit hasn't changed, but also already matches newVal. We're done.
-        return expectedBit;
+        return true;
      }

-      // Compute the new word
-      newWord = oldWord ^ bitmask;
-    } // CAS loop to resolve any conflicting changes to other bits in the word.
-    while (this->CompareAndSwapWordAtomic(coord.WordIndex, newWord, oldWord) != oldWord);
+      // Attempt to update the word with a compare-exchange in the loop condition.
+      // If the old word changed since last queried, oldWord will get updated and
+      // the loop will continue until it succeeds.
+    } while (!this->CompareExchangeWordAtomic(coord.WordIndex, &oldWord, oldWord ^ bitmask));

+    return true;
+  }
+
+  VTKM_DEPRECATED(1.6, "Use CompareExchangeBitAtomic. (Note the changed interface.)")
+  VTKM_EXEC_CONT bool CompareAndSwapBitAtomic(vtkm::Id bitIdx, bool newBit, bool expectedBit) const
+  {
+    this->CompareExchangeBitAtomic(bitIdx, &expectedBit, newBit);
    return expectedBit;
  }

-  /// Perform an atomic compare-and-swap operation on the word at @a wordIdx.
-  /// If the word in memory is equal to @a expectedWord, it is replaced with
-  /// the value of @a newWord and the original word is returned. This method
-  /// implements a full memory barrier around the atomic operation.
+  /// Perform an atomic compare-exchange operation on the word at @a wordIdx.
+  /// If the word in memory is equal to @a oldWord, it is replaced with
+  /// the value of @a newWord and true returned. If the word in memory is not
+  /// equal to @oldWord, @oldWord is set to the word in memory and false is
+  /// returned. This method implements a full memory barrier around the atomic
+  /// operation.
  template <typename WordType = WordTypePreferred>
-  VTKM_EXEC_CONT WordType CompareAndSwapWordAtomic(vtkm::Id wordIdx,
-                                                   WordType newWord,
-                                                   WordType expected) const
+  VTKM_EXEC_CONT bool CompareExchangeWordAtomic(vtkm::Id wordIdx,
+                                                WordType* oldWord,
+                                                WordType newWord) const
  {
    VTKM_STATIC_ASSERT_MSG(!IsConst, "Attempt to modify const BitField portal.");
    VTKM_STATIC_ASSERT_MSG(IsValidWordTypeAtomic<WordType>::value,
                           "Requested WordType does not support atomic"
                           " operations on target execution platform.");
    WordType* addr = this->GetWordAddress<WordType>(wordIdx);
-    return vtkm::AtomicCompareAndSwap(addr, newWord, expected);
+    return vtkm::AtomicCompareExchange(addr, oldWord, newWord);
+  }
+
+  template <typename WordType = WordTypePreferred>
+  VTKM_DEPRECATED(1.6, "Use CompareExchangeWordAtomic. (Note the changed interface.)")
+  VTKM_EXEC_CONT WordType
+    CompareAndSwapWordAtomic(vtkm::Id wordIdx, WordType newWord, WordType expected) const
+  {
+    this->CompareExchangeWordAtomic(wordIdx, &expected, newWord);
+    return expected;
  }

 private:
--- a/vtkm/cont/internal/AtomicInterfaceControl.h
+++ b/vtkm/cont/internal/AtomicInterfaceControl.h
@ -70,7 +70,8 @@ struct VTKM_DEPRECATED(1.6, "Use the functions in vtkm/Atomic.h.") AtomicInterfa
  template <typename T>
  VTKM_EXEC_CONT static T CompareAndSwap(T* addr, T newWord, T expected)
  {
-    return vtkm::AtomicCompareAndSwap(addr, newWord, expected);
+    vtkm::AtomicCompareExchange(addr, &expected, newWord);
+    return expected;
  }
 };
 }
--- a/vtkm/cont/internal/AtomicInterfaceExecution.h
+++ b/vtkm/cont/internal/AtomicInterfaceExecution.h
@ -71,7 +71,8 @@ struct VTKM_DEPRECATED(1.6, "Use the functions in vtkm/Atomic.h.") AtomicInterfa
  template <typename T>
  VTKM_EXEC_CONT static T CompareAndSwap(T* addr, T newWord, T expected)
  {
-    return vtkm::AtomicCompareAndSwap(addr, newWord, expected);
+    vtkm::AtomicCompareExchange(addr, &expected, newWord);
+    return expected;
  }
 };
 }
--- a/vtkm/cont/internal/FunctorsGeneral.h
+++ b/vtkm/cont/internal/FunctorsGeneral.h
@ -657,6 +657,23 @@ VTKM_EXEC static inline vtkm::Id IteratorDistanceImpl(const Iterator& from,
  return static_cast<vtkm::Id>(to - from);
 }

+#if defined(VTKM_HIP)
+
+template <typename Iterator>
+__host__ static inline vtkm::Id IteratorDistance(const Iterator& from, const Iterator& to)
+{
+  return static_cast<vtkm::Id>(std::distance(from, to));
+}
+
+template <typename Iterator>
+__device__ static inline vtkm::Id IteratorDistance(const Iterator& from, const Iterator& to)
+{
+  return IteratorDistanceImpl(
+    from, to, typename std::iterator_traits<Iterator>::iterator_category{});
+}
+
+#else
+
 template <typename Iterator>
 VTKM_EXEC static inline vtkm::Id IteratorDistance(const Iterator& from, const Iterator& to)
 {
@ -668,6 +685,8 @@ VTKM_EXEC static inline vtkm::Id IteratorDistance(const Iterator& from, const It
 #endif
 }

+#endif
+
 template <class InputPortalType, class ValuesPortalType, class OutputPortalType>
 struct LowerBoundsKernel
 {
--- a/vtkm/cont/testing/TestingBitField.h
+++ b/vtkm/cont/testing/TestingBitField.h
@ -203,11 +203,19 @@ struct TestingBitField
    DEVICE_ASSERT(testValues("XorBitAtomic"));

    const auto notBit = !bit;
-    bool casResult = portal.CompareAndSwapBitAtomic(i, bit, notBit);
-    DEVICE_ASSERT(casResult == bit);
+    // A compare-exchange that should fail
+    auto expectedBit = notBit;
+    bool cxResult = portal.CompareExchangeBitAtomic(i, &expectedBit, bit);
+    DEVICE_ASSERT(!cxResult);
+    DEVICE_ASSERT(expectedBit != notBit);
+    DEVICE_ASSERT(portal.GetBit(i) == expectedBit);
    DEVICE_ASSERT(portal.GetBit(i) == bit);
-    casResult = portal.CompareAndSwapBitAtomic(i, notBit, bit);
-    DEVICE_ASSERT(casResult == bit);
+
+    // A compare-exchange that should succeed.
+    expectedBit = bit;
+    cxResult = portal.CompareExchangeBitAtomic(i, &expectedBit, notBit);
+    DEVICE_ASSERT(cxResult);
+    DEVICE_ASSERT(expectedBit == bit);
    DEVICE_ASSERT(portal.GetBit(i) == notBit);

    return true;
@ -258,12 +266,20 @@ struct TestingBitField
    portal.XorWordAtomic(i, mod);
    DEVICE_ASSERT(testValues("XorWordAtomic"));

+    // Compare-exchange that should fail
    const WordType notWord = static_cast<WordType>(~word);
-    auto casResult = portal.CompareAndSwapWordAtomic(i, word, notWord);
-    DEVICE_ASSERT(casResult == word);
+    WordType expectedWord = notWord;
+    bool cxResult = portal.CompareExchangeWordAtomic(i, &expectedWord, word);
+    DEVICE_ASSERT(!cxResult);
+    DEVICE_ASSERT(expectedWord != notWord);
+    DEVICE_ASSERT(portal.template GetWord<WordType>(i) == expectedWord);
    DEVICE_ASSERT(portal.template GetWord<WordType>(i) == word);
-    casResult = portal.CompareAndSwapWordAtomic(i, notWord, word);
-    DEVICE_ASSERT(casResult == word);
+
+    // Compare-exchange that should succeed
+    expectedWord = word;
+    cxResult = portal.CompareExchangeWordAtomic(i, &expectedWord, notWord);
+    DEVICE_ASSERT(cxResult);
+    DEVICE_ASSERT(expectedWord == word);
    DEVICE_ASSERT(portal.template GetWord<WordType>(i) == notWord);

    return true;
--- a/vtkm/cont/testing/TestingDeviceAdapter.h
+++ b/vtkm/cont/testing/TestingDeviceAdapter.h
@ -327,14 +327,9 @@ public:
      T value = (T)index;
      //Get the old value from the array
      T oldValue = this->AArray.Get(0);
-      //This creates an atomic add using the CAS operatoin
-      T assumed = T(0);
-      do
-      {
-        assumed = oldValue;
-        oldValue = this->AArray.CompareAndSwap(0, (assumed + value), assumed);
-
-      } while (assumed != oldValue);
+      //Use atomic compare-exchange to atomically add value
+      while (!this->AArray.CompareExchange(0, &oldValue, oldValue + value))
+        ;
    }

    VTKM_CONT void SetErrorMessageBuffer(const vtkm::exec::internal::ErrorMessageBuffer&) {}
--- a/vtkm/exec/AtomicArrayExecutionObject.h
+++ b/vtkm/exec/AtomicArrayExecutionObject.h
@ -106,7 +106,7 @@ public:
    // We'll cast the signed types to unsigned to work around this.
    using APIType = typename detail::MakeUnsigned<ValueType>::type;

-    return static_cast<T>(vtkm::AtomicLoad(reinterpret_cast<const APIType*>(this->Data + index)));
+    return static_cast<T>(vtkm::AtomicLoad(reinterpret_cast<APIType*>(this->Data + index)));
  }

  /// \brief Peform an atomic addition with sequentially consistent memory
@ -158,47 +158,42 @@ public:
    vtkm::AtomicStore(reinterpret_cast<APIType*>(this->Data + index), static_cast<APIType>(value));
  }

-  /// \brief Perform an atomic CAS operation with sequentially consistent
+  /// \brief Perform an atomic compare and exchange operation with sequentially consistent
  /// memory ordering.
  /// \param index The index of the array element that will be atomically
  /// modified.
+  /// \param oldValue A pointer to the expected value of the indexed element.
  /// \param newValue The value to replace the indexed element with.
-  /// \param oldValue The expected value of the indexed element.
-  /// \return If the operation is successful, \a oldValue is returned. Otherwise
-  /// the current value of the indexed element is returned, and the element is
-  /// not modified.
+  /// \return If the operation is successful, \a true is returned. Otherwise,
+  /// \a oldValue is replaced with the current value of the indexed element,
+  /// the element is not modified, and \a false is returned. In either case, \a oldValue
+  /// becomes the value that was originally in the indexed element.
  ///
  /// This operation is typically used in a loop. For example usage,
-  /// an atomic multiplication may be implemented using CAS as follows:
+  /// an atomic multiplication may be implemented using compare-exchange as follows:
  ///
  /// ```
  /// AtomicArrayExecutionObject<vtkm::Int32, ...> arr = ...;
  ///
-  /// // CAS multiplication:
-  /// vtkm::Int32 cur = arr->Get(idx); // Load the current value at idx
-  /// vtkm::Int32 newVal; // will hold the result of the multiplication
-  /// vtkm::Int32 expect; // will hold the expected value before multiplication
+  /// // Compare-exchange multiplication:
+  /// vtkm::Int32 current = arr->Get(idx); // Load the current value at idx
  /// do {
-  ///   expect = cur; // Used to ensure the value hasn't changed since reading
-  ///   newVal = cur * multFactor; // the actual multiplication
-  /// }
-  /// while ((cur = arr->CompareAndSwap(idx, newVal, expect)) == expect);
+  ///   vtkm::Int32 newVal = current * multFactor; // the actual multiplication
+  /// } while (!arr->CompareExchange(idx, &current, newVal));
  /// ```
  ///
-  /// The while condition here updates \a cur with the pre-CAS value of the
-  /// operation (the return from CompareAndSwap), and compares this to the
-  /// expected value. If the values match, the operation was successful and the
+  /// The while condition here updates \a newVal what the proper multiplication
+  /// is given the expected current value. It then compares this to the
+  /// value in the array. If the values match, the operation was successful and the
  /// loop exits. If the values do not match, the value at \a idx was changed
-  /// by another thread since the initial Get, and the CAS operation failed --
-  /// the target element was not modified by the CAS call. If this happens, the
-  /// loop body re-executes using the new value of \a cur and tries again until
+  /// by another thread since the initial Get, and the compare-exchange operation failed --
+  /// the target element was not modified by the compare-exchange call. If this happens, the
+  /// loop body re-executes using the new value of \a current and tries again until
  /// it succeeds.
  ///
  VTKM_SUPPRESS_EXEC_WARNINGS
  VTKM_EXEC
-  ValueType CompareAndSwap(vtkm::Id index,
-                           const ValueType& newValue,
-                           const ValueType& oldValue) const
+  bool CompareExchange(vtkm::Id index, ValueType* oldValue, const ValueType& newValue) const
  {
    // We only support 32/64 bit signed/unsigned ints, and vtkm::Atomic
    // currently only provides API for unsigned types.
@ -207,9 +202,18 @@ public:
    // is how overflow works, and signed overflow is already undefined.
    using APIType = typename detail::MakeUnsigned<ValueType>::type;

-    return static_cast<T>(vtkm::AtomicCompareAndSwap(reinterpret_cast<APIType*>(this->Data + index),
-                                                     static_cast<APIType>(newValue),
-                                                     static_cast<APIType>(oldValue)));
+    return vtkm::AtomicCompareExchange(reinterpret_cast<APIType*>(this->Data + index),
+                                       reinterpret_cast<APIType*>(oldValue),
+                                       static_cast<APIType>(newValue));
+  }
+
+  VTKM_DEPRECATED(1.6, "Use CompareExchange. (Note the changed interface.)")
+  VTKM_EXEC ValueType CompareAndSwap(vtkm::Id index,
+                                     const ValueType& newValue,
+                                     ValueType oldValue) const
+  {
+    this->CompareExchange(index, &oldValue, newValue);
+    return oldValue;
  }

 private:
--- a/vtkm/filter/GhostCellRemove.hxx
+++ b/vtkm/filter/GhostCellRemove.hxx
@ -106,21 +106,21 @@ public:
  template <typename Atomic>
  VTKM_EXEC void Max(Atomic& atom, const vtkm::Id& val, const vtkm::Id& index) const
  {
-    vtkm::Id old = -1;
-    do
+    vtkm::Id old = atom.Get(index);
+    while (old < val)
    {
-      old = atom.CompareAndSwap(index, val, old);
-    } while (old < val);
+      atom.CompareExchange(index, &old, val);
+    }
  }

  template <typename Atomic>
  VTKM_EXEC void Min(Atomic& atom, const vtkm::Id& val, const vtkm::Id& index) const
  {
-    vtkm::Id old = 1000000000;
-    do
+    vtkm::Id old = atom.Get(index);
+    while (old > val)
    {
-      old = atom.CompareAndSwap(index, val, old);
-    } while (old > val);
+      atom.CompareExchange(index, &old, val);
+    }
  }

  template <typename T, typename AtomicType>
--- a/vtkm/rendering/Wireframer.h
+++ b/vtkm/rendering/Wireframer.h
@ -362,7 +362,7 @@ private:
      blendedColor[2] = color[2] * intensity + srcColor[2] * alpha;
      blendedColor[3] = alpha + intensity;
      next.Ints.Color = PackColor(blendedColor);
-      current.Raw = FrameBuffer.CompareAndSwap(index, next.Raw, current.Raw);
+      FrameBuffer.CompareExchange(index, &current.Raw, next.Raw);
    } while (current.Floats.Depth > next.Floats.Depth);
  }

--- a/vtkm/testing/UnitTestAtomic.cxx
+++ b/vtkm/testing/UnitTestAtomic.cxx
@ -277,7 +277,7 @@ struct AtomicTests
    }
  }

-  struct CompareAndSwapFunctor : vtkm::worklet::WorkletMapField
+  struct CompareExchangeFunctor : vtkm::worklet::WorkletMapField
  {
    using ControlSignature = void(FieldIn ignored, ExecObject);
    using ExecutionSignature = void(WorkIndex, _2);
@ -288,8 +288,8 @@ struct AtomicTests
      bool success = false;
      for (T overlapIndex = 0; overlapIndex < static_cast<T>(OVERLAP); ++overlapIndex)
      {
-        T oldValue = vtkm::AtomicCompareAndSwap(data + arrayIndex, overlapIndex + 1, overlapIndex);
-        if (oldValue == overlapIndex)
+        T expectedValue = overlapIndex;
+        if (vtkm::AtomicCompareExchange(data + arrayIndex, &expectedValue, overlapIndex + 1))
        {
          success = true;
          break;
@ -303,9 +303,9 @@ struct AtomicTests
    }
  };

-  VTKM_CONT void TestCompareAndSwap()
+  VTKM_CONT void TestCompareExchange()
  {
-    std::cout << "AtomicCompareAndSwap" << std::endl;
+    std::cout << "AtomicCompareExchange" << std::endl;
    vtkm::cont::ArrayHandleBasic<T> array;
    vtkm::cont::ArrayCopy(vtkm::cont::make_ArrayHandleConstant<T>(0, ARRAY_SIZE), array);
    array.Allocate(ARRAY_SIZE);
@ -331,7 +331,7 @@ struct AtomicTests
    TestOr();
    TestXor();
    TestNot();
-    TestCompareAndSwap();
+    TestCompareExchange();
  }
 };

--- a/vtkm/worklet/OrientPointNormals.h
+++ b/vtkm/worklet/OrientPointNormals.h
@ -151,9 +151,8 @@ public:
      for (vtkm::IdComponent c = 0; c < cells.GetNumberOfComponents(); ++c)
      {
        const vtkm::Id cellId = cells[c];
-        const bool alreadyVisited = visitedCells.CompareAndSwapBitAtomic(cellId, true, false);
-
-        if (!alreadyVisited)
+        bool checkNotVisited = false;
+        if (visitedCells.CompareExchangeBitAtomic(cellId, &checkNotVisited, true))
        { // This thread is first to visit cell
          activeCells.SetBitAtomic(cellId, true);
        }
@ -216,8 +215,8 @@ public:
        const bool alreadyVisited = visitedPoints.GetBit(pointId);
        if (!alreadyVisited)
        {
-          const bool alreadyActive = activePoints.CompareAndSwapBitAtomic(pointId, true, false);
-          if (!alreadyActive)
+          bool checkNotActive = false;
+          if (activePoints.CompareExchangeBitAtomic(pointId, &checkNotActive, true))
          { // If we're the first thread to mark point active, set ref point:
            refPoints.Set(pointId, refPtId);
          }
--- a/vtkm/worklet/connectivities/UnionFind.h
+++ b/vtkm/worklet/connectivities/UnionFind.h
@ -106,19 +106,16 @@ public:
    // We can use this return "new root" as is without calling findRoot() to
    // find the "new root". The while loop terminates when both u and v have
    // the same root (thus united).
-    auto root_u = UnionFind::findRoot(parents, u);
-    auto root_v = UnionFind::findRoot(parents, v);
+    vtkm::Id root_u = UnionFind::findRoot(parents, u);
+    vtkm::Id root_v = UnionFind::findRoot(parents, v);

    while (root_u != root_v)
    {
      // FIXME: we might be executing the loop one extra time than necessary.
-      // Nota Bene: VTKm's CompareAndSwap has a different order of parameters
-      // than common practice, it is (index, new, expected) rather than
-      // (index, expected, new).
      if (root_u < root_v)
-        root_v = parents.CompareAndSwap(root_v, root_u, root_v);
+        parents.CompareExchange(root_v, &root_v, root_u);
      else if (root_u > root_v)
-        root_u = parents.CompareAndSwap(root_u, root_v, root_u);
+        parents.CompareExchange(root_u, &root_u, root_v);
    }
  }

--- a/vtkm/worklet/contourtree_augmented/processcontourtree/HypersweepWorklets.h
+++ b/vtkm/worklet/contourtree_augmented/processcontourtree/HypersweepWorklets.h
@ -264,14 +264,9 @@ public:
    //Id writeValue = op(vertexValue, parentValue);

    auto cur = minMaxIndexPortal.Get(parent); // Load the current value at idx
-    vtkm::Id newVal;                          // will hold the result of the multiplication
-    vtkm::Id expect;                          // will hold the expected value before multiplication
-
-    do
-    {
-      expect = cur;                        // Used to ensure the value hasn't changed since reading
-      newVal = this->Op(cur, vertexValue); // the actual multiplication
-    } while ((cur = minMaxIndexPortal.CompareAndSwap(parent, newVal, expect)) != expect);
+    // Use a compare-exchange loop to ensure the operation gets applied atomically
+    while (!minMaxIndexPortal.CompareExchange(parent, &cur, this->Op(cur, vertexValue)))
+      ;

    //minMaxIndexPortal.Set(parent, writeValue);
  }