vtk-m/vtkm/Atomic.h
Kenneth Moreland c9417825e5 Merge topic 'fix-old-cuda-atomics'
78e06d5dc Fix old cuda atomics

Acked-by: Kitware Robot <kwrobot@kitware.com>
Merge-request: !3209
2024-05-15 15:42:51 -04:00

988 lines
36 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//============================================================================
#ifndef vtk_m_Atomic_h
#define vtk_m_Atomic_h
#include <vtkm/List.h>
#include <vtkm/internal/Windows.h>
#include <atomic>
namespace vtkm
{
/// \brief Specifies memory order semantics for atomic operations.
///
/// The memory order parameter controls how all other memory operations are
/// ordered around a specific atomic instruction.
///
/// Memory access is complicated. Compilers can reorder instructions to optimize
/// scheduling, processors can speculatively read memory, and caches make
/// assumptions about coherency that we may not normally be aware of. Because of
/// this complexity, the order in which multiple updates to shared memory become
/// visible to other threads is not guaranteed, nor is it guaranteed that each
/// thread will see memory updates occur in the same order as any other thread.
/// This can lead to surprising behavior and cause problems when using atomics
/// to communicate between threads.
///
/// These problems are solved by using a standard set of memory orderings which
/// describe common access patterns used for shared memory programming. Their
/// goal is to provide guarantees that changes made in one thread will be visible
/// to another thread at a specific and predictable point in execution, regardless
/// of any hardware or compiler optimizations.
///
/// If unsure, use `SequentiallyConsistent` memory orderings. It will "do the right
/// thing", but at the cost of increased and possibly unnecessary memory ordering
/// restrictions. The other orderings are optimizations that are only applicable
/// in very specific situations.
///
/// See https://en.cppreference.com/w/cpp/atomic/memory_order for a detailed
/// description of the different orderings and their usage.
///
/// The memory order semantics follow those of other common atomic operations such as
/// the `std::memory_order` identifiers used for `std::atomic`.
///
/// Note that when a memory order is specified, the enforced memory order is guaranteed
/// to be as good or better than that requested.
///
enum class MemoryOrder
{
/// An atomic operations with `Relaxed` memory order enforces no synchronization or ordering
/// constraints on local reads and writes. That is, a read or write to a local, non-atomic
/// variable may be moved to before or after an atomic operation with `Relaxed` memory order.
///
Relaxed,
/// A load operation with `Acquire` memory order will enforce that any local read or write
/// operations listed in the program after the atomic will happen after the atomic.
///
Acquire,
/// A store operation with `Release` memory order will enforce that any local read or write
/// operations listed in the program before the atomic will happen before the atomic.
///
Release,
/// A read-modify-write operation with `AcquireAndRelease` memory order will enforce that any
/// local read or write operations listed in the program before the atomic will happen before the
/// atomic and likewise any read or write operations listed in the program after the atomic will
/// happen after the atomic.
///
AcquireAndRelease,
/// An atomic with `SequentiallyConsistent` memory order will enforce any appropriate semantics
/// as `Acquire`, `Release`, and `AcquireAndRelease`. Additionally, `SequentiallyConsistent` will
/// enforce a consistent ordering of atomic operations across all threads. That is, all threads
/// observe the modifications in the same order.
///
SequentiallyConsistent
};
namespace internal
{
VTKM_EXEC_CONT inline std::memory_order StdAtomicMemOrder(vtkm::MemoryOrder order)
{
switch (order)
{
case vtkm::MemoryOrder::Relaxed:
return std::memory_order_relaxed;
case vtkm::MemoryOrder::Acquire:
return std::memory_order_acquire;
case vtkm::MemoryOrder::Release:
return std::memory_order_release;
case vtkm::MemoryOrder::AcquireAndRelease:
return std::memory_order_acq_rel;
case vtkm::MemoryOrder::SequentiallyConsistent:
return std::memory_order_seq_cst;
}
// Should never reach here, but avoid compiler warnings
return std::memory_order_seq_cst;
}
} // namespace internal
} // namespace vtkm
#if defined(VTKM_CUDA_DEVICE_PASS)
namespace vtkm
{
namespace detail
{
// Fence to ensure that previous non-atomic stores are visible to other threads.
VTKM_EXEC_CONT inline void AtomicStoreFence(vtkm::MemoryOrder order)
{
if ((order == vtkm::MemoryOrder::Release) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
(order == vtkm::MemoryOrder::SequentiallyConsistent))
{
__threadfence();
}
}
// Fence to ensure that previous non-atomic stores are visible to other threads.
VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
{
if ((order == vtkm::MemoryOrder::Acquire) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
(order == vtkm::MemoryOrder::SequentiallyConsistent))
{
__threadfence();
}
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
{
volatile T* const vaddr = addr; /* volatile to bypass cache*/
if (order == vtkm::MemoryOrder::SequentiallyConsistent)
{
__threadfence();
}
const T value = *vaddr;
/* fence to ensure that dependent reads are correctly ordered */
AtomicLoadFence(order);
return value;
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
{
volatile T* vaddr = addr; /* volatile to bypass cache */
/* fence to ensure that previous non-atomic stores are visible to other threads */
AtomicStoreFence(order);
*vaddr = value;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
auto result = atomicAdd(addr, arg);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
auto result = atomicAnd(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
auto result = atomicOr(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
auto result = atomicXor(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
{
return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
}
template <typename T>
VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
T* expected,
T desired,
vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
auto result = atomicCAS(addr, *expected, desired);
AtomicLoadFence(order);
if (result == *expected)
{
return true;
}
else
{
*expected = result;
return false;
}
}
#if __CUDA_ARCH__ < 200
VTKM_EXEC_CONT inline vtkm::Float32 AtomicAddImpl(vtkm::Float32* address,
vtkm::Float32 value,
vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
vtkm::UInt32 assumed;
vtkm::UInt32 old = __float_as_int(*address);
do
{
assumed = old;
old = atomicCAS(reinterpret_cast<vtkm::UInt32*>(address),
assumed,
__float_as_int(__int_as_float(assumed) + value));
} while (assumed != old);
AtomicLoadFence(order);
return __int_as_float(old);
}
#endif
#if __CUDA_ARCH__ < 600
VTKM_EXEC_CONT inline vtkm::Float64 AtomicAddImpl(vtkm::Float64* address,
vtkm::Float64 value,
vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
vtkm::UInt64 assumed;
vtkm::UInt64 old = __double_as_longlong(*address);
do
{
assumed = old;
old = atomicCAS(reinterpret_cast<vtkm::UInt64*>(address),
assumed,
__double_as_longlong(__longlong_as_double(assumed) + value));
} while (assumed != old);
AtomicLoadFence(order);
return __longlong_as_double(old);
}
#endif
}
} // namespace vtkm::detail
#elif defined(VTKM_ENABLE_KOKKOS)
VTKM_THIRDPARTY_PRE_INCLUDE
// Superhack! Kokkos_Macros.hpp defines macros to include modifiers like __device__.
// However, we don't want to actually use those if compiling this with a standard
// C++ compiler (because this particular code does not run on a device). Thus,
// we want to disable that behavior when not using the device compiler. To do that,
// we are going to have to load the KokkosCore_config.h file (which you are not
// supposed to do), then undefine the device enables if necessary, then load
// Kokkos_Macros.hpp to finish the state.
#ifndef KOKKOS_MACROS_HPP
#define KOKKOS_MACROS_HPP
#include <KokkosCore_config.h>
#undef KOKKOS_MACROS_HPP
#define KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
#if defined(KOKKOS_ENABLE_CUDA) && !defined(VTKM_CUDA)
#undef KOKKOS_ENABLE_CUDA
// In later versions we need to directly deactivate Kokkos_Setup_Cuda.hpp
#if KOKKOS_VERSION >= 30401
#define KOKKOS_CUDA_SETUP_HPP_
#endif
#endif
#if defined(KOKKOS_ENABLE_HIP) && !defined(VTKM_HIP)
#undef KOKKOS_ENABLE_HIP
#endif
#endif //KOKKOS_MACROS_HPP not loaded
#include <Kokkos_Atomic.hpp>
VTKM_THIRDPARTY_POST_INCLUDE
namespace vtkm
{
namespace detail
{
// Fence to ensure that previous non-atomic stores are visible to other threads.
VTKM_EXEC_CONT inline void AtomicStoreFence(vtkm::MemoryOrder order)
{
if ((order == vtkm::MemoryOrder::Release) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
(order == vtkm::MemoryOrder::SequentiallyConsistent))
{
Kokkos::memory_fence();
}
}
// Fence to ensure that previous non-atomic stores are visible to other threads.
VTKM_EXEC_CONT inline void AtomicLoadFence(vtkm::MemoryOrder order)
{
if ((order == vtkm::MemoryOrder::Acquire) || (order == vtkm::MemoryOrder::AcquireAndRelease) ||
(order == vtkm::MemoryOrder::SequentiallyConsistent))
{
Kokkos::memory_fence();
}
}
#ifdef KOKKOS_INTERNAL_NOT_PARALLEL
#define VTKM_DESUL_MEM_SCOPE desul::MemoryScopeCaller()
#else
#define VTKM_DESUL_MEM_SCOPE desul::MemoryScopeDevice()
#endif
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
{
switch (order)
{
case vtkm::MemoryOrder::Relaxed:
return desul::atomic_load(addr, desul::MemoryOrderRelaxed(), VTKM_DESUL_MEM_SCOPE);
case vtkm::MemoryOrder::Acquire:
case vtkm::MemoryOrder::Release: // Release doesn't make sense. Use Acquire.
case vtkm::MemoryOrder::AcquireAndRelease: // Release doesn't make sense. Use Acquire.
return desul::atomic_load(addr, desul::MemoryOrderAcquire(), VTKM_DESUL_MEM_SCOPE);
case vtkm::MemoryOrder::SequentiallyConsistent:
return desul::atomic_load(addr, desul::MemoryOrderSeqCst(), VTKM_DESUL_MEM_SCOPE);
}
// Should never reach here, but avoid compiler warnings
return desul::atomic_load(addr, desul::MemoryOrderSeqCst(), VTKM_DESUL_MEM_SCOPE);
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
{
switch (order)
{
case vtkm::MemoryOrder::Relaxed:
desul::atomic_store(addr, value, desul::MemoryOrderRelaxed(), VTKM_DESUL_MEM_SCOPE);
break;
case vtkm::MemoryOrder::Acquire: // Acquire doesn't make sense. Use Release.
case vtkm::MemoryOrder::Release:
case vtkm::MemoryOrder::AcquireAndRelease: // Acquire doesn't make sense. Use Release.
desul::atomic_store(addr, value, desul::MemoryOrderRelease(), VTKM_DESUL_MEM_SCOPE);
break;
case vtkm::MemoryOrder::SequentiallyConsistent:
desul::atomic_store(addr, value, desul::MemoryOrderSeqCst(), VTKM_DESUL_MEM_SCOPE);
break;
}
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
T result = Kokkos::atomic_fetch_add(addr, arg);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
T result = Kokkos::atomic_fetch_and(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
T result = Kokkos::atomic_fetch_or(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
T result = Kokkos::atomic_fetch_xor(addr, mask);
AtomicLoadFence(order);
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
{
return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
}
template <typename T>
VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
T* expected,
T desired,
vtkm::MemoryOrder order)
{
AtomicStoreFence(order);
T oldValue = Kokkos::atomic_compare_exchange(addr, *expected, desired);
AtomicLoadFence(order);
if (oldValue == *expected)
{
return true;
}
else
{
*expected = oldValue;
return false;
}
}
}
} // namespace vtkm::detail
#elif defined(VTKM_MSVC)
// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
#include <cstdint>
#include <cstring>
#include <intrin.h> // For MSVC atomics
namespace vtkm
{
namespace detail
{
template <typename To, typename From>
VTKM_EXEC_CONT inline To BitCast(const From& src)
{
// The memcpy should be removed by the compiler when possible, but this
// works around a host of issues with bitcasting using reinterpret_cast.
VTKM_STATIC_ASSERT(sizeof(From) == sizeof(To));
To dst;
std::memcpy(&dst, &src, sizeof(From));
return dst;
}
template <typename T>
VTKM_EXEC_CONT inline T BitCast(T&& src)
{
return std::forward<T>(src);
}
// Note about Load and Store implementations:
//
// "Simple reads and writes to properly-aligned 32-bit variables are atomic
// operations"
//
// "Simple reads and writes to properly aligned 64-bit variables are atomic on
// 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be
// atomic on 32-bit Windows."
//
// "Reads and writes to variables of other sizes [than 32 or 64 bits] are not
// guaranteed to be atomic on any platform."
//
// https://docs.microsoft.com/en-us/windows/desktop/sync/interlocked-variable-access
VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(vtkm::UInt8* const addr, vtkm::MemoryOrder order)
{
// This assumes that the memory interface is smart enough to load a 32-bit
// word atomically and a properly aligned 8-bit word from it.
// We could build address masks and do shifts to perform this manually if
// this assumption is incorrect.
auto result = *static_cast<volatile vtkm::UInt8* const>(addr);
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(vtkm::UInt16* const addr, vtkm::MemoryOrder order)
{
// This assumes that the memory interface is smart enough to load a 32-bit
// word atomically and a properly aligned 16-bit word from it.
// We could build address masks and do shifts to perform this manually if
// this assumption is incorrect.
auto result = *static_cast<volatile vtkm::UInt16* const>(addr);
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(vtkm::UInt32* const addr, vtkm::MemoryOrder order)
{
auto result = *static_cast<volatile vtkm::UInt32* const>(addr);
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(vtkm::UInt64* const addr, vtkm::MemoryOrder order)
{
auto result = *static_cast<volatile vtkm::UInt64* const>(addr);
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
return result;
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt8* addr,
vtkm::UInt8 val,
vtkm::MemoryOrder vtkmNotUsed(order))
{
// There doesn't seem to be an atomic store instruction in the windows
// API, so just exchange and discard the result.
_InterlockedExchange8(reinterpret_cast<volatile CHAR*>(addr), BitCast<CHAR>(val));
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt16* addr,
vtkm::UInt16 val,
vtkm::MemoryOrder vtkmNotUsed(order))
{
// There doesn't seem to be an atomic store instruction in the windows
// API, so just exchange and discard the result.
_InterlockedExchange16(reinterpret_cast<volatile SHORT*>(addr), BitCast<SHORT>(val));
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt32* addr,
vtkm::UInt32 val,
vtkm::MemoryOrder order)
{
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
*addr = val;
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt64* addr,
vtkm::UInt64 val,
vtkm::MemoryOrder order)
{
std::atomic_thread_fence(internal::StdAtomicMemOrder(order));
*addr = val;
}
#define VTKM_ATOMIC_OP(vtkmName, winName, vtkmType, winType, suffix) \
VTKM_EXEC_CONT inline vtkmType vtkmName(vtkmType* addr, vtkmType arg, vtkm::MemoryOrder order) \
{ \
return BitCast<vtkmType>( \
winName##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(arg))); \
}
#define VTKM_ATOMIC_OPS_FOR_TYPE(vtkmType, winType, suffix) \
VTKM_ATOMIC_OP(AtomicAddImpl, _InterlockedExchangeAdd, vtkmType, winType, suffix) \
VTKM_ATOMIC_OP(AtomicAndImpl, _InterlockedAnd, vtkmType, winType, suffix) \
VTKM_ATOMIC_OP(AtomicOrImpl, _InterlockedOr, vtkmType, winType, suffix) \
VTKM_ATOMIC_OP(AtomicXorImpl, _InterlockedXor, vtkmType, winType, suffix) \
VTKM_EXEC_CONT inline vtkmType AtomicNotImpl(vtkmType* addr, vtkm::MemoryOrder order) \
{ \
return AtomicXorImpl(addr, static_cast<vtkmType>(~vtkmType{ 0u }), order); \
} \
VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl( \
vtkmType* addr, vtkmType* expected, vtkmType desired, vtkm::MemoryOrder vtkmNotUsed(order)) \
{ \
vtkmType result = BitCast<vtkmType>( \
_InterlockedCompareExchange##suffix(reinterpret_cast<volatile winType*>(addr), \
BitCast<winType>(desired), \
BitCast<winType>(*expected))); \
if (result == *expected) \
{ \
return true; \
} \
else \
{ \
*expected = result; \
return false; \
} \
}
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt8, CHAR, 8)
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt16, SHORT, 16)
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt32, LONG, )
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt64, LONG64, 64)
#undef VTKM_ATOMIC_OPS_FOR_TYPE
VTKM_EXEC_CONT inline vtkm::Float32 AtomicAddImpl(vtkm::Float32* address,
vtkm::Float32 value,
vtkm::MemoryOrder vtkmNotUsed(order))
{
LONG assumed;
LONG old = BitCast<LONG>(*address);
do
{
assumed = old;
old = _InterlockedCompareExchange(reinterpret_cast<volatile LONG*>(address),
BitCast<LONG>(BitCast<vtkm::Float32>(assumed) + value),
assumed);
} while (assumed != old);
return BitCast<vtkm::Float32>(old);
}
VTKM_EXEC_CONT inline vtkm::Float64 AtomicAddImpl(vtkm::Float64* address,
vtkm::Float64 value,
vtkm::MemoryOrder vtkmNotUsed(order))
{
LONG64 assumed;
LONG64 old = BitCast<LONG64>(*address);
do
{
assumed = old;
old = _InterlockedCompareExchange64(reinterpret_cast<volatile LONG64*>(address),
BitCast<LONG64>(BitCast<vtkm::Float64>(assumed) + value),
assumed);
} while (assumed != old);
return BitCast<vtkm::Float64>(old);
}
}
} // namespace vtkm::detail
#else // gcc/clang for CPU
// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
#include <cstdint>
#include <cstring>
namespace vtkm
{
namespace detail
{
VTKM_EXEC_CONT inline int GccAtomicMemOrder(vtkm::MemoryOrder order)
{
switch (order)
{
case vtkm::MemoryOrder::Relaxed:
return __ATOMIC_RELAXED;
case vtkm::MemoryOrder::Acquire:
return __ATOMIC_ACQUIRE;
case vtkm::MemoryOrder::Release:
return __ATOMIC_RELEASE;
case vtkm::MemoryOrder::AcquireAndRelease:
return __ATOMIC_ACQ_REL;
case vtkm::MemoryOrder::SequentiallyConsistent:
return __ATOMIC_SEQ_CST;
}
// Should never reach here, but avoid compiler warnings
return __ATOMIC_SEQ_CST;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(T* const addr, vtkm::MemoryOrder order)
{
return __atomic_load_n(addr, GccAtomicMemOrder(order));
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value, vtkm::MemoryOrder order)
{
return __atomic_store_n(addr, value, GccAtomicMemOrder(order));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg, vtkm::MemoryOrder order)
{
return __atomic_fetch_add(addr, arg, GccAtomicMemOrder(order));
}
#include <vtkmstd/bit_cast.h>
// TODO: Use enable_if to write one version for both Float32 and Float64.
VTKM_EXEC_CONT inline vtkm::Float32 AtomicAddImpl(vtkm::Float32* addr,
vtkm::Float32 arg,
vtkm::MemoryOrder order)
{
vtkm::UInt32 expected = vtkmstd::bit_cast<vtkm::UInt32>(*addr);
vtkm::UInt32 desired;
do
{
desired = vtkmstd::bit_cast<vtkm::UInt32>(vtkmstd::bit_cast<vtkm::Float32>(expected) + arg);
} while (
!__atomic_compare_exchange_n(reinterpret_cast<vtkm::UInt32*>(addr),
&expected, // reloads expected with *addr prior to the operation
desired,
false,
GccAtomicMemOrder(order),
GccAtomicMemOrder(order)));
// return the "old" value that was in the memory.
return vtkmstd::bit_cast<vtkm::Float32>(expected);
}
// TODO: Use enable_if to write one version for both Float32 and Float64.
VTKM_EXEC_CONT inline vtkm::Float64 AtomicAddImpl(vtkm::Float64* addr,
vtkm::Float64 arg,
vtkm::MemoryOrder order)
{
vtkm::UInt64 expected = vtkmstd::bit_cast<vtkm::UInt64>(*addr);
vtkm::UInt64 desired;
do
{
desired = vtkmstd::bit_cast<vtkm::UInt64>(vtkmstd::bit_cast<vtkm::Float64>(expected) + arg);
} while (
!__atomic_compare_exchange_n(reinterpret_cast<vtkm::UInt64*>(addr),
&expected, // reloads expected with *addr prior to the operation
desired,
false,
GccAtomicMemOrder(order),
GccAtomicMemOrder(order)));
// return the "old" value that was in the memory.
return vtkmstd::bit_cast<vtkm::Float64>(expected);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
return __atomic_fetch_and(addr, mask, GccAtomicMemOrder(order));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
return __atomic_fetch_or(addr, mask, GccAtomicMemOrder(order));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask, vtkm::MemoryOrder order)
{
return __atomic_fetch_xor(addr, mask, GccAtomicMemOrder(order));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr, vtkm::MemoryOrder order)
{
return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }), order);
}
template <typename T>
VTKM_EXEC_CONT inline bool AtomicCompareExchangeImpl(T* addr,
T* expected,
T desired,
vtkm::MemoryOrder order)
{
return __atomic_compare_exchange_n(
addr, expected, desired, false, GccAtomicMemOrder(order), GccAtomicMemOrder(order));
}
}
} // namespace vtkm::detail
#endif // gcc/clang
namespace vtkm
{
namespace detail
{
template <typename T>
using OppositeSign = typename std::conditional<std::is_signed<T>::value,
typename std::make_unsigned<T>::type,
typename std::make_signed<T>::type>::type;
} // namespace detail
/// \brief The preferred type to use for atomic operations.
///
using AtomicTypePreferred = vtkm::UInt32;
/// \brief A list of types that can be used with atomic operations.
///
/// TODO: Adjust based on devices being compiled.
///
/// BUG: vtkm::UInt64 is provided in this list even though it is not supported on CUDA
/// before compute capability 3.5.
///
using AtomicTypesSupported = vtkm::List<vtkm::UInt32, vtkm::UInt64>;
/// \brief Atomic function to load a value from a shared memory location.
///
/// Given a pointer, returns the value in that pointer. If other threads are writing to
/// that same location, the returned value will be consistent to what was present before
/// or after that write.
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoad(T* const pointer,
vtkm::MemoryOrder order = vtkm::MemoryOrder::Acquire)
{
return detail::AtomicLoadImpl(pointer, order);
}
///@{
/// \brief Atomic function to save a value to a shared memory location.
///
/// Given a pointer and a value, stores that value at the pointer's location. If two
/// threads are simultaneously using `AtomicStore` at the same location, the resulting
/// value will be one of the values or the other (as opposed to a mix of bits).
///
template <typename T>
VTKM_EXEC_CONT inline void AtomicStore(T* pointer,
T value,
vtkm::MemoryOrder order = vtkm::MemoryOrder::Release)
{
detail::AtomicStoreImpl(pointer, value, order);
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStore(T* pointer,
detail::OppositeSign<T> value,
vtkm::MemoryOrder order = vtkm::MemoryOrder::Release)
{
detail::AtomicStoreImpl(pointer, static_cast<T>(value), order);
}
///@}
///@{
/// \brief Atomic function to add a value to a shared memory location.
///
/// Given a pointer and an operand, adds the operand to the value at the given memory
/// location. The result of the addition is put into that memory location and the
/// _old_ value that was originally in the memory is returned. For example, if you
/// call `AtomicAdd` on a memory location that holds a 5 with an operand of 3, the
/// value of 8 is stored in the memory location and the value of 5 is returned.
///
/// If multiple threads call `AtomicAdd` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicAdd(
T* pointer,
T operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicAddImpl(pointer, operand, order);
}
template <typename T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
VTKM_EXEC_CONT inline T AtomicAdd(
T* pointer,
detail::OppositeSign<T> operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicAddImpl(pointer, static_cast<T>(operand), order);
}
///@}
///@{
/// \brief Atomic function to AND bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise AND of the operand and thevalue at the given
/// memory location. The result of the AND is put into that memory location and the _old_ value
/// that was originally in the memory is returned. For example, if you call `AtomicAnd` on a memory
/// location that holds a 0x6 with an operand of 0x3, the value of 0x2 is stored in the memory
/// location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicAnd` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicAnd(
T* pointer,
T operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicAndImpl(pointer, operand, order);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAnd(
T* pointer,
detail::OppositeSign<T> operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicAndImpl(pointer, static_cast<T>(operand), order);
}
///@}
///@{
/// \brief Atomic function to OR bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise OR of the operand and the value at the given
/// memory location. The result of the OR is put into that memory location and the _old_ value
/// that was originally in the memory is returned. For example, if you call `AtomicOr` on a memory
/// location that holds a 0x6 with an operand of 0x3, the value of 0x7 is stored in the memory
/// location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicOr` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T>
VTKM_EXEC_CONT inline T
AtomicOr(T* pointer, T operand, vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicOrImpl(pointer, operand, order);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOr(
T* pointer,
detail::OppositeSign<T> operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicOrImpl(pointer, static_cast<T>(operand), order);
}
///@}
///@{
/// \brief Atomic function to XOR bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise exclusive-OR of the operand and the value at
/// the given memory location. The result of the XOR is put into that memory location and the _old_
/// value that was originally in the memory is returned. For example, if you call `AtomicXor` on a
/// memory location that holds a 0x6 with an operand of 0x3, the value of 0x5 is stored in the
/// memory location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicXor` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other.
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicXor(
T* pointer,
T operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicXorImpl(pointer, operand, order);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXor(
T* pointer,
detail::OppositeSign<T> operand,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicXorImpl(pointer, static_cast<T>(operand), order);
}
///@}
/// \brief Atomic function to NOT bits to a shared memory location.
///
/// Given a pointer, performs a bitwise NOT of the value at the given
/// memory location. The result of the NOT is put into that memory location and the _old_ value
/// that was originally in the memory is returned.
///
/// If multiple threads call `AtomicNot` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other.
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicNot(
T* pointer,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicNotImpl(pointer, order);
}
/// \brief Atomic function that replaces a value given a condition.
///
/// Given a pointer to a `shared` value, a pointer holding the `expected` value at that shared
/// location, and a new `desired` value, `AtomicCompareExchange` compares the existing `shared`
/// value to the `expected` value, and then conditionally replaces the `shared` value with
/// the provided `desired` value. Otherwise, the `expected` value gets replaced with the
/// `shared` value. Note that in either case, the function returns with `expected` replaced
/// with the value _originally_ in `shared` at the start of the call.
///
/// If the `shared` value and `expected` value are the same, then `shared` gets set to
/// `desired`, and `AtomicCompareAndExchange` returns `true`.
///
/// If the `shared` value and `expected` value are different, then `expected` gets set
/// to `shared`, and `AtomicCompareAndExchange` returns `false`. The value at `shared`
/// is _not_ changed in this case.
///
/// If multiple threads call `AtomicCompareExchange` simultaneously with the same `shared`
/// pointer, the result will be consistent as if one was called before the other (although
/// it is indeterminate which will be applied first). Note that the `expected` pointer should
/// _not_ be shared among threads. The `expected` pointer should be thread-local (often
/// pointing to an object on the stack).
///
template <typename T>
VTKM_EXEC_CONT inline bool AtomicCompareExchange(
T* shared,
T* expected,
T desired,
vtkm::MemoryOrder order = vtkm::MemoryOrder::SequentiallyConsistent)
{
return detail::AtomicCompareExchangeImpl(shared, expected, desired, order);
}
} // namespace vtkm
#endif //vtk_m_Atomic_h