vtk-m2/vtkm/Atomic.h
Kenneth Moreland 82d6ca9854 Allow Kokkos atomic functions to work without cuda device compiler
If the Kokkos device is enabled, we use the Kokkos atomic functions for
implementation since Kokkos has already ported all of these to each
device.

If Kokkos is compiled with CUDA, then the functions are marked with
`__device__` and `__host__`. Makes sense right?

Unless we are trying to use the atomic functions on host code compiled
only for the host. In that case, modifiers like `__device__` will cause
compiler errors.

So we want to disable Kokkos from using `__device__`, but only if CUDA
is enabled and we are not using the CUDA compiler. Had to hack things up
to get that to work.
2020-08-20 13:40:44 -06:00

535 lines
19 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//============================================================================
#ifndef vtk_m_Atomic_h
#define vtk_m_Atomic_h
#include <vtkm/List.h>
#include <vtkm/internal/Windows.h>
#if defined(VTKM_ENABLE_KOKKOS)
VTKM_THIRDPARTY_PRE_INCLUDE
// Superhack! Kokkos_Macros.hpp defines macros to include modifiers like __device__.
// However, we don't want to actually use those if compiling this with a standard
// C++ compiler (because this particular code does not run on a device). Thus,
// we want to disable that behavior when not using the device compiler. To do that,
// we are going to have to load the KokkosCore_config.h file (which you are not
// supposed to do), then undefine the device enables if necessary, then load
// Kokkos_Macros.hpp to finish the state.
#ifndef KOKKOS_MACROS_HPP
#define KOKKOS_MACROS_HPP
#include <KokkosCore_config.h>
#undef KOKKOS_MACROS_HPP
#define KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
#if defined(KOKKOS_ENABLE_CUDA) && !defined(VTKM_CUDA)
#undef KOKKOS_ENABLE_CUDA
#endif
#endif //KOKKOS_MACROS_HPP not loaded
#include <Kokkos_Core.hpp>
VTKM_THIRDPARTY_POST_INCLUDE
namespace vtkm
{
namespace detail
{
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
{
return Kokkos::Impl::atomic_load(addr);
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
{
Kokkos::Impl::atomic_store(addr, value);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
{
return Kokkos::atomic_fetch_add(addr, arg);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
{
return Kokkos::atomic_fetch_and(addr, mask);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
{
return Kokkos::atomic_fetch_or(addr, mask);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
{
return Kokkos::atomic_fetch_xor(addr, mask);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
{
return Kokkos::atomic_fetch_xor(addr, static_cast<T>(~T{ 0u }));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
{
return Kokkos::atomic_compare_exchange(addr, expected, desired);
}
}
} // namespace vtkm::detail
#elif defined(VTKM_CUDA_DEVICE_PASS)
namespace vtkm
{
namespace detail
{
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
{
const volatile T* vaddr = addr; /* volatile to bypass cache*/
const T value = *vaddr;
/* fence to ensure that dependent reads are correctly ordered */
__threadfence();
return value;
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
{
volatile T* vaddr = addr; /* volatile to bypass cache */
/* fence to ensure that previous non-atomic stores are visible to other threads */
__threadfence();
*vaddr = value;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
{
__threadfence();
auto result = atomicAdd(addr, arg);
__threadfence();
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
{
__threadfence();
auto result = atomicAnd(addr, mask);
__threadfence();
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
{
__threadfence();
auto result = atomicOr(addr, mask);
__threadfence();
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
{
__threadfence();
auto result = atomicXor(addr, mask);
__threadfence();
return result;
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
{
return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
{
__threadfence();
auto result = atomicCAS(addr, expected, desired);
__threadfence();
return result;
}
}
} // namespace vtkm::detail
#elif defined(VTKM_MSVC)
// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
#include <atomic>
#include <cstdint>
#include <cstring>
#include <intrin.h> // For MSVC atomics
namespace vtkm
{
namespace detail
{
template <typename To, typename From>
VTKM_EXEC_CONT inline To BitCast(const From& src)
{
// The memcpy should be removed by the compiler when possible, but this
// works around a host of issues with bitcasting using reinterpret_cast.
VTKM_STATIC_ASSERT(sizeof(From) == sizeof(To));
To dst;
std::memcpy(&dst, &src, sizeof(From));
return dst;
}
template <typename T>
VTKM_EXEC_CONT inline T BitCast(T&& src)
{
return std::forward<T>(src);
}
// Note about Load and Store implementations:
//
// "Simple reads and writes to properly-aligned 32-bit variables are atomic
// operations"
//
// "Simple reads and writes to properly aligned 64-bit variables are atomic on
// 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be
// atomic on 32-bit Windows."
//
// "Reads and writes to variables of other sizes [than 32 or 64 bits] are not
// guaranteed to be atomic on any platform."
//
// https://docs.microsoft.com/en-us/windows/desktop/sync/interlocked-variable-access
VTKM_EXEC_CONT inline vtkm::UInt8 AtomicLoadImpl(const vtkm::UInt8* addr)
{
// This assumes that the memory interface is smart enough to load a 32-bit
// word atomically and a properly aligned 8-bit word from it.
// We could build address masks and do shifts to perform this manually if
// this assumption is incorrect.
auto result = *static_cast<volatile const vtkm::UInt8*>(addr);
std::atomic_thread_fence(std::memory_order_acquire);
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt16 AtomicLoadImpl(const vtkm::UInt16* addr)
{
// This assumes that the memory interface is smart enough to load a 32-bit
// word atomically and a properly aligned 16-bit word from it.
// We could build address masks and do shifts to perform this manually if
// this assumption is incorrect.
auto result = *static_cast<volatile const vtkm::UInt16*>(addr);
std::atomic_thread_fence(std::memory_order_acquire);
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt32 AtomicLoadImpl(const vtkm::UInt32* addr)
{
auto result = *static_cast<volatile const vtkm::UInt32*>(addr);
std::atomic_thread_fence(std::memory_order_acquire);
return result;
}
VTKM_EXEC_CONT inline vtkm::UInt64 AtomicLoadImpl(const vtkm::UInt64* addr)
{
auto result = *static_cast<volatile const vtkm::UInt64*>(addr);
std::atomic_thread_fence(std::memory_order_acquire);
return result;
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt8* addr, vtkm::UInt8 val)
{
// There doesn't seem to be an atomic store instruction in the windows
// API, so just exchange and discard the result.
_InterlockedExchange8(reinterpret_cast<volatile CHAR*>(addr), BitCast<CHAR>(val));
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt16* addr, vtkm::UInt16 val)
{
// There doesn't seem to be an atomic store instruction in the windows
// API, so just exchange and discard the result.
_InterlockedExchange16(reinterpret_cast<volatile SHORT*>(addr), BitCast<SHORT>(val));
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt32* addr, vtkm::UInt32 val)
{
std::atomic_thread_fence(std::memory_order_release);
*addr = val;
}
VTKM_EXEC_CONT inline void AtomicStoreImpl(vtkm::UInt64* addr, vtkm::UInt64 val)
{
std::atomic_thread_fence(std::memory_order_release);
*addr = val;
}
#define VTKM_ATOMIC_OPS_FOR_TYPE(vtkmType, winType, suffix) \
VTKM_EXEC_CONT inline vtkmType AtomicAddImpl(vtkmType* addr, vtkmType arg) \
{ \
return BitCast<vtkmType>(_InterlockedExchangeAdd##suffix( \
reinterpret_cast<volatile winType*>(addr), BitCast<winType>(arg))); \
} \
VTKM_EXEC_CONT inline vtkmType AtomicAndImpl(vtkmType* addr, vtkmType mask) \
{ \
return BitCast<vtkmType>( \
_InterlockedAnd##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask))); \
} \
VTKM_EXEC_CONT inline vtkmType AtomicOrImpl(vtkmType* addr, vtkmType mask) \
{ \
return BitCast<vtkmType>( \
_InterlockedOr##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask))); \
} \
VTKM_EXEC_CONT inline vtkmType AtomicXorImpl(vtkmType* addr, vtkmType mask) \
{ \
return BitCast<vtkmType>( \
_InterlockedXor##suffix(reinterpret_cast<volatile winType*>(addr), BitCast<winType>(mask))); \
} \
VTKM_EXEC_CONT inline vtkmType AtomicNotImpl(vtkmType* addr) \
{ \
return AtomicXorImpl(addr, static_cast<vtkmType>(~vtkmType{ 0u })); \
} \
VTKM_EXEC_CONT inline vtkmType AtomicCompareAndSwapImpl( \
vtkmType* addr, vtkmType expected, vtkmType desired) \
{ \
return BitCast<vtkmType>( \
_InterlockedCompareExchange##suffix(reinterpret_cast<volatile winType*>(addr), \
BitCast<winType>(desired), \
BitCast<winType>(expected))); \
}
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt8, CHAR, 8)
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt16, SHORT, 16)
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt32, LONG, )
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt64, LONG64, 64)
#undef VTKM_ATOMIC_OPS_FOR_TYPE
}
} // namespace vtkm::detail
#else // gcc/clang for CPU
// Supports vtkm::UInt8, vtkm::UInt16, vtkm::UInt32, vtkm::UInt64
#include <atomic>
#include <cstdint>
#include <cstring>
namespace vtkm
{
namespace detail
{
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoadImpl(const T* addr)
{
return __atomic_load_n(addr, __ATOMIC_ACQUIRE);
}
template <typename T>
VTKM_EXEC_CONT inline void AtomicStoreImpl(T* addr, T value)
{
return __atomic_store_n(addr, value, __ATOMIC_RELEASE);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAddImpl(T* addr, T arg)
{
return __atomic_fetch_add(addr, arg, __ATOMIC_SEQ_CST);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicAndImpl(T* addr, T mask)
{
return __atomic_fetch_and(addr, mask, __ATOMIC_SEQ_CST);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicOrImpl(T* addr, T mask)
{
return __atomic_fetch_or(addr, mask, __ATOMIC_SEQ_CST);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicXorImpl(T* addr, T mask)
{
return __atomic_fetch_xor(addr, mask, __ATOMIC_SEQ_CST);
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicNotImpl(T* addr)
{
return AtomicXorImpl(addr, static_cast<T>(~T{ 0u }));
}
template <typename T>
VTKM_EXEC_CONT inline T AtomicCompareAndSwapImpl(T* addr, T expected, T desired)
{
__atomic_compare_exchange_n(addr, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
return expected;
}
}
} // namespace vtkm::detail
#endif // gcc/clang
namespace vtkm
{
/// \brief The preferred type to use for atomic operations.
///
using AtomicTypePreferred = vtkm::UInt32;
/// \brief A list of types that can be used with atomic operations.
///
/// TODO: Adjust based on devices being compiled.
///
/// BUG: vtkm::UInt64 is provided in this list even though it is not supported on CUDA
/// before compute capability 3.5.
///
using AtomicTypesSupported = vtkm::List<vtkm::UInt32, vtkm::UInt64>;
/// \brief Atomic function to load a value from a shared memory location.
///
/// Given a pointer, returns the value in that pointer. If other threads are writing to
/// that same location, the returned value will be consistent to what was present before
/// or after that write.
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicLoad(const T* pointer)
{
return detail::AtomicLoadImpl(pointer);
}
/// \brief Atomic function to save a value to a shared memory location.
///
/// Given a pointer and a value, stores that value at the pointer's location. If two
/// threads are simultaneously using `AtomicStore` at the same location, the resulting
/// value will be one of the values or the other (as opposed to a mix of bits).
///
template <typename T>
VTKM_EXEC_CONT inline void AtomicStore(T* pointer, T value)
{
detail::AtomicStoreImpl(pointer, value);
}
/// \brief Atomic function to add a value to a shared memory location.
///
/// Given a pointer and an operand, adds the operand to the value at the given memory
/// location. The result of the addition is put into that memory location and the
/// _old_ value that was originally in the memory is returned. For example, if you
/// call `AtomicAdd` on a memory location that holds a 5 with an operand of 3, the
/// value of 8 is stored in the memory location and the value of 5 is returned.
///
/// If multiple threads call `AtomicAdd` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T, typename U>
VTKM_EXEC_CONT inline T AtomicAdd(T* pointer, U operand)
{
return detail::AtomicAddImpl(pointer, static_cast<T>(operand));
}
/// \brief Atomic function to AND bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise AND of the operand and thevalue at the given
/// memory location. The result of the AND is put into that memory location and the _old_ value
/// that was originally in the memory is returned. For example, if you call `AtomicAnd` on a memory
/// location that holds a 0x6 with an operand of 0x3, the value of 0x2 is stored in the memory
/// location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicAnd` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T, typename U>
VTKM_EXEC_CONT inline T AtomicAnd(T* pointer, U operand)
{
return detail::AtomicAndImpl(pointer, static_cast<T>(operand));
}
/// \brief Atomic function to OR bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise OR of the operand and the value at the given
/// memory location. The result of the OR is put into that memory location and the _old_ value
/// that was originally in the memory is returned. For example, if you call `AtomicOr` on a memory
/// location that holds a 0x6 with an operand of 0x3, the value of 0x7 is stored in the memory
/// location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicOr` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other
/// (although it is indeterminate which will be applied first).
///
template <typename T, typename U>
VTKM_EXEC_CONT inline T AtomicOr(T* pointer, U operand)
{
return detail::AtomicOrImpl(pointer, static_cast<T>(operand));
}
/// \brief Atomic function to XOR bits to a shared memory location.
///
/// Given a pointer and an operand, performs a bitwise exclusive-OR of the operand and the value at
/// the given memory location. The result of the XOR is put into that memory location and the _old_
/// value that was originally in the memory is returned. For example, if you call `AtomicXor` on a
/// memory location that holds a 0x6 with an operand of 0x3, the value of 0x5 is stored in the
/// memory location and the value of 0x6 is returned.
///
/// If multiple threads call `AtomicXor` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other.
///
template <typename T, typename U>
VTKM_EXEC_CONT inline T AtomicXor(T* pointer, U operand)
{
return detail::AtomicXorImpl(pointer, static_cast<T>(operand));
}
/// \brief Atomic function to NOT bits to a shared memory location.
///
/// Given a pointer, performs a bitwise NOT of the value at the given
/// memory location. The result of the NOT is put into that memory location and the _old_ value
/// that was originally in the memory is returned.
///
/// If multiple threads call `AtomicNot` simultaneously, they will not interfere with
/// each other. The result will be consistent as if one was called before the other.
///
template <typename T>
VTKM_EXEC_CONT inline T AtomicNot(T* pointer)
{
return detail::AtomicNotImpl(pointer);
}
/// \brief Atomic function that replaces a value given a condition.
///
/// Given a pointer, an expected value, and a new desired value, replaces the value at the
/// pointer if it is the same as the expected value with the new desired value. If the original
/// value in the pointer does not equal the expected value, then the memory at the pointer
/// remains unchanged. In either case, the function returns the _old_ original value that
/// was at the pointer.
///
/// If multiple threads call `AtomicCompareAndSwap` simultaneously, the result will be consistent
/// as if one was called before the other (although it is indeterminate which will be applied
/// first).
///
template <typename T, typename U, typename V>
VTKM_EXEC_CONT inline T AtomicCompareAndSwap(T* pointer, U expected, V desired)
{
return detail::AtomicCompareAndSwapImpl(
pointer, static_cast<T>(expected), static_cast<V>(desired));
}
} // namespace vtkm
#endif //vtk_m_Atomic_h