Add VTKM_CUDA_DEVICE_PASS preprocessing definition.

This is only set while compiling device code, and is useful for code that needs different implementations on devices (e.g. they call CUDA device intrinsics, etc).
2024-09-08 13:23:51 +00:00 · 2019-01-24 11:08:37 -05:00 · 2019-01-24 11:08:37 -05:00 · 03fc7b66d0
commit 03fc7b66d0
parent ef0054eeb7
9 changed files with 16 additions and 10 deletions
--- a/vtkm/Math.h
+++ b/vtkm/Math.h
@ -36,7 +36,7 @@
 #include <stdlib.h>
 #endif // !VTKM_CUDA

-#if !defined(__CUDA_ARCH__)
+#if !defined(VTKM_CUDA_DEVICE_PASS)
 #define VTKM_USE_STL
 #include <algorithm>
 #endif
--- a/vtkm/Math.h.in
+++ b/vtkm/Math.h.in
@ -48,7 +48,7 @@ $# Ignore the following comment. It is meant for the generated file.
 #include <stdlib.h>
 #endif // !VTKM_CUDA

-#if !defined(__CUDA_ARCH__)
+#if !defined(VTKM_CUDA_DEVICE_PASS)
 #define VTKM_USE_STL
 #include <algorithm>
 #endif
--- a/vtkm/cont/cuda/internal/DeviceAdapterAtomicArrayImplementationCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAtomicArrayImplementationCuda.h
@ -61,7 +61,7 @@ public:
 // We work around this by calling the __device__ function inside of a
 // __CUDA_ARCH__ guard, as nvcc is smart enough to recognize that this is a
 // safe usage of a __device__ function in a __host__ __device__ context.
-#ifdef __CUDA_ARCH__
+#ifdef VTKM_CUDA_DEVICE_PASS
    T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
    return this->vtkmAtomicAdd(lockedValue, value);
 #else
@ -85,7 +85,7 @@ public:
 // We work around this by calling the __device__ function inside of a
 // __CUDA_ARCH__ guard, as nvcc is smart enough to recognize that this is a
 // safe usage of a __device__ function in a __host__ __device__ context.
-#ifdef __CUDA_ARCH__
+#ifdef VTKM_CUDA_DEVICE_PASS
    T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
    return this->vtkmCompareAndSwap(lockedValue, newValue, oldValue);
 #else
--- a/vtkm/cont/cuda/testing/UnitTestCudaMath.cu
+++ b/vtkm/cont/cuda/testing/UnitTestCudaMath.cu
@ -48,7 +48,7 @@ struct TriggerICE : public vtkm::worklet::WorkletMapField
  using ControlSignature = void(FieldIn, FieldIn, FieldOut);
  using ExecutionSignature = _3(_1, _2, WorkIndex);

-#if __CUDA_ARCH__
+#ifdef VTKM_CUDA_DEVICE_PASS
  template <class ValueType>
  __device__ ValueType operator()(const ValueType& bad,
                                  const ValueType& sane,
--- a/vtkm/exec/cuda/internal/ArrayPortalFromThrust.h
+++ b/vtkm/exec/cuda/internal/ArrayPortalFromThrust.h
@ -325,12 +325,12 @@ public:
    return static_cast<vtkm::Id>((this->EndIterator - this->BeginIterator));
  }

-//The __CUDA_ARCH__ define makes sure that the device only signature
+//The VTKM_CUDA_DEVICE_PASS define makes sure that the device only signature
 //only shows up for the device compilation. This allows the nvcc compiler
 //to have separate host and device code paths for the same method. This
 //solves the problem of trying to call a device only method from a
 //device/host method
-#if __CUDA_ARCH__
+#ifdef VTKM_CUDA_DEVICE_PASS
  __device__ ValueType Get(vtkm::Id index) const
  {
    return vtkm::exec::cuda::internal::load_through_texture<ValueType>::get(this->BeginIterator +
--- a/vtkm/exec/cuda/internal/ExecutionPolicy.h
+++ b/vtkm/exec/cuda/internal/ExecutionPolicy.h
@ -196,7 +196,7 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
                          binary_op);

 //only sync if we are being invoked from the host
-#ifndef __CUDA_ARCH__
+#ifndef VTKM_CUDA_DEVICE_PASS
  VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 #endif

--- a/vtkm/internal/Assume.h
+++ b/vtkm/internal/Assume.h
@ -45,7 +45,7 @@
  VTKM_SWALLOW_SEMICOLON_POST_BLOCK

 // VTKM_ASSUME_IMPL is compiler-specific:
-#if defined(__CUDA_ARCH__)
+#if defined(VTKM_CUDA_DEVICE_PASS)
 //For all versions of CUDA this is a no-op while we look
 //for a CUDA asm snippet that replicates this kind of behavior
 #define VTKM_ASSUME_IMPL(cond) (void)0 /* no-op */
--- a/vtkm/internal/Configure.h.in
+++ b/vtkm/internal/Configure.h.in
@ -20,10 +20,16 @@
 #ifndef vtk_m_internal_Configure_h
 #define vtk_m_internal_Configure_h

+// Defined when NVCC is compiling either __host__ or __device__ code.
 #ifdef __CUDACC__
 #define VTKM_CUDA
 #endif

+// Defined only when NVCC is compiling __device__ code.
+#ifdef __CUDA_ARCH__
+#define VTKM_CUDA_DEVICE_PASS
+#endif
+
 #if defined(_MSC_VER)
 //MSVC 2015+ can use a clang frontend, so we want to label it only as MSVC
 //and not MSVC and clang
--- a/vtkm/internal/Unreachable.h
+++ b/vtkm/internal/Unreachable.h
@ -48,7 +48,7 @@
  VTKM_SWALLOW_SEMICOLON_POST_BLOCK

 // VTKM_UNREACHABLE_IMPL is compiler-specific:
-#if defined(__CUDA_ARCH__)
+#if defined(VTKM_CUDA_DEVICE_PASS)

 #define VTKM_UNREACHABLE_IMPL() (void)0 /* no-op, no known intrinsic */