Add VTKM_CUDA_DEVICE_PASS preprocessing definition.

This is only set while compiling device code, and is useful
for code that needs different implementations on devices (e.g.
they call CUDA device intrinsics, etc).
This commit is contained in:
Allison Vacanti 2019-01-24 11:08:37 -05:00
parent ef0054eeb7
commit 03fc7b66d0
9 changed files with 16 additions and 10 deletions

@ -36,7 +36,7 @@
#include <stdlib.h>
#endif // !VTKM_CUDA
#if !defined(__CUDA_ARCH__)
#if !defined(VTKM_CUDA_DEVICE_PASS)
#define VTKM_USE_STL
#include <algorithm>
#endif

@ -48,7 +48,7 @@ $# Ignore the following comment. It is meant for the generated file.
#include <stdlib.h>
#endif // !VTKM_CUDA
#if !defined(__CUDA_ARCH__)
#if !defined(VTKM_CUDA_DEVICE_PASS)
#define VTKM_USE_STL
#include <algorithm>
#endif

@ -61,7 +61,7 @@ public:
// We work around this by calling the __device__ function inside of a
// __CUDA_ARCH__ guard, as nvcc is smart enough to recognize that this is a
// safe usage of a __device__ function in a __host__ __device__ context.
#ifdef __CUDA_ARCH__
#ifdef VTKM_CUDA_DEVICE_PASS
T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return this->vtkmAtomicAdd(lockedValue, value);
#else
@ -85,7 +85,7 @@ public:
// We work around this by calling the __device__ function inside of a
// __CUDA_ARCH__ guard, as nvcc is smart enough to recognize that this is a
// safe usage of a __device__ function in a __host__ __device__ context.
#ifdef __CUDA_ARCH__
#ifdef VTKM_CUDA_DEVICE_PASS
T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return this->vtkmCompareAndSwap(lockedValue, newValue, oldValue);
#else

@ -48,7 +48,7 @@ struct TriggerICE : public vtkm::worklet::WorkletMapField
using ControlSignature = void(FieldIn, FieldIn, FieldOut);
using ExecutionSignature = _3(_1, _2, WorkIndex);
#if __CUDA_ARCH__
#ifdef VTKM_CUDA_DEVICE_PASS
template <class ValueType>
__device__ ValueType operator()(const ValueType& bad,
const ValueType& sane,

@ -325,12 +325,12 @@ public:
return static_cast<vtkm::Id>((this->EndIterator - this->BeginIterator));
}
//The __CUDA_ARCH__ define makes sure that the device only signature
//The VTKM_CUDA_DEVICE_PASS define makes sure that the device only signature
//only shows up for the device compilation. This allows the nvcc compiler
//to have separate host and device code paths for the same method. This
//solves the problem of trying to call a device only method from a
//device/host method
#if __CUDA_ARCH__
#ifdef VTKM_CUDA_DEVICE_PASS
__device__ ValueType Get(vtkm::Id index) const
{
return vtkm::exec::cuda::internal::load_through_texture<ValueType>::get(this->BeginIterator +

@ -196,7 +196,7 @@ __host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_ke
binary_op);
//only sync if we are being invoked from the host
#ifndef __CUDA_ARCH__
#ifndef VTKM_CUDA_DEVICE_PASS
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
#endif

@ -45,7 +45,7 @@
VTKM_SWALLOW_SEMICOLON_POST_BLOCK
// VTKM_ASSUME_IMPL is compiler-specific:
#if defined(__CUDA_ARCH__)
#if defined(VTKM_CUDA_DEVICE_PASS)
//For all versions of CUDA this is a no-op while we look
//for a CUDA asm snippet that replicates this kind of behavior
#define VTKM_ASSUME_IMPL(cond) (void)0 /* no-op */

@ -20,10 +20,16 @@
#ifndef vtk_m_internal_Configure_h
#define vtk_m_internal_Configure_h
// Defined when NVCC is compiling either __host__ or __device__ code.
#ifdef __CUDACC__
#define VTKM_CUDA
#endif
// Defined only when NVCC is compiling __device__ code.
#ifdef __CUDA_ARCH__
#define VTKM_CUDA_DEVICE_PASS
#endif
#if defined(_MSC_VER)
//MSVC 2015+ can use a clang frontend, so we want to label it only as MSVC
//and not MSVC and clang

@ -48,7 +48,7 @@
VTKM_SWALLOW_SEMICOLON_POST_BLOCK
// VTKM_UNREACHABLE_IMPL is compiler-specific:
#if defined(__CUDA_ARCH__)
#if defined(VTKM_CUDA_DEVICE_PASS)
#define VTKM_UNREACHABLE_IMPL() (void)0 /* no-op, no known intrinsic */