vtk-m2/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
Robert Maynard b56894dd09 Move VTK-m Cuda backend over to a grid-stride iteration pattern.
This allows for easier host side logic when determining grid and block
sizes, and allows for a smaller library side by moving some logic
into compiled in functions.
2018-04-30 17:29:26 -04:00

287 lines
9.0 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/Math.h>
// Here are the actual implementation of the algorithms.
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
static __global__ void DetermineIfValidCudaDevice()
{
//used only to see if we can launch kernels. It is possible to have a
//CUDA capable device, but still fail to have CUDA support.
}
}
}
}
}
namespace vtkm
{
namespace cont
{
template <>
struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
: public vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
vtkm::cont::DeviceAdapterTagCuda>
{
VTKM_CONT static void Synchronize()
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
}
};
/// CUDA contains its own high resolution timer.
///
template <>
class DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterTimerImplementation()
{
VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
this->Reset();
}
VTKM_CONT ~DeviceAdapterTimerImplementation()
{
// These aren't wrapped in VTKM_CUDA_CALL because we can't throw errors
// from destructors. We're relying on cudaGetLastError in the
// VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR catching any issues from these calls
// later.
cudaEventDestroy(this->StartEvent);
cudaEventDestroy(this->EndEvent);
}
VTKM_CONT void Reset()
{
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
}
VTKM_CONT vtkm::Float64 GetElapsedTime()
{
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(
cudaEventElapsedTime(&elapsedTimeMilliseconds, this->StartEvent, this->EndEvent));
return static_cast<vtkm::Float64>(0.001f * elapsedTimeMilliseconds);
}
private:
// Copying CUDA events is problematic.
DeviceAdapterTimerImplementation(
const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) = delete;
void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) =
delete;
cudaEvent_t StartEvent;
cudaEvent_t EndEvent;
};
/// \brief Class providing a CUDA runtime support detector.
///
/// The class provide the actual implementation used by
/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
///
/// We will verify at runtime that the machine has at least one CUDA
/// capable device, and said device is from the 'fermi' (SM_20) generation
/// or newer.
///
template <>
class DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterRuntimeDetector()
: NumberOfDevices(0)
, HighestArchSupported(0)
{
static bool deviceQueryInit = false;
static int numDevices = 0;
static int archVersion = 0;
if (!deviceQueryInit)
{
deviceQueryInit = true;
//first query for the number of devices
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
for (vtkm::Int32 i = 0; i < numDevices; i++)
{
cudaDeviceProp prop;
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
archVersion = vtkm::Max(arch, archVersion);
}
//Make sure we can actually launch a kernel. This could fail for any
//of the following reasons:
//
// 1. cudaErrorInsufficientDriver, caused by out of data drives
// 2. cudaErrorDevicesUnavailable, caused by another process locking the
// device or somebody disabling cuda support on the device
// 3. cudaErrorNoKernelImageForDevice we built for a compute version
// greater than the device we are running on
// Most likely others that I don't even know about
vtkm::cont::cuda::internal::DetermineIfValidCudaDevice<<<1, 1, 0, cudaStreamPerThread>>>();
if (cudaSuccess != cudaGetLastError())
{
numDevices = 0;
archVersion = 0;
}
}
this->NumberOfDevices = numDevices;
this->HighestArchSupported = archVersion;
}
/// Returns true if the given device adapter is supported on the current
/// machine.
///
/// Only returns true if we have at-least one CUDA capable device of SM_20 or
/// greater ( fermi ).
///
VTKM_CONT bool Exists() const
{
//
return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
}
private:
vtkm::Int32 NumberOfDevices;
vtkm::Int32 HighestArchSupported;
};
/// CUDA contains its own atomic operations
///
template <typename T>
class DeviceAdapterAtomicArrayImplementation<T, vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT
DeviceAdapterAtomicArrayImplementation(
vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle)
: Portal(handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagCuda()))
{
}
inline __device__ T Add(vtkm::Id index, const T& value) const
{
T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return vtkmAtomicAdd(lockedValue, value);
}
inline __device__ T CompareAndSwap(vtkm::Id index,
const vtkm::Int64& newValue,
const vtkm::Int64& oldValue) const
{
T* lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return vtkmCompareAndSwap(lockedValue, newValue, oldValue);
}
private:
using PortalType =
typename vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic>::template ExecutionTypes<
vtkm::cont::DeviceAdapterTagCuda>::Portal;
PortalType Portal;
inline __device__ vtkm::Int64 vtkmAtomicAdd(vtkm::Int64* address, const vtkm::Int64& value) const
{
return atomicAdd((unsigned long long*)address, (unsigned long long)value);
}
inline __device__ vtkm::Int32 vtkmAtomicAdd(vtkm::Int32* address, const vtkm::Int32& value) const
{
return atomicAdd(address, value);
}
inline __device__ vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32* address,
const vtkm::Int32& newValue,
const vtkm::Int32& oldValue) const
{
return atomicCAS(address, oldValue, newValue);
}
inline __device__ vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64* address,
const vtkm::Int64& newValue,
const vtkm::Int64& oldValue) const
{
return atomicCAS((unsigned long long int*)address,
(unsigned long long int)oldValue,
(unsigned long long int)newValue);
}
};
template <>
class DeviceTaskTypes<vtkm::cont::DeviceAdapterTagCuda>
{
public:
template <typename WorkletType, typename InvocationType>
static vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType> MakeTask(
WorkletType& worklet,
InvocationType& invocation,
vtkm::Id,
vtkm::Id globalIndexOffset = 0)
{
using Task = vtkm::exec::cuda::internal::TaskStrided1D<WorkletType, InvocationType>;
return Task(worklet, invocation, globalIndexOffset);
}
template <typename WorkletType, typename InvocationType>
static vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType> MakeTask(
WorkletType& worklet,
InvocationType& invocation,
vtkm::Id3,
vtkm::Id globalIndexOffset = 0)
{
using Task = vtkm::exec::cuda::internal::TaskStrided3D<WorkletType, InvocationType>;
return Task(worklet, invocation, globalIndexOffset);
}
};
}
} // namespace vtkm::cont
#endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h