vtk-m2/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
2017-02-24 09:39:22 -05:00

251 lines
7.6 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 Sandia Corporation.
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/Math.h>
// Here are the actual implementation of the algorithms.
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
#include <cuda.h>
namespace vtkm {
namespace cont {
namespace cuda {
namespace internal {
static
__global__
void DetermineIfValidCudaDevice()
{
//used only to see if we can launch kernels. It is possible to have a
//CUDA capable device, but still fail to have CUDA support.
}
}
}
}
}
namespace vtkm {
namespace cont {
template<>
struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
: public vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
vtkm::cont::DeviceAdapterTagCuda>
{
VTKM_CONT static void Synchronize()
{
VTKM_CUDA_CALL(cudaDeviceSynchronize());
}
};
/// CUDA contains its own high resolution timer.
///
template<>
class DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterTimerImplementation()
{
VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
this->Reset();
}
VTKM_CONT ~DeviceAdapterTimerImplementation()
{
VTKM_CUDA_CALL(cudaEventDestroy(this->StartEvent));
VTKM_CUDA_CALL(cudaEventDestroy(this->EndEvent));
}
VTKM_CONT void Reset()
{
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
}
VTKM_CONT vtkm::Float64 GetElapsedTime()
{
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds,
this->StartEvent,
this->EndEvent));
return static_cast<vtkm::Float64>(0.001f*elapsedTimeMilliseconds);
}
private:
// Copying CUDA events is problematic.
DeviceAdapterTimerImplementation(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda> &) = delete;
void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda> &) = delete;
cudaEvent_t StartEvent;
cudaEvent_t EndEvent;
};
/// \brief Class providing a CUDA runtime support detector.
///
/// The class provide the actual implementation used by
/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
///
/// We will verify at runtime that the machine has at least one CUDA
/// capable device, and said device is from the 'fermi' (SM_20) generation
/// or newer.
///
template<>
class DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterRuntimeDetector():
NumberOfDevices(0),
HighestArchSupported(0)
{
static bool deviceQueryInit = false;
static int numDevices = 0;
static int archVersion = 0;
if(!deviceQueryInit)
{
deviceQueryInit = true;
//first query for the number of devices
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
for (vtkm::Int32 i = 0; i < numDevices; i++)
{
cudaDeviceProp prop;
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
archVersion = vtkm::Max(arch, archVersion);
}
//Make sure we can actually launch a kernel. This could fail for any
//of the following reasons:
//
// 1. cudaErrorInsufficientDriver, caused by out of data drives
// 2. cudaErrorDevicesUnavailable, caused by another process locking the
// device or somebody disabling cuda support on the device
// 3. cudaErrorNoKernelImageForDevice we built for a compute version
// greater than the device we are running on
// Most likely others that I don't even know about
vtkm::cont::cuda::internal::DetermineIfValidCudaDevice <<<1,1>>> ();
if(cudaSuccess != cudaGetLastError())
{
numDevices = 0;
archVersion = 0;
}
}
this->NumberOfDevices = numDevices;
this->HighestArchSupported = archVersion;
}
/// Returns true if the given device adapter is supported on the current
/// machine.
///
/// Only returns true if we have at-least one CUDA capable device of SM_20 or
/// greater ( fermi ).
///
VTKM_CONT bool Exists() const
{
//
return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
}
private:
vtkm::Int32 NumberOfDevices;
vtkm::Int32 HighestArchSupported;
};
/// CUDA contains its own atomic operations
///
template<typename T>
class DeviceAdapterAtomicArrayImplementation<T,vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT
DeviceAdapterAtomicArrayImplementation(
vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle):
Portal( handle.PrepareForInPlace( vtkm::cont::DeviceAdapterTagCuda()) )
{
}
inline __device__
T Add(vtkm::Id index, const T& value) const
{
T *lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return vtkmAtomicAdd(lockedValue, value);
}
inline __device__
T CompareAndSwap(vtkm::Id index, const vtkm::Int64 &newValue, const vtkm::Int64 &oldValue) const
{
T *lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
return vtkmCompareAndSwap(lockedValue, newValue, oldValue);
}
private:
typedef typename vtkm::cont::ArrayHandle<T,vtkm::cont::StorageTagBasic>
::template ExecutionTypes<vtkm::cont::DeviceAdapterTagCuda>::Portal PortalType;
PortalType Portal;
inline __device__
vtkm::Int64 vtkmAtomicAdd(vtkm::Int64 *address, const vtkm::Int64 &value) const
{
return atomicAdd((unsigned long long *)address,(unsigned long long) value);
}
inline __device__
vtkm::Int32 vtkmAtomicAdd(vtkm::Int32 *address, const vtkm::Int32 &value) const
{
return atomicAdd(address,value);
}
inline __device__
vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32 *address, const vtkm::Int32 &newValue, const vtkm::Int32 &oldValue) const
{
return atomicCAS(address,oldValue,newValue);
}
inline __device__
vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64 *address, const vtkm::Int64 &newValue, const vtkm::Int64 &oldValue) const
{
return atomicCAS((unsigned long long int*) address,
(unsigned long long int ) oldValue,
(unsigned long long int ) newValue);
}
};
}
} // namespace vtkm::cont
#endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h