251 lines
7.6 KiB
C++
251 lines
7.6 KiB
C++
//============================================================================
|
|
// Copyright (c) Kitware, Inc.
|
|
// All rights reserved.
|
|
// See LICENSE.txt for details.
|
|
// This software is distributed WITHOUT ANY WARRANTY; without even
|
|
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
// PURPOSE. See the above copyright notice for more information.
|
|
//
|
|
// Copyright 2014 Sandia Corporation.
|
|
// Copyright 2014 UT-Battelle, LLC.
|
|
// Copyright 2014 Los Alamos National Security.
|
|
//
|
|
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
|
// Laboratory (LANL), the U.S. Government retains certain rights in
|
|
// this software.
|
|
//============================================================================
|
|
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
|
|
#define vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
|
|
|
|
#include <vtkm/cont/cuda/ErrorCuda.h>
|
|
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
|
|
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
|
|
|
|
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
|
|
|
|
#include <vtkm/Math.h>
|
|
|
|
// Here are the actual implementation of the algorithms.
|
|
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
|
|
|
|
#include <cuda.h>
|
|
|
|
namespace vtkm {
|
|
namespace cont {
|
|
namespace cuda {
|
|
namespace internal {
|
|
|
|
static
|
|
__global__
|
|
void DetermineIfValidCudaDevice()
|
|
{
|
|
//used only to see if we can launch kernels. It is possible to have a
|
|
//CUDA capable device, but still fail to have CUDA support.
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace vtkm {
|
|
namespace cont {
|
|
|
|
template<>
|
|
struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
|
|
: public vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
|
|
vtkm::cont::DeviceAdapterTagCuda>
|
|
{
|
|
|
|
VTKM_CONT static void Synchronize()
|
|
{
|
|
VTKM_CUDA_CALL(cudaDeviceSynchronize());
|
|
}
|
|
|
|
};
|
|
|
|
/// CUDA contains its own high resolution timer.
|
|
///
|
|
template<>
|
|
class DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
|
|
{
|
|
public:
|
|
VTKM_CONT DeviceAdapterTimerImplementation()
|
|
{
|
|
VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
|
|
VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
|
|
this->Reset();
|
|
}
|
|
VTKM_CONT ~DeviceAdapterTimerImplementation()
|
|
{
|
|
VTKM_CUDA_CALL(cudaEventDestroy(this->StartEvent));
|
|
VTKM_CUDA_CALL(cudaEventDestroy(this->EndEvent));
|
|
}
|
|
|
|
VTKM_CONT void Reset()
|
|
{
|
|
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, 0));
|
|
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
|
|
}
|
|
|
|
VTKM_CONT vtkm::Float64 GetElapsedTime()
|
|
{
|
|
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, 0));
|
|
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
|
|
float elapsedTimeMilliseconds;
|
|
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds,
|
|
this->StartEvent,
|
|
this->EndEvent));
|
|
return static_cast<vtkm::Float64>(0.001f*elapsedTimeMilliseconds);
|
|
}
|
|
|
|
private:
|
|
// Copying CUDA events is problematic.
|
|
DeviceAdapterTimerImplementation(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda> &) = delete;
|
|
void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda> &) = delete;
|
|
|
|
cudaEvent_t StartEvent;
|
|
cudaEvent_t EndEvent;
|
|
};
|
|
|
|
/// \brief Class providing a CUDA runtime support detector.
|
|
///
|
|
/// The class provide the actual implementation used by
|
|
/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
|
|
///
|
|
/// We will verify at runtime that the machine has at least one CUDA
|
|
/// capable device, and said device is from the 'fermi' (SM_20) generation
|
|
/// or newer.
|
|
///
|
|
template<>
|
|
class DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
|
|
{
|
|
public:
|
|
VTKM_CONT DeviceAdapterRuntimeDetector():
|
|
NumberOfDevices(0),
|
|
HighestArchSupported(0)
|
|
{
|
|
static bool deviceQueryInit = false;
|
|
static int numDevices = 0;
|
|
static int archVersion = 0;
|
|
|
|
if(!deviceQueryInit)
|
|
{
|
|
deviceQueryInit = true;
|
|
|
|
//first query for the number of devices
|
|
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
|
|
|
|
for (vtkm::Int32 i = 0; i < numDevices; i++)
|
|
{
|
|
cudaDeviceProp prop;
|
|
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
|
|
const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
|
|
archVersion = vtkm::Max(arch, archVersion);
|
|
}
|
|
|
|
//Make sure we can actually launch a kernel. This could fail for any
|
|
//of the following reasons:
|
|
//
|
|
// 1. cudaErrorInsufficientDriver, caused by out of data drives
|
|
// 2. cudaErrorDevicesUnavailable, caused by another process locking the
|
|
// device or somebody disabling cuda support on the device
|
|
// 3. cudaErrorNoKernelImageForDevice we built for a compute version
|
|
// greater than the device we are running on
|
|
// Most likely others that I don't even know about
|
|
vtkm::cont::cuda::internal::DetermineIfValidCudaDevice <<<1,1>>> ();
|
|
if(cudaSuccess != cudaGetLastError())
|
|
{
|
|
numDevices = 0;
|
|
archVersion = 0;
|
|
}
|
|
}
|
|
|
|
this->NumberOfDevices = numDevices;
|
|
this->HighestArchSupported = archVersion;
|
|
}
|
|
|
|
/// Returns true if the given device adapter is supported on the current
|
|
/// machine.
|
|
///
|
|
/// Only returns true if we have at-least one CUDA capable device of SM_20 or
|
|
/// greater ( fermi ).
|
|
///
|
|
VTKM_CONT bool Exists() const
|
|
{
|
|
//
|
|
return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
|
|
}
|
|
|
|
private:
|
|
vtkm::Int32 NumberOfDevices;
|
|
vtkm::Int32 HighestArchSupported;
|
|
};
|
|
|
|
/// CUDA contains its own atomic operations
|
|
///
|
|
template<typename T>
|
|
class DeviceAdapterAtomicArrayImplementation<T,vtkm::cont::DeviceAdapterTagCuda>
|
|
{
|
|
public:
|
|
VTKM_CONT
|
|
DeviceAdapterAtomicArrayImplementation(
|
|
vtkm::cont::ArrayHandle<T, vtkm::cont::StorageTagBasic> handle):
|
|
Portal( handle.PrepareForInPlace( vtkm::cont::DeviceAdapterTagCuda()) )
|
|
{
|
|
}
|
|
|
|
inline __device__
|
|
T Add(vtkm::Id index, const T& value) const
|
|
{
|
|
T *lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
|
|
return vtkmAtomicAdd(lockedValue, value);
|
|
}
|
|
|
|
inline __device__
|
|
T CompareAndSwap(vtkm::Id index, const vtkm::Int64 &newValue, const vtkm::Int64 &oldValue) const
|
|
{
|
|
T *lockedValue = ::thrust::raw_pointer_cast(this->Portal.GetIteratorBegin() + index);
|
|
return vtkmCompareAndSwap(lockedValue, newValue, oldValue);
|
|
}
|
|
|
|
private:
|
|
typedef typename vtkm::cont::ArrayHandle<T,vtkm::cont::StorageTagBasic>
|
|
::template ExecutionTypes<vtkm::cont::DeviceAdapterTagCuda>::Portal PortalType;
|
|
PortalType Portal;
|
|
|
|
inline __device__
|
|
vtkm::Int64 vtkmAtomicAdd(vtkm::Int64 *address, const vtkm::Int64 &value) const
|
|
{
|
|
return atomicAdd((unsigned long long *)address,(unsigned long long) value);
|
|
}
|
|
|
|
inline __device__
|
|
vtkm::Int32 vtkmAtomicAdd(vtkm::Int32 *address, const vtkm::Int32 &value) const
|
|
{
|
|
return atomicAdd(address,value);
|
|
}
|
|
|
|
inline __device__
|
|
vtkm::Int32 vtkmCompareAndSwap(vtkm::Int32 *address, const vtkm::Int32 &newValue, const vtkm::Int32 &oldValue) const
|
|
{
|
|
return atomicCAS(address,oldValue,newValue);
|
|
}
|
|
|
|
inline __device__
|
|
vtkm::Int64 vtkmCompareAndSwap(vtkm::Int64 *address, const vtkm::Int64 &newValue, const vtkm::Int64 &oldValue) const
|
|
{
|
|
return atomicCAS((unsigned long long int*) address,
|
|
(unsigned long long int ) oldValue,
|
|
(unsigned long long int ) newValue);
|
|
}
|
|
};
|
|
|
|
}
|
|
} // namespace vtkm::cont
|
|
|
|
#endif //vtk_m_cont_cuda_internal_DeviceAdapterAlgorithmCuda_h
|