remove cudaGetDevice calls, favor runtime device config

This commit is contained in:
Nickolas Davis 2021-08-26 14:17:15 -06:00
parent adac415f15
commit 9730de8074
5 changed files with 36 additions and 29 deletions

@ -239,8 +239,10 @@ void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
if (IsManagedPointer(ptr) && numBytes >= Threshold)
{
#if CUDART_VERSION >= 8000
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
vtkm::Id dev;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(dev);
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
@ -254,8 +256,10 @@ void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
if (IsManagedPointer(ptr) && numBytes >= Threshold)
{
#if CUDART_VERSION >= 8000
int dev;
VTKM_CUDA_CALL(cudaGetDevice(&dev));
vtkm::Id dev;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(dev);
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
// VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));

@ -10,6 +10,10 @@
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
#include <vtkm/cont/RuntimeDeviceInformation.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/cuda/internal/RuntimeDeviceConfigurationCuda.h>
#include <atomic>
#include <cstring>
#include <functional>
@ -133,14 +137,14 @@ VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
std::call_once(lookupBuiltFlag, []() {
ScheduleParameterBuilder builder;
//iterate over all devices
int count = 0;
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
for (int deviceId = 0; deviceId < count; ++deviceId)
auto cudaDeviceConfig = dynamic_cast<
vtkm::cont::internal::RuntimeDeviceConfiguration<vtkm::cont::DeviceAdapterTagCuda>&>(
vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration(
vtkm::cont::DeviceAdapterTagCuda()));
std::vector<cudaDeviceProp> cudaDevices;
cudaDeviceConfig.GetCudaDeviceProp(cudaDevices);
for (const auto& deviceProp : cudaDevices)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
ScheduleParameters params = builder.Compute(deviceProp.name,
deviceProp.major,
deviceProp.minor,
@ -204,8 +208,10 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
(void)size;
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
vtkm::Id deviceId;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(deviceId);
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
blocks = static_cast<vtkm::UInt32>(params.first);
threadsPerBlock = static_cast<vtkm::UInt32>(params.second);
@ -218,8 +224,10 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
{
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
vtkm::Id deviceId;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(deviceId);
if (size.z <= 1)
{ //2d images
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];

@ -25,6 +25,11 @@ static int archVersion = 0;
void queryNumberOfDevicesandHighestArchSupported(vtkm::Int32& nod, vtkm::Int32& has)
{
// We currently cannot use RuntimeDeviceInformation{}.GetRuntimeConfiguration(
// vtkm::cont::DeviceAdapterTagCuda()) in this function due to constraints in
// initialize that query device Existence before we initialize the Runtime
// Configuration. Once those constraints are removed/fixed this file can be
// updated to use that call instead of directly querying the cuda device
std::call_once(deviceQueryFlag, []() {
//first query for the number of devices
auto res = cudaGetDeviceCount(&numDevices);

@ -62,17 +62,8 @@ public:
<< value << " >= " << this->CudaDeviceCount);
return RuntimeDeviceConfigReturnCode::INVALID_VALUE;
}
try
{
VTKM_CUDA_CALL(cudaSetDevice(value));
return RuntimeDeviceConfigReturnCode::SUCCESS;
}
catch (const vtkm::cont::cuda::ErrorCuda& err)
{
VTKM_LOG_S(vtkm::cont::LogLevel::Error,
"Failed to set CudaDeviceInstance: " << err.GetMessage());
return RuntimeDeviceConfigReturnCode::INTERNAL_ERROR;
}
VTKM_CUDA_CALL(cudaSetDevice(value));
return RuntimeDeviceConfigReturnCode::SUCCESS;
}
VTKM_CONT virtual RuntimeDeviceConfigReturnCode GetDeviceInstance(

@ -24,14 +24,13 @@ namespace cont
namespace internal
{
enum class RuntimeDeviceConfigReturnCode : vtkm::Id
enum class RuntimeDeviceConfigReturnCode
{
SUCCESS,
OUT_OF_BOUNDS,
INVALID_FOR_DEVICE,
INVALID_VALUE,
NOT_APPLIED,
INTERNAL_ERROR
NOT_APPLIED
};
class VTKM_CONT_EXPORT RuntimeDeviceConfigurationBase