Check error codes from CUDA functions

Most functions in the CUDA runtime API return an error code that must be
checked to determine whether the operation completed successfully. Most
operations in VTK-m just called the function and assumed it completed
correctly, which could lead to further errors. This change wraps most
CUDA calls in a VTKM_CUDA_CALL macro that checks the error code and
throws an exception if the call fails.
This commit is contained in:
Kenneth Moreland 2016-12-07 15:13:02 -07:00
parent ce651e479d
commit 55c159d6f0
4 changed files with 53 additions and 44 deletions

@ -22,6 +22,8 @@
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/cont/cuda/ErrorControlCuda.h>
#include <cuda.h>
#include <algorithm>
#include <vector>
@ -97,19 +99,19 @@ static int FindFastestDeviceId()
{
//get the number of devices and store information
int numberOfDevices=0;
cudaGetDeviceCount(&numberOfDevices);
VTKM_CUDA_CALL(cudaGetDeviceCount(&numberOfDevices));
std::vector<compute_info> devices;
for(int i=0; i < numberOfDevices; ++i)
{
{
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);
VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, i));
if(properties.computeMode != cudaComputeModeProhibited)
{
{
//only add devices that have compute mode allowed
devices.push_back( compute_info(properties,i) );
}
}
}
//sort from fastest to slowest
std::sort(devices.begin(),devices.end());

@ -137,7 +137,7 @@ public:
deviceQueryInit = true;
//first query for the number of devices
cudaGetDeviceCount(&numDevices);
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
for (vtkm::Int32 i = 0; i < numDevices; i++)
{

@ -28,6 +28,8 @@
#include <vtkm/TypeTraits.h>
#include <vtkm/UnaryPredicates.h>
#include <vtkm/cont/cuda/ErrorControlCuda.h>
#include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
@ -171,8 +173,8 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
for(vtkm::UInt32 k=0; k < 16; k++)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]);
dim3 gridSize3d;
@ -191,16 +193,16 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
}
compute_block_size(ranges, blockSize3d, gridSize3d);
cudaEventRecord(start, 0);
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
cudaEventRecord(stop, 0);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
cudaEventDestroy(start);
cudaEventDestroy(stop);
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
results.push_back( record );
@ -224,22 +226,22 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
std::cout << "flat array performance " << std::endl;
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
typedef
vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
vtkm::cont::DeviceAdapterTagCuda > Algorithm;
Algorithm::Schedule(functor, numInstances);
cudaEventRecord(stop, 0);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
cudaEventDestroy(start);
cudaEventDestroy(stop);
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
}
@ -247,23 +249,23 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
std::cout << "fixed 3d block size performance " << std::endl;
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blockSize3d(64,2,1);
dim3 gridSize3d;
compute_block_size(ranges, blockSize3d, gridSize3d);
cudaEventRecord(start, 0);
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
cudaEventRecord(stop, 0);
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
cudaEventDestroy(start);
cudaEventDestroy(stop);
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
@ -1094,8 +1096,10 @@ private:
static char* devicePtr = nullptr;
if( !errorArrayInit )
{
cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped );
cudaHostGetDevicePointer(&devicePtr, hostPtr, 0);
VTKM_CUDA_CALL(cudaMallocHost( (void**)&hostPtr,
ERROR_ARRAY_SIZE,
cudaHostAllocMapped ));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
errorArrayInit = true;
}
//set the size of the array
@ -1117,10 +1121,11 @@ private:
if( !gridQueryInit )
{
gridQueryInit = true;
int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda
int currDevice;
VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, currDevice);
VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
@ -1134,14 +1139,16 @@ private:
//what we are going to do next, and than we will store that result
vtkm::UInt32 *dev_actual_size;
cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) );
VTKM_CUDA_CALL(
cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) )
);
DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size);
cudaDeviceSynchronize();
cudaMemcpy( &maxGridSize[0],
dev_actual_size,
sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost );
cudaFree(dev_actual_size);
VTKM_CUDA_CALL(cudaDeviceSynchronize());
VTKM_CUDA_CALL(cudaMemcpy( &maxGridSize[0],
dev_actual_size,
sizeof(vtkm::UInt32),
cudaMemcpyDeviceToHost ));
VTKM_CUDA_CALL(cudaFree(dev_actual_size));
}
return maxGridSize;
}
@ -1196,7 +1203,7 @@ public:
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
cudaDeviceSynchronize();
VTKM_CUDA_CALL(cudaDeviceSynchronize());
//check what the value is
if (hostErrorPtr[0] != '\0')
@ -1255,7 +1262,7 @@ public:
//In the future I want move this before the schedule call, and throwing
//an exception if the previous schedule wrote an error. This would help
//cuda to run longer before we hard sync.
cudaDeviceSynchronize();
VTKM_CUDA_CALL(cudaDeviceSynchronize());
//check what the value is
if (hostErrorPtr[0] != '\0')

@ -190,7 +190,7 @@ __host__ __device__
//only sync if we are being invoked from the host
#ifndef __CUDA_ARCH__
cudaDeviceSynchronize();
VTKM_CUDA_CALL(cudaDeviceSynchronize());
#endif
return result;