Check error codes from CUDA functions

Most functions in the CUDA runtime API return an error code that must be checked to determine whether the operation completed successfully. Most operations in VTK-m just called the function and assumed it completed correctly, which could lead to further errors. This change wraps most CUDA calls in a VTKM_CUDA_CALL macro that checks the error code and throws an exception if the call fails.
2024-09-08 13:23:51 +00:00 · 2016-12-07 15:13:02 -07:00 · 2016-12-07 15:13:02 -07:00 · 55c159d6f0
commit 55c159d6f0
parent ce651e479d
4 changed files with 53 additions and 44 deletions
--- a/vtkm/cont/cuda/ChooseCudaDevice.h
+++ b/vtkm/cont/cuda/ChooseCudaDevice.h
@ -22,6 +22,8 @@

 #include <vtkm/cont/ErrorExecution.h>

+#include <vtkm/cont/cuda/ErrorControlCuda.h>
+
 #include <cuda.h>
 #include <algorithm>
 #include <vector>
@ -97,19 +99,19 @@ static int FindFastestDeviceId()
 {
  //get the number of devices and store information
  int numberOfDevices=0;
-  cudaGetDeviceCount(&numberOfDevices);
+  VTKM_CUDA_CALL(cudaGetDeviceCount(&numberOfDevices));

  std::vector<compute_info> devices;
  for(int i=0; i < numberOfDevices; ++i)
-    {
+  {
    cudaDeviceProp properties;
-    cudaGetDeviceProperties(&properties, i);
+    VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, i));
    if(properties.computeMode != cudaComputeModeProhibited)
-      {
+    {
      //only add devices that have compute mode allowed
      devices.push_back( compute_info(properties,i) );
-      }
    }
+  }

  //sort from fastest to slowest
  std::sort(devices.begin(),devices.end());
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -137,7 +137,7 @@ public:
      deviceQueryInit = true;

      //first query for the number of devices
-      cudaGetDeviceCount(&numDevices);
+      VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));

      for (vtkm::Int32 i = 0; i < numDevices; i++)
      {
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@ -28,6 +28,8 @@
 #include <vtkm/TypeTraits.h>
 #include <vtkm/UnaryPredicates.h>

+#include <vtkm/cont/cuda/ErrorControlCuda.h>
+
 #include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
 #include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>

@ -171,8 +173,8 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
      for(vtkm::UInt32 k=0; k < 16; k++)
        {
        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
+        VTKM_CUDA_CALL(cudaEventCreate(&start));
+        VTKM_CUDA_CALL(cudaEventCreate(&stop));

        dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]);
        dim3 gridSize3d;
@ -191,16 +193,16 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
          }

        compute_block_size(ranges, blockSize3d, gridSize3d);
-        cudaEventRecord(start, 0);
+        VTKM_CUDA_CALL(cudaEventRecord(start, 0));
        Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
-        cudaEventRecord(stop, 0);
+        VTKM_CUDA_CALL(cudaEventRecord(stop, 0));

-        cudaEventSynchronize(stop);
+        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
        float elapsedTimeMilliseconds;
-        cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+        VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));

-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
+        VTKM_CUDA_CALL(cudaEventDestroy(start));
+        VTKM_CUDA_CALL(cudaEventDestroy(stop));

        PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
        results.push_back( record );
@ -224,22 +226,22 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
  std::cout << "flat array performance " << std::endl;
  {
  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  VTKM_CUDA_CALL(cudaEventCreate(&start));
+  VTKM_CUDA_CALL(cudaEventCreate(&stop));

-  cudaEventRecord(start, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(start, 0));
  typedef
    vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
          vtkm::cont::DeviceAdapterTagCuda > Algorithm;
  Algorithm::Schedule(functor, numInstances);
-  cudaEventRecord(stop, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(stop, 0));

-  cudaEventSynchronize(stop);
+  VTKM_CUDA_CALL(cudaEventSynchronize(stop));
  float elapsedTimeMilliseconds;
-  cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+  VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));

-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
+  VTKM_CUDA_CALL(cudaEventDestroy(start));
+  VTKM_CUDA_CALL(cudaEventDestroy(stop));

  std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
  }
@ -247,23 +249,23 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
  std::cout << "fixed 3d block size performance " << std::endl;
  {
  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  VTKM_CUDA_CALL(cudaEventCreate(&start));
+  VTKM_CUDA_CALL(cudaEventCreate(&stop));

  dim3 blockSize3d(64,2,1);
  dim3 gridSize3d;

  compute_block_size(ranges, blockSize3d, gridSize3d);
-  cudaEventRecord(start, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(start, 0));
  Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
-  cudaEventRecord(stop, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(stop, 0));

-  cudaEventSynchronize(stop);
+  VTKM_CUDA_CALL(cudaEventSynchronize(stop));
  float elapsedTimeMilliseconds;
-  cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+  VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));

-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
+  VTKM_CUDA_CALL(cudaEventDestroy(start));
+  VTKM_CUDA_CALL(cudaEventDestroy(stop));

  std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
  std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
@ -1094,8 +1096,10 @@ private:
    static char* devicePtr = nullptr;
    if( !errorArrayInit )
      {
-      cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped );
-      cudaHostGetDevicePointer(&devicePtr, hostPtr, 0);
+      VTKM_CUDA_CALL(cudaMallocHost( (void**)&hostPtr,
+                                     ERROR_ARRAY_SIZE,
+                                     cudaHostAllocMapped ));
+      VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
      errorArrayInit = true;
      }
    //set the size of the array
@ -1117,10 +1121,11 @@ private:
    if( !gridQueryInit )
      {
      gridQueryInit = true;
-      int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda
+      int currDevice;
+      VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda

      cudaDeviceProp properties;
-      cudaGetDeviceProperties(&properties, currDevice);
+      VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
      maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
      maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
      maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
@ -1134,14 +1139,16 @@ private:
      //what we are going to do next, and than we will store that result

      vtkm::UInt32 *dev_actual_size;
-      cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) );
+      VTKM_CUDA_CALL(
+            cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) )
+            );
      DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size);
-      cudaDeviceSynchronize();
-      cudaMemcpy( &maxGridSize[0],
-                  dev_actual_size,
-                  sizeof(vtkm::UInt32),
-                  cudaMemcpyDeviceToHost );
-      cudaFree(dev_actual_size);
+      VTKM_CUDA_CALL(cudaDeviceSynchronize());
+      VTKM_CUDA_CALL(cudaMemcpy( &maxGridSize[0],
+                                 dev_actual_size,
+                                 sizeof(vtkm::UInt32),
+                                 cudaMemcpyDeviceToHost ));
+      VTKM_CUDA_CALL(cudaFree(dev_actual_size));
      }
    return maxGridSize;
    }
@ -1196,7 +1203,7 @@ public:
    //In the future I want move this before the schedule call, and throwing
    //an exception if the previous schedule wrote an error. This would help
    //cuda to run longer before we hard sync.
-    cudaDeviceSynchronize();
+    VTKM_CUDA_CALL(cudaDeviceSynchronize());

    //check what the value is
    if (hostErrorPtr[0] != '\0')
@ -1255,7 +1262,7 @@ public:
    //In the future I want move this before the schedule call, and throwing
    //an exception if the previous schedule wrote an error. This would help
    //cuda to run longer before we hard sync.
-    cudaDeviceSynchronize();
+    VTKM_CUDA_CALL(cudaDeviceSynchronize());

    //check what the value is
    if (hostErrorPtr[0] != '\0')
--- a/vtkm/exec/cuda/internal/ExecutionPolicy.h
+++ b/vtkm/exec/cuda/internal/ExecutionPolicy.h
@ -190,7 +190,7 @@ __host__ __device__

  //only sync if we are being invoked from the host
 #ifndef  __CUDA_ARCH__
-  cudaDeviceSynchronize();
+  VTKM_CUDA_CALL(cudaDeviceSynchronize());
 #endif

  return result;