From 55c159d6f071feb0de24fbb13f5b1cd53d2f7ec5 Mon Sep 17 00:00:00 2001
From: Kenneth Moreland <kmorel@sandia.gov>
Date: Wed, 7 Dec 2016 15:13:02 -0700
Subject: [PATCH] Check error codes from CUDA functions

Most functions in the CUDA runtime API return an error code that must be
checked to determine whether the operation completed successfully. Most
operations in VTK-m just called the function and assumed it completed
correctly, which could lead to further errors. This change wraps most
CUDA calls in a VTKM_CUDA_CALL macro that checks the error code and
throws an exception if the call fails.
---
 vtkm/cont/cuda/ChooseCudaDevice.h             | 12 +--
 .../internal/DeviceAdapterAlgorithmCuda.h     |  2 +-
 .../internal/DeviceAdapterAlgorithmThrust.h   | 81 ++++++++++---------
 vtkm/exec/cuda/internal/ExecutionPolicy.h     |  2 +-
 4 files changed, 53 insertions(+), 44 deletions(-)
diff --git a/vtkm/cont/cuda/ChooseCudaDevice.h b/vtkm/cont/cuda/ChooseCudaDevice.h
index 73dd1c011..381373a27 100644
--- a/vtkm/cont/cuda/ChooseCudaDevice.h
+++ b/vtkm/cont/cuda/ChooseCudaDevice.h
@@ -22,6 +22,8 @@
 
 #include <vtkm/cont/ErrorExecution.h>
 
+#include <vtkm/cont/cuda/ErrorControlCuda.h>
+
 #include <cuda.h>
 #include <algorithm>
 #include <vector>
@@ -97,19 +99,19 @@ static int FindFastestDeviceId()
 {
   //get the number of devices and store information
   int numberOfDevices=0;
-  cudaGetDeviceCount(&numberOfDevices);
+  VTKM_CUDA_CALL(cudaGetDeviceCount(&numberOfDevices));
 
   std::vector<compute_info> devices;
   for(int i=0; i < numberOfDevices; ++i)
-    {
+  {
     cudaDeviceProp properties;
-    cudaGetDeviceProperties(&properties, i);
+    VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, i));
     if(properties.computeMode != cudaComputeModeProhibited)
-      {
+    {
       //only add devices that have compute mode allowed
       devices.push_back( compute_info(properties,i) );
-      }
     }
+  }
 
   //sort from fastest to slowest
   std::sort(devices.begin(),devices.end());
diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
index 8589835bb..ce00058e0 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@@ -137,7 +137,7 @@ public:
       deviceQueryInit = true;
 
       //first query for the number of devices
-      cudaGetDeviceCount(&numDevices);
+      VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
 
       for (vtkm::Int32 i = 0; i < numDevices; i++)
       {
diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
index cce1dc469..297412c56 100644
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h
@@ -28,6 +28,8 @@
 #include <vtkm/TypeTraits.h>
 #include <vtkm/UnaryPredicates.h>
 
+#include <vtkm/cont/cuda/ErrorControlCuda.h>
+
 #include <vtkm/cont/cuda/internal/MakeThrustIterator.h>
 #include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
 
@@ -171,8 +173,8 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
       for(vtkm::UInt32 k=0; k < 16; k++)
         {
         cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
+        VTKM_CUDA_CALL(cudaEventCreate(&start));
+        VTKM_CUDA_CALL(cudaEventCreate(&stop));
 
         dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]);
         dim3 gridSize3d;
@@ -191,16 +193,16 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
           }
 
         compute_block_size(ranges, blockSize3d, gridSize3d);
-        cudaEventRecord(start, 0);
+        VTKM_CUDA_CALL(cudaEventRecord(start, 0));
         Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
-        cudaEventRecord(stop, 0);
+        VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
 
-        cudaEventSynchronize(stop);
+        VTKM_CUDA_CALL(cudaEventSynchronize(stop));
         float elapsedTimeMilliseconds;
-        cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+        VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
 
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
+        VTKM_CUDA_CALL(cudaEventDestroy(start));
+        VTKM_CUDA_CALL(cudaEventDestroy(stop));
 
         PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
         results.push_back( record );
@@ -224,22 +226,22 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
   std::cout << "flat array performance " << std::endl;
   {
   cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  VTKM_CUDA_CALL(cudaEventCreate(&start));
+  VTKM_CUDA_CALL(cudaEventCreate(&stop));
 
-  cudaEventRecord(start, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(start, 0));
   typedef
     vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
           vtkm::cont::DeviceAdapterTagCuda > Algorithm;
   Algorithm::Schedule(functor, numInstances);
-  cudaEventRecord(stop, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
 
-  cudaEventSynchronize(stop);
+  VTKM_CUDA_CALL(cudaEventSynchronize(stop));
   float elapsedTimeMilliseconds;
-  cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+  VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
 
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
+  VTKM_CUDA_CALL(cudaEventDestroy(start));
+  VTKM_CUDA_CALL(cudaEventDestroy(stop));
 
   std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
   }
@@ -247,23 +249,23 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range
   std::cout << "fixed 3d block size performance " << std::endl;
   {
   cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  VTKM_CUDA_CALL(cudaEventCreate(&start));
+  VTKM_CUDA_CALL(cudaEventCreate(&stop));
 
   dim3 blockSize3d(64,2,1);
   dim3 gridSize3d;
 
   compute_block_size(ranges, blockSize3d, gridSize3d);
-  cudaEventRecord(start, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(start, 0));
   Schedule3DIndexKernel<Functor> <<<gridSize3d, blockSize3d>>> (functor, ranges);
-  cudaEventRecord(stop, 0);
+  VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
 
-  cudaEventSynchronize(stop);
+  VTKM_CUDA_CALL(cudaEventSynchronize(stop));
   float elapsedTimeMilliseconds;
-  cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop);
+  VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
 
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
+  VTKM_CUDA_CALL(cudaEventDestroy(start));
+  VTKM_CUDA_CALL(cudaEventDestroy(stop));
 
   std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
   std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl;
@@ -1094,8 +1096,10 @@ private:
     static char* devicePtr = nullptr;
     if( !errorArrayInit )
       {
-      cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped );
-      cudaHostGetDevicePointer(&devicePtr, hostPtr, 0);
+      VTKM_CUDA_CALL(cudaMallocHost( (void**)&hostPtr,
+                                     ERROR_ARRAY_SIZE,
+                                     cudaHostAllocMapped ));
+      VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0));
       errorArrayInit = true;
       }
     //set the size of the array
@@ -1117,10 +1121,11 @@ private:
     if( !gridQueryInit )
       {
       gridQueryInit = true;
-      int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda
+      int currDevice;
+      VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda
 
       cudaDeviceProp properties;
-      cudaGetDeviceProperties(&properties, currDevice);
+      VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice));
       maxGridSize[0] = static_cast<vtkm::UInt32>(properties.maxGridSize[0]);
       maxGridSize[1] = static_cast<vtkm::UInt32>(properties.maxGridSize[1]);
       maxGridSize[2] = static_cast<vtkm::UInt32>(properties.maxGridSize[2]);
@@ -1134,14 +1139,16 @@ private:
       //what we are going to do next, and than we will store that result
 
       vtkm::UInt32 *dev_actual_size;
-      cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) );
+      VTKM_CUDA_CALL(
+            cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) )
+            );
       DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size);
-      cudaDeviceSynchronize();
-      cudaMemcpy( &maxGridSize[0],
-                  dev_actual_size,
-                  sizeof(vtkm::UInt32),
-                  cudaMemcpyDeviceToHost );
-      cudaFree(dev_actual_size);
+      VTKM_CUDA_CALL(cudaDeviceSynchronize());
+      VTKM_CUDA_CALL(cudaMemcpy( &maxGridSize[0],
+                                 dev_actual_size,
+                                 sizeof(vtkm::UInt32),
+                                 cudaMemcpyDeviceToHost ));
+      VTKM_CUDA_CALL(cudaFree(dev_actual_size));
       }
     return maxGridSize;
     }
@@ -1196,7 +1203,7 @@ public:
     //In the future I want move this before the schedule call, and throwing
     //an exception if the previous schedule wrote an error. This would help
     //cuda to run longer before we hard sync.
-    cudaDeviceSynchronize();
+    VTKM_CUDA_CALL(cudaDeviceSynchronize());
 
     //check what the value is
     if (hostErrorPtr[0] != '\0')
@@ -1255,7 +1262,7 @@ public:
     //In the future I want move this before the schedule call, and throwing
     //an exception if the previous schedule wrote an error. This would help
     //cuda to run longer before we hard sync.
-    cudaDeviceSynchronize();
+    VTKM_CUDA_CALL(cudaDeviceSynchronize());
 
     //check what the value is
     if (hostErrorPtr[0] != '\0')
diff --git a/vtkm/exec/cuda/internal/ExecutionPolicy.h b/vtkm/exec/cuda/internal/ExecutionPolicy.h
index 794f2a2c0..9fe8098d8 100644
--- a/vtkm/exec/cuda/internal/ExecutionPolicy.h
+++ b/vtkm/exec/cuda/internal/ExecutionPolicy.h
@@ -190,7 +190,7 @@ __host__ __device__
 
   //only sync if we are being invoked from the host
 #ifndef  __CUDA_ARCH__
-  cudaDeviceSynchronize();
+  VTKM_CUDA_CALL(cudaDeviceSynchronize());
 #endif
 
   return result;