From 55c159d6f071feb0de24fbb13f5b1cd53d2f7ec5 Mon Sep 17 00:00:00 2001 From: Kenneth Moreland Date: Wed, 7 Dec 2016 15:13:02 -0700 Subject: [PATCH] Check error codes from CUDA functions Most functions in the CUDA runtime API return an error code that must be checked to determine whether the operation completed successfully. Most operations in VTK-m just called the function and assumed it completed correctly, which could lead to further errors. This change wraps most CUDA calls in a VTKM_CUDA_CALL macro that checks the error code and throws an exception if the call fails. --- vtkm/cont/cuda/ChooseCudaDevice.h | 12 +-- .../internal/DeviceAdapterAlgorithmCuda.h | 2 +- .../internal/DeviceAdapterAlgorithmThrust.h | 81 ++++++++++--------- vtkm/exec/cuda/internal/ExecutionPolicy.h | 2 +- 4 files changed, 53 insertions(+), 44 deletions(-) diff --git a/vtkm/cont/cuda/ChooseCudaDevice.h b/vtkm/cont/cuda/ChooseCudaDevice.h index 73dd1c011..381373a27 100644 --- a/vtkm/cont/cuda/ChooseCudaDevice.h +++ b/vtkm/cont/cuda/ChooseCudaDevice.h @@ -22,6 +22,8 @@ #include +#include + #include #include #include @@ -97,19 +99,19 @@ static int FindFastestDeviceId() { //get the number of devices and store information int numberOfDevices=0; - cudaGetDeviceCount(&numberOfDevices); + VTKM_CUDA_CALL(cudaGetDeviceCount(&numberOfDevices)); std::vector devices; for(int i=0; i < numberOfDevices; ++i) - { + { cudaDeviceProp properties; - cudaGetDeviceProperties(&properties, i); + VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, i)); if(properties.computeMode != cudaComputeModeProhibited) - { + { //only add devices that have compute mode allowed devices.push_back( compute_info(properties,i) ); - } } + } //sort from fastest to slowest std::sort(devices.begin(),devices.end()); diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h index 8589835bb..ce00058e0 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h @@ -137,7 +137,7 @@ public: deviceQueryInit = true; //first query for the number of devices - cudaGetDeviceCount(&numDevices); + VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices)); for (vtkm::Int32 i = 0; i < numDevices; i++) { diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h index cce1dc469..297412c56 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h @@ -28,6 +28,8 @@ #include #include +#include + #include #include @@ -171,8 +173,8 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range for(vtkm::UInt32 k=0; k < 16; k++) { cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); dim3 blockSize3d(indexTable[i],indexTable[j],indexTable[k]); dim3 gridSize3d; @@ -191,16 +193,16 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range } compute_block_size(ranges, blockSize3d, gridSize3d); - cudaEventRecord(start, 0); + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); Schedule3DIndexKernel <<>> (functor, ranges); - cudaEventRecord(stop, 0); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - cudaEventSynchronize(stop); + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); float elapsedTimeMilliseconds; - cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop); + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - cudaEventDestroy(start); - cudaEventDestroy(stop); + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); PerfRecord record(elapsedTimeMilliseconds, blockSize3d); results.push_back( record ); @@ -224,22 +226,22 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range std::cout << "flat array performance " << std::endl; { cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); - cudaEventRecord(start, 0); + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); typedef vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust< vtkm::cont::DeviceAdapterTagCuda > Algorithm; Algorithm::Schedule(functor, numInstances); - cudaEventRecord(stop, 0); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - cudaEventSynchronize(stop); + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); float elapsedTimeMilliseconds; - cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop); + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - cudaEventDestroy(start); - cudaEventDestroy(stop); + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl; } @@ -247,23 +249,23 @@ static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& range std::cout << "fixed 3d block size performance " << std::endl; { cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); dim3 blockSize3d(64,2,1); dim3 gridSize3d; compute_block_size(ranges, blockSize3d, gridSize3d); - cudaEventRecord(start, 0); + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); Schedule3DIndexKernel <<>> (functor, ranges); - cudaEventRecord(stop, 0); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - cudaEventSynchronize(stop); + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); float elapsedTimeMilliseconds; - cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop); + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - cudaEventDestroy(start); - cudaEventDestroy(stop); + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl; std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z << " required: " << elapsedTimeMilliseconds << std::endl; @@ -1094,8 +1096,10 @@ private: static char* devicePtr = nullptr; if( !errorArrayInit ) { - cudaMallocHost( (void**)&hostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped ); - cudaHostGetDevicePointer(&devicePtr, hostPtr, 0); + VTKM_CUDA_CALL(cudaMallocHost( (void**)&hostPtr, + ERROR_ARRAY_SIZE, + cudaHostAllocMapped )); + VTKM_CUDA_CALL(cudaHostGetDevicePointer(&devicePtr, hostPtr, 0)); errorArrayInit = true; } //set the size of the array @@ -1117,10 +1121,11 @@ private: if( !gridQueryInit ) { gridQueryInit = true; - int currDevice; cudaGetDevice(&currDevice); //get deviceid from cuda + int currDevice; + VTKM_CUDA_CALL(cudaGetDevice(&currDevice)); //get deviceid from cuda cudaDeviceProp properties; - cudaGetDeviceProperties(&properties, currDevice); + VTKM_CUDA_CALL(cudaGetDeviceProperties(&properties, currDevice)); maxGridSize[0] = static_cast(properties.maxGridSize[0]); maxGridSize[1] = static_cast(properties.maxGridSize[1]); maxGridSize[2] = static_cast(properties.maxGridSize[2]); @@ -1134,14 +1139,16 @@ private: //what we are going to do next, and than we will store that result vtkm::UInt32 *dev_actual_size; - cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) ); + VTKM_CUDA_CALL( + cudaMalloc( (void**)&dev_actual_size, sizeof(vtkm::UInt32) ) + ); DetermineProperXGridSize <<<1,1>>> (maxGridSize[0], dev_actual_size); - cudaDeviceSynchronize(); - cudaMemcpy( &maxGridSize[0], - dev_actual_size, - sizeof(vtkm::UInt32), - cudaMemcpyDeviceToHost ); - cudaFree(dev_actual_size); + VTKM_CUDA_CALL(cudaDeviceSynchronize()); + VTKM_CUDA_CALL(cudaMemcpy( &maxGridSize[0], + dev_actual_size, + sizeof(vtkm::UInt32), + cudaMemcpyDeviceToHost )); + VTKM_CUDA_CALL(cudaFree(dev_actual_size)); } return maxGridSize; } @@ -1196,7 +1203,7 @@ public: //In the future I want move this before the schedule call, and throwing //an exception if the previous schedule wrote an error. This would help //cuda to run longer before we hard sync. - cudaDeviceSynchronize(); + VTKM_CUDA_CALL(cudaDeviceSynchronize()); //check what the value is if (hostErrorPtr[0] != '\0') @@ -1255,7 +1262,7 @@ public: //In the future I want move this before the schedule call, and throwing //an exception if the previous schedule wrote an error. This would help //cuda to run longer before we hard sync. - cudaDeviceSynchronize(); + VTKM_CUDA_CALL(cudaDeviceSynchronize()); //check what the value is if (hostErrorPtr[0] != '\0') diff --git a/vtkm/exec/cuda/internal/ExecutionPolicy.h b/vtkm/exec/cuda/internal/ExecutionPolicy.h index 794f2a2c0..9fe8098d8 100644 --- a/vtkm/exec/cuda/internal/ExecutionPolicy.h +++ b/vtkm/exec/cuda/internal/ExecutionPolicy.h @@ -190,7 +190,7 @@ __host__ __device__ //only sync if we are being invoked from the host #ifndef __CUDA_ARCH__ - cudaDeviceSynchronize(); + VTKM_CUDA_CALL(cudaDeviceSynchronize()); #endif return result;