Introduce a log level that details kernel launch parameters

2024-09-16 17:22:55 +00:00 · 2019-05-13 16:32:43 -04:00 · 2019-05-13 16:32:43 -04:00 · 8aaf922aa4
commit 8aaf922aa4
parent 990b0241a3
5 changed files with 123 additions and 10 deletions
--- a/docs/changelog/add-cuda-kernel-details-to-logging.md
+++ b/docs/changelog/add-cuda-kernel-details-to-logging.md
@ -0,0 +1,12 @@
+# VTK-m logs details about each CUDA kernel launch
+
+The VTK-m logging infrastructure has been extended with a new log level
+`KernelLaunches` which exists between `MemTransfer` and `Cast`.
+
+This log level reports the number of blocks, threads per block, and the
+PTX version of each CUDA kernel launched.
+
+This logging level was primarily introduced to help developers that are
+tracking down issues that occur when VTK-m components have been built with
+different `sm_XX` flags and help people looking to do kernel performance
+tuning.
--- a/vtkm/cont/Logging.cxx
+++ b/vtkm/cont/Logging.cxx
@ -115,6 +115,7 @@ void InitLogging(int& argc, char* argv[])
  SetLogLevelName(vtkm::cont::LogLevel::MemCont, "MemC");
  SetLogLevelName(vtkm::cont::LogLevel::MemExec, "MemE");
  SetLogLevelName(vtkm::cont::LogLevel::MemTransfer, "MemT");
+  SetLogLevelName(vtkm::cont::LogLevel::KernelLaunches, "Kern");
  SetLogLevelName(vtkm::cont::LogLevel::Cast, "Cast");


@ -152,6 +153,16 @@ void SetStderrLogLevel(LogLevel level)
 #endif // VTKM_ENABLE_LOGGING
 }

+VTKM_CONT
+vtkm::cont::LogLevel GetStderrLogLevel()
+{
+#ifdef VTKM_ENABLE_LOGGING
+  return static_cast<vtkm::cont::LogLevel>(loguru::g_stderr_verbosity);
+#else  // VTKM_ENABLE_LOGGING
+  return vtkm::cont::LogLevel::Off;
+#endif // VTKM_ENABLE_LOGGING
+}
+
 VTKM_CONT
 void SetLogThreadName(const std::string& name)
 {
--- a/vtkm/cont/Logging.h
+++ b/vtkm/cont/Logging.h
@ -104,6 +104,8 @@
 ///   execution environments, respectively.
 /// - MemTransfer: This level logs memory transfers between the control and host
 ///   environments.
+/// - KernelLaunches: This level logs details about each device side kernel launch
+///   such as the CUDA PTX, Warps, and Grids used.
 /// - Cast: Logs details of dynamic object resolution.
 ///
 /// The log may be shared and extended by applications that use VTK-m. There
@ -310,6 +312,9 @@ enum class LogLevel
  /// Host->device / device->host data copies
  MemTransfer,

+  /// Details on Device-side Kernel Launches
+  KernelLaunches,
+
  /// When a dynamic object is (or isn't) resolved via CastAndCall, etc.
  Cast,

@ -350,6 +355,13 @@ VTKM_CONT_EXPORT
 VTKM_CONT
 void SetStderrLogLevel(vtkm::cont::LogLevel level);

+/**
+ * Get the active highest log level that will be printed to stderr.
+ */
+VTKM_CONT_EXPORT
+VTKM_CONT
+vtkm::cont::LogLevel GetStderrLogLevel();
+
 /**
 * Register a custom name to identify a log level. The name will be truncated
 * to 4 characters internally.
@ -416,20 +428,23 @@ std::string GetSizeString(vtkm::UInt64 bytes, int prec = 2);
 * enabled and the platform supports it, the type name will also be demangled.
 * @{
 */
+static inline VTKM_CONT std::string TypeToString(const std::type_info& t)
+{
+#ifdef VTKM_ENABLE_LOGGING
+  return loguru::demangle(t.name()).c_str();
+#else  // VTKM_ENABLE_LOGGING
+  return t.name();
+#endif // VTKM_ENABLE_LOGGING
+}
 template <typename T>
 static inline VTKM_CONT std::string TypeToString()
 {
-#ifdef VTKM_ENABLE_LOGGING
-  return loguru::demangle(typeid(T).name()).c_str();
-#else  // VTKM_ENABLE_LOGGING
-  return typeid(T).name();
-#endif // VTKM_ENABLE_LOGGING
+  return TypeToString(typeid(T));
 }
-
 template <typename T>
-static inline VTKM_CONT std::string TypeToString(const T&)
+static inline VTKM_CONT std::string TypeToString(const T& t)
 {
-  return TypeToString<T>();
+  return TypeToString(typeid(t));
 }
 /**@}*/
 }
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
@ -233,5 +233,45 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
    threadsPerBlock = params.second;
  }
 }
+
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
+  const cudaFuncAttributes& func_attrs,
+  const std::type_info& worklet_info,
+  vtkm::UInt32 blocks,
+  vtkm::UInt32 threadsPerBlock,
+  vtkm::Id)
+{
+  (void)func_attrs;
+  (void)blocks;
+  (void)threadsPerBlock;
+  std::string name = vtkm::cont::TypeToString(worklet_info);
+  VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
+             "Launching 1D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i]",
+             name.c_str(),
+             (func_attrs.ptxVersion * 10),
+             blocks,
+             threadsPerBlock);
+}
+
+void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
+  const cudaFuncAttributes& func_attrs,
+  const std::type_info& worklet_info,
+  vtkm::UInt32 blocks,
+  dim3 threadsPerBlock,
+  const dim3&)
+{
+  (void)func_attrs;
+  (void)blocks;
+  (void)threadsPerBlock;
+  std::string name = vtkm::cont::TypeToString(worklet_info);
+  VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
+             "Launching 3D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i, %i, %i]",
+             name.c_str(),
+             (func_attrs.ptxVersion * 10),
+             blocks,
+             threadsPerBlock.x,
+             threadsPerBlock.y,
+             threadsPerBlock.z);
+}
 }
 } // end namespace vtkm::cont
--- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
+++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h
@ -132,7 +132,6 @@ VTKM_CONT_EXPORT void InitScheduleParameters(
                                           int maxThreadsPerMultiProcessor,
                                           int maxThreadsPerBlock));

-
 namespace internal
 {

@ -169,7 +168,6 @@ __global__ void TaskStrided3DLaunch(TaskType task, dim3 size)
  }
 }

-
 template <typename T, typename BinaryOperationType>
 __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
 {
@ -1418,6 +1416,20 @@ public:
  VTKM_CONT_EXPORT
  static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);

+  VTKM_CONT_EXPORT
+  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
+                              const std::type_info& worklet_info,
+                              vtkm::UInt32 blocks,
+                              vtkm::UInt32 threadsPerBlock,
+                              vtkm::Id size);
+
+  VTKM_CONT_EXPORT
+  static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
+                              const std::type_info& worklet_info,
+                              vtkm::UInt32 blocks,
+                              dim3 threadsPerBlock,
+                              const dim3& size);
+
 public:
  template <typename WType, typename IType>
  static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
@ -1436,6 +1448,17 @@ public:
    vtkm::UInt32 blocks, threadsPerBlock;
    GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);

+#ifdef VTKM_ENABLE_LOGGING
+    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
+    {
+      using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
+      cudaFuncAttributes empty_kernel_attrs;
+      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
+                                           cuda::internal::TaskStrided1DLaunch<FunctorType>));
+      LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances);
+    }
+#endif
+
    cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
      functor, numInstances);
  }
@ -1462,6 +1485,17 @@ public:
    dim3 threadsPerBlock;
    GetBlocksAndThreads(blocks, threadsPerBlock, ranges);

+#ifdef VTKM_ENABLE_LOGGING
+    if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
+    {
+      using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
+      cudaFuncAttributes empty_kernel_attrs;
+      VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
+                                           cuda::internal::TaskStrided3DLaunch<FunctorType>));
+      LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges);
+    }
+#endif
+
    cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
      functor, ranges);
  }
@ -1472,6 +1506,7 @@ public:
    VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);

    vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
+
    ScheduleTask(kernel, numInstances);
  }