Introduce a log level that details kernel launch parameters

This commit is contained in:
Robert Maynard 2019-05-13 16:32:43 -04:00
parent 990b0241a3
commit 8aaf922aa4
5 changed files with 123 additions and 10 deletions

@ -0,0 +1,12 @@
# VTK-m logs details about each CUDA kernel launch
The VTK-m logging infrastructure has been extended with a new log level
`KernelLaunches` which exists between `MemTransfer` and `Cast`.
This log level reports the number of blocks, threads per block, and the
PTX version of each CUDA kernel launched.
This logging level was primarily introduced to help developers that are
tracking down issues that occur when VTK-m components have been built with
different `sm_XX` flags and help people looking to do kernel performance
tuning.

@ -115,6 +115,7 @@ void InitLogging(int& argc, char* argv[])
SetLogLevelName(vtkm::cont::LogLevel::MemCont, "MemC");
SetLogLevelName(vtkm::cont::LogLevel::MemExec, "MemE");
SetLogLevelName(vtkm::cont::LogLevel::MemTransfer, "MemT");
SetLogLevelName(vtkm::cont::LogLevel::KernelLaunches, "Kern");
SetLogLevelName(vtkm::cont::LogLevel::Cast, "Cast");
@ -152,6 +153,16 @@ void SetStderrLogLevel(LogLevel level)
#endif // VTKM_ENABLE_LOGGING
}
VTKM_CONT
vtkm::cont::LogLevel GetStderrLogLevel()
{
#ifdef VTKM_ENABLE_LOGGING
return static_cast<vtkm::cont::LogLevel>(loguru::g_stderr_verbosity);
#else // VTKM_ENABLE_LOGGING
return vtkm::cont::LogLevel::Off;
#endif // VTKM_ENABLE_LOGGING
}
VTKM_CONT
void SetLogThreadName(const std::string& name)
{

@ -104,6 +104,8 @@
/// execution environments, respectively.
/// - MemTransfer: This level logs memory transfers between the control and host
/// environments.
/// - KernelLaunches: This level logs details about each device side kernel launch
/// such as the CUDA PTX, Warps, and Grids used.
/// - Cast: Logs details of dynamic object resolution.
///
/// The log may be shared and extended by applications that use VTK-m. There
@ -310,6 +312,9 @@ enum class LogLevel
/// Host->device / device->host data copies
MemTransfer,
/// Details on Device-side Kernel Launches
KernelLaunches,
/// When a dynamic object is (or isn't) resolved via CastAndCall, etc.
Cast,
@ -350,6 +355,13 @@ VTKM_CONT_EXPORT
VTKM_CONT
void SetStderrLogLevel(vtkm::cont::LogLevel level);
/**
* Get the active highest log level that will be printed to stderr.
*/
VTKM_CONT_EXPORT
VTKM_CONT
vtkm::cont::LogLevel GetStderrLogLevel();
/**
* Register a custom name to identify a log level. The name will be truncated
* to 4 characters internally.
@ -416,20 +428,23 @@ std::string GetSizeString(vtkm::UInt64 bytes, int prec = 2);
* enabled and the platform supports it, the type name will also be demangled.
* @{
*/
static inline VTKM_CONT std::string TypeToString(const std::type_info& t)
{
#ifdef VTKM_ENABLE_LOGGING
return loguru::demangle(t.name()).c_str();
#else // VTKM_ENABLE_LOGGING
return t.name();
#endif // VTKM_ENABLE_LOGGING
}
template <typename T>
static inline VTKM_CONT std::string TypeToString()
{
#ifdef VTKM_ENABLE_LOGGING
return loguru::demangle(typeid(T).name()).c_str();
#else // VTKM_ENABLE_LOGGING
return typeid(T).name();
#endif // VTKM_ENABLE_LOGGING
return TypeToString(typeid(T));
}
template <typename T>
static inline VTKM_CONT std::string TypeToString(const T&)
static inline VTKM_CONT std::string TypeToString(const T& t)
{
return TypeToString<T>();
return TypeToString(typeid(t));
}
/**@}*/
}

@ -233,5 +233,45 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThrea
threadsPerBlock = params.second;
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
vtkm::UInt32 threadsPerBlock,
vtkm::Id)
{
(void)func_attrs;
(void)blocks;
(void)threadsPerBlock;
std::string name = vtkm::cont::TypeToString(worklet_info);
VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
"Launching 1D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i]",
name.c_str(),
(func_attrs.ptxVersion * 10),
blocks,
threadsPerBlock);
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
dim3 threadsPerBlock,
const dim3&)
{
(void)func_attrs;
(void)blocks;
(void)threadsPerBlock;
std::string name = vtkm::cont::TypeToString(worklet_info);
VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
"Launching 3D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i, %i, %i]",
name.c_str(),
(func_attrs.ptxVersion * 10),
blocks,
threadsPerBlock.x,
threadsPerBlock.y,
threadsPerBlock.z);
}
}
} // end namespace vtkm::cont

@ -132,7 +132,6 @@ VTKM_CONT_EXPORT void InitScheduleParameters(
int maxThreadsPerMultiProcessor,
int maxThreadsPerBlock));
namespace internal
{
@ -169,7 +168,6 @@ __global__ void TaskStrided3DLaunch(TaskType task, dim3 size)
}
}
template <typename T, typename BinaryOperationType>
__global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
{
@ -1418,6 +1416,20 @@ public:
VTKM_CONT_EXPORT
static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
VTKM_CONT_EXPORT
static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
vtkm::UInt32 threadsPerBlock,
vtkm::Id size);
VTKM_CONT_EXPORT
static void LogKernelLaunch(const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
dim3 threadsPerBlock,
const dim3& size);
public:
template <typename WType, typename IType>
static void ScheduleTask(vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>& functor,
@ -1436,6 +1448,17 @@ public:
vtkm::UInt32 blocks, threadsPerBlock;
GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
#ifdef VTKM_ENABLE_LOGGING
if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
{
using FunctorType = vtkm::exec::cuda::internal::TaskStrided1D<WType, IType>;
cudaFuncAttributes empty_kernel_attrs;
VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
cuda::internal::TaskStrided1DLaunch<FunctorType>));
LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, numInstances);
}
#endif
cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
functor, numInstances);
}
@ -1462,6 +1485,17 @@ public:
dim3 threadsPerBlock;
GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
#ifdef VTKM_ENABLE_LOGGING
if (GetStderrLogLevel() >= vtkm::cont::LogLevel::KernelLaunches)
{
using FunctorType = vtkm::exec::cuda::internal::TaskStrided3D<WType, IType>;
cudaFuncAttributes empty_kernel_attrs;
VTKM_CUDA_CALL(cudaFuncGetAttributes(&empty_kernel_attrs,
cuda::internal::TaskStrided3DLaunch<FunctorType>));
LogKernelLaunch(empty_kernel_attrs, typeid(WType), blocks, threadsPerBlock, ranges);
}
#endif
cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
functor, ranges);
}
@ -1472,6 +1506,7 @@ public:
VTKM_LOG_SCOPE_FUNCTION(vtkm::cont::LogLevel::Perf);
vtkm::exec::cuda::internal::TaskStrided1D<Functor, vtkm::internal::NullType> kernel(functor);
ScheduleTask(kernel, numInstances);
}