vtk-m/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
Kenneth Moreland c44f686496 Add hints to device adapter scheduler
The `DeviceAdapter` provides an abstract interface to the accelerator
devices worklets and other algorithms run on. As such, the programmer has
less control about how the device launches each worklet. Each device
adapter has its own configuration parameters and other ways to attempt to
optimize how things are run, but these are always a universal set of
options that are applied to everything run on the device. There is no way
to specify launch parameters for a particular worklet.

To provide this information, VTK-m now supports `Hint`s to the device
adapter. The `DeviceAdapterAlgorithm::Schedule` method takes a templated
argument that is of the type `HintList`. This object contains a template
list of `Hint` types that provide suggestions on how to launch the parallel
execution. The device adapter will pick out hints that pertain to it and
adjust its launching accordingly.

These are called hints rather than, say, directives, because they don't
force the device adapter to do anything. The device adapter is free to
ignore any (and all) hints. The point is that the device adapter can take
into account the information to try to optimize for itself.

A provided hint can be tied to specific device adapters. In this way, an
worklet can further optimize itself. If multiple hints match a device
adapter, the last one in the list will be selected.

The `Worklet` base now has an internal type named `Hints` that points to a
`HintList` that is applied when the worklet is scheduled. Derived worklet
classes can provide hints by simply defining their own `Hints` type.
2024-02-09 10:42:23 -05:00

313 lines
11 KiB
Plaintext

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
#include <vtkm/cont/RuntimeDeviceInformation.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/cuda/internal/RuntimeDeviceConfigurationCuda.h>
#include <atomic>
#include <cstring>
#include <functional>
#include <mutex>
#include <cuda.h>
// minwindef.h on Windows creates a preprocessor macro named PASCAL, which messes this up.
#ifdef PASCAL
#undef PASCAL
#endif
namespace vtkm
{
namespace cont
{
namespace cuda
{
static vtkm::cont::cuda::ScheduleParameters (
*ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
//Use the provided function as the the compute function for ScheduleParameterBuilder
VTKM_CONT_EXPORT void InitScheduleParameters(
vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
{
ComputeFromEnv = function;
}
namespace internal
{
//These represent the best block/threads-per for scheduling on each GPU
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
struct VTKM_CONT_EXPORT ScheduleParameterBuilder
{
//This represents information that is used to compute the best
//ScheduleParameters for a given GPU
enum struct GPU_STRATA
{
ENV = 0,
OLDER = 5,
PASCAL = 6,
VOLTA = 7,
PASCAL_HPC = 6000,
VOLTA_HPC = 7000
};
std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
// clang-format off
// The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
ScheduleParameterBuilder()
: Presets{
{ GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings
{ GPU_STRATA::OLDER,
{ 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
{ GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
{ GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
{ GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 64, { 8, 8, 4 } } }, //P100
{ GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 64, { 8, 8, 4 } } }, //V100
}
, Compute(nullptr)
{
if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
{
this->Compute = vtkm::cont::cuda::ComputeFromEnv;
}
else
{
this->Compute = [=] (const char* name, int major, int minor,
int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters {
return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
}
}
// clang-format on
vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
int major,
int minor,
int numSMs,
int maxThreadsPerSM,
int maxThreadsPerBlock)
{
(void)minor;
(void)maxThreadsPerSM;
(void)maxThreadsPerBlock;
const constexpr int GPU_STRATA_MAX_GEN = 7;
const constexpr int GPU_STRATA_MIN_GEN = 5;
int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
if (strataAsInt > GPU_STRATA_MIN_GEN)
{ //only pascal and above have fancy
//Currently the only
bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
if (is_tesla)
{
strataAsInt *= 1000; //tesla modifier
}
}
auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
ScheduleParameters params = preset->second;
params.one_d_blocks = params.one_d_blocks * numSMs;
params.two_d_blocks = params.two_d_blocks * numSMs;
params.three_d_blocks = params.three_d_blocks * numSMs;
return params;
}
};
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
{
//check flag
static std::once_flag lookupBuiltFlag;
std::call_once(lookupBuiltFlag, []() {
ScheduleParameterBuilder builder;
auto cudaDeviceConfig = dynamic_cast<
vtkm::cont::internal::RuntimeDeviceConfiguration<vtkm::cont::DeviceAdapterTagCuda>&>(
vtkm::cont::RuntimeDeviceInformation{}.GetRuntimeConfiguration(
vtkm::cont::DeviceAdapterTagCuda()));
std::vector<cudaDeviceProp> cudaDevices;
cudaDeviceConfig.GetCudaDeviceProp(cudaDevices);
for (const auto& deviceProp : cudaDevices)
{
ScheduleParameters params = builder.Compute(deviceProp.name,
deviceProp.major,
deviceProp.minor,
deviceProp.multiProcessorCount,
deviceProp.maxThreadsPerMultiProcessor,
deviceProp.maxThreadsPerBlock);
scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
scheduling_3d_parameters.emplace_back(params.three_d_blocks,
params.three_d_threads_per_block);
}
});
}
}
} // end namespace cuda::internal
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
-> const PinnedErrorArray&
{
constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
static thread_local PinnedErrorArray local;
if (!local.HostPtr)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
local.HostPtr[0] = '\0'; // clear
local.Size = ERROR_ARRAY_SIZE;
}
return local;
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
vtkm::exec::cuda::internal::TaskStrided& functor)
{
auto pinnedArray = GetPinnedErrorArray();
vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
functor.SetErrorMessageBuffer(errorMessage);
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
{
auto pinnedArray = GetPinnedErrorArray();
if (pinnedArray.HostPtr[0] != '\0')
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
pinnedArray.HostPtr[0] = '\0'; // clear
throw excep;
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
vtkm::UInt32& threadsPerBlock,
vtkm::Id size,
vtkm::IdComponent maxThreadsPerBlock)
{
(void)size;
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
vtkm::Id deviceId;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(deviceId);
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
blocks = static_cast<vtkm::UInt32>(params.first);
threadsPerBlock = static_cast<vtkm::UInt32>(params.second);
if ((maxThreadsPerBlock > 0) && (threadsPerBlock < static_cast<vtkm::UInt32>(maxThreadsPerBlock)))
{
threadsPerBlock = static_cast<vtkm::UInt32>(maxThreadsPerBlock);
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
dim3& threadsPerBlock,
const dim3& size,
vtkm::IdComponent maxThreadsPerBlock)
{
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
vtkm::Id deviceId;
vtkm::cont::RuntimeDeviceInformation()
.GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagCuda())
.GetDeviceInstance(deviceId);
if (size.z <= 1)
{ //2d images
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
blocks = static_cast<vtkm::UInt32>(params.first);
threadsPerBlock = params.second;
}
else
{ //3d images
const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
blocks = static_cast<vtkm::UInt32>(params.first);
threadsPerBlock = params.second;
}
if (maxThreadsPerBlock > 0)
{
while ((threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z) >
static_cast<vtkm::UInt32>(maxThreadsPerBlock))
{
// Reduce largest element until threads are small enough.
if (threadsPerBlock.x > threadsPerBlock.y)
{
threadsPerBlock.x /= 2;
}
else if (threadsPerBlock.y > threadsPerBlock.z)
{
threadsPerBlock.y /= 2;
}
else
{
threadsPerBlock.z /= 2;
}
}
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
vtkm::UInt32 threadsPerBlock,
vtkm::Id)
{
(void)func_attrs;
(void)blocks;
(void)threadsPerBlock;
std::string name = vtkm::cont::TypeToString(worklet_info);
VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
"Launching 1D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i]",
name.c_str(),
(func_attrs.ptxVersion * 10),
blocks,
threadsPerBlock);
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::LogKernelLaunch(
const cudaFuncAttributes& func_attrs,
const std::type_info& worklet_info,
vtkm::UInt32 blocks,
dim3 threadsPerBlock,
const dim3&)
{
(void)func_attrs;
(void)blocks;
(void)threadsPerBlock;
std::string name = vtkm::cont::TypeToString(worklet_info);
VTKM_LOG_F(vtkm::cont::LogLevel::KernelLaunches,
"Launching 3D kernel %s on CUDA [ptx=%i, blocks=%i, threadsPerBlock=%i, %i, %i]",
name.c_str(),
(func_attrs.ptxVersion * 10),
blocks,
threadsPerBlock.x,
threadsPerBlock.y,
threadsPerBlock.z);
}
}
} // end namespace vtkm::cont