Merge topic 'support_cuda_scheduling_parameters_via_runtime'

047b64651 VTK-m now provides better scheduling parameters controls

Acked-by: Kitware Robot <kwrobot@kitware.com>
Acked-by: Kenneth Moreland <kmorel@sandia.gov>
Merge-request: !1643
This commit is contained in:
Robert Maynard 2019-04-17 14:04:02 +00:00 committed by Kitware Robot
commit 6c5c197a37
5 changed files with 260 additions and 278 deletions

@ -0,0 +1,45 @@
# VTK-m CUDA kernel scheduling including improved defaults, and user customization
VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
When VTK-m first launches a kernel we do system introspection and determine
what GPU's are on the machine and than match this information to a preset
table of values. The implementation is designed in a way that allows for
VTK-m to offer both specific presets for a given GPU ( V100 ) or for
an entire generation of cards ( Pascal ).
Currently VTK-m offers preset tables for the following GPU's:
- Tesla V100
- Tesla P100
If the hardware doesn't match a specific GPU card we than try to find the
nearest know hardware generation and use those defaults. Currently we offer
defaults for
- Older than Pascal Hardware
- Pascal Hardware
- Volta+ Hardware
Some users have workloads that don't align with the defaults provided by
VTK-m. When that is the cause, it is possible to override the defaults
by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
As shown below:
```cpp
ScheduleParameters CustomScheduleValues(char const* name,
int major,
int minor,
int multiProcessorCount,
int maxThreadsPerMultiProcessor,
int maxThreadsPerBlock)
{
ScheduleParameters params {
64 * multiProcessorCount, //1d blocks
64, //1d threads per block
64 * multiProcessorCount, //2d blocks
{ 8, 8, 1 }, //2d threads per block
64 * multiProcessorCount, //3d blocks
{ 4, 4, 4 } }; //3d threads per block
return params;
}
vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
```

@ -29,7 +29,6 @@ set(headers
DeviceAdapterTimerImplementationCuda.h
ExecutionArrayInterfaceBasicCuda.h
MakeThrustIterator.h
TaskTuner.h
ThrustExceptionHandler.h
VirtualObjectTransferCuda.h
)

@ -21,43 +21,143 @@
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
#include <atomic>
#include <cstring>
#include <functional>
#include <mutex>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
namespace cuda
{
static vtkm::cont::cuda::ScheduleParameters (
*ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
//Use the provided function as the the compute function for ScheduleParameterBuilder
VTKM_CONT_EXPORT void InitScheduleParameters(
vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
{
ComputeFromEnv = function;
}
namespace internal
{
VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
{
std::size_t index = 0;
if (dId > 0)
{
index = static_cast<size_t>(dId);
}
//These represent the best block/threads-per for scheduling on each GPU
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
//check
struct VTKM_CONT_EXPORT ScheduleParameterBuilder
{
//This represents information that is used to compute the best
//ScheduleParameters for a given GPU
enum struct GPU_STRATA
{
ENV = 0,
OLDER = 5,
PASCAL = 6,
VOLTA = 7,
PASCAL_HPC = 6000,
VOLTA_HPC = 7000
};
std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
// clang-format off
// The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
ScheduleParameterBuilder()
: Presets{
{ GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings
{ GPU_STRATA::OLDER,
{ 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
{ GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
{ GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
{ GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
{ GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
}
, Compute(nullptr)
{
if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
{
this->Compute = vtkm::cont::cuda::ComputeFromEnv;
}
else
{
this->Compute = [=] (const char* name, int major, int minor,
int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters {
return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
}
}
// clang-format on
vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
int major,
int minor,
int numSMs,
int maxThreadsPerSM,
int maxThreadsPerBlock)
{
(void)minor;
(void)maxThreadsPerSM;
(void)maxThreadsPerBlock;
const constexpr int GPU_STRATA_MAX_GEN = 7;
const constexpr int GPU_STRATA_MIN_GEN = 5;
int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
if (strataAsInt > GPU_STRATA_MIN_GEN)
{ //only pascal and above have fancy
//Currently the only
bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
if (is_tesla)
{
strataAsInt *= 1000; //tesla modifier
}
}
auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
ScheduleParameters params{ preset->second };
params.one_d_blocks = params.one_d_blocks * numSMs;
params.two_d_blocks = params.two_d_blocks * numSMs;
params.three_d_blocks = params.three_d_blocks * numSMs;
return params;
}
};
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
{
//check flag
static std::once_flag lookupBuiltFlag;
static std::vector<vtkm::UInt32> numSMs;
std::call_once(lookupBuiltFlag, []() {
ScheduleParameterBuilder builder;
//iterate over all devices
int numberOfSMs = 0;
int count = 0;
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
numSMs.reserve(static_cast<std::size_t>(count));
for (int deviceId = 0; deviceId < count; ++deviceId)
{ //get the number of sm's per deviceId
VTKM_CUDA_CALL(
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
ScheduleParameters params = builder.Compute(deviceProp.name,
deviceProp.major,
deviceProp.minor,
deviceProp.multiProcessorCount,
deviceProp.maxThreadsPerMultiProcessor,
deviceProp.maxThreadsPerBlock);
scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
scheduling_3d_parameters.emplace_back(params.three_d_blocks,
params.three_d_threads_per_block);
}
});
return numSMs[index];
}
}
} // end namespace cuda::internal
@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
vtkm::UInt32& grids,
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
vtkm::UInt32& threadsPerBlock,
vtkm::Id size)
{
(void)size;
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * cuda::internal::getNumSMs(deviceId);
blocks = 128;
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
vtkm::UInt32& grids,
dim3& blocks,
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
dim3& threadsPerBlock,
const dim3& size)
{
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * cuda::internal::getNumSMs(deviceId);
if (size.x == 0)
{ //grids that have no x dimension
blocks.x = 1;
blocks.y = 8;
blocks.z = 8;
}
else if (size.x > 128)
{
blocks.x = 8;
blocks.y = 8;
blocks.z = 4;
if (size.z <= 1)
{ //2d images
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
else
{ //for really small grids
blocks.x = 4;
blocks.y = 4;
blocks.z = 4;
{ //3d images
const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
}
}

@ -48,12 +48,6 @@
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
#include <vtkm/cont/cuda/internal/TaskTuner.h>
#endif
// Disable warnings we check vtkm for but Thrust does not.
VTKM_THIRDPARTY_PRE_INCLUDE
#include <cooperative_groups.h>
@ -79,10 +73,73 @@ namespace vtkm
{
namespace cont
{
namespace cuda
{
/// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels
///
/// \c ScheduleParameters represents how VTK-m should schedule different
/// cuda kernel types. By default VTK-m uses a preset table based on the
/// GPU's found at runtime.
///
/// When these defaults are insufficient for certain projects it is possible
/// to override the defaults by using \c InitScheduleParameters.
///
///
struct VTKM_CONT_EXPORT ScheduleParameters
{
int one_d_blocks;
int one_d_threads_per_block;
int two_d_blocks;
dim3 two_d_threads_per_block;
int three_d_blocks;
dim3 three_d_threads_per_block;
};
/// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches
///
/// By default VTK-m uses a preset table based on the GPU's found at runtime to
/// determine the best scheduling parameters for a worklet. When these defaults
/// are insufficient for certain projects it is possible to override the defaults
/// by binding a custom function to \c InitScheduleParameters.
///
/// Note: The this function must be called before any invocation of any worklets
/// by VTK-m.
///
/// Note: This function will be called for each GPU on a machine.
///
/// \code{.cpp}
///
/// ScheduleParameters CustomScheduleValues(char const* name,
/// int major,
/// int minor,
/// int multiProcessorCount,
/// int maxThreadsPerMultiProcessor,
/// int maxThreadsPerBlock)
/// {
///
/// ScheduleParameters params {
/// 64 * multiProcessorCount, //1d blocks
/// 64, //1d threads per block
/// 64 * multiProcessorCount, //2d blocks
/// { 8, 8, 1 }, //2d threads per block
/// 64 * multiProcessorCount, //3d blocks
/// { 4, 4, 4 } }; //3d threads per block
/// return params;
/// }
/// \endcode
///
///
VTKM_CONT_EXPORT void InitScheduleParameters(
vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
int major,
int minor,
int multiProcessorCount,
int maxThreadsPerMultiProcessor,
int maxThreadsPerBlock));
namespace internal
{
@ -1358,10 +1415,12 @@ public:
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
VTKM_CONT_EXPORT
static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
static void GetBlocksAndThreads(vtkm::UInt32& blocks,
vtkm::UInt32& threadsPerBlock,
vtkm::Id size);
VTKM_CONT_EXPORT
static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);
static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
public:
template <typename WType, typename IType>
@ -1378,15 +1437,11 @@ public:
CheckForErrors();
SetupErrorBuffer(functor);
vtkm::UInt32 grids, blocks;
GetGridsAndBlocks(grids, blocks, numInstances);
vtkm::UInt32 blocks, threadsPerBlock;
GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
numInstances);
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
parameter_sweep_1d_schedule(functor, numInstances);
#endif
cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
functor, numInstances);
}
template <typename WType, typename IType>
@ -1407,15 +1462,12 @@ public:
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]));
vtkm::UInt32 grids;
dim3 blocks;
GetGridsAndBlocks(grids, blocks, ranges);
vtkm::UInt32 blocks;
dim3 threadsPerBlock;
GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
parameter_sweep_3d_schedule(functor, rangeMax);
#endif
cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
functor, ranges);
}
template <class Functor>

@ -1,211 +0,0 @@
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
#define vtk_m_cont_cuda_internal_TaskTuner_h
#include <vtkm/Types.h>
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <cuda.h>
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
vtkm::UInt32 getNumSMs(int dId);
template <typename TaskType>
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
template <typename TaskType>
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
struct PerfRecord1d
{
PerfRecord1d(float elapsedT, int g, int b)
: elapsedTime(elapsedT)
, grid(g)
, block(b)
{
}
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
float elapsedTime;
int grid;
int block;
};
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
{
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
<< ">>> required: " << record.elapsedTime << "\n";
return os;
}
struct PerfRecord3d
{
PerfRecord3d(float elapsedT, int g, dim3 b)
: elapsedTime(elapsedT)
, grid(g)
, block(b)
{
}
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
float elapsedTime;
int grid;
dim3 block;
};
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
{
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
return os;
}
template <typename TaskT>
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
{
std::vector<PerfRecord1d> results;
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192 };
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
for (vtkm::UInt32 g = 0; g < 12; g++)
{
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 b = 0; b < 12; b++)
{
vtkm::UInt32 blocks = blockIndexTable[b];
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
}
std::sort(results.begin(), results.end());
for (auto&& i : results)
{
std::cout << i << std::endl;
}
}
template <typename TaskT>
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
{
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]));
std::vector<PerfRecord3d> results;
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
28, 30, 32, 64, 128, 256, 512, 1024 };
int deviceId;
for (vtkm::UInt32 g = 0; g < 12; g++)
{
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 i = 0; i < 16; i++)
{
for (vtkm::UInt32 j = 0; j < 16; j++)
{
for (vtkm::UInt32 k = 0; k < 16; k++)
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
blocks.z >= 64)
{
//cuda can't handle more than 1024 threads per block
//so don't try if we compute higher than that
//also don't try stupidly low numbers
//cuda can't handle more than 64 threads in the z direction
continue;
}
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
}
}
}
std::sort(results.begin(), results.end());
for (auto&& i : results)
{
std::cout << i << std::endl;
}
}
}
}
}
}
#endif