vtk-m/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu
Robert Maynard d1ce4a0bca Fix the default launch sizes for Tesla hardware.
The 8x8x8 is a better launch strategy for most VTK-m kernels.
The current problem is that a couple of VTK-m kernels use a
high number of registers and this number of threads combines to
require too many registers.

What we should do in the longer run is have more controls over
kernel launches on a per kernel basis. This will require VTK-m
to extract the number of registers being used by each kernel
2019-05-06 16:12:15 -04:00

233 lines
8.1 KiB
Plaintext

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
#include <atomic>
#include <cstring>
#include <functional>
#include <mutex>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
namespace cuda
{
static vtkm::cont::cuda::ScheduleParameters (
*ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
//Use the provided function as the the compute function for ScheduleParameterBuilder
VTKM_CONT_EXPORT void InitScheduleParameters(
vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
{
ComputeFromEnv = function;
}
namespace internal
{
//These represent the best block/threads-per for scheduling on each GPU
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
struct VTKM_CONT_EXPORT ScheduleParameterBuilder
{
//This represents information that is used to compute the best
//ScheduleParameters for a given GPU
enum struct GPU_STRATA
{
ENV = 0,
OLDER = 5,
PASCAL = 6,
VOLTA = 7,
PASCAL_HPC = 6000,
VOLTA_HPC = 7000
};
std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
// clang-format off
// The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
ScheduleParameterBuilder()
: Presets{
{ GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings
{ GPU_STRATA::OLDER,
{ 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
{ GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
{ GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
{ GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 64, { 8, 8, 4 } } }, //P100
{ GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 64, { 8, 8, 4 } } }, //V100
}
, Compute(nullptr)
{
if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
{
this->Compute = vtkm::cont::cuda::ComputeFromEnv;
}
else
{
this->Compute = [=] (const char* name, int major, int minor,
int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters {
return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
}
}
// clang-format on
vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
int major,
int minor,
int numSMs,
int maxThreadsPerSM,
int maxThreadsPerBlock)
{
(void)minor;
(void)maxThreadsPerSM;
(void)maxThreadsPerBlock;
const constexpr int GPU_STRATA_MAX_GEN = 7;
const constexpr int GPU_STRATA_MIN_GEN = 5;
int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
if (strataAsInt > GPU_STRATA_MIN_GEN)
{ //only pascal and above have fancy
//Currently the only
bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
if (is_tesla)
{
strataAsInt *= 1000; //tesla modifier
}
}
auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
ScheduleParameters params = preset->second;
params.one_d_blocks = params.one_d_blocks * numSMs;
params.two_d_blocks = params.two_d_blocks * numSMs;
params.three_d_blocks = params.three_d_blocks * numSMs;
return params;
}
};
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
{
//check flag
static std::once_flag lookupBuiltFlag;
std::call_once(lookupBuiltFlag, []() {
ScheduleParameterBuilder builder;
//iterate over all devices
int count = 0;
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
for (int deviceId = 0; deviceId < count; ++deviceId)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
ScheduleParameters params = builder.Compute(deviceProp.name,
deviceProp.major,
deviceProp.minor,
deviceProp.multiProcessorCount,
deviceProp.maxThreadsPerMultiProcessor,
deviceProp.maxThreadsPerBlock);
scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
scheduling_3d_parameters.emplace_back(params.three_d_blocks,
params.three_d_threads_per_block);
}
});
}
}
} // end namespace cuda::internal
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
-> const PinnedErrorArray&
{
constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
static thread_local PinnedErrorArray local;
if (!local.HostPtr)
{
VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
local.HostPtr[0] = '\0'; // clear
local.Size = ERROR_ARRAY_SIZE;
}
return local;
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
vtkm::exec::cuda::internal::TaskStrided& functor)
{
auto pinnedArray = GetPinnedErrorArray();
vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
functor.SetErrorMessageBuffer(errorMessage);
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
{
auto pinnedArray = GetPinnedErrorArray();
if (pinnedArray.HostPtr[0] != '\0')
{
VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
pinnedArray.HostPtr[0] = '\0'; // clear
throw excep;
}
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
vtkm::UInt32& threadsPerBlock,
vtkm::Id size)
{
(void)size;
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
vtkm::UInt32& blocks,
dim3& threadsPerBlock,
const dim3& size)
{
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
if (size.z <= 1)
{ //2d images
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
else
{ //3d images
const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
blocks = params.first;
threadsPerBlock = params.second;
}
}
}
} // end namespace vtkm::cont