Merge topic 'support_cuda_scheduling_parameters_via_runtime'
047b64651 VTK-m now provides better scheduling parameters controls Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Kenneth Moreland <kmorel@sandia.gov> Merge-request: !1643
This commit is contained in:
commit
6c5c197a37
45
docs/changelog/improve-cuda-scheduling.md
Normal file
45
docs/changelog/improve-cuda-scheduling.md
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# VTK-m CUDA kernel scheduling including improved defaults, and user customization
|
||||||
|
|
||||||
|
VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
|
||||||
|
When VTK-m first launches a kernel we do system introspection and determine
|
||||||
|
what GPU's are on the machine and than match this information to a preset
|
||||||
|
table of values. The implementation is designed in a way that allows for
|
||||||
|
VTK-m to offer both specific presets for a given GPU ( V100 ) or for
|
||||||
|
an entire generation of cards ( Pascal ).
|
||||||
|
|
||||||
|
Currently VTK-m offers preset tables for the following GPU's:
|
||||||
|
- Tesla V100
|
||||||
|
- Tesla P100
|
||||||
|
|
||||||
|
If the hardware doesn't match a specific GPU card we than try to find the
|
||||||
|
nearest know hardware generation and use those defaults. Currently we offer
|
||||||
|
defaults for
|
||||||
|
- Older than Pascal Hardware
|
||||||
|
- Pascal Hardware
|
||||||
|
- Volta+ Hardware
|
||||||
|
|
||||||
|
Some users have workloads that don't align with the defaults provided by
|
||||||
|
VTK-m. When that is the cause, it is possible to override the defaults
|
||||||
|
by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
|
||||||
|
As shown below:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
ScheduleParameters CustomScheduleValues(char const* name,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int multiProcessorCount,
|
||||||
|
int maxThreadsPerMultiProcessor,
|
||||||
|
int maxThreadsPerBlock)
|
||||||
|
{
|
||||||
|
|
||||||
|
ScheduleParameters params {
|
||||||
|
64 * multiProcessorCount, //1d blocks
|
||||||
|
64, //1d threads per block
|
||||||
|
64 * multiProcessorCount, //2d blocks
|
||||||
|
{ 8, 8, 1 }, //2d threads per block
|
||||||
|
64 * multiProcessorCount, //3d blocks
|
||||||
|
{ 4, 4, 4 } }; //3d threads per block
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
|
||||||
|
```
|
@ -29,7 +29,6 @@ set(headers
|
|||||||
DeviceAdapterTimerImplementationCuda.h
|
DeviceAdapterTimerImplementationCuda.h
|
||||||
ExecutionArrayInterfaceBasicCuda.h
|
ExecutionArrayInterfaceBasicCuda.h
|
||||||
MakeThrustIterator.h
|
MakeThrustIterator.h
|
||||||
TaskTuner.h
|
|
||||||
ThrustExceptionHandler.h
|
ThrustExceptionHandler.h
|
||||||
VirtualObjectTransferCuda.h
|
VirtualObjectTransferCuda.h
|
||||||
)
|
)
|
||||||
|
@ -21,43 +21,143 @@
|
|||||||
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
|
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <cstring>
|
||||||
|
#include <functional>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
namespace vtkm
|
namespace vtkm
|
||||||
{
|
{
|
||||||
namespace cont
|
namespace cont
|
||||||
{
|
{
|
||||||
namespace cuda
|
namespace cuda
|
||||||
{
|
{
|
||||||
|
|
||||||
|
static vtkm::cont::cuda::ScheduleParameters (
|
||||||
|
*ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
|
||||||
|
|
||||||
|
//Use the provided function as the the compute function for ScheduleParameterBuilder
|
||||||
|
VTKM_CONT_EXPORT void InitScheduleParameters(
|
||||||
|
vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
|
||||||
|
{
|
||||||
|
ComputeFromEnv = function;
|
||||||
|
}
|
||||||
|
|
||||||
namespace internal
|
namespace internal
|
||||||
{
|
{
|
||||||
|
|
||||||
VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
|
//These represent the best block/threads-per for scheduling on each GPU
|
||||||
{
|
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
|
||||||
std::size_t index = 0;
|
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
|
||||||
if (dId > 0)
|
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
|
||||||
{
|
|
||||||
index = static_cast<size_t>(dId);
|
|
||||||
}
|
|
||||||
|
|
||||||
//check
|
struct VTKM_CONT_EXPORT ScheduleParameterBuilder
|
||||||
|
{
|
||||||
|
//This represents information that is used to compute the best
|
||||||
|
//ScheduleParameters for a given GPU
|
||||||
|
enum struct GPU_STRATA
|
||||||
|
{
|
||||||
|
ENV = 0,
|
||||||
|
OLDER = 5,
|
||||||
|
PASCAL = 6,
|
||||||
|
VOLTA = 7,
|
||||||
|
PASCAL_HPC = 6000,
|
||||||
|
VOLTA_HPC = 7000
|
||||||
|
};
|
||||||
|
|
||||||
|
std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
|
||||||
|
std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
// The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
|
||||||
|
ScheduleParameterBuilder()
|
||||||
|
: Presets{
|
||||||
|
{ GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings
|
||||||
|
{ GPU_STRATA::OLDER,
|
||||||
|
{ 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
|
||||||
|
{ GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
|
||||||
|
{ GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
|
||||||
|
{ GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
|
||||||
|
{ GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
|
||||||
|
}
|
||||||
|
, Compute(nullptr)
|
||||||
|
{
|
||||||
|
if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
|
||||||
|
{
|
||||||
|
this->Compute = vtkm::cont::cuda::ComputeFromEnv;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this->Compute = [=] (const char* name, int major, int minor,
|
||||||
|
int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters {
|
||||||
|
return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int numSMs,
|
||||||
|
int maxThreadsPerSM,
|
||||||
|
int maxThreadsPerBlock)
|
||||||
|
{
|
||||||
|
(void)minor;
|
||||||
|
(void)maxThreadsPerSM;
|
||||||
|
(void)maxThreadsPerBlock;
|
||||||
|
|
||||||
|
const constexpr int GPU_STRATA_MAX_GEN = 7;
|
||||||
|
const constexpr int GPU_STRATA_MIN_GEN = 5;
|
||||||
|
int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
|
||||||
|
strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
|
||||||
|
if (strataAsInt > GPU_STRATA_MIN_GEN)
|
||||||
|
{ //only pascal and above have fancy
|
||||||
|
|
||||||
|
//Currently the only
|
||||||
|
bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
|
||||||
|
if (is_tesla)
|
||||||
|
{
|
||||||
|
strataAsInt *= 1000; //tesla modifier
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
|
||||||
|
ScheduleParameters params{ preset->second };
|
||||||
|
params.one_d_blocks = params.one_d_blocks * numSMs;
|
||||||
|
params.two_d_blocks = params.two_d_blocks * numSMs;
|
||||||
|
params.three_d_blocks = params.three_d_blocks * numSMs;
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
|
||||||
|
{
|
||||||
|
//check flag
|
||||||
static std::once_flag lookupBuiltFlag;
|
static std::once_flag lookupBuiltFlag;
|
||||||
static std::vector<vtkm::UInt32> numSMs;
|
|
||||||
|
|
||||||
std::call_once(lookupBuiltFlag, []() {
|
std::call_once(lookupBuiltFlag, []() {
|
||||||
|
ScheduleParameterBuilder builder;
|
||||||
//iterate over all devices
|
//iterate over all devices
|
||||||
int numberOfSMs = 0;
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
|
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
|
||||||
numSMs.reserve(static_cast<std::size_t>(count));
|
|
||||||
for (int deviceId = 0; deviceId < count; ++deviceId)
|
for (int deviceId = 0; deviceId < count; ++deviceId)
|
||||||
{ //get the number of sm's per deviceId
|
{
|
||||||
VTKM_CUDA_CALL(
|
cudaDeviceProp deviceProp;
|
||||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
|
cudaGetDeviceProperties(&deviceProp, deviceId);
|
||||||
numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
|
|
||||||
|
ScheduleParameters params = builder.Compute(deviceProp.name,
|
||||||
|
deviceProp.major,
|
||||||
|
deviceProp.minor,
|
||||||
|
deviceProp.multiProcessorCount,
|
||||||
|
deviceProp.maxThreadsPerMultiProcessor,
|
||||||
|
deviceProp.maxThreadsPerBlock);
|
||||||
|
scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
|
||||||
|
scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
|
||||||
|
scheduling_3d_parameters.emplace_back(params.three_d_blocks,
|
||||||
|
params.three_d_threads_per_block);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return numSMs[index];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // end namespace cuda::internal
|
} // end namespace cuda::internal
|
||||||
@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
|
||||||
vtkm::UInt32& grids,
|
|
||||||
vtkm::UInt32& blocks,
|
vtkm::UInt32& blocks,
|
||||||
|
vtkm::UInt32& threadsPerBlock,
|
||||||
vtkm::Id size)
|
vtkm::Id size)
|
||||||
{
|
{
|
||||||
(void)size;
|
(void)size;
|
||||||
|
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
|
||||||
|
|
||||||
int deviceId;
|
int deviceId;
|
||||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||||
grids = 32 * cuda::internal::getNumSMs(deviceId);
|
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
|
||||||
blocks = 128;
|
blocks = params.first;
|
||||||
|
threadsPerBlock = params.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
|
||||||
vtkm::UInt32& grids,
|
vtkm::UInt32& blocks,
|
||||||
dim3& blocks,
|
dim3& threadsPerBlock,
|
||||||
const dim3& size)
|
const dim3& size)
|
||||||
{
|
{
|
||||||
|
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
|
||||||
|
|
||||||
int deviceId;
|
int deviceId;
|
||||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||||
grids = 32 * cuda::internal::getNumSMs(deviceId);
|
if (size.z <= 1)
|
||||||
|
{ //2d images
|
||||||
if (size.x == 0)
|
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
|
||||||
{ //grids that have no x dimension
|
blocks = params.first;
|
||||||
blocks.x = 1;
|
threadsPerBlock = params.second;
|
||||||
blocks.y = 8;
|
|
||||||
blocks.z = 8;
|
|
||||||
}
|
|
||||||
else if (size.x > 128)
|
|
||||||
{
|
|
||||||
blocks.x = 8;
|
|
||||||
blocks.y = 8;
|
|
||||||
blocks.z = 4;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{ //for really small grids
|
{ //3d images
|
||||||
blocks.x = 4;
|
const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
|
||||||
blocks.y = 4;
|
blocks = params.first;
|
||||||
blocks.z = 4;
|
threadsPerBlock = params.second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,12 +48,6 @@
|
|||||||
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
||||||
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
||||||
|
|
||||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
|
||||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
|
||||||
#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
|
|
||||||
#include <vtkm/cont/cuda/internal/TaskTuner.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Disable warnings we check vtkm for but Thrust does not.
|
// Disable warnings we check vtkm for but Thrust does not.
|
||||||
VTKM_THIRDPARTY_PRE_INCLUDE
|
VTKM_THIRDPARTY_PRE_INCLUDE
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
@ -79,10 +73,73 @@ namespace vtkm
|
|||||||
{
|
{
|
||||||
namespace cont
|
namespace cont
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
namespace cuda
|
namespace cuda
|
||||||
{
|
{
|
||||||
|
/// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels
|
||||||
|
///
|
||||||
|
/// \c ScheduleParameters represents how VTK-m should schedule different
|
||||||
|
/// cuda kernel types. By default VTK-m uses a preset table based on the
|
||||||
|
/// GPU's found at runtime.
|
||||||
|
///
|
||||||
|
/// When these defaults are insufficient for certain projects it is possible
|
||||||
|
/// to override the defaults by using \c InitScheduleParameters.
|
||||||
|
///
|
||||||
|
///
|
||||||
|
struct VTKM_CONT_EXPORT ScheduleParameters
|
||||||
|
{
|
||||||
|
int one_d_blocks;
|
||||||
|
int one_d_threads_per_block;
|
||||||
|
|
||||||
|
int two_d_blocks;
|
||||||
|
dim3 two_d_threads_per_block;
|
||||||
|
|
||||||
|
int three_d_blocks;
|
||||||
|
dim3 three_d_threads_per_block;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches
|
||||||
|
///
|
||||||
|
/// By default VTK-m uses a preset table based on the GPU's found at runtime to
|
||||||
|
/// determine the best scheduling parameters for a worklet. When these defaults
|
||||||
|
/// are insufficient for certain projects it is possible to override the defaults
|
||||||
|
/// by binding a custom function to \c InitScheduleParameters.
|
||||||
|
///
|
||||||
|
/// Note: The this function must be called before any invocation of any worklets
|
||||||
|
/// by VTK-m.
|
||||||
|
///
|
||||||
|
/// Note: This function will be called for each GPU on a machine.
|
||||||
|
///
|
||||||
|
/// \code{.cpp}
|
||||||
|
///
|
||||||
|
/// ScheduleParameters CustomScheduleValues(char const* name,
|
||||||
|
/// int major,
|
||||||
|
/// int minor,
|
||||||
|
/// int multiProcessorCount,
|
||||||
|
/// int maxThreadsPerMultiProcessor,
|
||||||
|
/// int maxThreadsPerBlock)
|
||||||
|
/// {
|
||||||
|
///
|
||||||
|
/// ScheduleParameters params {
|
||||||
|
/// 64 * multiProcessorCount, //1d blocks
|
||||||
|
/// 64, //1d threads per block
|
||||||
|
/// 64 * multiProcessorCount, //2d blocks
|
||||||
|
/// { 8, 8, 1 }, //2d threads per block
|
||||||
|
/// 64 * multiProcessorCount, //3d blocks
|
||||||
|
/// { 4, 4, 4 } }; //3d threads per block
|
||||||
|
/// return params;
|
||||||
|
/// }
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
///
|
||||||
|
VTKM_CONT_EXPORT void InitScheduleParameters(
|
||||||
|
vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int multiProcessorCount,
|
||||||
|
int maxThreadsPerMultiProcessor,
|
||||||
|
int maxThreadsPerBlock));
|
||||||
|
|
||||||
|
|
||||||
namespace internal
|
namespace internal
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -1358,10 +1415,12 @@ public:
|
|||||||
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
||||||
|
|
||||||
VTKM_CONT_EXPORT
|
VTKM_CONT_EXPORT
|
||||||
static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
|
static void GetBlocksAndThreads(vtkm::UInt32& blocks,
|
||||||
|
vtkm::UInt32& threadsPerBlock,
|
||||||
|
vtkm::Id size);
|
||||||
|
|
||||||
VTKM_CONT_EXPORT
|
VTKM_CONT_EXPORT
|
||||||
static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);
|
static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
template <typename WType, typename IType>
|
template <typename WType, typename IType>
|
||||||
@ -1378,15 +1437,11 @@ public:
|
|||||||
CheckForErrors();
|
CheckForErrors();
|
||||||
SetupErrorBuffer(functor);
|
SetupErrorBuffer(functor);
|
||||||
|
|
||||||
vtkm::UInt32 grids, blocks;
|
vtkm::UInt32 blocks, threadsPerBlock;
|
||||||
GetGridsAndBlocks(grids, blocks, numInstances);
|
GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
|
||||||
|
|
||||||
cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
|
cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
|
||||||
numInstances);
|
functor, numInstances);
|
||||||
|
|
||||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
|
||||||
parameter_sweep_1d_schedule(functor, numInstances);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename WType, typename IType>
|
template <typename WType, typename IType>
|
||||||
@ -1407,15 +1462,12 @@ public:
|
|||||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||||
|
|
||||||
vtkm::UInt32 grids;
|
vtkm::UInt32 blocks;
|
||||||
dim3 blocks;
|
dim3 threadsPerBlock;
|
||||||
GetGridsAndBlocks(grids, blocks, ranges);
|
GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
|
||||||
|
|
||||||
cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
|
cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
|
||||||
|
functor, ranges);
|
||||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
|
||||||
parameter_sweep_3d_schedule(functor, rangeMax);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Functor>
|
template <class Functor>
|
||||||
|
@ -1,211 +0,0 @@
|
|||||||
//============================================================================
|
|
||||||
// Copyright (c) Kitware, Inc.
|
|
||||||
// All rights reserved.
|
|
||||||
// See LICENSE.txt for details.
|
|
||||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
|
||||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
||||||
// PURPOSE. See the above copyright notice for more information.
|
|
||||||
//
|
|
||||||
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
|
|
||||||
// Copyright 2014 UT-Battelle, LLC.
|
|
||||||
// Copyright 2014 Los Alamos National Security.
|
|
||||||
//
|
|
||||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
|
||||||
// the U.S. Government retains certain rights in this software.
|
|
||||||
//
|
|
||||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
|
||||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
|
||||||
// this software.
|
|
||||||
//============================================================================
|
|
||||||
|
|
||||||
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
|
|
||||||
#define vtk_m_cont_cuda_internal_TaskTuner_h
|
|
||||||
|
|
||||||
#include <vtkm/Types.h>
|
|
||||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace vtkm
|
|
||||||
{
|
|
||||||
namespace cont
|
|
||||||
{
|
|
||||||
namespace cuda
|
|
||||||
{
|
|
||||||
namespace internal
|
|
||||||
{
|
|
||||||
|
|
||||||
vtkm::UInt32 getNumSMs(int dId);
|
|
||||||
|
|
||||||
template <typename TaskType>
|
|
||||||
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
|
|
||||||
template <typename TaskType>
|
|
||||||
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
|
|
||||||
|
|
||||||
struct PerfRecord1d
|
|
||||||
{
|
|
||||||
PerfRecord1d(float elapsedT, int g, int b)
|
|
||||||
: elapsedTime(elapsedT)
|
|
||||||
, grid(g)
|
|
||||||
, block(b)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
|
|
||||||
|
|
||||||
float elapsedTime;
|
|
||||||
int grid;
|
|
||||||
int block;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
|
|
||||||
{
|
|
||||||
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
|
|
||||||
<< ">>> required: " << record.elapsedTime << "\n";
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
struct PerfRecord3d
|
|
||||||
{
|
|
||||||
PerfRecord3d(float elapsedT, int g, dim3 b)
|
|
||||||
: elapsedTime(elapsedT)
|
|
||||||
, grid(g)
|
|
||||||
, block(b)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
|
|
||||||
|
|
||||||
float elapsedTime;
|
|
||||||
int grid;
|
|
||||||
dim3 block;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
|
|
||||||
{
|
|
||||||
|
|
||||||
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
|
|
||||||
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <typename TaskT>
|
|
||||||
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
|
|
||||||
{
|
|
||||||
std::vector<PerfRecord1d> results;
|
|
||||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
|
||||||
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
|
|
||||||
256, 512, 1024, 2048, 4096, 8192 };
|
|
||||||
|
|
||||||
int deviceId;
|
|
||||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
|
||||||
|
|
||||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
|
||||||
{
|
|
||||||
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
|
|
||||||
for (vtkm::UInt32 b = 0; b < 12; b++)
|
|
||||||
{
|
|
||||||
vtkm::UInt32 blocks = blockIndexTable[b];
|
|
||||||
|
|
||||||
cudaEvent_t start, stop;
|
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
|
||||||
|
|
||||||
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
|
||||||
float elapsedTimeMilliseconds;
|
|
||||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
|
||||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
|
||||||
|
|
||||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::sort(results.begin(), results.end());
|
|
||||||
for (auto&& i : results)
|
|
||||||
{
|
|
||||||
std::cout << i << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename TaskT>
|
|
||||||
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
|
|
||||||
{
|
|
||||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
|
||||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
|
||||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
|
||||||
std::vector<PerfRecord3d> results;
|
|
||||||
|
|
||||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
|
||||||
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
|
|
||||||
28, 30, 32, 64, 128, 256, 512, 1024 };
|
|
||||||
|
|
||||||
int deviceId;
|
|
||||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
|
||||||
{
|
|
||||||
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
|
|
||||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
|
||||||
{
|
|
||||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
|
||||||
{
|
|
||||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
|
||||||
{
|
|
||||||
cudaEvent_t start, stop;
|
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
|
||||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
|
||||||
|
|
||||||
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
|
|
||||||
|
|
||||||
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
|
|
||||||
blocks.z >= 64)
|
|
||||||
{
|
|
||||||
//cuda can't handle more than 1024 threads per block
|
|
||||||
//so don't try if we compute higher than that
|
|
||||||
|
|
||||||
//also don't try stupidly low numbers
|
|
||||||
|
|
||||||
//cuda can't handle more than 64 threads in the z direction
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
|
||||||
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
|
|
||||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
|
||||||
float elapsedTimeMilliseconds;
|
|
||||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
|
||||||
|
|
||||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
|
||||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
|
||||||
|
|
||||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::sort(results.begin(), results.end());
|
|
||||||
for (auto&& i : results)
|
|
||||||
{
|
|
||||||
std::cout << i << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
Loading…
Reference in New Issue
Block a user