Merge topic 'support_cuda_scheduling_parameters_via_runtime'
047b64651 VTK-m now provides better scheduling parameters controls Acked-by: Kitware Robot <kwrobot@kitware.com> Acked-by: Kenneth Moreland <kmorel@sandia.gov> Merge-request: !1643
This commit is contained in:
commit
6c5c197a37
45
docs/changelog/improve-cuda-scheduling.md
Normal file
45
docs/changelog/improve-cuda-scheduling.md
Normal file
@ -0,0 +1,45 @@
|
||||
# VTK-m CUDA kernel scheduling including improved defaults, and user customization
|
||||
|
||||
VTK-m now offers a more GPU aware set of defaults for kernel scheduling.
|
||||
When VTK-m first launches a kernel we do system introspection and determine
|
||||
what GPU's are on the machine and than match this information to a preset
|
||||
table of values. The implementation is designed in a way that allows for
|
||||
VTK-m to offer both specific presets for a given GPU ( V100 ) or for
|
||||
an entire generation of cards ( Pascal ).
|
||||
|
||||
Currently VTK-m offers preset tables for the following GPU's:
|
||||
- Tesla V100
|
||||
- Tesla P100
|
||||
|
||||
If the hardware doesn't match a specific GPU card we than try to find the
|
||||
nearest know hardware generation and use those defaults. Currently we offer
|
||||
defaults for
|
||||
- Older than Pascal Hardware
|
||||
- Pascal Hardware
|
||||
- Volta+ Hardware
|
||||
|
||||
Some users have workloads that don't align with the defaults provided by
|
||||
VTK-m. When that is the cause, it is possible to override the defaults
|
||||
by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`.
|
||||
As shown below:
|
||||
|
||||
```cpp
|
||||
ScheduleParameters CustomScheduleValues(char const* name,
|
||||
int major,
|
||||
int minor,
|
||||
int multiProcessorCount,
|
||||
int maxThreadsPerMultiProcessor,
|
||||
int maxThreadsPerBlock)
|
||||
{
|
||||
|
||||
ScheduleParameters params {
|
||||
64 * multiProcessorCount, //1d blocks
|
||||
64, //1d threads per block
|
||||
64 * multiProcessorCount, //2d blocks
|
||||
{ 8, 8, 1 }, //2d threads per block
|
||||
64 * multiProcessorCount, //3d blocks
|
||||
{ 4, 4, 4 } }; //3d threads per block
|
||||
return params;
|
||||
}
|
||||
vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues);
|
||||
```
|
@ -29,7 +29,6 @@ set(headers
|
||||
DeviceAdapterTimerImplementationCuda.h
|
||||
ExecutionArrayInterfaceBasicCuda.h
|
||||
MakeThrustIterator.h
|
||||
TaskTuner.h
|
||||
ThrustExceptionHandler.h
|
||||
VirtualObjectTransferCuda.h
|
||||
)
|
||||
|
@ -21,43 +21,143 @@
|
||||
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
static vtkm::cont::cuda::ScheduleParameters (
|
||||
*ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;
|
||||
|
||||
//Use the provided function as the the compute function for ScheduleParameterBuilder
|
||||
VTKM_CONT_EXPORT void InitScheduleParameters(
|
||||
vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
|
||||
{
|
||||
ComputeFromEnv = function;
|
||||
}
|
||||
|
||||
namespace internal
|
||||
{
|
||||
|
||||
VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
|
||||
//These represent the best block/threads-per for scheduling on each GPU
|
||||
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
|
||||
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
|
||||
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;
|
||||
|
||||
struct VTKM_CONT_EXPORT ScheduleParameterBuilder
|
||||
{
|
||||
std::size_t index = 0;
|
||||
if (dId > 0)
|
||||
//This represents information that is used to compute the best
|
||||
//ScheduleParameters for a given GPU
|
||||
enum struct GPU_STRATA
|
||||
{
|
||||
index = static_cast<size_t>(dId);
|
||||
ENV = 0,
|
||||
OLDER = 5,
|
||||
PASCAL = 6,
|
||||
VOLTA = 7,
|
||||
PASCAL_HPC = 6000,
|
||||
VOLTA_HPC = 7000
|
||||
};
|
||||
|
||||
std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
|
||||
std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;
|
||||
|
||||
// clang-format off
|
||||
// The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
|
||||
ScheduleParameterBuilder()
|
||||
: Presets{
|
||||
{ GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings
|
||||
{ GPU_STRATA::OLDER,
|
||||
{ 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
|
||||
{ GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
|
||||
{ GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
|
||||
{ GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
|
||||
{ GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
|
||||
}
|
||||
, Compute(nullptr)
|
||||
{
|
||||
if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
|
||||
{
|
||||
this->Compute = vtkm::cont::cuda::ComputeFromEnv;
|
||||
}
|
||||
else
|
||||
{
|
||||
this->Compute = [=] (const char* name, int major, int minor,
|
||||
int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters {
|
||||
return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
|
||||
}
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
|
||||
int major,
|
||||
int minor,
|
||||
int numSMs,
|
||||
int maxThreadsPerSM,
|
||||
int maxThreadsPerBlock)
|
||||
{
|
||||
(void)minor;
|
||||
(void)maxThreadsPerSM;
|
||||
(void)maxThreadsPerBlock;
|
||||
|
||||
const constexpr int GPU_STRATA_MAX_GEN = 7;
|
||||
const constexpr int GPU_STRATA_MIN_GEN = 5;
|
||||
int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
|
||||
strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
|
||||
if (strataAsInt > GPU_STRATA_MIN_GEN)
|
||||
{ //only pascal and above have fancy
|
||||
|
||||
//Currently the only
|
||||
bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
|
||||
if (is_tesla)
|
||||
{
|
||||
strataAsInt *= 1000; //tesla modifier
|
||||
}
|
||||
}
|
||||
|
||||
//check
|
||||
auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
|
||||
ScheduleParameters params{ preset->second };
|
||||
params.one_d_blocks = params.one_d_blocks * numSMs;
|
||||
params.two_d_blocks = params.two_d_blocks * numSMs;
|
||||
params.three_d_blocks = params.three_d_blocks * numSMs;
|
||||
return params;
|
||||
}
|
||||
};
|
||||
|
||||
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
|
||||
{
|
||||
//check flag
|
||||
static std::once_flag lookupBuiltFlag;
|
||||
static std::vector<vtkm::UInt32> numSMs;
|
||||
|
||||
std::call_once(lookupBuiltFlag, []() {
|
||||
ScheduleParameterBuilder builder;
|
||||
//iterate over all devices
|
||||
int numberOfSMs = 0;
|
||||
int count = 0;
|
||||
VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
|
||||
numSMs.reserve(static_cast<std::size_t>(count));
|
||||
for (int deviceId = 0; deviceId < count; ++deviceId)
|
||||
{ //get the number of sm's per deviceId
|
||||
VTKM_CUDA_CALL(
|
||||
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId));
|
||||
numSMs.push_back(static_cast<vtkm::UInt32>(numberOfSMs));
|
||||
{
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, deviceId);
|
||||
|
||||
ScheduleParameters params = builder.Compute(deviceProp.name,
|
||||
deviceProp.major,
|
||||
deviceProp.minor,
|
||||
deviceProp.multiProcessorCount,
|
||||
deviceProp.maxThreadsPerMultiProcessor,
|
||||
deviceProp.maxThreadsPerBlock);
|
||||
scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
|
||||
scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
|
||||
scheduling_3d_parameters.emplace_back(params.three_d_blocks,
|
||||
params.three_d_threads_per_block);
|
||||
}
|
||||
});
|
||||
return numSMs[index];
|
||||
}
|
||||
}
|
||||
} // end namespace cuda::internal
|
||||
@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
||||
vtkm::UInt32& grids,
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
|
||||
vtkm::UInt32& blocks,
|
||||
vtkm::UInt32& threadsPerBlock,
|
||||
vtkm::Id size)
|
||||
{
|
||||
(void)size;
|
||||
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
|
||||
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
grids = 32 * cuda::internal::getNumSMs(deviceId);
|
||||
blocks = 128;
|
||||
const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
|
||||
blocks = params.first;
|
||||
threadsPerBlock = params.second;
|
||||
}
|
||||
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
|
||||
vtkm::UInt32& grids,
|
||||
dim3& blocks,
|
||||
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
|
||||
vtkm::UInt32& blocks,
|
||||
dim3& threadsPerBlock,
|
||||
const dim3& size)
|
||||
{
|
||||
vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();
|
||||
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
grids = 32 * cuda::internal::getNumSMs(deviceId);
|
||||
|
||||
if (size.x == 0)
|
||||
{ //grids that have no x dimension
|
||||
blocks.x = 1;
|
||||
blocks.y = 8;
|
||||
blocks.z = 8;
|
||||
}
|
||||
else if (size.x > 128)
|
||||
{
|
||||
blocks.x = 8;
|
||||
blocks.y = 8;
|
||||
blocks.z = 4;
|
||||
if (size.z <= 1)
|
||||
{ //2d images
|
||||
const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
|
||||
blocks = params.first;
|
||||
threadsPerBlock = params.second;
|
||||
}
|
||||
else
|
||||
{ //for really small grids
|
||||
blocks.x = 4;
|
||||
blocks.y = 4;
|
||||
blocks.z = 4;
|
||||
{ //3d images
|
||||
const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
|
||||
blocks = params.first;
|
||||
threadsPerBlock = params.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -48,12 +48,6 @@
|
||||
#include <vtkm/exec/cuda/internal/TaskStrided.h>
|
||||
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
||||
|
||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
||||
// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D)
|
||||
#include <vtkm/cont/cuda/internal/TaskTuner.h>
|
||||
#endif
|
||||
|
||||
// Disable warnings we check vtkm for but Thrust does not.
|
||||
VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
#include <cooperative_groups.h>
|
||||
@ -79,10 +73,73 @@ namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
|
||||
|
||||
namespace cuda
|
||||
{
|
||||
/// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels
|
||||
///
|
||||
/// \c ScheduleParameters represents how VTK-m should schedule different
|
||||
/// cuda kernel types. By default VTK-m uses a preset table based on the
|
||||
/// GPU's found at runtime.
|
||||
///
|
||||
/// When these defaults are insufficient for certain projects it is possible
|
||||
/// to override the defaults by using \c InitScheduleParameters.
|
||||
///
|
||||
///
|
||||
struct VTKM_CONT_EXPORT ScheduleParameters
|
||||
{
|
||||
int one_d_blocks;
|
||||
int one_d_threads_per_block;
|
||||
|
||||
int two_d_blocks;
|
||||
dim3 two_d_threads_per_block;
|
||||
|
||||
int three_d_blocks;
|
||||
dim3 three_d_threads_per_block;
|
||||
};
|
||||
|
||||
/// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches
|
||||
///
|
||||
/// By default VTK-m uses a preset table based on the GPU's found at runtime to
|
||||
/// determine the best scheduling parameters for a worklet. When these defaults
|
||||
/// are insufficient for certain projects it is possible to override the defaults
|
||||
/// by binding a custom function to \c InitScheduleParameters.
|
||||
///
|
||||
/// Note: The this function must be called before any invocation of any worklets
|
||||
/// by VTK-m.
|
||||
///
|
||||
/// Note: This function will be called for each GPU on a machine.
|
||||
///
|
||||
/// \code{.cpp}
|
||||
///
|
||||
/// ScheduleParameters CustomScheduleValues(char const* name,
|
||||
/// int major,
|
||||
/// int minor,
|
||||
/// int multiProcessorCount,
|
||||
/// int maxThreadsPerMultiProcessor,
|
||||
/// int maxThreadsPerBlock)
|
||||
/// {
|
||||
///
|
||||
/// ScheduleParameters params {
|
||||
/// 64 * multiProcessorCount, //1d blocks
|
||||
/// 64, //1d threads per block
|
||||
/// 64 * multiProcessorCount, //2d blocks
|
||||
/// { 8, 8, 1 }, //2d threads per block
|
||||
/// 64 * multiProcessorCount, //3d blocks
|
||||
/// { 4, 4, 4 } }; //3d threads per block
|
||||
/// return params;
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
///
|
||||
VTKM_CONT_EXPORT void InitScheduleParameters(
|
||||
vtkm::cont::cuda::ScheduleParameters (*)(char const* name,
|
||||
int major,
|
||||
int minor,
|
||||
int multiProcessorCount,
|
||||
int maxThreadsPerMultiProcessor,
|
||||
int maxThreadsPerBlock));
|
||||
|
||||
|
||||
namespace internal
|
||||
{
|
||||
|
||||
@ -1358,10 +1415,12 @@ public:
|
||||
static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor);
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size);
|
||||
static void GetBlocksAndThreads(vtkm::UInt32& blocks,
|
||||
vtkm::UInt32& threadsPerBlock,
|
||||
vtkm::Id size);
|
||||
|
||||
VTKM_CONT_EXPORT
|
||||
static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size);
|
||||
static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size);
|
||||
|
||||
public:
|
||||
template <typename WType, typename IType>
|
||||
@ -1378,15 +1437,11 @@ public:
|
||||
CheckForErrors();
|
||||
SetupErrorBuffer(functor);
|
||||
|
||||
vtkm::UInt32 grids, blocks;
|
||||
GetGridsAndBlocks(grids, blocks, numInstances);
|
||||
vtkm::UInt32 blocks, threadsPerBlock;
|
||||
GetBlocksAndThreads(blocks, threadsPerBlock, numInstances);
|
||||
|
||||
cuda::internal::TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor,
|
||||
numInstances);
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D
|
||||
parameter_sweep_1d_schedule(functor, numInstances);
|
||||
#endif
|
||||
cuda::internal::TaskStrided1DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
|
||||
functor, numInstances);
|
||||
}
|
||||
|
||||
template <typename WType, typename IType>
|
||||
@ -1407,15 +1462,12 @@ public:
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
|
||||
vtkm::UInt32 grids;
|
||||
dim3 blocks;
|
||||
GetGridsAndBlocks(grids, blocks, ranges);
|
||||
vtkm::UInt32 blocks;
|
||||
dim3 threadsPerBlock;
|
||||
GetBlocksAndThreads(blocks, threadsPerBlock, ranges);
|
||||
|
||||
cuda::internal::TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(functor, ranges);
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
parameter_sweep_3d_schedule(functor, rangeMax);
|
||||
#endif
|
||||
cuda::internal::TaskStrided3DLaunch<<<blocks, threadsPerBlock, 0, cudaStreamPerThread>>>(
|
||||
functor, ranges);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
|
@ -1,211 +0,0 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
|
||||
// Copyright 2014 UT-Battelle, LLC.
|
||||
// Copyright 2014 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
|
||||
#define vtk_m_cont_cuda_internal_TaskTuner_h
|
||||
|
||||
#include <vtkm/Types.h>
|
||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
vtkm::UInt32 getNumSMs(int dId);
|
||||
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
|
||||
template <typename TaskType>
|
||||
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
|
||||
|
||||
struct PerfRecord1d
|
||||
{
|
||||
PerfRecord1d(float elapsedT, int g, int b)
|
||||
: elapsedTime(elapsedT)
|
||||
, grid(g)
|
||||
, block(b)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
float elapsedTime;
|
||||
int grid;
|
||||
int block;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
|
||||
{
|
||||
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
|
||||
<< ">>> required: " << record.elapsedTime << "\n";
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
struct PerfRecord3d
|
||||
{
|
||||
PerfRecord3d(float elapsedT, int g, dim3 b)
|
||||
: elapsedTime(elapsedT)
|
||||
, grid(g)
|
||||
, block(b)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
float elapsedTime;
|
||||
int grid;
|
||||
dim3 block;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
|
||||
{
|
||||
|
||||
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
|
||||
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
template <typename TaskT>
|
||||
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
|
||||
{
|
||||
std::vector<PerfRecord1d> results;
|
||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
||||
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
|
||||
256, 512, 1024, 2048, 4096, 8192 };
|
||||
|
||||
int deviceId;
|
||||
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
||||
|
||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
||||
{
|
||||
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
|
||||
for (vtkm::UInt32 b = 0; b < 12; b++)
|
||||
{
|
||||
vtkm::UInt32 blocks = blockIndexTable[b];
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
for (auto&& i : results)
|
||||
{
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TaskT>
|
||||
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
|
||||
{
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
std::vector<PerfRecord3d> results;
|
||||
|
||||
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
||||
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
|
||||
28, 30, 32, 64, 128, 256, 512, 1024 };
|
||||
|
||||
int deviceId;
|
||||
for (vtkm::UInt32 g = 0; g < 12; g++)
|
||||
{
|
||||
vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId);
|
||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
||||
{
|
||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
||||
{
|
||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
|
||||
|
||||
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
|
||||
blocks.z >= 64)
|
||||
{
|
||||
//cuda can't handle more than 1024 threads per block
|
||||
//so don't try if we compute higher than that
|
||||
|
||||
//also don't try stupidly low numbers
|
||||
|
||||
//cuda can't handle more than 64 threads in the z direction
|
||||
continue;
|
||||
}
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
||||
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
for (auto&& i : results)
|
||||
{
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user