vtk-m2/vtkm/cont/cuda/internal/TaskTuner.h

213 lines
6.1 KiB
C
Raw Normal View History

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
#define vtk_m_cont_cuda_internal_TaskTuner_h
#include <vtkm/Types.h>
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <cuda.h>
#include <algorithm>
#include <iostream>
#include <string>
#include <typeinfo>
#include <vector>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
int getNumSMs(int dId);
template <typename TaskType>
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
template <typename TaskType>
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
struct PerfRecord1d
{
PerfRecord1d(float elapsedT, int g, int b)
: elapsedTime(elapsedT)
, grid(g)
, block(b)
{
}
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
float elapsedTime;
int grid;
int block;
};
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
{
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
<< ">>> required: " << record.elapsedTime << "\n";
return os;
}
struct PerfRecord3d
{
PerfRecord3d(float elapsedT, int g, dim3 b)
: elapsedTime(elapsedT)
, grid(g)
, block(b)
{
}
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
float elapsedTime;
int grid;
dim3 block;
};
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
{
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
return os;
}
template <typename TaskT>
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
{
std::vector<PerfRecord1d> results;
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192 };
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
for (vtkm::UInt32 g = 0; g < 12; g++)
{
int grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 b = 0; b < 12; b++)
{
int blocks = blockIndexTable[b];
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
}
std::sort(results.begin(), results.end());
for (auto&& i : results)
{
std::cout << i << std::endl;
}
}
template <typename TaskT>
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
{
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
static_cast<vtkm::UInt32>(rangeMax[1]),
static_cast<vtkm::UInt32>(rangeMax[2]));
std::vector<PerfRecord3d> results;
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
28, 30, 32, 64, 128, 256, 512, 1024 };
int deviceId;
for (vtkm::UInt32 g = 0; g < 12; g++)
{
int grids = gridIndexTable[g] * getNumSMs(deviceId);
for (vtkm::UInt32 i = 0; i < 16; i++)
{
for (vtkm::UInt32 j = 0; j < 16; j++)
{
for (vtkm::UInt32 k = 0; k < 16; k++)
{
cudaEvent_t start, stop;
VTKM_CUDA_CALL(cudaEventCreate(&start));
VTKM_CUDA_CALL(cudaEventCreate(&stop));
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
blocks.z >= 64)
{
//cuda can't handle more than 1024 threads per block
//so don't try if we compute higher than that
//also don't try stupidly low numbers
//cuda can't handle more than 64 threads in the z direction
continue;
}
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
VTKM_CUDA_CALL(cudaEventDestroy(start));
VTKM_CUDA_CALL(cudaEventDestroy(stop));
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
}
}
}
}
std::sort(results.begin(), results.end());
for (auto&& i : results)
{
std::cout << i << std::endl;
}
}
}
}
}
}
#endif