2017-06-23 17:27:33 +00:00
|
|
|
//============================================================================
|
|
|
|
// Copyright (c) Kitware, Inc.
|
|
|
|
// All rights reserved.
|
|
|
|
// See LICENSE.txt for details.
|
|
|
|
// This software is distributed WITHOUT ANY WARRANTY; without even
|
|
|
|
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
|
|
// PURPOSE. See the above copyright notice for more information.
|
|
|
|
//
|
2017-09-20 21:33:44 +00:00
|
|
|
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
|
2017-06-23 17:27:33 +00:00
|
|
|
// Copyright 2014 UT-Battelle, LLC.
|
|
|
|
// Copyright 2014 Los Alamos National Security.
|
|
|
|
//
|
2017-09-20 21:33:44 +00:00
|
|
|
// Under the terms of Contract DE-NA0003525 with NTESS,
|
2017-06-23 17:27:33 +00:00
|
|
|
// the U.S. Government retains certain rights in this software.
|
|
|
|
//
|
|
|
|
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
|
|
|
// Laboratory (LANL), the U.S. Government retains certain rights in
|
|
|
|
// this software.
|
|
|
|
//============================================================================
|
|
|
|
|
|
|
|
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
|
|
|
|
#define vtk_m_cont_cuda_internal_TaskTuner_h
|
|
|
|
|
|
|
|
#include <vtkm/Types.h>
|
|
|
|
#include <vtkm/cont/cuda/ErrorCuda.h>
|
|
|
|
|
|
|
|
#include <cuda.h>
|
|
|
|
|
2017-07-07 12:40:03 +00:00
|
|
|
#include <algorithm>
|
2017-06-23 17:27:33 +00:00
|
|
|
#include <iostream>
|
|
|
|
#include <string>
|
|
|
|
#include <typeinfo>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace vtkm
|
|
|
|
{
|
|
|
|
namespace cont
|
|
|
|
{
|
|
|
|
namespace cuda
|
|
|
|
{
|
|
|
|
namespace internal
|
|
|
|
{
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
int getNumSMs(int dId);
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
template <typename TaskType>
|
|
|
|
__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id);
|
|
|
|
template <typename TaskType>
|
|
|
|
__global__ void TaskStrided3DLaunch(TaskType task, dim3 size);
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
struct PerfRecord1d
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
PerfRecord1d(float elapsedT, int g, int b)
|
2017-06-23 17:27:33 +00:00
|
|
|
: elapsedTime(elapsedT)
|
2018-04-27 18:19:15 +00:00
|
|
|
, grid(g)
|
|
|
|
, block(b)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; }
|
2017-06-23 17:27:33 +00:00
|
|
|
|
|
|
|
float elapsedTime;
|
2018-04-27 18:19:15 +00:00
|
|
|
int grid;
|
|
|
|
int block;
|
2017-06-23 17:27:33 +00:00
|
|
|
};
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block
|
|
|
|
<< ">>> required: " << record.elapsedTime << "\n";
|
|
|
|
return os;
|
2017-06-23 17:27:33 +00:00
|
|
|
}
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
|
|
|
|
struct PerfRecord3d
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
PerfRecord3d(float elapsedT, int g, dim3 b)
|
|
|
|
: elapsedTime(elapsedT)
|
|
|
|
, grid(g)
|
|
|
|
, block(b)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; }
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
float elapsedTime;
|
|
|
|
int grid;
|
|
|
|
dim3 block;
|
|
|
|
};
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y
|
|
|
|
<< "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n";
|
|
|
|
return os;
|
|
|
|
}
|
2017-06-23 17:27:33 +00:00
|
|
|
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
template <typename TaskT>
|
|
|
|
static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances)
|
|
|
|
{
|
|
|
|
std::vector<PerfRecord1d> results;
|
|
|
|
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
|
|
|
constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128,
|
|
|
|
256, 512, 1024, 2048, 4096, 8192 };
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
int deviceId;
|
|
|
|
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
for (vtkm::UInt32 g = 0; g < 12; g++)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
int grids = gridIndexTable[g] * getNumSMs(deviceId);
|
|
|
|
for (vtkm::UInt32 b = 0; b < 12; b++)
|
|
|
|
{
|
|
|
|
int blocks = blockIndexTable[b];
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
cudaEvent_t start, stop;
|
|
|
|
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
|
|
|
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
TaskStrided1DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, numInstances);
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
|
|
|
float elapsedTimeMilliseconds;
|
|
|
|
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
|
|
|
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
|
|
|
}
|
|
|
|
}
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
std::sort(results.begin(), results.end());
|
|
|
|
for (auto&& i : results)
|
|
|
|
{
|
|
|
|
std::cout << i << std::endl;
|
2017-06-23 17:27:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
template <typename TaskT>
|
|
|
|
static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
|
|
|
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
|
|
|
static_cast<vtkm::UInt32>(rangeMax[1]),
|
|
|
|
static_cast<vtkm::UInt32>(rangeMax[2]));
|
2018-04-27 18:19:15 +00:00
|
|
|
std::vector<PerfRecord3d> results;
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
|
|
|
|
constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24,
|
|
|
|
28, 30, 32, 64, 128, 256, 512, 1024 };
|
|
|
|
|
|
|
|
int deviceId;
|
|
|
|
for (vtkm::UInt32 g = 0; g < 12; g++)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
int grids = gridIndexTable[g] * getNumSMs(deviceId);
|
|
|
|
for (vtkm::UInt32 i = 0; i < 16; i++)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
for (vtkm::UInt32 j = 0; j < 16; j++)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
for (vtkm::UInt32 k = 0; k < 16; k++)
|
|
|
|
{
|
|
|
|
cudaEvent_t start, stop;
|
|
|
|
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
|
|
|
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]);
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 ||
|
|
|
|
blocks.z >= 64)
|
|
|
|
{
|
|
|
|
//cuda can't handle more than 1024 threads per block
|
|
|
|
//so don't try if we compute higher than that
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
//also don't try stupidly low numbers
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
//cuda can't handle more than 64 threads in the z direction
|
|
|
|
continue;
|
|
|
|
}
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread));
|
|
|
|
TaskStrided3DLaunch<<<grids, blocks, 0, cudaStreamPerThread>>>(task, ranges);
|
|
|
|
VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
|
|
|
float elapsedTimeMilliseconds;
|
|
|
|
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
|
|
|
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
2017-06-23 17:27:33 +00:00
|
|
|
|
2018-04-27 18:19:15 +00:00
|
|
|
results.emplace_back(elapsedTimeMilliseconds, grids, blocks);
|
|
|
|
}
|
2017-06-23 17:27:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::sort(results.begin(), results.end());
|
2018-04-27 18:19:15 +00:00
|
|
|
for (auto&& i : results)
|
2017-06-23 17:27:33 +00:00
|
|
|
{
|
2018-04-27 18:19:15 +00:00
|
|
|
std::cout << i << std::endl;
|
2017-06-23 17:27:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|