mirror of
https://gitlab.kitware.com/vtk/vtk-m
synced 2024-09-16 17:22:55 +00:00
Move the parameter sweeping code to a separate header.
The parameter sweeping code is only enabled when tuning for new GPU's so we should move it to a separate header to make DeviceAdapterAlgorithmThrust easier to read.
This commit is contained in:
parent
d1c86f45cf
commit
c11f29c093
@ -25,6 +25,7 @@ set(headers
|
||||
DeviceAdapterAlgorithmThrust.h
|
||||
DeviceAdapterTagCuda.h
|
||||
MakeThrustIterator.h
|
||||
TaskTuner.h
|
||||
ThrustExceptionHandler.h
|
||||
VirtualObjectTransferCuda.h
|
||||
)
|
||||
|
@ -38,9 +38,14 @@
|
||||
#include <vtkm/exec/internal/ErrorMessageBuffer.h>
|
||||
#include <vtkm/exec/internal/TaskSingular.h>
|
||||
|
||||
//#include <vtkm/cont/DeviceAdapterAlgorithm.h>
|
||||
#include <vtkm/cont/internal/DeviceAdapterAlgorithmGeneral.h>
|
||||
|
||||
// #define ANALYZE_VTKM_SCHEDULER_1D
|
||||
// #define ANALYZE_VTKM_SCHEDULER_3D
|
||||
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
|
||||
#include <vtkm/cont/cuda/internal/TaskTuner.h>
|
||||
#endif
|
||||
|
||||
// Disable warnings we check vtkm for but Thrust does not.
|
||||
VTKM_THIRDPARTY_PRE_INCLUDE
|
||||
//our own custom thrust execution policy
|
||||
@ -118,6 +123,47 @@ __global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size)
|
||||
functor(index);
|
||||
}
|
||||
|
||||
#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D)
|
||||
|
||||
// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd
|
||||
// for an unknown reason
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel2(FunctorType functor,
|
||||
vtkm::Id numberOfKernelsInvoked,
|
||||
vtkm::Id length)
|
||||
{
|
||||
vtkm::Id index = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
|
||||
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
|
||||
for (; index < length; index += inc)
|
||||
{
|
||||
functor(index);
|
||||
}
|
||||
}
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size)
|
||||
{
|
||||
const dim3 start(blockIdx.x * blockDim.x + threadIdx.x,
|
||||
blockIdx.y * blockDim.y + threadIdx.y,
|
||||
blockIdx.z * blockDim.z + threadIdx.z);
|
||||
const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z);
|
||||
|
||||
for (uint k = start.z; k < size.z; k += inc.z)
|
||||
{
|
||||
for (uint j = start.y; j < size.y; j += inc.y)
|
||||
{
|
||||
vtkm::Id3 index(start.x, j, k);
|
||||
for (vtkm::Id i = start.x; i < size.x; i += inc.x)
|
||||
{
|
||||
index[0] = i;
|
||||
functor(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T, typename BinaryOperationType>
|
||||
__global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op)
|
||||
{
|
||||
@ -134,140 +180,6 @@ inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d
|
||||
: (rangeMax.z / blockSize3d.z);
|
||||
}
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER
|
||||
class PerfRecord
|
||||
{
|
||||
public:
|
||||
PerfRecord(float elapsedT, dim3 block)
|
||||
: elapsedTime(elapsedT)
|
||||
, blockSize(block)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
float elapsedTime;
|
||||
dim3 blockSize;
|
||||
};
|
||||
|
||||
template <class Functor>
|
||||
static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& rangeMax)
|
||||
{
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
std::vector<PerfRecord> results;
|
||||
vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
|
||||
|
||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
||||
{
|
||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
||||
{
|
||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
|
||||
dim3 gridSize3d;
|
||||
|
||||
if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
|
||||
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
|
||||
{
|
||||
//cuda can't handle more than 1024 threads per block
|
||||
//so don't try if we compute higher than that
|
||||
|
||||
//also don't try stupidly low numbers
|
||||
|
||||
//cuda can't handle more than 64 threads in the z direction
|
||||
continue;
|
||||
}
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
|
||||
results.push_back(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
|
||||
for (vtkm::Int64 i = 1; i <= size; i++)
|
||||
{
|
||||
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
|
||||
vtkm::UInt32 x = results[index].blockSize.x;
|
||||
vtkm::UInt32 y = results[index].blockSize.y;
|
||||
vtkm::UInt32 z = results[index].blockSize.z;
|
||||
float t = results[index].elapsedTime;
|
||||
|
||||
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "flat array performance " << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
typedef vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust<
|
||||
vtkm::cont::DeviceAdapterTagCuda>
|
||||
Algorithm;
|
||||
Algorithm::Schedule(functor, numInstances);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "fixed 3d block size performance " << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(64, 2, 1);
|
||||
dim3 gridSize3d;
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/// This class can be subclassed to implement the DeviceAdapterAlgorithm for a
|
||||
/// device that uses thrust as its implementation. The subclass should pass in
|
||||
/// the correct device adapter tag as the template parameter.
|
||||
@ -1365,6 +1277,10 @@ public:
|
||||
{
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_1D
|
||||
compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
@ -1390,10 +1306,6 @@ public:
|
||||
|
||||
functor.SetErrorMessageBuffer(errorMessage);
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER
|
||||
//requires the errormessage buffer be set
|
||||
compare_3d_schedule_patterns(functor, rangeMax);
|
||||
#endif
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
@ -1402,6 +1314,9 @@ public:
|
||||
//memory in the X direction. Also should be good for thin in the Z axis
|
||||
//algorithms.
|
||||
dim3 blockSize3d(64, 2, 1);
|
||||
//In general we need more information as this doesn't work well when
|
||||
//executing on the points, and need to fetch all cells used, as the z
|
||||
//width is not fat enough
|
||||
|
||||
//handle the simple use case of 'bad' datasets which are thin in X
|
||||
//but larger in the other directions, allowing us decent performance with
|
||||
@ -1411,6 +1326,7 @@ public:
|
||||
blockSize3d = dim3(16, 4, 4);
|
||||
}
|
||||
|
||||
|
||||
dim3 gridSize3d;
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
|
||||
@ -1427,6 +1343,22 @@ public:
|
||||
{
|
||||
throw vtkm::cont::ErrorExecution(hostErrorPtr);
|
||||
}
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_1D
|
||||
compare_1d_dynamic_block_picker(functor,
|
||||
rangeMax[0] * rangeMax[1] * rangeMax[2],
|
||||
gridSize3d.x * gridSize3d.y * gridSize3d.z,
|
||||
blockSize3d.x * blockSize3d.y * blockSize3d.z);
|
||||
#endif
|
||||
|
||||
#ifdef ANALYZE_VTKM_SCHEDULER_3D
|
||||
//requires the errormessage buffer be set
|
||||
compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d);
|
||||
#endif
|
||||
|
||||
#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D
|
||||
parameter_sweep_3d_schedule(functor, rangeMax);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, class Storage>
|
||||
|
333
vtkm/cont/cuda/internal/TaskTuner.h
Normal file
333
vtkm/cont/cuda/internal/TaskTuner.h
Normal file
@ -0,0 +1,333 @@
|
||||
//============================================================================
|
||||
// Copyright (c) Kitware, Inc.
|
||||
// All rights reserved.
|
||||
// See LICENSE.txt for details.
|
||||
// This software is distributed WITHOUT ANY WARRANTY; without even
|
||||
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
// PURPOSE. See the above copyright notice for more information.
|
||||
//
|
||||
// Copyright 2014 Sandia Corporation.
|
||||
// Copyright 2014 UT-Battelle, LLC.
|
||||
// Copyright 2014 Los Alamos National Security.
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
|
||||
// Laboratory (LANL), the U.S. Government retains certain rights in
|
||||
// this software.
|
||||
//============================================================================
|
||||
|
||||
#ifndef vtk_m_cont_cuda_internal_TaskTuner_h
|
||||
#define vtk_m_cont_cuda_internal_TaskTuner_h
|
||||
|
||||
#include <vtkm/Types.h>
|
||||
#include <vtkm/cont/cuda/ErrorCuda.h>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
namespace vtkm
|
||||
{
|
||||
namespace cont
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size);
|
||||
template <class FunctorType>
|
||||
__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size);
|
||||
|
||||
inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d);
|
||||
|
||||
|
||||
template <typename Task>
|
||||
__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size)
|
||||
{
|
||||
const vtkm::Id start = static_cast<vtkm::Id>(blockIdx.x * blockDim.x + threadIdx.x);
|
||||
const vtkm::Id inc = static_cast<vtkm::Id>(blockDim.x * gridDim.x);
|
||||
for (vtkm::Id i = start; i < size; i += inc)
|
||||
{
|
||||
task(i);
|
||||
}
|
||||
}
|
||||
|
||||
class PerfRecord
|
||||
{
|
||||
public:
|
||||
PerfRecord(float elapsedT, dim3 block)
|
||||
: elapsedTime(elapsedT)
|
||||
, blockSize(block)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; }
|
||||
|
||||
float elapsedTime;
|
||||
dim3 blockSize;
|
||||
};
|
||||
|
||||
template <typename Task>
|
||||
static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy)
|
||||
{
|
||||
int blockSize; // The launch configurator returned block size
|
||||
int minGridSize; // The minimum grid size needed to achieve the
|
||||
// maximum occupancy for a full device launch
|
||||
int gridSize; // The actual grid size needed, based on number of SM's
|
||||
int device; // device to run on
|
||||
int numSMs; // number of SMs on the active device
|
||||
|
||||
cudaGetDevice(&device);
|
||||
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device);
|
||||
|
||||
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2<Task>, 0, 0);
|
||||
|
||||
|
||||
blockSize /= (numSMs * 2);
|
||||
// Round up according to array size
|
||||
// gridSize = (size + blockSize - 1) / blockSize;
|
||||
gridSize = 32 * numSMs;
|
||||
// std::cout << "numSMs: " << numSMs << std::endl;
|
||||
|
||||
// calculate theoretical occupancy
|
||||
int maxActiveBlocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&maxActiveBlocks, Schedule1DIndexKernel2<Task>, blockSize, 0);
|
||||
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device);
|
||||
|
||||
grids = gridSize;
|
||||
blocks = blockSize;
|
||||
occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
|
||||
(float)(props.maxThreadsPerMultiProcessor / props.warpSize);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void compare_1d_dynamic_block_picker(Functor functor,
|
||||
vtkm::Id size,
|
||||
const vtkm::Id& currentGridSize,
|
||||
const vtkm::Id& currentBlockSize)
|
||||
{
|
||||
const std::type_info& ti = typeid(functor);
|
||||
std::cout << "fixed 1d block size performance " << ti.name() << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule1DIndexKernel<Functor><<<currentGridSize, currentBlockSize>>>(
|
||||
functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "Schedule1DIndexKernel size: " << size << std::endl;
|
||||
std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "dynamic 1d block size performance " << ti.name() << std::endl;
|
||||
{
|
||||
|
||||
int grids, blocks;
|
||||
float occupancy;
|
||||
BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule1DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl;
|
||||
std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void compare_3d_dynamic_block_picker(Functor functor,
|
||||
vtkm::Id3 ranges,
|
||||
const dim3& gridSize3d,
|
||||
const dim3& blockSize3d)
|
||||
{
|
||||
const std::type_info& ti = typeid(functor);
|
||||
std::cout << "fixed 3d block size performance " << ti.name() << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
// std::cout << "Schedule3DIndexKernel size: " << size << std::endl;
|
||||
// std::cout << "GridSize of: " << currentGridSize
|
||||
// << " BlockSize of: " << currentBlockSize << " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "dynamic 3d block size performance " << ti.name() << std::endl;
|
||||
{
|
||||
|
||||
// int grids, blocks;
|
||||
// float occupancy;
|
||||
// BlockSizeGuesser<Functor>(size, grids, blocks, occupancy);
|
||||
|
||||
// cudaEvent_t start, stop;
|
||||
// VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
// VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
// Schedule3DIndexKernel2<Functor><<<grids, blocks>>>(functor, vtkm::Id(0), size);
|
||||
// VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
// float elapsedTimeMilliseconds;
|
||||
// VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
// VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
// VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
// std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl;
|
||||
// std::cout << "GridSize of: " << grids
|
||||
// << " BlockSize of: " << blocks << " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax)
|
||||
{
|
||||
const dim3 ranges(static_cast<vtkm::UInt32>(rangeMax[0]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[1]),
|
||||
static_cast<vtkm::UInt32>(rangeMax[2]));
|
||||
std::vector<PerfRecord> results;
|
||||
vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 };
|
||||
|
||||
for (vtkm::UInt32 i = 0; i < 16; i++)
|
||||
{
|
||||
for (vtkm::UInt32 j = 0; j < 16; j++)
|
||||
{
|
||||
for (vtkm::UInt32 k = 0; k < 16; k++)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]);
|
||||
dim3 gridSize3d;
|
||||
|
||||
if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 ||
|
||||
(blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64)
|
||||
{
|
||||
//cuda can't handle more than 1024 threads per block
|
||||
//so don't try if we compute higher than that
|
||||
|
||||
//also don't try stupidly low numbers
|
||||
|
||||
//cuda can't handle more than 64 threads in the z direction
|
||||
continue;
|
||||
}
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
PerfRecord record(elapsedTimeMilliseconds, blockSize3d);
|
||||
results.push_back(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
const vtkm::Int64 size = static_cast<vtkm::Int64>(results.size());
|
||||
for (vtkm::Int64 i = 1; i <= size; i++)
|
||||
{
|
||||
vtkm::UInt64 index = static_cast<vtkm::UInt64>(size - i);
|
||||
vtkm::UInt32 x = results[index].blockSize.x;
|
||||
vtkm::UInt32 y = results[index].blockSize.y;
|
||||
vtkm::UInt32 z = results[index].blockSize.z;
|
||||
float t = results[index].elapsedTime;
|
||||
|
||||
std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl;
|
||||
}
|
||||
|
||||
std::cout << "fixed 3d block size performance " << std::endl;
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&start));
|
||||
VTKM_CUDA_CALL(cudaEventCreate(&stop));
|
||||
|
||||
dim3 blockSize3d(64, 2, 1);
|
||||
dim3 gridSize3d;
|
||||
|
||||
compute_block_size(ranges, blockSize3d, gridSize3d);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(start, 0));
|
||||
Schedule3DIndexKernel<Functor><<<gridSize3d, blockSize3d>>>(functor, ranges);
|
||||
VTKM_CUDA_CALL(cudaEventRecord(stop, 0));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventSynchronize(stop));
|
||||
float elapsedTimeMilliseconds;
|
||||
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop));
|
||||
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(start));
|
||||
VTKM_CUDA_CALL(cudaEventDestroy(stop));
|
||||
|
||||
std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z
|
||||
<< " required: " << elapsedTimeMilliseconds << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user