From c11f29c093ec1edca980e6cfe8e8695013ccd660 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 23 Jun 2017 13:27:33 -0400 Subject: [PATCH] Move the parameter sweeping code to a separate header. The parameter sweeping code is only enabled when tuning for new GPU's so we should move it to a separate header to make DeviceAdapterAlgorithmThrust easier to read. --- vtkm/cont/cuda/internal/CMakeLists.txt | 1 + .../internal/DeviceAdapterAlgorithmThrust.h | 210 ++++------- vtkm/cont/cuda/internal/TaskTuner.h | 333 ++++++++++++++++++ 3 files changed, 405 insertions(+), 139 deletions(-) create mode 100644 vtkm/cont/cuda/internal/TaskTuner.h diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt index 2bf1a27c3..944ca3d5e 100644 --- a/vtkm/cont/cuda/internal/CMakeLists.txt +++ b/vtkm/cont/cuda/internal/CMakeLists.txt @@ -25,6 +25,7 @@ set(headers DeviceAdapterAlgorithmThrust.h DeviceAdapterTagCuda.h MakeThrustIterator.h + TaskTuner.h ThrustExceptionHandler.h VirtualObjectTransferCuda.h ) diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h index df0d7f5ba..b6b59b7c4 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h @@ -38,9 +38,14 @@ #include #include -//#include #include +// #define ANALYZE_VTKM_SCHEDULER_1D +// #define ANALYZE_VTKM_SCHEDULER_3D +#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D) +#include +#endif + // Disable warnings we check vtkm for but Thrust does not. VTKM_THIRDPARTY_PRE_INCLUDE //our own custom thrust execution policy @@ -118,6 +123,47 @@ __global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size) functor(index); } +#if defined(ANALYZE_VTKM_SCHEDULER_1D) || defined(ANALYZE_VTKM_SCHEDULER_3D) + +// Currently we are getting compile failures with vtkm::worklet::wavelets::InverseTransformOdd +// for an unknown reason +template +__global__ void Schedule1DIndexKernel2(FunctorType functor, + vtkm::Id numberOfKernelsInvoked, + vtkm::Id length) +{ + vtkm::Id index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const vtkm::Id inc = static_cast(blockDim.x * gridDim.x); + for (; index < length; index += inc) + { + functor(index); + } +} + +template +__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size) +{ + const dim3 start(blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z); + const dim3 inc(blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z); + + for (uint k = start.z; k < size.z; k += inc.z) + { + for (uint j = start.y; j < size.y; j += inc.y) + { + vtkm::Id3 index(start.x, j, k); + for (vtkm::Id i = start.x; i < size.x; i += inc.x) + { + index[0] = i; + functor(index); + } + } + } +} + +#endif + template __global__ void SumExclusiveScan(T a, T b, T result, BinaryOperationType binary_op) { @@ -134,140 +180,6 @@ inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d : (rangeMax.z / blockSize3d.z); } -#ifdef ANALYZE_VTKM_SCHEDULER -class PerfRecord -{ -public: - PerfRecord(float elapsedT, dim3 block) - : elapsedTime(elapsedT) - , blockSize(block) - { - } - - bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; } - - float elapsedTime; - dim3 blockSize; -}; - -template -static void compare_3d_schedule_patterns(Functor functor, const vtkm::Id3& rangeMax) -{ - const dim3 ranges(static_cast(rangeMax[0]), - static_cast(rangeMax[1]), - static_cast(rangeMax[2])); - std::vector results; - vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 }; - - for (vtkm::UInt32 i = 0; i < 16; i++) - { - for (vtkm::UInt32 j = 0; j < 16; j++) - { - for (vtkm::UInt32 k = 0; k < 16; k++) - { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]); - dim3 gridSize3d; - - if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 || - (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64) - { - //cuda can't handle more than 1024 threads per block - //so don't try if we compute higher than that - - //also don't try stupidly low numbers - - //cuda can't handle more than 64 threads in the z direction - continue; - } - - compute_block_size(ranges, blockSize3d, gridSize3d); - VTKM_CUDA_CALL(cudaEventRecord(start, 0)); - Schedule3DIndexKernel<<>>(functor, ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - PerfRecord record(elapsedTimeMilliseconds, blockSize3d); - results.push_back(record); - } - } - } - - std::sort(results.begin(), results.end()); - const vtkm::Int64 size = static_cast(results.size()); - for (vtkm::Int64 i = 1; i <= size; i++) - { - vtkm::UInt64 index = static_cast(size - i); - vtkm::UInt32 x = results[index].blockSize.x; - vtkm::UInt32 y = results[index].blockSize.y; - vtkm::UInt32 z = results[index].blockSize.z; - float t = results[index].elapsedTime; - - std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl; - } - - std::cout << "flat array performance " << std::endl; - { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - VTKM_CUDA_CALL(cudaEventRecord(start, 0)); - typedef vtkm::cont::cuda::internal::DeviceAdapterAlgorithmThrust< - vtkm::cont::DeviceAdapterTagCuda> - Algorithm; - Algorithm::Schedule(functor, numInstances); - VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - std::cout << "Flat index required: " << elapsedTimeMilliseconds << std::endl; - } - - std::cout << "fixed 3d block size performance " << std::endl; - { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - dim3 blockSize3d(64, 2, 1); - dim3 gridSize3d; - - compute_block_size(ranges, blockSize3d, gridSize3d); - VTKM_CUDA_CALL(cudaEventRecord(start, 0)); - Schedule3DIndexKernel<<>>(functor, ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z - << " required: " << elapsedTimeMilliseconds << std::endl; - std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z - << " required: " << elapsedTimeMilliseconds << std::endl; - } -} - -#endif - /// This class can be subclassed to implement the DeviceAdapterAlgorithm for a /// device that uses thrust as its implementation. The subclass should pass in /// the correct device adapter tag as the template parameter. @@ -1365,6 +1277,10 @@ public: { throw vtkm::cont::ErrorExecution(hostErrorPtr); } + +#ifdef ANALYZE_VTKM_SCHEDULER_1D + compare_1d_dynamic_block_picker(functor, numInstances, totalBlocks, blockSize); +#endif } template @@ -1390,10 +1306,6 @@ public: functor.SetErrorMessageBuffer(errorMessage); -#ifdef ANALYZE_VTKM_SCHEDULER - //requires the errormessage buffer be set - compare_3d_schedule_patterns(functor, rangeMax); -#endif const dim3 ranges(static_cast(rangeMax[0]), static_cast(rangeMax[1]), static_cast(rangeMax[2])); @@ -1402,6 +1314,9 @@ public: //memory in the X direction. Also should be good for thin in the Z axis //algorithms. dim3 blockSize3d(64, 2, 1); + //In general we need more information as this doesn't work well when + //executing on the points, and need to fetch all cells used, as the z + //width is not fat enough //handle the simple use case of 'bad' datasets which are thin in X //but larger in the other directions, allowing us decent performance with @@ -1411,6 +1326,7 @@ public: blockSize3d = dim3(16, 4, 4); } + dim3 gridSize3d; compute_block_size(ranges, blockSize3d, gridSize3d); @@ -1427,6 +1343,22 @@ public: { throw vtkm::cont::ErrorExecution(hostErrorPtr); } + +#ifdef ANALYZE_VTKM_SCHEDULER_1D + compare_1d_dynamic_block_picker(functor, + rangeMax[0] * rangeMax[1] * rangeMax[2], + gridSize3d.x * gridSize3d.y * gridSize3d.z, + blockSize3d.x * blockSize3d.y * blockSize3d.z); +#endif + +#ifdef ANALYZE_VTKM_SCHEDULER_3D + //requires the errormessage buffer be set + compare_3d_dynamic_block_picker(functor, rangeMax, gridSize3d, blockSize3d); +#endif + +#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D + parameter_sweep_3d_schedule(functor, rangeMax); +#endif } template diff --git a/vtkm/cont/cuda/internal/TaskTuner.h b/vtkm/cont/cuda/internal/TaskTuner.h new file mode 100644 index 000000000..fe8b15fd0 --- /dev/null +++ b/vtkm/cont/cuda/internal/TaskTuner.h @@ -0,0 +1,333 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#ifndef vtk_m_cont_cuda_internal_TaskTuner_h +#define vtk_m_cont_cuda_internal_TaskTuner_h + +#include +#include + +#include + +#include +#include +#include +#include + +namespace vtkm +{ +namespace cont +{ +namespace cuda +{ +namespace internal +{ + +template +__global__ void Schedule1DIndexKernel(FunctorType functor, vtkm::Id, vtkm::Id); +template +__global__ void Schedule1DIndexKernel2(FunctorType functor, vtkm::Id, vtkm::Id); +template +__global__ void Schedule3DIndexKernel(FunctorType functor, dim3 size); +template +__global__ void Schedule3DIndexKernel2(FunctorType functor, dim3 size); + +inline void compute_block_size(dim3 rangeMax, dim3 blockSize3d, dim3& gridSize3d); + + +template +__global__ void TaskStrided1DLaunch(Task task, vtkm::Id size) +{ + const vtkm::Id start = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const vtkm::Id inc = static_cast(blockDim.x * gridDim.x); + for (vtkm::Id i = start; i < size; i += inc) + { + task(i); + } +} + +class PerfRecord +{ +public: + PerfRecord(float elapsedT, dim3 block) + : elapsedTime(elapsedT) + , blockSize(block) + { + } + + bool operator<(const PerfRecord& other) const { return elapsedTime < other.elapsedTime; } + + float elapsedTime; + dim3 blockSize; +}; + +template +static void BlockSizeGuesser(vtkm::Id size, int& grids, int& blocks, float& occupancy) +{ + int blockSize; // The launch configurator returned block size + int minGridSize; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + int gridSize; // The actual grid size needed, based on number of SM's + int device; // device to run on + int numSMs; // number of SMs on the active device + + cudaGetDevice(&device); + cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device); + + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, Schedule1DIndexKernel2, 0, 0); + + + blockSize /= (numSMs * 2); + // Round up according to array size + // gridSize = (size + blockSize - 1) / blockSize; + gridSize = 32 * numSMs; + // std::cout << "numSMs: " << numSMs << std::endl; + + // calculate theoretical occupancy + int maxActiveBlocks; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &maxActiveBlocks, Schedule1DIndexKernel2, blockSize, 0); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device); + + grids = gridSize; + blocks = blockSize; + occupancy = (maxActiveBlocks * blockSize / props.warpSize) / + (float)(props.maxThreadsPerMultiProcessor / props.warpSize); +} + +template +static void compare_1d_dynamic_block_picker(Functor functor, + vtkm::Id size, + const vtkm::Id& currentGridSize, + const vtkm::Id& currentBlockSize) +{ + const std::type_info& ti = typeid(functor); + std::cout << "fixed 1d block size performance " << ti.name() << std::endl; + { + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + Schedule1DIndexKernel<<>>( + functor, vtkm::Id(0), size); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + std::cout << "Schedule1DIndexKernel size: " << size << std::endl; + std::cout << "GridSize of: " << currentGridSize << " BlockSize of: " << currentBlockSize + << " required: " << elapsedTimeMilliseconds << std::endl; + } + + std::cout << "dynamic 1d block size performance " << ti.name() << std::endl; + { + + int grids, blocks; + float occupancy; + BlockSizeGuesser(size, grids, blocks, occupancy); + + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + Schedule1DIndexKernel2<<>>(functor, vtkm::Id(0), size); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + std::cout << "Schedule1DIndexKernel2 size: " << size << std::endl; + std::cout << "GridSize of: " << grids << " BlockSize of: " << blocks + << " required: " << elapsedTimeMilliseconds << std::endl; + } + std::cout << std::endl; +} + +template +static void compare_3d_dynamic_block_picker(Functor functor, + vtkm::Id3 ranges, + const dim3& gridSize3d, + const dim3& blockSize3d) +{ + const std::type_info& ti = typeid(functor); + std::cout << "fixed 3d block size performance " << ti.name() << std::endl; + { + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + Schedule3DIndexKernel<<>>(functor, ranges); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + // std::cout << "Schedule3DIndexKernel size: " << size << std::endl; + // std::cout << "GridSize of: " << currentGridSize + // << " BlockSize of: " << currentBlockSize << " required: " << elapsedTimeMilliseconds << std::endl; + } + + std::cout << "dynamic 3d block size performance " << ti.name() << std::endl; + { + + // int grids, blocks; + // float occupancy; + // BlockSizeGuesser(size, grids, blocks, occupancy); + + // cudaEvent_t start, stop; + // VTKM_CUDA_CALL(cudaEventCreate(&start)); + // VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + + // VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + // Schedule3DIndexKernel2<<>>(functor, vtkm::Id(0), size); + // VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + // VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + // float elapsedTimeMilliseconds; + // VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + // VTKM_CUDA_CALL(cudaEventDestroy(start)); + // VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + // std::cout << "Schedule3DIndexKernel2 size: " << size << std::endl; + // std::cout << "GridSize of: " << grids + // << " BlockSize of: " << blocks << " required: " << elapsedTimeMilliseconds << std::endl; + } + std::cout << std::endl; +} + +template +static void parameter_sweep_3d_schedule(Functor functor, const vtkm::Id3& rangeMax) +{ + const dim3 ranges(static_cast(rangeMax[0]), + static_cast(rangeMax[1]), + static_cast(rangeMax[2])); + std::vector results; + vtkm::UInt32 indexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32, 64, 128, 256, 512, 1024 }; + + for (vtkm::UInt32 i = 0; i < 16; i++) + { + for (vtkm::UInt32 j = 0; j < 16; j++) + { + for (vtkm::UInt32 k = 0; k < 16; k++) + { + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + dim3 blockSize3d(indexTable[i], indexTable[j], indexTable[k]); + dim3 gridSize3d; + + if ((blockSize3d.x * blockSize3d.y * blockSize3d.z) >= 1024 || + (blockSize3d.x * blockSize3d.y * blockSize3d.z) <= 4 || blockSize3d.z >= 64) + { + //cuda can't handle more than 1024 threads per block + //so don't try if we compute higher than that + + //also don't try stupidly low numbers + + //cuda can't handle more than 64 threads in the z direction + continue; + } + + compute_block_size(ranges, blockSize3d, gridSize3d); + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + Schedule3DIndexKernel<<>>(functor, ranges); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + PerfRecord record(elapsedTimeMilliseconds, blockSize3d); + results.push_back(record); + } + } + } + + std::sort(results.begin(), results.end()); + const vtkm::Int64 size = static_cast(results.size()); + for (vtkm::Int64 i = 1; i <= size; i++) + { + vtkm::UInt64 index = static_cast(size - i); + vtkm::UInt32 x = results[index].blockSize.x; + vtkm::UInt32 y = results[index].blockSize.y; + vtkm::UInt32 z = results[index].blockSize.z; + float t = results[index].elapsedTime; + + std::cout << "BlockSize of: " << x << "," << y << "," << z << " required: " << t << std::endl; + } + + std::cout << "fixed 3d block size performance " << std::endl; + { + cudaEvent_t start, stop; + VTKM_CUDA_CALL(cudaEventCreate(&start)); + VTKM_CUDA_CALL(cudaEventCreate(&stop)); + + dim3 blockSize3d(64, 2, 1); + dim3 gridSize3d; + + compute_block_size(ranges, blockSize3d, gridSize3d); + VTKM_CUDA_CALL(cudaEventRecord(start, 0)); + Schedule3DIndexKernel<<>>(functor, ranges); + VTKM_CUDA_CALL(cudaEventRecord(stop, 0)); + + VTKM_CUDA_CALL(cudaEventSynchronize(stop)); + float elapsedTimeMilliseconds; + VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); + + VTKM_CUDA_CALL(cudaEventDestroy(start)); + VTKM_CUDA_CALL(cudaEventDestroy(stop)); + + std::cout << "BlockSize of: " << blockSize3d.x << "," << blockSize3d.y << "," << blockSize3d.z + << " required: " << elapsedTimeMilliseconds << std::endl; + std::cout << "GridSize of: " << gridSize3d.x << "," << gridSize3d.y << "," << gridSize3d.z + << " required: " << elapsedTimeMilliseconds << std::endl; + } +} +} +} +} +} + +#endif