From 047b64651771fe0057269280b983f76f6d7ccb9e Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 15 Apr 2019 08:06:33 -0400 Subject: [PATCH] VTK-m now provides better scheduling parameters controls VTK-m now offers a more GPU aware set of defaults for kernel scheduling. When VTK-m first launches a kernel we do system introspection and determine what GPU's are on the machine and than match this information to a preset table of values. The implementation is designed in a way that allows for VTK-m to offer both specific presets for a given GPU ( V100 ) or for an entire generation of cards ( Pascal ). Currently VTK-m offers preset tables for the following GPU's: - Tesla V100 - Tesla P100 If the hardware doesn't match a specific GPU card we than try to find the nearest know hardware generation and use those defaults. Currently we offer defaults for - Older than Pascal Hardware - Pascal Hardware - Volta+ Hardware Some users have workloads that don't align with the defaults provided by VTK-m. When that is the cause, it is possible to override the defaults by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`. As shown below: ```cpp ScheduleParameters CustomScheduleValues(char const* name, int major, int minor, int multiProcessorCount, int maxThreadsPerMultiProcessor, int maxThreadsPerBlock) { ScheduleParameters params { 64 * multiProcessorCount, //1d blocks 64, //1d threads per block 64 * multiProcessorCount, //2d blocks { 8, 8, 1 }, //2d threads per block 64 * multiProcessorCount, //3d blocks { 4, 4, 4 } }; //3d threads per block return params; } vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues); ``` --- docs/changelog/improve-cuda-scheduling.md | 45 ++++ vtkm/cont/cuda/internal/CMakeLists.txt | 1 - .../internal/DeviceAdapterAlgorithmCuda.cu | 177 +++++++++++---- .../internal/DeviceAdapterAlgorithmCuda.h | 104 ++++++--- vtkm/cont/cuda/internal/TaskTuner.h | 211 ------------------ 5 files changed, 260 insertions(+), 278 deletions(-) create mode 100644 docs/changelog/improve-cuda-scheduling.md delete mode 100644 vtkm/cont/cuda/internal/TaskTuner.h diff --git a/docs/changelog/improve-cuda-scheduling.md b/docs/changelog/improve-cuda-scheduling.md new file mode 100644 index 000000000..c8a193e37 --- /dev/null +++ b/docs/changelog/improve-cuda-scheduling.md @@ -0,0 +1,45 @@ +# VTK-m CUDA kernel scheduling including improved defaults, and user customization + +VTK-m now offers a more GPU aware set of defaults for kernel scheduling. +When VTK-m first launches a kernel we do system introspection and determine +what GPU's are on the machine and than match this information to a preset +table of values. The implementation is designed in a way that allows for +VTK-m to offer both specific presets for a given GPU ( V100 ) or for +an entire generation of cards ( Pascal ). + +Currently VTK-m offers preset tables for the following GPU's: +- Tesla V100 +- Tesla P100 + +If the hardware doesn't match a specific GPU card we than try to find the +nearest know hardware generation and use those defaults. Currently we offer +defaults for +- Older than Pascal Hardware +- Pascal Hardware +- Volta+ Hardware + +Some users have workloads that don't align with the defaults provided by +VTK-m. When that is the cause, it is possible to override the defaults +by binding a custom function to `vtkm::cont::cuda::InitScheduleParameters`. +As shown below: + +```cpp + ScheduleParameters CustomScheduleValues(char const* name, + int major, + int minor, + int multiProcessorCount, + int maxThreadsPerMultiProcessor, + int maxThreadsPerBlock) + { + + ScheduleParameters params { + 64 * multiProcessorCount, //1d blocks + 64, //1d threads per block + 64 * multiProcessorCount, //2d blocks + { 8, 8, 1 }, //2d threads per block + 64 * multiProcessorCount, //3d blocks + { 4, 4, 4 } }; //3d threads per block + return params; + } + vtkm::cont::cuda::InitScheduleParameters(&CustomScheduleValues); +``` diff --git a/vtkm/cont/cuda/internal/CMakeLists.txt b/vtkm/cont/cuda/internal/CMakeLists.txt index 284389052..0a0aeb110 100644 --- a/vtkm/cont/cuda/internal/CMakeLists.txt +++ b/vtkm/cont/cuda/internal/CMakeLists.txt @@ -29,7 +29,6 @@ set(headers DeviceAdapterTimerImplementationCuda.h ExecutionArrayInterfaceBasicCuda.h MakeThrustIterator.h - TaskTuner.h ThrustExceptionHandler.h VirtualObjectTransferCuda.h ) diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu index afd3b06c7..aaec8b2c6 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.cu @@ -21,43 +21,143 @@ #include #include +#include +#include #include +#include + namespace vtkm { namespace cont { namespace cuda { + +static vtkm::cont::cuda::ScheduleParameters ( + *ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr; + +//Use the provided function as the the compute function for ScheduleParameterBuilder +VTKM_CONT_EXPORT void InitScheduleParameters( + vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int)) +{ + ComputeFromEnv = function; +} + namespace internal { -VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId) -{ - std::size_t index = 0; - if (dId > 0) - { - index = static_cast(dId); - } +//These represent the best block/threads-per for scheduling on each GPU +static std::vector> scheduling_1d_parameters; +static std::vector> scheduling_2d_parameters; +static std::vector> scheduling_3d_parameters; - //check +struct VTKM_CONT_EXPORT ScheduleParameterBuilder +{ + //This represents information that is used to compute the best + //ScheduleParameters for a given GPU + enum struct GPU_STRATA + { + ENV = 0, + OLDER = 5, + PASCAL = 6, + VOLTA = 7, + PASCAL_HPC = 6000, + VOLTA_HPC = 7000 + }; + + std::map Presets; + std::function Compute; + + // clang-format off + // The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware + ScheduleParameterBuilder() + : Presets{ + { GPU_STRATA::ENV, { 0, 0, 0, { 0, 0, 0 }, 0, { 0, 0, 0 } } }, //use env settings + { GPU_STRATA::OLDER, + { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal + { GPU_STRATA::PASCAL, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal + { GPU_STRATA::VOLTA, { 32, 128, 8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta + { GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100 + { GPU_STRATA::VOLTA_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100 + } + , Compute(nullptr) + { + if (vtkm::cont::cuda::ComputeFromEnv != nullptr) + { + this->Compute = vtkm::cont::cuda::ComputeFromEnv; + } + else + { + this->Compute = [=] (const char* name, int major, int minor, + int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters { + return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); }; + } + } + // clang-format on + + vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name, + int major, + int minor, + int numSMs, + int maxThreadsPerSM, + int maxThreadsPerBlock) + { + (void)minor; + (void)maxThreadsPerSM; + (void)maxThreadsPerBlock; + + const constexpr int GPU_STRATA_MAX_GEN = 7; + const constexpr int GPU_STRATA_MIN_GEN = 5; + int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN); + strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN); + if (strataAsInt > GPU_STRATA_MIN_GEN) + { //only pascal and above have fancy + + //Currently the only + bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla + if (is_tesla) + { + strataAsInt *= 1000; //tesla modifier + } + } + + auto preset = this->Presets.find(static_cast(strataAsInt)); + ScheduleParameters params{ preset->second }; + params.one_d_blocks = params.one_d_blocks * numSMs; + params.two_d_blocks = params.two_d_blocks * numSMs; + params.three_d_blocks = params.three_d_blocks * numSMs; + return params; + } +}; + +VTKM_CONT_EXPORT void SetupKernelSchedulingParameters() +{ + //check flag static std::once_flag lookupBuiltFlag; - static std::vector numSMs; std::call_once(lookupBuiltFlag, []() { + ScheduleParameterBuilder builder; //iterate over all devices - int numberOfSMs = 0; int count = 0; VTKM_CUDA_CALL(cudaGetDeviceCount(&count)); - numSMs.reserve(static_cast(count)); for (int deviceId = 0; deviceId < count; ++deviceId) - { //get the number of sm's per deviceId - VTKM_CUDA_CALL( - cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId)); - numSMs.push_back(static_cast(numberOfSMs)); + { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, deviceId); + + ScheduleParameters params = builder.Compute(deviceProp.name, + deviceProp.major, + deviceProp.minor, + deviceProp.multiProcessorCount, + deviceProp.maxThreadsPerMultiProcessor, + deviceProp.maxThreadsPerBlock); + scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block); + scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block); + scheduling_3d_parameters.emplace_back(params.three_d_blocks, + params.three_d_threads_per_block); } }); - return numSMs[index]; } } } // end namespace cuda::internal @@ -101,44 +201,41 @@ void DeviceAdapterAlgorithm::CheckForErrors() } } -void DeviceAdapterAlgorithm::GetGridsAndBlocks( - vtkm::UInt32& grids, +void DeviceAdapterAlgorithm::GetBlocksAndThreads( vtkm::UInt32& blocks, + vtkm::UInt32& threadsPerBlock, vtkm::Id size) { (void)size; + vtkm::cont::cuda::internal::SetupKernelSchedulingParameters(); + int deviceId; VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda - grids = 32 * cuda::internal::getNumSMs(deviceId); - blocks = 128; + const auto& params = cuda::internal::scheduling_1d_parameters[static_cast(deviceId)]; + blocks = params.first; + threadsPerBlock = params.second; } -void DeviceAdapterAlgorithm::GetGridsAndBlocks( - vtkm::UInt32& grids, - dim3& blocks, +void DeviceAdapterAlgorithm::GetBlocksAndThreads( + vtkm::UInt32& blocks, + dim3& threadsPerBlock, const dim3& size) { + vtkm::cont::cuda::internal::SetupKernelSchedulingParameters(); + int deviceId; VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda - grids = 32 * cuda::internal::getNumSMs(deviceId); - - if (size.x == 0) - { //grids that have no x dimension - blocks.x = 1; - blocks.y = 8; - blocks.z = 8; - } - else if (size.x > 128) - { - blocks.x = 8; - blocks.y = 8; - blocks.z = 4; + if (size.z <= 1) + { //2d images + const auto& params = cuda::internal::scheduling_2d_parameters[static_cast(deviceId)]; + blocks = params.first; + threadsPerBlock = params.second; } else - { //for really small grids - blocks.x = 4; - blocks.y = 4; - blocks.z = 4; + { //3d images + const auto& params = cuda::internal::scheduling_3d_parameters[static_cast(deviceId)]; + blocks = params.first; + threadsPerBlock = params.second; } } } diff --git a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h index 2990a3bf5..81889c5f6 100644 --- a/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h +++ b/vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h @@ -48,12 +48,6 @@ #include #include -// #define PARAMETER_SWEEP_VTKM_SCHEDULER_1D -// #define PARAMETER_SWEEP_VTKM_SCHEDULER_3D -#if defined(PARAMETER_SWEEP_VTKM_SCHEDULER_1D) || defined(PARAMETER_SWEEP_VTKM_SCHEDULER_3D) -#include -#endif - // Disable warnings we check vtkm for but Thrust does not. VTKM_THIRDPARTY_PRE_INCLUDE #include @@ -79,10 +73,73 @@ namespace vtkm { namespace cont { - - namespace cuda { +/// \brief Represents how to schedule 1D, 2D, and 3D Cuda kernels +/// +/// \c ScheduleParameters represents how VTK-m should schedule different +/// cuda kernel types. By default VTK-m uses a preset table based on the +/// GPU's found at runtime. +/// +/// When these defaults are insufficient for certain projects it is possible +/// to override the defaults by using \c InitScheduleParameters. +/// +/// +struct VTKM_CONT_EXPORT ScheduleParameters +{ + int one_d_blocks; + int one_d_threads_per_block; + + int two_d_blocks; + dim3 two_d_threads_per_block; + + int three_d_blocks; + dim3 three_d_threads_per_block; +}; + +/// \brief Specify the custom scheduling to use for VTK-m CUDA kernel launches +/// +/// By default VTK-m uses a preset table based on the GPU's found at runtime to +/// determine the best scheduling parameters for a worklet. When these defaults +/// are insufficient for certain projects it is possible to override the defaults +/// by binding a custom function to \c InitScheduleParameters. +/// +/// Note: The this function must be called before any invocation of any worklets +/// by VTK-m. +/// +/// Note: This function will be called for each GPU on a machine. +/// +/// \code{.cpp} +/// +/// ScheduleParameters CustomScheduleValues(char const* name, +/// int major, +/// int minor, +/// int multiProcessorCount, +/// int maxThreadsPerMultiProcessor, +/// int maxThreadsPerBlock) +/// { +/// +/// ScheduleParameters params { +/// 64 * multiProcessorCount, //1d blocks +/// 64, //1d threads per block +/// 64 * multiProcessorCount, //2d blocks +/// { 8, 8, 1 }, //2d threads per block +/// 64 * multiProcessorCount, //3d blocks +/// { 4, 4, 4 } }; //3d threads per block +/// return params; +/// } +/// \endcode +/// +/// +VTKM_CONT_EXPORT void InitScheduleParameters( + vtkm::cont::cuda::ScheduleParameters (*)(char const* name, + int major, + int minor, + int multiProcessorCount, + int maxThreadsPerMultiProcessor, + int maxThreadsPerBlock)); + + namespace internal { @@ -1358,10 +1415,12 @@ public: static void SetupErrorBuffer(vtkm::exec::cuda::internal::TaskStrided& functor); VTKM_CONT_EXPORT - static void GetGridsAndBlocks(vtkm::UInt32& grid, vtkm::UInt32& blocks, vtkm::Id size); + static void GetBlocksAndThreads(vtkm::UInt32& blocks, + vtkm::UInt32& threadsPerBlock, + vtkm::Id size); VTKM_CONT_EXPORT - static void GetGridsAndBlocks(vtkm::UInt32& grid, dim3& blocks, const dim3& size); + static void GetBlocksAndThreads(vtkm::UInt32& blocks, dim3& threadsPerBlock, const dim3& size); public: template @@ -1378,15 +1437,11 @@ public: CheckForErrors(); SetupErrorBuffer(functor); - vtkm::UInt32 grids, blocks; - GetGridsAndBlocks(grids, blocks, numInstances); + vtkm::UInt32 blocks, threadsPerBlock; + GetBlocksAndThreads(blocks, threadsPerBlock, numInstances); - cuda::internal::TaskStrided1DLaunch<<>>(functor, - numInstances); - -#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_1D - parameter_sweep_1d_schedule(functor, numInstances); -#endif + cuda::internal::TaskStrided1DLaunch<<>>( + functor, numInstances); } template @@ -1407,15 +1462,12 @@ public: static_cast(rangeMax[1]), static_cast(rangeMax[2])); - vtkm::UInt32 grids; - dim3 blocks; - GetGridsAndBlocks(grids, blocks, ranges); + vtkm::UInt32 blocks; + dim3 threadsPerBlock; + GetBlocksAndThreads(blocks, threadsPerBlock, ranges); - cuda::internal::TaskStrided3DLaunch<<>>(functor, ranges); - -#ifdef PARAMETER_SWEEP_VTKM_SCHEDULER_3D - parameter_sweep_3d_schedule(functor, rangeMax); -#endif + cuda::internal::TaskStrided3DLaunch<<>>( + functor, ranges); } template diff --git a/vtkm/cont/cuda/internal/TaskTuner.h b/vtkm/cont/cuda/internal/TaskTuner.h deleted file mode 100644 index 608646bc6..000000000 --- a/vtkm/cont/cuda/internal/TaskTuner.h +++ /dev/null @@ -1,211 +0,0 @@ -//============================================================================ -// Copyright (c) Kitware, Inc. -// All rights reserved. -// See LICENSE.txt for details. -// This software is distributed WITHOUT ANY WARRANTY; without even -// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR -// PURPOSE. See the above copyright notice for more information. -// -// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS). -// Copyright 2014 UT-Battelle, LLC. -// Copyright 2014 Los Alamos National Security. -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National -// Laboratory (LANL), the U.S. Government retains certain rights in -// this software. -//============================================================================ - -#ifndef vtk_m_cont_cuda_internal_TaskTuner_h -#define vtk_m_cont_cuda_internal_TaskTuner_h - -#include -#include - -#include - -#include -#include -#include -#include - -namespace vtkm -{ -namespace cont -{ -namespace cuda -{ -namespace internal -{ - -vtkm::UInt32 getNumSMs(int dId); - -template -__global__ void TaskStrided1DLaunch(TaskType task, vtkm::Id); -template -__global__ void TaskStrided3DLaunch(TaskType task, dim3 size); - -struct PerfRecord1d -{ - PerfRecord1d(float elapsedT, int g, int b) - : elapsedTime(elapsedT) - , grid(g) - , block(b) - { - } - - bool operator<(const PerfRecord1d& other) const { return elapsedTime < other.elapsedTime; } - - float elapsedTime; - int grid; - int block; -}; - -inline std::ostream& operator<<(std::ostream& os, const PerfRecord1d& record) -{ - os << "TaskStrided1DLaunch<<<" << record.grid << "," << record.block - << ">>> required: " << record.elapsedTime << "\n"; - return os; -} - - -struct PerfRecord3d -{ - PerfRecord3d(float elapsedT, int g, dim3 b) - : elapsedTime(elapsedT) - , grid(g) - , block(b) - { - } - - bool operator<(const PerfRecord3d& other) const { return elapsedTime < other.elapsedTime; } - - float elapsedTime; - int grid; - dim3 block; -}; - -inline std::ostream& operator<<(std::ostream& os, const PerfRecord3d& record) -{ - - os << "TaskStrided3DLaunch<<<" << record.grid << ",(" << record.block.x << "," << record.block.y - << "," << record.block.z << ")>>> required: " << record.elapsedTime << "\n"; - return os; -} - - -template -static void parameter_sweep_1d_schedule(const TaskT& task, const vtkm::Id& numInstances) -{ - std::vector results; - constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 }; - constexpr vtkm::UInt32 blockIndexTable[12] = { 4, 8, 16, 32, 64, 128, - 256, 512, 1024, 2048, 4096, 8192 }; - - int deviceId; - VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda - - for (vtkm::UInt32 g = 0; g < 12; g++) - { - vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId); - for (vtkm::UInt32 b = 0; b < 12; b++) - { - vtkm::UInt32 blocks = blockIndexTable[b]; - - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - TaskStrided1DLaunch<<>>(task, numInstances); - - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - results.emplace_back(elapsedTimeMilliseconds, grids, blocks); - } - } - - std::sort(results.begin(), results.end()); - for (auto&& i : results) - { - std::cout << i << std::endl; - } -} - -template -static void parameter_sweep_3d_schedule(const TaskT& task, const vtkm::Id3& rangeMax) -{ - const dim3 ranges(static_cast(rangeMax[0]), - static_cast(rangeMax[1]), - static_cast(rangeMax[2])); - std::vector results; - - constexpr vtkm::UInt32 gridIndexTable[12] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 }; - constexpr vtkm::UInt32 blockIndexTable[16] = { 1, 2, 4, 8, 12, 16, 20, 24, - 28, 30, 32, 64, 128, 256, 512, 1024 }; - - int deviceId; - for (vtkm::UInt32 g = 0; g < 12; g++) - { - vtkm::UInt32 grids = gridIndexTable[g] * getNumSMs(deviceId); - for (vtkm::UInt32 i = 0; i < 16; i++) - { - for (vtkm::UInt32 j = 0; j < 16; j++) - { - for (vtkm::UInt32 k = 0; k < 16; k++) - { - cudaEvent_t start, stop; - VTKM_CUDA_CALL(cudaEventCreate(&start)); - VTKM_CUDA_CALL(cudaEventCreate(&stop)); - - dim3 blocks(blockIndexTable[i], blockIndexTable[j], blockIndexTable[k]); - - if ((blocks.x * blocks.y * blocks.z) >= 1024 || (blocks.x * blocks.y * blocks.z) <= 4 || - blocks.z >= 64) - { - //cuda can't handle more than 1024 threads per block - //so don't try if we compute higher than that - - //also don't try stupidly low numbers - - //cuda can't handle more than 64 threads in the z direction - continue; - } - - VTKM_CUDA_CALL(cudaEventRecord(start, cudaStreamPerThread)); - TaskStrided3DLaunch<<>>(task, ranges); - VTKM_CUDA_CALL(cudaEventRecord(stop, cudaStreamPerThread)); - - VTKM_CUDA_CALL(cudaEventSynchronize(stop)); - float elapsedTimeMilliseconds; - VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, start, stop)); - - VTKM_CUDA_CALL(cudaEventDestroy(start)); - VTKM_CUDA_CALL(cudaEventDestroy(stop)); - - results.emplace_back(elapsedTimeMilliseconds, grids, blocks); - } - } - } - } - - std::sort(results.begin(), results.end()); - for (auto&& i : results) - { - std::cout << i << std::endl; - } -} -} -} -} -} - -#endif