//============================================================================ // Copyright (c) Kitware, Inc. // All rights reserved. // See LICENSE.txt for details. // // This software is distributed WITHOUT ANY WARRANTY; without even // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // PURPOSE. See the above copyright notice for more information. //============================================================================ #include "Benchmarker.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VTKM_ENABLE_OPENMP #include #endif namespace { // Parametrize the input size samples for most of the benchmarks // // Define at compile time: // // Being VTKm_BENCHS_RANGE_LOWER_BOUNDARY b0 and, // being VTKm_BENCHS_RANGE_UPPER_BOUNDARY b1 // // This will create the following sample sizes b0, b0*2^3, b0*2^6, ..., b1. // // Notice that setting up VTKm_BENCHS_RANGE_LOWER_BOUNDARY / VTKm_BENCHS_RANGE_UPPER_BOUNDARY // will affect both ShortRange and FullRange. // #ifndef VTKm_BENCHS_RANGE_LOWER_BOUNDARY #define FULL_RANGE_LOWER_BOUNDARY (1 << 12) // 4 KiB #define SHORT_RANGE_LOWER_BOUNDARY (1 << 15) // 32 KiB #else #define FULL_RANGE_LOWER_BOUNDARY (VTKm_BENCHS_RANGE_LOWER_BOUNDARY) #define SHORT_RANGE_LOWER_BOUNDARY (VTKm_BENCHS_RANGE_LOWER_BOUNDARY) #endif #ifndef VTKm_BENCHS_RANGE_UPPER_BOUNDARY #define FULL_RANGE_UPPER_BOUNDARY (1 << 27) // 128 MiB #define SHORT_RANGE_UPPER_BOUNDARY (1 << 27) // 128 MiB #define BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING (1 << 26) // 64 MiB #else #define FULL_RANGE_UPPER_BOUNDARY (VTKm_BENCHS_RANGE_UPPER_BOUNDARY) #define SHORT_RANGE_UPPER_BOUNDARY (VTKm_BENCHS_RANGE_UPPER_BOUNDARY) #define BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING (VTKm_BENCHS_RANGE_UPPER_BOUNDARY) #endif // Default sampling rate is x8 and always includes min/max, // so this will generate 7 samples at: // 1: 4 KiB // 2: 32 KiB // 3: 256 KiB // 4: 2 MiB // 5: 16 MiB // 6: 128 MiB static const std::pair FullRange{ FULL_RANGE_LOWER_BOUNDARY, FULL_RANGE_UPPER_BOUNDARY }; // Smaller range that can be used to reduce the number of benchmarks. Used // with `RangeMultiplier(SmallRangeMultiplier)`, this produces: // 1: 32 KiB // 2: 2 MiB // 3: 128 MiB static const std::pair SmallRange{ SHORT_RANGE_LOWER_BOUNDARY, SHORT_RANGE_UPPER_BOUNDARY }; static constexpr int SmallRangeMultiplier = 1 << 21; // Ensure a sample at 2MiB #ifndef VTKM_ENABLE_KOKKOS using TypeList = vtkm::List>; using SmallTypeList = vtkm::List; #else // Kokkos requires 0 == (sizeof(Kokkos::MinMaxScalar) % sizeof(int) // so removing vtkm::UInt8 using TypeList = vtkm::List>; using SmallTypeList = vtkm::List; #endif // Only 32-bit words are currently supported atomically across devices: using AtomicWordTypes = vtkm::List; // The Fill algorithm uses different word types: using FillWordTypes = vtkm::List; using IdArrayHandle = vtkm::cont::ArrayHandle; // Hold configuration state (e.g. active device) vtkm::cont::InitializeResult Config; // Helper function to convert numBytes to numWords: template vtkm::Id BytesToWords(vtkm::Id numBytes) { const vtkm::Id wordSize = static_cast(sizeof(T)); return numBytes / wordSize; } // Various kernels used by the different benchmarks to accelerate // initialization of data template struct TestValueFunctor { VTKM_EXEC_CONT T operator()(vtkm::Id i) const { return static_cast(i + 10); } }; template VTKM_EXEC_CONT T TestValue(vtkm::Id index) { return TestValueFunctor{}(index); } template struct TestValueFunctor> { VTKM_EXEC_CONT vtkm::Pair operator()(vtkm::Id i) const { return vtkm::make_Pair(TestValue(i), TestValue(i + 1)); } }; template struct TestValueFunctor> { template VTKM_EXEC_CONT vtkm::Vec FillVec(vtkm::Id i, vtkmstd::index_sequence) const { return vtkm::make_Vec(TestValue(i + static_cast(Ns))...); } VTKM_EXEC_CONT vtkm::Vec operator()(vtkm::Id i) const { return FillVec(i, vtkmstd::make_index_sequence(N)>{}); } }; template VTKM_CONT void FillTestValue(ArrayT& array, vtkm::Id numValues) { using T = typename ArrayT::ValueType; vtkm::cont::Algorithm::Copy( vtkm::cont::make_ArrayHandleImplicit(TestValueFunctor{}, numValues), array); } template struct ModuloTestValueFunctor { vtkm::Id Mod; VTKM_EXEC_CONT T operator()(vtkm::Id i) const { return TestValue(i % this->Mod); } }; template VTKM_CONT void FillModuloTestValue(ArrayT& array, vtkm::Id mod, vtkm::Id numValues) { using T = typename ArrayT::ValueType; vtkm::cont::Algorithm::Copy( vtkm::cont::make_ArrayHandleImplicit(ModuloTestValueFunctor{ mod }, numValues), array); } template struct BinaryTestValueFunctor { vtkm::Id Mod; VTKM_EXEC_CONT T operator()(vtkm::Id i) const { T zero = vtkm::TypeTraits::ZeroInitialization(); // Always return zero unless 1 == Mod if (i == this->Mod) { // Ensure that the result is not equal to zero T retVal; do { retVal = TestValue(i++); } while (retVal == zero); return retVal; } return std::move(zero); } }; template VTKM_CONT void FillBinaryTestValue(ArrayT& array, vtkm::Id mod, vtkm::Id numValues) { using T = typename ArrayT::ValueType; vtkm::cont::Algorithm::Copy( vtkm::cont::make_ArrayHandleImplicit(BinaryTestValueFunctor{ mod }, numValues), array); } template VTKM_CONT void FillRandomTestValue(ArrayT& array, vtkm::Id numValues) { using ValueType = typename ArrayT::ValueType; std::mt19937_64 rng; array.Allocate(numValues); auto portal = array.WritePortal(); for (vtkm::Id i = 0; i < portal.GetNumberOfValues(); ++i) { portal.Set(i, TestValue(static_cast(rng()))); } } template VTKM_CONT void FillRandomModTestValue(ArrayT& array, vtkm::Id mod, vtkm::Id numValues) { using ValueType = typename ArrayT::ValueType; std::mt19937_64 rng; array.Allocate(numValues); auto portal = array.WritePortal(); for (vtkm::Id i = 0; i < portal.GetNumberOfValues(); ++i) { portal.Set(i, TestValue(static_cast(rng()) % mod)); } } static inline std::string SizeAndValuesString(vtkm::Id numBytes, vtkm::Id numValues) { std::ostringstream str; str << vtkm::cont::GetHumanReadableSize(numBytes) << " | " << numValues << " values"; return str.str(); } template struct GenerateBitFieldWorklet : public vtkm::worklet::WorkletMapField { using ControlSignature = void(FieldIn dummy, BitFieldOut); using ExecutionSignature = void(InputIndex, _2); WordType Exemplar; vtkm::Id Stride; vtkm::Id MaxMaskedWord; VTKM_CONT GenerateBitFieldWorklet(WordType exemplar, vtkm::Id stride, vtkm::Id maxMaskedWord) : Exemplar(exemplar) , Stride(stride) , MaxMaskedWord(maxMaskedWord) { } template VTKM_EXEC void operator()(vtkm::Id wordIdx, BitPortal& portal) const { if (wordIdx <= this->MaxMaskedWord && (wordIdx % this->Stride) == 0) { portal.SetWordAtomic(wordIdx, this->Exemplar); } else { portal.SetWordAtomic(wordIdx, static_cast(0)); } } }; // Create a bit field for testing. The bit array will contain numWords words. // The exemplar word is used to set bits in the array. Stride indicates how // many words will be set to 0 between words initialized to the exemplar. // Words with indices higher than maxMaskedWord will be set to 0. // Stride and maxMaskedWord may be used to test different types of imbalanced // loads. template VTKM_CONT vtkm::cont::BitField GenerateBitField(WordType exemplar, vtkm::Id stride, vtkm::Id maxMaskedWord, vtkm::Id numWords) { if (stride == 0) { stride = 1; } vtkm::Id numBits = numWords * static_cast(sizeof(WordType) * CHAR_BIT); vtkm::cont::BitField bits; bits.Allocate(numBits); // This array is just to set the input domain appropriately: auto dummy = vtkm::cont::make_ArrayHandleConstant(0, numWords); vtkm::cont::Invoker invoker{ Config.Device }; invoker(GenerateBitFieldWorklet{ exemplar, stride, maxMaskedWord }, dummy, bits); return bits; }; //============================================================================== // Benchmarks begin: template void BenchBitFieldToUnorderedSetImpl(benchmark::State& state, vtkm::Id numBytes, WordType exemplar, vtkm::Id stride, vtkm::Float32 fillRatio, const std::string& name) { const vtkm::Id numWords = BytesToWords(numBytes); const vtkm::Id maxMaskedWord = static_cast(static_cast(numWords) * fillRatio); { // Set label: const vtkm::Id numFilledWords = maxMaskedWord / stride; const vtkm::Id numSetBits = numFilledWords * vtkm::CountSetBits(exemplar); std::stringstream desc; desc << vtkm::cont::GetHumanReadableSize(numBytes) << " | " << name << " | " << "SetBits:" << numSetBits; state.SetLabel(desc.str()); } vtkm::cont::BitField bits = GenerateBitField(exemplar, stride, maxMaskedWord, numWords); IdArrayHandle indices; vtkm::cont::Timer timer{ Config.Device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::BitFieldToUnorderedSet(Config.Device, bits, indices); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); }; void BenchBitFieldToUnorderedSet(benchmark::State& state) { using WordType = vtkm::WordTypeDefault; const vtkm::Id numBytes = static_cast(state.range(0)); const auto fillPattern = state.range(1); // Launch the implementation with the appropriate fill pattern: switch (fillPattern) { case 0: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0x00000000, 1, 0.f, "Null"); break; case 1: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0xffffffff, 1, 1.f, "Full"); break; case 2: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0xffff0000, 1, 0.f, "HalfWord"); break; case 3: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0xffffffff, 1, 0.5f, "HalfField"); break; case 4: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0xffffffff, 2, 1.f, "AltWords"); break; case 5: BenchBitFieldToUnorderedSetImpl(state, numBytes, 0x55555555, 1, 1.f, "AltBits"); break; default: VTKM_UNREACHABLE("Internal error."); } } void BenchBitFieldToUnorderedSetGenerator(benchmark::internal::Benchmark* bm) { // Use a reduced NUM_BYTES_MAX value here -- these benchmarks allocate one // 8-byte id per bit, so this caps the index array out at 512 MB: static int64_t numBytesMax = std::min(1 << 29, BITFIELD_TO_UNORDEREDSET_MAX_SAMPLING); bm->UseManualTime(); bm->ArgNames({ "Size", "C" }); for (int64_t config = 0; config < 6; ++config) { bm->Ranges({ { FullRange.first, numBytesMax }, { config, config } }); } } VTKM_BENCHMARK_APPLY(BenchBitFieldToUnorderedSet, BenchBitFieldToUnorderedSetGenerator); template void BenchCopy(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle src; vtkm::cont::ArrayHandle dst; FillTestValue(src, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::Copy(device, src, dst); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCopy, ->Ranges({ FullRange })->ArgName("Size"), TypeList); template void BenchCopyIf(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); const vtkm::Id percentValid = static_cast(state.range(1)); const vtkm::Id numValid = (numValues * percentValid) / 100; const vtkm::Id modulo = numValid != 0 ? numValues / numValid : numValues + 1; { std::ostringstream desc; desc << SizeAndValuesString(numBytes, numValues) << " | " << numValid << " valid (" << (numValid * 100 / numValues) << "%)"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle src; vtkm::cont::ArrayHandle stencil; vtkm::cont::ArrayHandle dst; FillTestValue(src, numValues); FillBinaryTestValue(stencil, modulo, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::CopyIf(device, src, stencil, dst); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; void BenchCopyIfGenerator(benchmark::internal::Benchmark* bm) { bm->ArgNames({ "Size", "%Valid" }); bm->RangeMultiplier(SmallRangeMultiplier); for (int64_t pcntValid = 0; pcntValid <= 100; pcntValid += 25) { bm->Ranges({ SmallRange, { pcntValid, pcntValid } }); } } VTKM_BENCHMARK_TEMPLATES_APPLY(BenchCopyIf, BenchCopyIfGenerator, SmallTypeList); template void BenchCountSetBitsImpl(benchmark::State& state, vtkm::Id numBytes, WordType exemplar, vtkm::Id stride, vtkm::Float32 fillRatio, const std::string& name) { const vtkm::Id numWords = BytesToWords(numBytes); const vtkm::Id maxMaskedWord = static_cast(static_cast(numWords) * fillRatio); { // Set label: const vtkm::Id numFilledWords = maxMaskedWord / stride; const vtkm::Id numSetBits = numFilledWords * vtkm::CountSetBits(exemplar); std::stringstream desc; desc << vtkm::cont::GetHumanReadableSize(numBytes) << " | " << name << " | " << "SetBits:" << numSetBits; state.SetLabel(desc.str()); } vtkm::cont::BitField bits = GenerateBitField(exemplar, stride, maxMaskedWord, numWords); vtkm::cont::Timer timer{ Config.Device }; for (auto _ : state) { (void)_; timer.Start(); const vtkm::Id setBits = vtkm::cont::Algorithm::CountSetBits(Config.Device, bits); benchmark::DoNotOptimize(setBits); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); }; void BenchCountSetBits(benchmark::State& state) { using WordType = vtkm::WordTypeDefault; const vtkm::Id numBytes = static_cast(state.range(0)); const auto fillPattern = state.range(1); // Launch the implementation with the appropriate fill pattern: switch (fillPattern) { case 0: BenchCountSetBitsImpl(state, numBytes, 0x00000000, 1, 0.f, "Null"); break; case 1: BenchCountSetBitsImpl(state, numBytes, 0xffffffff, 1, 1.f, "Full"); break; case 2: BenchCountSetBitsImpl(state, numBytes, 0xffff0000, 1, 0.f, "HalfWord"); break; case 3: BenchCountSetBitsImpl(state, numBytes, 0xffffffff, 1, 0.5f, "HalfField"); break; case 4: BenchCountSetBitsImpl(state, numBytes, 0xffffffff, 2, 1.f, "AltWords"); break; case 5: BenchCountSetBitsImpl(state, numBytes, 0x55555555, 1, 1.f, "AltBits"); break; default: VTKM_UNREACHABLE("Internal error."); } } void BenchCountSetBitsGenerator(benchmark::internal::Benchmark* bm) { bm->UseManualTime(); bm->ArgNames({ "Size", "C" }); for (int64_t config = 0; config < 6; ++config) { bm->Ranges({ { FullRange.first, FullRange.second }, { config, config } }); } } VTKM_BENCHMARK_APPLY(BenchCountSetBits, BenchCountSetBitsGenerator); template void BenchFillArrayHandle(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle array; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::Fill(device, array, TestValue(19), numValues); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFillArrayHandle, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); void BenchFillBitFieldBool(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numBits = numBytes * CHAR_BIT; const bool value = state.range(1) != 0; state.SetLabel(vtkm::cont::GetHumanReadableSize(numBytes)); vtkm::cont::BitField bits; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::Fill(device, bits, value, numBits); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); }; VTKM_BENCHMARK_OPTS(BenchFillBitFieldBool, ->Ranges({ { FullRange.first, FullRange.second }, { 0, 1 } }) ->ArgNames({ "Size", "Val" })); template void BenchFillBitFieldMask(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numBits = numBytes * CHAR_BIT; const WordType mask = static_cast(0x1); state.SetLabel(vtkm::cont::GetHumanReadableSize(numBytes)); vtkm::cont::BitField bits; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::Fill(device, bits, mask, numBits); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchFillBitFieldMask, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), FillWordTypes); template void BenchLowerBounds(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numValuesBytes = static_cast(state.range(0)); const vtkm::Id numInputsBytes = static_cast(state.range(1)); const vtkm::Id numValues = BytesToWords(numValuesBytes); const vtkm::Id numInputs = BytesToWords(numInputsBytes); { std::ostringstream desc; desc << SizeAndValuesString(numValuesBytes, numValues) << " | " << numInputs << " lookups"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle input; vtkm::cont::ArrayHandle output; vtkm::cont::ArrayHandle values; FillRandomTestValue(input, numInputs); FillRandomTestValue(values, numValues); vtkm::cont::Algorithm::Sort(device, values); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::LowerBounds(device, input, values, output); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchLowerBounds, ->RangeMultiplier(SmallRangeMultiplier) ->Ranges({ SmallRange, SmallRange }) ->ArgNames({ "Size", "InputSize" }), TypeList); template void BenchReduce(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle array; FillTestValue(array, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); auto result = vtkm::cont::Algorithm::Reduce( device, array, vtkm::TypeTraits::ZeroInitialization()); benchmark::DoNotOptimize(result); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchReduce, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); template void BenchReduceByKey(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); const vtkm::Id percentKeys = static_cast(state.range(1)); const vtkm::Id numKeys = std::max((numValues * percentKeys) / 100, vtkm::Id{ 1 }); { std::ostringstream desc; desc << SizeAndValuesString(numBytes, numValues) << " | " << numKeys << " (" << ((numKeys * 100) / numValues) << "%) unique"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle valuesIn; vtkm::cont::ArrayHandle valuesOut; vtkm::cont::ArrayHandle keysIn; vtkm::cont::ArrayHandle keysOut; FillTestValue(valuesIn, numValues); FillModuloTestValue(keysIn, numKeys, numValues); vtkm::cont::Algorithm::Sort(device, keysIn); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::ReduceByKey(device, keysIn, valuesIn, keysOut, valuesOut, vtkm::Add{}); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; void BenchReduceByKeyGenerator(benchmark::internal::Benchmark* bm) { bm->RangeMultiplier(SmallRangeMultiplier); bm->ArgNames({ "Size", "%Keys" }); for (int64_t pcntKeys = 0; pcntKeys <= 100; pcntKeys += 25) { bm->Ranges({ SmallRange, { pcntKeys, pcntKeys } }); } } VTKM_BENCHMARK_TEMPLATES_APPLY(BenchReduceByKey, BenchReduceByKeyGenerator, SmallTypeList); template void BenchScanExclusive(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle src; vtkm::cont::ArrayHandle dst; FillTestValue(src, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::ScanExclusive(device, src, dst); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchScanExclusive, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); template void BenchScanExtended(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle src; vtkm::cont::ArrayHandle dst; FillTestValue(src, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::ScanExtended(device, src, dst); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchScanExtended, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); template void BenchScanInclusive(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle src; vtkm::cont::ArrayHandle dst; FillTestValue(src, numValues); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::ScanInclusive(device, src, dst); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchScanInclusive, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); template void BenchSort(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle unsorted; FillRandomTestValue(unsorted, numValues); vtkm::cont::ArrayHandle array; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; // Reset the array to the unsorted state: vtkm::cont::Algorithm::Copy(device, unsorted, array); timer.Start(); vtkm::cont::Algorithm::Sort(array); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchSort, ->Range(FullRange.first, FullRange.second) ->ArgName("Size"), TypeList); template void BenchSortByKey(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); const vtkm::Id percentKeys = static_cast(state.range(1)); const vtkm::Id numKeys = std::max((numValues * percentKeys) / 100, vtkm::Id{ 1 }); { std::ostringstream desc; desc << SizeAndValuesString(numBytes, numValues) << " | " << numKeys << " (" << ((numKeys * 100) / numValues) << "%) keys"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle valuesUnsorted; vtkm::cont::ArrayHandle values; vtkm::cont::ArrayHandle keysUnsorted; vtkm::cont::ArrayHandle keys; FillRandomTestValue(valuesUnsorted, numValues); FillModuloTestValue(keysUnsorted, numKeys, numValues); vtkm::cont::Algorithm::Sort(device, keysUnsorted); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; vtkm::cont::Algorithm::Copy(device, keysUnsorted, keys); vtkm::cont::Algorithm::Copy(device, valuesUnsorted, values); timer.Start(); vtkm::cont::Algorithm::SortByKey(device, keys, values); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; void BenchSortByKeyGenerator(benchmark::internal::Benchmark* bm) { bm->RangeMultiplier(SmallRangeMultiplier); bm->ArgNames({ "Size", "%Keys" }); for (int64_t pcntKeys = 0; pcntKeys <= 100; pcntKeys += 25) { bm->Ranges({ SmallRange, { pcntKeys, pcntKeys } }); } } VTKM_BENCHMARK_TEMPLATES_APPLY(BenchSortByKey, BenchSortByKeyGenerator, SmallTypeList); template void BenchStableSortIndices(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); state.SetLabel(SizeAndValuesString(numBytes, numValues)); vtkm::cont::ArrayHandle values; FillRandomTestValue(values, numValues); vtkm::cont::ArrayHandle indices; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; // Reset the indices array: vtkm::cont::Algorithm::Copy(device, vtkm::cont::make_ArrayHandleIndex(numValues), indices); timer.Start(); vtkm::worklet::StableSortIndices::Sort(device, values, indices); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchStableSortIndices, ->Range(SmallRange.first, SmallRange.second) ->ArgName("Size"), TypeList); template void BenchStableSortIndicesUnique(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); const vtkm::Id percentUnique = static_cast(state.range(1)); const vtkm::Id numUnique = std::max((numValues * percentUnique) / 100, vtkm::Id{ 1 }); { std::ostringstream desc; desc << SizeAndValuesString(numBytes, numValues) << " | " << numUnique << " (" << ((numUnique * 100) / numValues) << "%) unique"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle values; FillRandomModTestValue(values, numUnique, numValues); // Prepare IndicesOrig to contain the sorted, non-unique index map: const vtkm::cont::ArrayHandle indicesOrig = vtkm::worklet::StableSortIndices::Sort(device, values); // Working memory: vtkm::cont::ArrayHandle indices; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; // Reset the indices array: vtkm::cont::Algorithm::Copy(device, indicesOrig, indices); timer.Start(); vtkm::worklet::StableSortIndices::Unique(device, values, indices); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; void BenchmarkStableSortIndicesUniqueGenerator(benchmark::internal::Benchmark* bm) { bm->RangeMultiplier(SmallRangeMultiplier); bm->ArgNames({ "Size", "%Uniq" }); for (int64_t pcntUnique = 0; pcntUnique <= 100; pcntUnique += 25) { // Cap the max size here at 2 MiB. This sort is too slow. const int64_t maxSize = 1 << 21; bm->Ranges( { { SmallRange.first, std::min(maxSize, SmallRange.second) }, { pcntUnique, pcntUnique } }); } } VTKM_BENCHMARK_TEMPLATES_APPLY(BenchStableSortIndicesUnique, BenchmarkStableSortIndicesUniqueGenerator, SmallTypeList); template void BenchUnique(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numBytes = static_cast(state.range(0)); const vtkm::Id numValues = BytesToWords(numBytes); const vtkm::Id percentUnique = static_cast(state.range(1)); const vtkm::Id numUnique = std::max((numValues * percentUnique) / 100, vtkm::Id{ 1 }); { std::ostringstream desc; desc << SizeAndValuesString(numBytes, numValues) << " | " << numUnique << " (" << ((numUnique * 100) / numValues) << "%) unique"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle valuesOrig; FillRandomModTestValue(valuesOrig, numUnique, numValues); // Presort the input: vtkm::cont::Algorithm::Sort(device, valuesOrig); vtkm::cont::ArrayHandle values; vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; // Make a working copy of the input: vtkm::cont::Algorithm::Copy(device, valuesOrig, values); timer.Start(); vtkm::cont::Algorithm::Unique(device, values); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetBytesProcessed(static_cast(numBytes) * iterations); state.SetItemsProcessed(static_cast(numValues) * iterations); }; void BenchmarkUniqueGenerator(benchmark::internal::Benchmark* bm) { bm->RangeMultiplier(SmallRangeMultiplier); bm->ArgNames({ "Size", "%Uniq" }); for (int64_t pcntUnique = 0; pcntUnique <= 100; pcntUnique += 25) { bm->Ranges({ SmallRange, { pcntUnique, pcntUnique } }); } } VTKM_BENCHMARK_TEMPLATES_APPLY(BenchUnique, BenchmarkUniqueGenerator, SmallTypeList); template void BenchUpperBounds(benchmark::State& state) { const vtkm::cont::DeviceAdapterId device = Config.Device; const vtkm::Id numValuesBytes = static_cast(state.range(0)); const vtkm::Id numInputsBytes = static_cast(state.range(1)); const vtkm::Id numValues = BytesToWords(numValuesBytes); const vtkm::Id numInputs = BytesToWords(numInputsBytes); { std::ostringstream desc; desc << SizeAndValuesString(numValuesBytes, numValues) << " | " << numInputs << " lookups"; state.SetLabel(desc.str()); } vtkm::cont::ArrayHandle input; vtkm::cont::ArrayHandle output; vtkm::cont::ArrayHandle values; FillRandomTestValue(input, numInputs); FillRandomTestValue(values, numValues); vtkm::cont::Algorithm::Sort(device, values); vtkm::cont::Timer timer{ device }; for (auto _ : state) { (void)_; timer.Start(); vtkm::cont::Algorithm::UpperBounds(device, input, values, output); timer.Stop(); state.SetIterationTime(timer.GetElapsedTime()); } const int64_t iterations = static_cast(state.iterations()); state.SetItemsProcessed(static_cast(numInputs) * iterations); }; VTKM_BENCHMARK_TEMPLATES_OPTS(BenchUpperBounds, ->RangeMultiplier(SmallRangeMultiplier) ->Ranges({ SmallRange, SmallRange }) ->ArgNames({ "Size", "InputSize" }), SmallTypeList); } // end anon namespace int main(int argc, char* argv[]) { auto opts = vtkm::cont::InitializeOptions::RequireDevice; std::vector args(argv, argv + argc); vtkm::bench::detail::InitializeArgs(&argc, args, opts); // Parse VTK-m options: Config = vtkm::cont::Initialize(argc, args.data(), opts); // This occurs when it is help if (opts == vtkm::cont::InitializeOptions::None) { std::cout << Config.Usage << std::endl; } else { vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device); } // handle benchmarking related args and run benchmarks: VTKM_EXECUTE_BENCHMARKS(argc, args.data()); }