vtk-m/benchmarking/BenchmarkAtomicArray.cxx
Kenneth Moreland 28ecf3636d Change interface of atomic compare and swap
The old atomic compare and swap operations (`vtkm::AtomicCompareAndSwap`
and `vtkm::exec::AtomicArrayExecutionObject::CompareAndSwap`) had an
order of arguments that was confusing. The order of the arguments was
shared pointer (or index), desired value, expected value. Most people
probably assume expected value comes before desired value. And this
order conflicts with the order in the `std` methods, GCC atomics, and
Kokkos.

Change the interface of atomic operations to be patterned off the
`std::atomic_compare_exchange` and `std::atomic<T>::compare_exchange`
methods. First, these methods have a more intuitive order of parameters
(shared pointer, expected, desired). Second, rather than take a value
for the expected and return the actual old value, they take a pointer to
the expected value (or reference in `AtomicArrayExecutionObject`) and
modify this value in the case that it does not match the actual value.
This makes it harder to mix up the expected and desired parameters.
Also, because the methods return a bool indicating whether the value was
changed, there is an additional benefit that compare-exchange loops are
implemented easier.

For example, consider you want to apply the function `MyOp` on a
`sharedValue` atomically. With the old interface, you would have to do
something like this.

```cpp
T oldValue;
T newValue;
do
{
  oldValue = *sharedValue;
  newValue = MyOp(oldValue);
} while (vtkm::AtomicCompareAndSwap(sharedValue, newValue, oldValue) != oldValue);
```

With the new interface, this is simplfied to this.

```cpp
T oldValue = *sharedValue;
while (!vtkm::AtomicCompareExchange(sharedValue, &oldValue, MyOp(oldValue));
```
2020-10-20 08:39:22 -06:00

522 lines
18 KiB
C++

//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
//
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//============================================================================
#include "Benchmarker.h"
#include <vtkm/cont/ArrayHandle.h>
#include <vtkm/cont/AtomicArray.h>
#include <vtkm/cont/DeviceAdapterTag.h>
#include <vtkm/cont/Initialize.h>
#include <vtkm/cont/Invoker.h>
#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/Timer.h>
#include <vtkm/worklet/WorkletMapField.h>
#include <vtkm/TypeTraits.h>
#include <sstream>
#include <string>
namespace
{
// Provide access to the requested device to the benchmark functions:
vtkm::cont::InitializeResult Config;
// Range for array sizes
static constexpr vtkm::Id ARRAY_SIZE_MIN = 1;
static constexpr vtkm::Id ARRAY_SIZE_MAX = 1 << 20;
// This is 32x larger than the largest array size.
static constexpr vtkm::Id NUM_WRITES = 33554432; // 2^25
static constexpr vtkm::Id STRIDE = 32;
// Benchmarks AtomicArray::Add such that each work index writes to adjacent indices.
struct AddSeqWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, AtomicArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
template <typename T, typename AtomicPortal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& val, AtomicPortal& portal) const
{
portal.Add(i % portal.GetNumberOfValues(), val);
}
};
template <typename ValueType>
void BenchAddSeq(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> atomicArray;
vtkm::cont::Algorithm::Fill(
atomicArray, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(AddSeqWorker{}, ones, atomicArray);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(BenchAddSeq,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX },
{ NUM_WRITES, NUM_WRITES } })
->ArgNames({ "AtomicsValues", "AtomicOps" }),
vtkm::cont::AtomicArrayTypeList);
// Provides a non-atomic baseline for BenchAddSeq
struct AddSeqBaselineWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, WholeArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
template <typename T, typename Portal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& val, Portal& portal) const
{
const vtkm::Id j = i % portal.GetNumberOfValues();
portal.Set(j, portal.Get(j) + val);
}
};
template <typename ValueType>
void BenchAddSeqBaseline(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> array;
vtkm::cont::Algorithm::Fill(array, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(AddSeqBaselineWorker{}, ones, array);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(BenchAddSeqBaseline,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX },
{ NUM_WRITES, NUM_WRITES } })
->ArgNames({ "Values", "Ops" }),
vtkm::cont::AtomicArrayTypeList);
// Benchmarks AtomicArray::Add such that each work index writes to a strided
// index ( floor(i / stride) + stride * (i % stride)
struct AddStrideWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, AtomicArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
vtkm::Id Stride;
AddStrideWorker(vtkm::Id stride)
: Stride{ stride }
{
}
template <typename T, typename AtomicPortal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& val, AtomicPortal& portal) const
{
const vtkm::Id numVals = portal.GetNumberOfValues();
const vtkm::Id j = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
portal.Add(j, val);
}
};
template <typename ValueType>
void BenchAddStride(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
const vtkm::Id stride = static_cast<vtkm::Id>(state.range(2));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> atomicArray;
vtkm::cont::Algorithm::Fill(
atomicArray, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(AddStrideWorker{ stride }, ones, atomicArray);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(
BenchAddStride,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX }, { NUM_WRITES, NUM_WRITES }, { STRIDE, STRIDE } })
->ArgNames({ "AtomicsValues", "AtomicOps", "Stride" }),
vtkm::cont::AtomicArrayTypeList);
// Non-atomic baseline for AddStride
struct AddStrideBaselineWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, WholeArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
vtkm::Id Stride;
AddStrideBaselineWorker(vtkm::Id stride)
: Stride{ stride }
{
}
template <typename T, typename Portal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& val, Portal& portal) const
{
const vtkm::Id numVals = portal.GetNumberOfValues();
const vtkm::Id j = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
portal.Set(j, portal.Get(j) + val);
}
};
template <typename ValueType>
void BenchAddStrideBaseline(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
const vtkm::Id stride = static_cast<vtkm::Id>(state.range(2));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> array;
vtkm::cont::Algorithm::Fill(array, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(AddStrideBaselineWorker{ stride }, ones, array);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(
BenchAddStrideBaseline,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX }, { NUM_WRITES, NUM_WRITES }, { STRIDE, STRIDE } })
->ArgNames({ "Values", "Ops", "Stride" }),
vtkm::cont::AtomicArrayTypeList);
// Benchmarks AtomicArray::CompareExchange such that each work index writes to adjacent
// indices.
struct CASSeqWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, AtomicArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
template <typename T, typename AtomicPortal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& in, AtomicPortal& portal) const
{
const vtkm::Id idx = i % portal.GetNumberOfValues();
const T val = static_cast<T>(i) + in;
T oldVal = portal.Get(idx);
while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
;
}
};
template <typename ValueType>
void BenchCASSeq(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> atomicArray;
vtkm::cont::Algorithm::Fill(
atomicArray, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(CASSeqWorker{}, ones, atomicArray);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCASSeq,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX },
{ NUM_WRITES, NUM_WRITES } })
->ArgNames({ "AtomicsValues", "AtomicOps" }),
vtkm::cont::AtomicArrayTypeList);
// Provides a non-atomic baseline for BenchCASSeq
struct CASSeqBaselineWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, WholeArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
template <typename T, typename Portal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& in, Portal& portal) const
{
const vtkm::Id idx = i % portal.GetNumberOfValues();
const T val = static_cast<T>(i) + in;
const T oldVal = portal.Get(idx);
portal.Set(idx, oldVal + val);
}
};
template <typename ValueType>
void BenchCASSeqBaseline(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> array;
vtkm::cont::Algorithm::Fill(array, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(CASSeqBaselineWorker{}, ones, array);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(BenchCASSeqBaseline,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX },
{ NUM_WRITES, NUM_WRITES } })
->ArgNames({ "Values", "Ops" }),
vtkm::cont::AtomicArrayTypeList);
// Benchmarks AtomicArray::CompareExchange such that each work index writes to
// a strided index:
// ( floor(i / stride) + stride * (i % stride)
struct CASStrideWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, AtomicArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
vtkm::Id Stride;
CASStrideWorker(vtkm::Id stride)
: Stride{ stride }
{
}
template <typename T, typename AtomicPortal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& in, AtomicPortal& portal) const
{
const vtkm::Id numVals = portal.GetNumberOfValues();
const vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
const T val = static_cast<T>(i) + in;
T oldVal = portal.Get(idx);
while (!portal.CompareExchange(idx, &oldVal, oldVal + val))
;
}
};
template <typename ValueType>
void BenchCASStride(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
const vtkm::Id stride = static_cast<vtkm::Id>(state.range(2));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> atomicArray;
vtkm::cont::Algorithm::Fill(
atomicArray, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(CASStrideWorker{ stride }, ones, atomicArray);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(
BenchCASStride,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX }, { NUM_WRITES, NUM_WRITES }, { STRIDE, STRIDE } })
->ArgNames({ "AtomicsValues", "AtomicOps", "Stride" }),
vtkm::cont::AtomicArrayTypeList);
// Non-atomic baseline for CASStride
struct CASStrideBaselineWorker : public vtkm::worklet::WorkletMapField
{
using ControlSignature = void(FieldIn, AtomicArrayInOut);
using ExecutionSignature = void(InputIndex, _1, _2);
vtkm::Id Stride;
CASStrideBaselineWorker(vtkm::Id stride)
: Stride{ stride }
{
}
template <typename T, typename AtomicPortal>
VTKM_EXEC void operator()(const vtkm::Id i, const T& in, AtomicPortal& portal) const
{
const vtkm::Id numVals = portal.GetNumberOfValues();
const vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % numVals;
const T val = static_cast<T>(i) + in;
T oldVal = portal.Get(idx);
portal.Set(idx, oldVal + val);
}
};
template <typename ValueType>
void BenchCASStrideBaseline(benchmark::State& state)
{
const vtkm::cont::DeviceAdapterId device = Config.Device;
const vtkm::Id numValues = static_cast<vtkm::Id>(state.range(0));
const vtkm::Id numWrites = static_cast<vtkm::Id>(state.range(1));
const vtkm::Id stride = static_cast<vtkm::Id>(state.range(2));
auto ones = vtkm::cont::make_ArrayHandleConstant<ValueType>(static_cast<ValueType>(1), numWrites);
vtkm::cont::ArrayHandle<ValueType> array;
vtkm::cont::Algorithm::Fill(array, vtkm::TypeTraits<ValueType>::ZeroInitialization(), numValues);
vtkm::cont::Invoker invoker{ device };
vtkm::cont::Timer timer{ device };
for (auto _ : state)
{
(void)_;
timer.Start();
invoker(CASStrideBaselineWorker{ stride }, ones, array);
timer.Stop();
state.SetIterationTime(timer.GetElapsedTime());
}
const int64_t iterations = static_cast<int64_t>(state.iterations());
const int64_t valsWritten = static_cast<int64_t>(numWrites);
const int64_t bytesWritten = static_cast<int64_t>(sizeof(ValueType)) * valsWritten;
state.SetItemsProcessed(valsWritten * iterations);
state.SetItemsProcessed(bytesWritten * iterations);
}
VTKM_BENCHMARK_TEMPLATES_OPTS(
BenchCASStrideBaseline,
->Ranges({ { ARRAY_SIZE_MIN, ARRAY_SIZE_MAX }, { NUM_WRITES, NUM_WRITES }, { STRIDE, STRIDE } })
->ArgNames({ "AtomicsValues", "AtomicOps", "Stride" }),
vtkm::cont::AtomicArrayTypeList);
} // end anon namespace
int main(int argc, char* argv[])
{
// Parse VTK-m options:
auto opts = vtkm::cont::InitializeOptions::RequireDevice;
std::vector<char*> args(argv, argv + argc);
vtkm::bench::detail::InitializeArgs(&argc, args, opts);
// Parse VTK-m options:
Config = vtkm::cont::Initialize(argc, args.data(), opts);
// This occurs when it is help
if (opts == vtkm::cont::InitializeOptions::None)
{
std::cout << Config.Usage << std::endl;
}
else
{
vtkm::cont::GetRuntimeDeviceTracker().ForceDevice(Config.Device);
}
// handle benchmarking related args and run benchmarks:
VTKM_EXECUTE_BENCHMARKS(argc, args.data());
}