Unroll reduction loops for non-integral types on OpenMP.

This commit is contained in:
Allison Vacanti 2019-07-16 11:35:08 -04:00
parent e30cb08725
commit 41894a97b3
2 changed files with 81 additions and 13 deletions

@ -105,7 +105,7 @@ if sortOpt:
keys = sorted(keys, key=lambda k: benchmarks[k].mean)
print("# Summary: (%s)"%filename)
print("%-9s\t%-9s\t%-s"%("Mean", "Stdev", "Benchmark (type)"))
print("%-9s\t%-9s\t%-9s\t%-s"%("Mean", "Stdev", "Stdev%", "Benchmark (type)"))
for key in keys:
data = benchmarks[key]
print("%9.6f\t%9.6f\t%s (%s)"%(data.mean, data.stdDev, key.name, key.type))
print("%9.6f\t%9.6f\t%9.6f\t%s (%s)"%(data.mean, data.stdDev, data.stdDev / data.mean * 100., key.name, key.type))

@ -278,6 +278,23 @@ using OpenMPReductionSupported = std::false_type;
struct ReduceHelper
{
// std::is_integral, but adapted to see through vecs and pairs.
template <typename T>
struct IsIntegral : public std::is_integral<T>
{
};
template <typename T, vtkm::IdComponent Size>
struct IsIntegral<vtkm::Vec<T, Size>> : public std::is_integral<T>
{
};
template <typename T, typename U>
struct IsIntegral<vtkm::Pair<T, U>>
: public std::integral_constant<bool, std::is_integral<T>{} && std::is_integral<U>{}>
{
};
// Generic implementation:
template <typename PortalT, typename ReturnType, typename Functor>
static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
@ -309,18 +326,11 @@ struct ReduceHelper
if (doParallel)
{
// Use the first (numThreads*2) values for initializing:
ReturnType accum;
accum = f(data[2 * tid], data[2 * tid + 1]);
// Static dispatch to unroll non-integral types:
const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
data, numVals, tid, numThreads, f, IsIntegral<ReturnType>{});
// Assign each thread chunks of the remaining values for local reduction
VTKM_OPENMP_DIRECTIVE(for schedule(static))
for (vtkm::Id i = numThreads * 2; i < numVals; i++)
{
accum = f(accum, data[i]);
}
threadData[static_cast<std::size_t>(tid)] = accum;
threadData[static_cast<std::size_t>(tid)] = localResult;
}
} // end parallel
@ -344,6 +354,64 @@ struct ReduceHelper
return init;
}
// non-integer reduction: unroll loop manually.
// This gives faster code for floats and non-trivial types.
template <typename ReturnType, typename IterType, typename FunctorType>
static ReturnType DoParallelReduction(IterType data,
vtkm::Id numVals,
int tid,
int numThreads,
FunctorType f,
std::false_type /* isIntegral */)
{
// Use the first (numThreads*2) values for initializing:
ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
vtkm::Id i = numThreads * 2;
const vtkm::Id unrollEnd = (numVals / 4) * 4;
VTKM_OPENMP_DIRECTIVE(for schedule(static))
for (i = numThreads * 2; i < unrollEnd; i += 4)
{
const auto t1 = f(data[i], data[i + 1]);
const auto t2 = f(data[i + 2], data[i + 3]);
accum = f(accum, t1);
accum = f(accum, t2);
}
// Let thread 0 mop up any remaining values:
if (tid == 0)
{
for (i = unrollEnd; i < numVals; ++i)
{
accum = f(accum, data[i]);
}
}
return accum;
}
// Integer reduction: no unrolling. Ints vectorize easily and unrolling can
// hurt performance.
template <typename ReturnType, typename IterType, typename FunctorType>
static ReturnType DoParallelReduction(IterType data,
vtkm::Id numVals,
int tid,
int numThreads,
FunctorType f,
std::true_type /* isIntegral */)
{
// Use the first (numThreads*2) values for initializing:
ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
// Assign each thread chunks of the remaining values for local reduction
VTKM_OPENMP_DIRECTIVE(for schedule(static))
for (vtkm::Id i = numThreads * 2; i < numVals; i++)
{
accum = f(accum, data[i]);
}
return accum;
}
#ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
// Specialize for vtkm functors with OpenMP special cases: