Use std::copy in serial Copy implementation.

I had assumed that the compiler would be clever enough to turn the
iterative implementation of Copy into a memcpy, but inspecting the
disassembly on a release GCC build shows that this is not the case,
likely because it can't assume that the memory ranges do not overlap.

Replacing the loop with std::copy speeds things up (about 30-50%) for
most data types, though there is a slight (usually < 5%) slowdown for
Vec types. The uint8 copy improved by a factor of 8.

Comparison:
| Speedup | iteration            | std::copy            | Benchmark (Type) |
|---------|----------------------|----------------------|------------------|
|   1.363 | 0.001590 +- 0.000087 | 0.001166 +- 0.000049 | Copy 2097152 values (vtkm::Float32) |
|   1.487 | 0.003429 +- 0.000185 | 0.002305 +- 0.000146 | Copy 2097152 values (vtkm::Float64) |
|   1.379 | 0.001568 +- 0.000072 | 0.001137 +- 0.000093 | Copy 2097152 values (vtkm::Int32) |
|   1.420 | 0.003410 +- 0.000173 | 0.002402 +- 0.000101 | Copy 2097152 values (vtkm::Int64) |
|   1.303 | 0.001564 +- 0.000083 | 0.001201 +- 0.000078 | Copy 2097152 values (vtkm::UInt32) |
|   7.204 | 0.002441 +- 0.000104 | 0.000339 +- 0.000029 | Copy 2097152 values (vtkm::UInt8) |
|   0.987 | 0.006602 +- 0.000266 | 0.006688 +- 0.000291 | Copy 2097152 values (vtkm::Vec< vtkm::Float32, 4 >) |
|   0.965 | 0.010065 +- 0.000528 | 0.010427 +- 0.000617 | Copy 2097152 values (vtkm::Vec< vtkm::Float64, 3 >) |
|   0.979 | 0.003327 +- 0.000191 | 0.003398 +- 0.000142 | Copy 2097152 values (vtkm::Vec< vtkm::Int32, 2 >) |
|   0.851 | 0.001579 +- 0.000090 | 0.001856 +- 0.000098 | Copy 2097152 values (vtkm::Vec< vtkm::UInt8, 4 >) |
This commit is contained in:
Allison Vacanti 2017-10-10 12:42:13 -04:00
parent b396716f86
commit 825f351d04

@ -50,6 +50,19 @@ private:
using Device = vtkm::cont::DeviceAdapterTagSerial;
public:
template <typename T, typename U, class CIn, class COut>
VTKM_CONT static void Copy(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::cont::ArrayHandle<U, COut>& output)
{
const vtkm::Id inSize = input.GetNumberOfValues();
auto inputPortal = input.PrepareForInput(DeviceAdapterTagSerial());
auto outputPortal = output.PrepareForOutput(inSize, DeviceAdapterTagSerial());
std::copy(vtkm::cont::ArrayPortalToIteratorBegin(inputPortal),
vtkm::cont::ArrayPortalToIteratorEnd(inputPortal),
vtkm::cont::ArrayPortalToIteratorBegin(outputPortal));
}
template <typename T, typename U, class CIn, class CStencil, class COut>
VTKM_CONT static void CopyIf(const vtkm::cont::ArrayHandle<T, CIn>& input,
const vtkm::cont::ArrayHandle<U, CStencil>& stencil,
@ -87,6 +100,56 @@ public:
output.Shrink(writePos);
}
template <typename T, typename U, class CIn, class COut>
VTKM_CONT static bool CopySubRange(const vtkm::cont::ArrayHandle<T, CIn>& input,
vtkm::Id inputStartIndex,
vtkm::Id numberOfElementsToCopy,
vtkm::cont::ArrayHandle<U, COut>& output,
vtkm::Id outputIndex = 0)
{
const vtkm::Id inSize = input.GetNumberOfValues();
if (inputStartIndex < 0 || numberOfElementsToCopy < 0 || outputIndex < 0 ||
inputStartIndex >= inSize)
{ //invalid parameters
return false;
}
//determine if the numberOfElementsToCopy needs to be reduced
if (inSize < (inputStartIndex + numberOfElementsToCopy))
{ //adjust the size
numberOfElementsToCopy = (inSize - inputStartIndex);
}
const vtkm::Id outSize = output.GetNumberOfValues();
const vtkm::Id copyOutEnd = outputIndex + numberOfElementsToCopy;
if (outSize < copyOutEnd)
{ //output is not large enough
if (outSize == 0)
{ //since output has nothing, just need to allocate to correct length
output.Allocate(copyOutEnd);
}
else
{ //we currently have data in this array, so preserve it in the new
//resized array
vtkm::cont::ArrayHandle<U, COut> temp;
temp.Allocate(copyOutEnd);
CopySubRange(output, 0, outSize, temp);
output = temp;
}
}
auto inputPortal = input.PrepareForInput(DeviceAdapterTagSerial());
auto outputPortal = output.PrepareForInPlace(DeviceAdapterTagSerial());
auto inIter = vtkm::cont::ArrayPortalToIteratorBegin(inputPortal);
auto outIter = vtkm::cont::ArrayPortalToIteratorBegin(outputPortal);
std::copy(inIter + inputStartIndex,
inIter + inputStartIndex + numberOfElementsToCopy,
outIter + outputIndex);
return true;
}
template <typename T, typename U, class CIn>
VTKM_CONT static U Reduce(const vtkm::cont::ArrayHandle<T, CIn>& input, U initialValue)
{