dpdk: apply dual loop unrolling in DPDK TX
Too many prefetches within loop unrollings induce bottleneck and performance degradation on some CPUs which have less cache line fill buffers, e.g, Arm Cortex-A72. Apply dual loop unrolling and tune prefetches manually to remove hot-spot with prefetch instructions, to get throughput improvement. It brings about 1% throughput improvement and saves 8% clocks with the target node on Cortex-A72. Type: feature Change-Id: If3a64a04a77e90cd0240bc4d1186dbb09dac7df0 Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
This commit is contained in:

committed by
Damjan Marion

parent
8a1dea4ce6
commit
fe2523d1a4
@ -289,6 +289,7 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm,
|
||||
n_left = n_packets;
|
||||
mb = ptd->mbufs;
|
||||
|
||||
#if (CLIB_N_PREFETCHES >= 8)
|
||||
while (n_left >= 8)
|
||||
{
|
||||
u32 or_flags;
|
||||
@ -353,6 +354,62 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm,
|
||||
mb += 4;
|
||||
n_left -= 4;
|
||||
}
|
||||
#elif (CLIB_N_PREFETCHES >= 4)
|
||||
while (n_left >= 4)
|
||||
{
|
||||
vlib_buffer_t *b2, *b3;
|
||||
u32 or_flags;
|
||||
|
||||
CLIB_PREFETCH (mb[2], CLIB_CACHE_LINE_BYTES, STORE);
|
||||
CLIB_PREFETCH (mb[3], CLIB_CACHE_LINE_BYTES, STORE);
|
||||
b2 = vlib_buffer_from_rte_mbuf (mb[2]);
|
||||
CLIB_PREFETCH (b2, CLIB_CACHE_LINE_BYTES, LOAD);
|
||||
b3 = vlib_buffer_from_rte_mbuf (mb[3]);
|
||||
CLIB_PREFETCH (b3, CLIB_CACHE_LINE_BYTES, LOAD);
|
||||
|
||||
b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
|
||||
b[1] = vlib_buffer_from_rte_mbuf (mb[1]);
|
||||
|
||||
or_flags = b[0]->flags | b[1]->flags;
|
||||
all_or_flags |= or_flags;
|
||||
|
||||
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
|
||||
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]);
|
||||
|
||||
if (or_flags & VLIB_BUFFER_NEXT_PRESENT)
|
||||
{
|
||||
dpdk_validate_rte_mbuf (vm, b[0], 1);
|
||||
dpdk_validate_rte_mbuf (vm, b[1], 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
dpdk_validate_rte_mbuf (vm, b[0], 0);
|
||||
dpdk_validate_rte_mbuf (vm, b[1], 0);
|
||||
}
|
||||
|
||||
if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) &&
|
||||
(or_flags &
|
||||
(VNET_BUFFER_F_OFFLOAD_TCP_CKSUM
|
||||
| VNET_BUFFER_F_OFFLOAD_IP_CKSUM
|
||||
| VNET_BUFFER_F_OFFLOAD_UDP_CKSUM))))
|
||||
{
|
||||
dpdk_buffer_tx_offload (xd, b[0], mb[0]);
|
||||
dpdk_buffer_tx_offload (xd, b[1], mb[1]);
|
||||
}
|
||||
|
||||
if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
|
||||
{
|
||||
if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
|
||||
dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[0]);
|
||||
if (b[1]->flags & VLIB_BUFFER_IS_TRACED)
|
||||
dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[1]);
|
||||
}
|
||||
|
||||
mb += 2;
|
||||
n_left -= 2;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (n_left > 0)
|
||||
{
|
||||
b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
|
||||
|
Reference in New Issue
Block a user