ip: apply dual loop unrolling in ip4_input

Too many prefetches within loop unrollings induce bottleneck and
performance degradation on some CPUs which have less cache line fill
buffers, e.g, Arm Cortex-A72.
Apply dual loop unrolling and tune prefetches manually to resolve
hot-spot with prefetch instructions.
It saves about 11.5% cycles with ip4_input node on Cortex-A72 CPUs.

Type: feature

Change-Id: I1ac9eb21061a804af2a414b420217fbcda3689c9
Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
This commit is contained in:
Lijian.Zhang
2019-07-08 10:33:34 +08:00
committed by Damjan Marion
parent 840f64b4b2
commit 86b1871ba2

@ -151,6 +151,7 @@ ip4_input_inline (vlib_main_t * vm,
vlib_get_buffers (vm, from, bufs, n_left_from);
b = bufs;
next = nexts;
#if (CLIB_N_PREFETCHES >= 8)
while (n_left_from >= 4)
{
u32 x = 0;
@ -233,6 +234,73 @@ ip4_input_inline (vlib_main_t * vm,
next += 4;
n_left_from -= 4;
}
#elif (CLIB_N_PREFETCHES >= 4)
while (n_left_from >= 2)
{
u32 x = 0;
u32 next0, next1;
/* Prefetch next iteration. */
if (n_left_from >= 6)
{
vlib_prefetch_buffer_header (b[4], LOAD);
vlib_prefetch_buffer_header (b[5], LOAD);
vlib_prefetch_buffer_data (b[2], LOAD);
vlib_prefetch_buffer_data (b[3], LOAD);
}
vnet_buffer (b[0])->ip.adj_index[VLIB_RX] = ~0;
vnet_buffer (b[1])->ip.adj_index[VLIB_RX] = ~0;
sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX];
sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_RX];
x |= sw_if_index[0] ^ last_sw_if_index;
x |= sw_if_index[1] ^ last_sw_if_index;
if (PREDICT_TRUE (x == 0))
{
/* we deal with 2 more packets sharing the same sw_if_index
with the previous one, so we can optimize */
cnt += 2;
if (arc_enabled)
{
next0 = ip4_input_set_next (sw_if_index[0], b[0], 1);
next1 = ip4_input_set_next (sw_if_index[1], b[1], 1);
}
else
{
next0 = ip4_input_set_next (sw_if_index[0], b[0], 0);
next1 = ip4_input_set_next (sw_if_index[1], b[1], 0);
}
}
else
{
ip4_input_check_sw_if_index (vm, cm, sw_if_index[0],
&last_sw_if_index, &cnt, &arc_enabled);
ip4_input_check_sw_if_index (vm, cm, sw_if_index[1],
&last_sw_if_index, &cnt, &arc_enabled);
next0 = ip4_input_set_next (sw_if_index[0], b[0], 1);
next1 = ip4_input_set_next (sw_if_index[1], b[1], 1);
}
ip[0] = vlib_buffer_get_current (b[0]);
ip[1] = vlib_buffer_get_current (b[1]);
ip4_input_check_x2 (vm, error_node, b[0], b[1], ip[0], ip[1],
&next0, &next1, verify_checksum);
next[0] = (u16) next0;
next[1] = (u16) next1;
/* next */
b += 2;
next += 2;
n_left_from -= 2;
}
#endif
while (n_left_from)
{
u32 next0;