From 0657c165d9ac00e9217e4eea68b407504274d1cc Mon Sep 17 00:00:00 2001 From: Steven Date: Thu, 8 Jun 2017 12:52:29 -0700 Subject: [PATCH] memif: jumbo frames support Current memif interface supports frame size up to 2048. This patch is to enhance memif to support jumbo frames. On tx (writing buffers to the ring), keep reading the next buffer in vlib when the flag VLIB_BUFFER_NEXT_PRESENT and merge it to the same ring entry. Use descriptor chaining if the buffer is not big enough. On rx (reading buffers from the ring), if the packet is greater than 2048, create multiple vlib buffers, chained with the VLIB_BUFFER_NEXT_PRESENT. Testing: Because the ping command provided by VPP does not support jumbo frames, I have to use linux ping. Here is the set up that I use for testing. VM1 --- vhost ---- VPP1 --- memif --- VPP2 --- vhost --- VM2 Create vhost-user interfaces between VM1 and VPP1 and between VPP2 and VM2 VM configuration: Set the interface mtu on the VM, e.g 9216 to support jumbo frames. create static route and static arp on VM1 to VM2 and vice versa. Use iperf3 or ping -s 8000 from VM1 to VM2 or vice versa. Sample run sluong@ubuntu:~$ ping 131.1.1.1 -c1 -s 8000 ping 131.1.1.1 -c1 -s 8000 PING 131.1.1.1 (131.1.1.1) 8000(8028) bytes of data. 8008 bytes from 131.1.1.1: icmp_seq=1 ttl=62 time=0.835 ms --- 131.1.1.1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.835/0.835/0.835/0.000 ms sluong@ubuntu:~$ DBGvpp# sh interface memif0 Name Idx State Counter Count memif0 1 up rx packets 1 rx bytes 8042 tx packets 1 tx bytes 8042 ip4 1 DBGvpp# Change-Id: I469bece3d45a790dceaee1d6a8e976bd018feee2 Signed-off-by: Steven --- src/plugins/memif/device.c | 110 ++++++++----- src/plugins/memif/node.c | 316 ++++++++++++++++++++++++------------- 2 files changed, 275 insertions(+), 151 deletions(-) diff --git a/src/plugins/memif/device.c b/src/plugins/memif/device.c index 870dd354828..07afc303e83 100644 --- a/src/plugins/memif/device.c +++ b/src/plugins/memif/device.c @@ -30,6 +30,7 @@ #define foreach_memif_tx_func_error \ _(NO_FREE_SLOTS, "no free tx slots") \ +_(TRUNC_PACKET, "packet > buffer size -- truncated in tx ring") \ _(PENDING_MSGS, "pending msgs in tx ring") typedef enum @@ -86,6 +87,70 @@ memif_prefetch_buffer_and_data (vlib_main_t * vm, u32 bi) CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); } +/** + * @brief Copy buffer to tx ring + * + * @param * vm (in) + * @param * node (in) + * @param * mif (in) pointer to memif interface + * @param bi (in) vlib buffer index + * @param * ring (in) pointer to memif ring + * @param * head (in/out) ring head + * @param mask (in) ring size - 1 + */ +static_always_inline void +memif_copy_buffer_to_tx_ring (vlib_main_t * vm, vlib_node_runtime_t * node, + memif_if_t * mif, u32 bi, memif_ring_t * ring, + u16 * head, u16 mask) +{ + vlib_buffer_t *b0; + void *mb0; + u32 total = 0, len; + + mb0 = memif_get_buffer (mif, ring, *head); + ring->desc[*head].flags = 0; + do + { + b0 = vlib_get_buffer (vm, bi); + len = b0->current_length; + if (PREDICT_FALSE (ring->desc[*head].buffer_length < (total + len))) + { + if (PREDICT_TRUE (total)) + { + ring->desc[*head].length = total; + total = 0; + ring->desc[*head].flags |= MEMIF_DESC_FLAG_NEXT; + *head = (*head + 1) & mask; + mb0 = memif_get_buffer (mif, ring, *head); + ring->desc[*head].flags = 0; + } + } + if (PREDICT_TRUE (ring->desc[*head].buffer_length >= (total + len))) + { + clib_memcpy (mb0 + total, vlib_buffer_get_current (b0), + CLIB_CACHE_LINE_BYTES); + if (len > CLIB_CACHE_LINE_BYTES) + clib_memcpy (mb0 + CLIB_CACHE_LINE_BYTES + total, + vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, + len - CLIB_CACHE_LINE_BYTES); + total += len; + } + else + { + vlib_error_count (vm, node->node_index, MEMIF_TX_ERROR_TRUNC_PACKET, + 1); + break; + } + } + while ((bi = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) ? b0->next_buffer : 0)); + + if (PREDICT_TRUE (total)) + { + ring->desc[*head].length = total; + *head = (*head + 1) & mask; + } +} + static_always_inline uword memif_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, memif_if_t * mif, @@ -152,32 +217,10 @@ memif_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, memif_prefetch_buffer_and_data (vm, buffers[2]); memif_prefetch_buffer_and_data (vm, buffers[3]); - vlib_buffer_t *b0 = vlib_get_buffer (vm, buffers[0]); - vlib_buffer_t *b1 = vlib_get_buffer (vm, buffers[1]); - - void *mb0 = memif_get_buffer (mif, ring, head); - clib_memcpy (mb0, vlib_buffer_get_current (b0), CLIB_CACHE_LINE_BYTES); - ring->desc[head].length = b0->current_length; - head = (head + 1) & mask; - - void *mb1 = memif_get_buffer (mif, ring, head); - clib_memcpy (mb1, vlib_buffer_get_current (b1), CLIB_CACHE_LINE_BYTES); - ring->desc[head].length = b1->current_length; - head = (head + 1) & mask; - - if (b0->current_length > CLIB_CACHE_LINE_BYTES) - { - clib_memcpy (mb0 + CLIB_CACHE_LINE_BYTES, - vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, - b0->current_length - CLIB_CACHE_LINE_BYTES); - } - if (b1->current_length > CLIB_CACHE_LINE_BYTES) - { - clib_memcpy (mb1 + CLIB_CACHE_LINE_BYTES, - vlib_buffer_get_current (b1) + CLIB_CACHE_LINE_BYTES, - b1->current_length - CLIB_CACHE_LINE_BYTES); - } - + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[0], ring, &head, + mask); + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[1], ring, &head, + mask); buffers += 2; n_left -= 2; @@ -186,19 +229,8 @@ memif_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (n_left && free_slots) { - vlib_buffer_t *b0 = vlib_get_buffer (vm, buffers[0]); - void *mb0 = memif_get_buffer (mif, ring, head); - clib_memcpy (mb0, vlib_buffer_get_current (b0), CLIB_CACHE_LINE_BYTES); - - if (b0->current_length > CLIB_CACHE_LINE_BYTES) - { - clib_memcpy (mb0 + CLIB_CACHE_LINE_BYTES, - vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, - b0->current_length - CLIB_CACHE_LINE_BYTES); - } - ring->desc[head].length = b0->current_length; - head = (head + 1) & mask; - + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[0], ring, &head, + mask); buffers++; n_left--; free_slots--; diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index e2c7631c0e2..c6403fef7f3 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -76,6 +76,130 @@ memif_prefetch (vlib_main_t * vm, u32 bi) CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE); } +static_always_inline void +memif_buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, + u32 prev_bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_t *first_b = vlib_get_buffer (vm, first_bi); + vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_bi); + + /* update first buffer */ + first_b->total_length_not_including_first_buffer += b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + b->next_buffer = 0; +} + +/** + * @brief Copy buffer from rx ring + * + * @param * vm (in) + * @param * mif (in) pointer to memif interface + * @param * ring (in) pointer to memif ring + * @param * rd (in) pointer to ring data + * @param ring_size (in) ring size + * @param * n_free_bufs (in/out) the number of free vlib buffers available + * @param ** first_b (out) the first vlib buffer pointer + * @param * first_bi (out) the first vlib buffer index + * @param * bi (in/out) the current buffer index + * #param * num_slots (in/out) the number of descriptors available to read + * + * @return total bytes read from rx ring also written to vlib buffers + */ +static_always_inline uword +memif_copy_buffer_from_rx_ring (vlib_main_t * vm, memif_if_t * mif, + memif_ring_t * ring, memif_queue_t * mq, + u16 ring_size, u32 n_buffer_bytes, + u32 * n_free_bufs, vlib_buffer_t ** first_b, + u32 * first_bi, u32 * bi, u16 * num_slots) +{ + memif_main_t *nm = &memif_main; + u32 thread_index = vlib_get_thread_index (); + u32 total_bytes = 0, offset = 0; + u32 data_len; + u32 bytes_to_copy; + void *mb; + vlib_buffer_t *b; + u16 mask = ring_size - 1; + u32 prev_bi; + u16 last_head; + + while (*num_slots) + { + data_len = ring->desc[mq->last_head].length; + while (data_len && (*n_free_bufs)) + { + /* get empty buffer */ + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + prev_bi = *bi; + *bi = nm->rx_buffers[thread_index][last_buf]; + b = vlib_get_buffer (vm, *bi); + _vec_len (nm->rx_buffers[thread_index]) = last_buf; + (*n_free_bufs)--; + if (PREDICT_FALSE (*n_free_bufs == 0)) + { + *n_free_bufs += + vlib_buffer_alloc (vm, + &nm->rx_buffers[thread_index] + [*n_free_bufs], ring_size); + _vec_len (nm->rx_buffers[thread_index]) = *n_free_bufs; + } + + if (last_buf > 4) + { + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); + } + + /* copy buffer */ + bytes_to_copy = + data_len > n_buffer_bytes ? n_buffer_bytes : data_len; + b->current_data = 0; + mb = memif_get_buffer (mif, ring, mq->last_head); + clib_memcpy (vlib_buffer_get_current (b), mb + offset, + CLIB_CACHE_LINE_BYTES); + if (bytes_to_copy > CLIB_CACHE_LINE_BYTES) + clib_memcpy (vlib_buffer_get_current (b) + CLIB_CACHE_LINE_BYTES, + mb + CLIB_CACHE_LINE_BYTES + offset, + bytes_to_copy - CLIB_CACHE_LINE_BYTES); + + /* fill buffer header */ + b->current_length = bytes_to_copy; + + if (total_bytes == 0) + { + /* fill buffer metadata */ + b->total_length_not_including_first_buffer = 0; + b->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer (b)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0; + *first_bi = *bi; + *first_b = vlib_get_buffer (vm, *first_bi); + } + else + memif_buffer_add_to_chain (vm, *bi, *first_bi, prev_bi); + + offset += bytes_to_copy; + total_bytes += bytes_to_copy; + data_len -= bytes_to_copy; + } + last_head = mq->last_head; + /* Advance to next descriptor */ + mq->last_head = (mq->last_head + 1) & mask; + offset = 0; + (*num_slots)--; + if ((ring->desc[last_head].flags & MEMIF_DESC_FLAG_NEXT) == 0) + break; + } + + return (total_bytes); +} + static_always_inline uword memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, memif_if_t * mif, @@ -92,11 +216,11 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_rx_bytes = 0; u32 *to_next = 0; u32 n_free_bufs; + u32 b0_total, b1_total; u32 thread_index = vlib_get_thread_index (); - u32 bi0, bi1; - vlib_buffer_t *b0, *b1; u16 ring_size, mask, num_slots; - void *mb0, *mb1; + u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); mq = vec_elt_at_index (mif->rx_queues, qid); ring = mq->ring; @@ -133,7 +257,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 next1 = next_index; vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (num_slots > 5 && n_left_to_next > 2) + while (num_slots > 11 && n_left_to_next > 2) { if (PREDICT_TRUE (mq->last_head + 5 < ring_size)) { @@ -159,157 +283,125 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, CLIB_PREFETCH (&ring->desc[(mq->last_head + 5) % mask], CLIB_CACHE_LINE_BYTES, LOAD); } - /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; - bi0 = nm->rx_buffers[thread_index][last_buf]; - bi1 = nm->rx_buffers[thread_index][last_buf - 1]; - _vec_len (nm->rx_buffers[thread_index]) -= 2; - if (last_buf > 4) - { - memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); - memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); - } + vlib_buffer_t *first_b0 = 0; + u32 bi0 = 0, first_bi0 = 0; + b0_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b0, + &first_bi0, &bi0, + &num_slots); + + vlib_buffer_t *first_b1 = 0; + u32 bi1 = 0, first_bi1 = 0; + b1_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b1, + &first_bi1, &bi1, + &num_slots); /* enqueue buffer */ - to_next[0] = bi0; - to_next[1] = bi1; + to_next[0] = first_bi0; + to_next[1] = first_bi1; to_next += 2; n_left_to_next -= 2; - /* fill buffer metadata */ - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = mif->sw_if_index; - vnet_buffer (b1)->sw_if_index[VLIB_RX] = mif->sw_if_index; - - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - /* copy buffer */ - mb0 = memif_get_buffer (mif, ring, mq->last_head); - clib_memcpy (vlib_buffer_get_current (b0), mb0, - CLIB_CACHE_LINE_BYTES); - b0->current_length = ring->desc[mq->last_head].length; - mq->last_head = (mq->last_head + 1) & mask; - - mb1 = memif_get_buffer (mif, ring, mq->last_head); - clib_memcpy (vlib_buffer_get_current (b1), mb1, - CLIB_CACHE_LINE_BYTES); - b1->current_length = ring->desc[mq->last_head].length; - mq->last_head = (mq->last_head + 1) & mask; - - if (b0->current_length > CLIB_CACHE_LINE_BYTES) - clib_memcpy (vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, - mb0 + CLIB_CACHE_LINE_BYTES, - b0->current_length - CLIB_CACHE_LINE_BYTES); - - if (b1->current_length > CLIB_CACHE_LINE_BYTES) - clib_memcpy (vlib_buffer_get_current (b1) + CLIB_CACHE_LINE_BYTES, - mb1 + CLIB_CACHE_LINE_BYTES, - b1->current_length - CLIB_CACHE_LINE_BYTES); - /* trace */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b1); if (PREDICT_FALSE (n_trace > 0)) { /* b0 */ - memif_input_trace_t *tr; - vlib_trace_buffer (vm, node, next0, b0, - /* follow_chain */ 0); - vlib_set_trace_count (vm, node, --n_trace); - tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); - tr->next_index = next0; - tr->hw_if_index = mif->hw_if_index; - tr->ring = qid; - + if (PREDICT_TRUE (first_b0 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } if (n_trace) { /* b1 */ - memif_input_trace_t *tr; - vlib_trace_buffer (vm, node, next1, b1, - /* follow_chain */ 0); - vlib_set_trace_count (vm, node, --n_trace); - tr = vlib_add_trace (vm, node, b1, sizeof (*tr)); - tr->next_index = next1; - tr->hw_if_index = mif->hw_if_index; - tr->ring = qid; + if (PREDICT_TRUE (first_b1 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next1, first_b1, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b1, sizeof (*tr)); + tr->next_index = next1; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } } } /* redirect if feature path enabled */ vnet_feature_start_device_input_x2 (mif->sw_if_index, - &next0, &next1, b0, b1); + &next0, &next1, first_b0, + first_b1); /* enqueue */ vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, - n_left_to_next, - bi0, bi1, next0, next1); + n_left_to_next, first_bi0, + first_bi1, next0, next1); /* next packet */ - num_slots -= 2; n_rx_packets += 2; - n_rx_bytes += b0->current_length; - n_rx_bytes += b1->current_length; + n_rx_bytes += b0_total + b1_total; } while (num_slots && n_left_to_next) { - /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; - bi0 = nm->rx_buffers[thread_index][last_buf]; - _vec_len (nm->rx_buffers[thread_index]) = last_buf; - - /* enqueue buffer */ - to_next[0] = bi0; - to_next += 1; - n_left_to_next--; - - /* fill buffer metadata */ - b0 = vlib_get_buffer (vm, bi0); - b0->current_length = ring->desc[mq->last_head].length; - vnet_buffer (b0)->sw_if_index[VLIB_RX] = mif->sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - /* copy buffer */ - mb0 = memif_get_buffer (mif, ring, mq->last_head); - clib_memcpy (vlib_buffer_get_current (b0), mb0, - CLIB_CACHE_LINE_BYTES); - if (b0->current_length > CLIB_CACHE_LINE_BYTES) - clib_memcpy (vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, - mb0 + CLIB_CACHE_LINE_BYTES, - b0->current_length - CLIB_CACHE_LINE_BYTES); + vlib_buffer_t *first_b0 = 0; + u32 bi0 = 0, first_bi0 = 0; + b0_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b0, + &first_bi0, &bi0, + &num_slots); /* trace */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0); if (PREDICT_FALSE (n_trace > 0)) { - memif_input_trace_t *tr; - vlib_trace_buffer (vm, node, next0, b0, - /* follow_chain */ 0); - vlib_set_trace_count (vm, node, --n_trace); - tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); - tr->next_index = next0; - tr->hw_if_index = mif->hw_if_index; - tr->ring = qid; + if (PREDICT_TRUE (first_b0 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } } + /* enqueue buffer */ + to_next[0] = first_bi0; + to_next += 1; + n_left_to_next--; /* redirect if feature path enabled */ - vnet_feature_start_device_input_x1 (mif->sw_if_index, &next0, b0); + vnet_feature_start_device_input_x1 (mif->sw_if_index, &next0, + first_b0); /* enqueue */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, bi0, next0); + n_left_to_next, first_bi0, next0); /* next packet */ - mq->last_head = (mq->last_head + 1) & mask; - num_slots--; n_rx_packets++; - n_rx_bytes += b0->current_length; + n_rx_bytes += b0_total; } vlib_put_next_frame (vm, node, next_index, n_left_to_next); }