rdma: use rings for buffers management

Refactor rdma driver for improved performance and prepare for raw
datapath access.

Type: refactor

Change-Id: Iae31872055a6947708ea9f430bd1dc083ea63b5a
Signed-off-by: Benoît Ganne <bganne@cisco.com>
This commit is contained in:
Benoît Ganne
2019-08-21 15:11:43 +02:00
committed by Damjan Marion
parent 264dce73a5
commit 4fffc536f4
5 changed files with 248 additions and 208 deletions

View File

@ -266,8 +266,7 @@ rdma_async_event_error_ready (clib_file_t * f)
{
rdma_main_t *rm = &rdma_main;
rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
return clib_error_return (0, "RDMA async event error for device %U",
format_vlib_pci_addr, &rd->pci_addr);
return clib_error_return (0, "RDMA: %s: async event error", rd->name);
}
static clib_error_t *
@ -293,8 +292,7 @@ rdma_async_event_read_ready (clib_file_t * f)
case IBV_EVENT_DEVICE_FATAL:
rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
format_vlib_pci_addr, &rd->pci_addr);
vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name);
break;
default:
rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %i",
@ -326,8 +324,7 @@ rdma_async_event_init (rdma_device_t * rd)
t.file_descriptor = rd->ctx->async_fd;
t.error_function = rdma_async_event_error_ready;
t.private_data = rd->dev_instance;
t.description =
format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
t.description = format (0, "%s async event", rd->name);
rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
return 0;
@ -393,6 +390,7 @@ rdma_dev_cleanup (rdma_device_t * rd)
vec_free (rd->rxqs);
vec_free (rd->txqs);
vec_free (rd->name);
vlib_pci_free_device_info (rd->pci);
pool_put (rm->devices, rd);
}
@ -406,6 +404,7 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
rxq = vec_elt_at_index (rd->rxqs, qid);
rxq->size = n_desc;
vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
return clib_error_return_unix (0, "Create CQ Failed");
@ -482,6 +481,7 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
txq = vec_elt_at_index (rd->txqs, qid);
txq->size = n_desc;
vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
return clib_error_return_unix (0, "Create CQ Failed");
@ -492,7 +492,6 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
qpia.cap.max_send_wr = n_desc;
qpia.cap.max_send_sge = 1;
qpia.qp_type = IBV_QPT_RAW_PACKET;
qpia.sq_sig_all = 1;
if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
return clib_error_return_unix (0, "Queue Pair create failed");
@ -549,6 +548,7 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
bm->buffer_mem_size,
IBV_ACCESS_LOCAL_WRITE)) == 0)
return clib_error_return_unix (0, "Register MR Failed");
rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
return 0;
}
@ -573,11 +573,13 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
{
vnet_main_t *vnm = vnet_get_main ();
rdma_main_t *rm = &rdma_main;
rdma_device_t *rd = 0;
struct ibv_device **dev_list = 0;
rdma_device_t *rd;
vlib_pci_addr_t pci_addr;
struct ibv_device **dev_list;
int n_devs;
u8 *s = 0, *s2 = 0;
u8 *s;
u16 qid;
int i;
args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE;
args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE;
@ -588,40 +590,16 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
args->rv = VNET_API_ERROR_INVALID_VALUE;
args->error =
clib_error_return (0, "rx queue number must be a power of two");
return;
}
if (!is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
{
args->rv = VNET_API_ERROR_INVALID_VALUE;
args->error =
clib_error_return (0, "queue size must be a power of two");
return;
}
pool_get_zero (rm->devices, rd);
rd->dev_instance = rd - rm->devices;
rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
rd->name = vec_dup (args->name);
/* check if device exist and if it is bound to mlx5_core */
s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
s2 = clib_sysfs_link_to_name ((char *) s);
if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
{
args->error =
clib_error_return (0,
"invalid interface (only mlx5 supported for now)");
goto err0;
}
/* extract PCI address */
vec_reset_length (s);
s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
!is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
{
args->error = clib_error_return (0, "cannot find PCI address");
args->rv = VNET_API_ERROR_INVALID_VALUE;
args->error =
clib_error_return (0, "queue size must be a power of two >= %i",
VLIB_FRAME_SIZE);
goto err0;
}
@ -630,12 +608,39 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
{
args->error =
clib_error_return_unix (0,
"no RDMA devices available, errno = %d. "
"Is the ib_uverbs module loaded?", errno);
"no RDMA devices available. Is the ib_uverbs module loaded?");
goto err0;
}
for (int i = 0; i < n_devs; i++)
/* get PCI address */
s = format (0, "/sys/class/net/%s/device%c", args->ifname, 0);
if (sysfs_path_to_pci_addr ((char *) s, &pci_addr) == 0)
{
args->error =
clib_error_return (0, "cannot find PCI address for device ");
goto err1;
}
pool_get_zero (rm->devices, rd);
rd->dev_instance = rd - rm->devices;
rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
rd->name = format (0, "%s", args->name);
rd->linux_ifname = format (0, "%s", args->ifname);
rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error);
if (!rd->pci)
goto err2;
rd->pool = vlib_buffer_pool_get_default_for_numa (vm, rd->pci->numa_node);
if (strncmp ((char *) rd->pci->driver_name, "mlx5_core", 9))
{
args->error =
clib_error_return (0,
"invalid interface (only mlx5 supported for now)");
goto err2;
}
for (i = 0; i < n_devs; i++)
{
vlib_pci_addr_t addr;
@ -645,7 +650,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
continue;
if (addr.as_u32 != rd->pci_addr.as_u32)
if (addr.as_u32 != rd->pci->addr.as_u32)
continue;
if ((rd->ctx = ibv_open_device (dev_list[i])))
@ -654,7 +659,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
if ((args->error =
rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
goto err1;
goto err2;
if ((args->error = rdma_register_interface (vnm, rd)))
goto err2;
@ -675,6 +680,8 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
rdma_input_node.index);
vec_foreach_index (qid, rd->rxqs)
vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, qid, ~0);
vec_free (s);
return;
err3:
@ -683,10 +690,9 @@ err2:
rdma_dev_cleanup (rd);
err1:
ibv_free_device_list (dev_list);
err0:
vec_free (s2);
vec_free (s);
args->rv = VNET_API_ERROR_INVALID_INTERFACE;
err0:
vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
}

View File

@ -59,7 +59,9 @@ format_rdma_device (u8 * s, va_list * args)
rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
u32 indent = format_get_indent (s);
s = format (s, "flags: %U", format_rdma_device_flags, rd);
s = format (s, "netdev: %s\n", rd->linux_ifname);
s = format (s, "%Uflags: %U", format_white_space, indent,
format_rdma_device_flags, rd);
if (rd->error)
s = format (s, "\n%Uerror %U", format_white_space, indent,
format_clib_error, rd->error);

File diff suppressed because it is too large Load Diff

View File

@ -28,46 +28,45 @@ static_always_inline void
rdma_device_output_free (vlib_main_t * vm, rdma_txq_t * txq)
{
struct ibv_wc wc[VLIB_FRAME_SIZE];
u32 to_free[VLIB_FRAME_SIZE];
int n_free;
int i;
u32 tail, slot;
int n;
n_free = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc);
if (n_free <= 0)
n = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc);
if (n <= 0)
return;
for (i = 0; i < n_free; i++)
to_free[i] = wc[i].wr_id;
vlib_buffer_free (vm, to_free, n_free);
tail = wc[n - 1].wr_id;
slot = txq->head & (txq->size - 1);
vlib_buffer_free_from_ring (vm, txq->bufs, slot, txq->size,
tail - txq->head);
txq->head = tail;
}
VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
static_always_inline u32
rmda_device_output_tx (vlib_main_t * vm, const rdma_device_t * rd,
rdma_txq_t * txq, u32 n_left_from, u32 * bi)
{
rdma_main_t *rm = &rdma_main;
vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
u32 thread_index = vm->thread_index;
rdma_txq_t *txq =
vec_elt_at_index (rd->txqs, thread_index % vec_len (rd->txqs));
u32 *from, *f, n_left_from;
u32 n_tx_packets, n_tx_failed;
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
struct ibv_send_wr wr[VLIB_FRAME_SIZE], *w = wr;
struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge;
int i, ret;
u32 n, slot = txq->tail & (txq->size - 1);
u32 *tx = &txq->bufs[slot];
f = from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
vlib_get_buffers (vm, from, bufs, n_left_from);
/* do not enqueue more packet than ring space */
n_left_from = clib_min (n_left_from, txq->size - (txq->tail - txq->head));
/* avoid wrap-around logic in core loop */
n = n_left_from = clib_min (n_left_from, txq->size - slot);
/* if ring is full, do nothing */
if (PREDICT_FALSE (0 == n_left_from))
return 0;
vlib_get_buffers (vm, bi, bufs, n_left_from);
memset (w, 0, n_left_from * sizeof (w[0]));
while (n_left_from >= 4)
while (n >= 4)
{
if (PREDICT_TRUE (n_left_from >= 8))
if (PREDICT_TRUE (n >= 8))
{
vlib_prefetch_buffer_header (b[4 + 0], LOAD);
vlib_prefetch_buffer_header (b[4 + 1], LOAD);
@ -82,96 +81,126 @@ VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
CLIB_PREFETCH (&w[4 + 3], CLIB_CACHE_LINE_BYTES, STORE);
}
vlib_buffer_copy_indices (tx, bi, 4);
s[0].addr = vlib_buffer_get_current_va (b[0]);
s[0].length = b[0]->current_length;
s[0].lkey = rd->mr->lkey;
s[0].lkey = rd->lkey;
s[1].addr = vlib_buffer_get_current_va (b[1]);
s[1].length = b[1]->current_length;
s[1].lkey = rd->mr->lkey;
s[1].lkey = rd->lkey;
s[2].addr = vlib_buffer_get_current_va (b[2]);
s[2].length = b[2]->current_length;
s[2].lkey = rd->mr->lkey;
s[2].lkey = rd->lkey;
s[3].addr = vlib_buffer_get_current_va (b[3]);
s[3].length = b[3]->current_length;
s[3].lkey = rd->mr->lkey;
s[3].lkey = rd->lkey;
w[0].wr_id = f[0];
w[0].next = &w[0] + 1;
w[0].sg_list = &s[0];
w[0].num_sge = 1;
w[0].opcode = IBV_WR_SEND;
w[1].wr_id = f[1];
w[1].next = &w[1] + 1;
w[1].sg_list = &s[1];
w[1].num_sge = 1;
w[1].opcode = IBV_WR_SEND;
w[2].wr_id = f[2];
w[2].next = &w[2] + 1;
w[2].sg_list = &s[2];
w[2].num_sge = 1;
w[2].opcode = IBV_WR_SEND;
w[3].wr_id = f[3];
w[3].next = &w[3] + 1;
w[3].sg_list = &s[3];
w[3].num_sge = 1;
w[3].opcode = IBV_WR_SEND;
s += 4;
f += 4;
w += 4;
b += 4;
n_left_from -= 4;
bi += 4;
tx += 4;
n -= 4;
}
while (n_left_from >= 1)
while (n >= 1)
{
vlib_buffer_copy_indices (tx, bi, 1);
s[0].addr = vlib_buffer_get_current_va (b[0]);
s[0].length = b[0]->current_length;
s[0].lkey = rd->mr->lkey;
s[0].lkey = rd->lkey;
w[0].wr_id = f[0];
w[0].next = &w[0] + 1;
w[0].sg_list = &s[0];
w[0].num_sge = 1;
w[0].opcode = IBV_WR_SEND;
s += 1;
f += 1;
w += 1;
b += 1;
n_left_from -= 1;
bi += 1;
tx += 1;
n -= 1;
}
w[-1].next = 0; /* fix next pointer in WR linked-list last item */
w[-1].wr_id = txq->tail + n_left_from; /* register item to free */
w[-1].next = 0; /* fix next pointer in WR linked-list */
w[-1].send_flags = IBV_SEND_SIGNALED; /* generate a CQE so we can free buffers */
w = wr;
if (PREDICT_FALSE (0 != ibv_post_send (txq->qp, w, &w)))
n_left_from = w - wr;
txq->tail += n_left_from;
return n_left_from;
}
VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
rdma_main_t *rm = &rdma_main;
vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
u32 thread_index = vm->thread_index;
rdma_txq_t *txq =
vec_elt_at_index (rd->txqs, thread_index % vec_len (rd->txqs));
u32 *from;
u32 n_left_from;
int i;
ASSERT (txq->size >= VLIB_FRAME_SIZE && is_pow2 (txq->size));
ASSERT (txq->tail - txq->head <= txq->size);
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
clib_spinlock_lock_if_init (&txq->lock);
for (i = 0; i < 5; i++)
for (i = 0; i < 5 && n_left_from >= 0; i++)
{
u32 n_enq;
rdma_device_output_free (vm, txq);
ret = ibv_post_send (txq->qp, w, &w);
if (0 == ret)
break;
n_enq = rmda_device_output_tx (vm, rd, txq, n_left_from, from);
n_left_from -= n_enq;
from += n_enq;
}
clib_spinlock_unlock_if_init (&txq->lock);
n_tx_packets = 0 == ret ? frame->n_vectors : w - wr;
n_tx_failed = frame->n_vectors - n_tx_packets;
if (PREDICT_FALSE (n_tx_failed))
if (PREDICT_FALSE (n_left_from))
{
vlib_buffer_free (vm, &from[n_tx_packets], n_tx_failed);
vlib_buffer_free (vm, from, n_left_from);
vlib_error_count (vm, node->node_index,
RDMA_TX_ERROR_NO_FREE_SLOTS, n_tx_failed);
RDMA_TX_ERROR_NO_FREE_SLOTS, n_left_from);
}
return n_tx_packets;
return frame->n_vectors - n_left_from;
}
/*

View File

@ -37,40 +37,47 @@ enum
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
u32 size;
u32 n_enq;
struct ibv_cq *cq;
struct ibv_wq *wq;
u32 *bufs;
u32 size;
u32 head;
u32 tail;
} rdma_rxq_t;
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
u32 size;
u32 n_enq;
clib_spinlock_t lock;
struct ibv_cq *cq;
struct ibv_qp *qp;
clib_spinlock_t lock;
u32 *bufs;
u32 size;
u32 head;
u32 tail;
} rdma_txq_t;
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
u32 flags;
u32 per_interface_next_index;
u32 dev_instance;
u32 sw_if_index;
u32 hw_if_index;
u32 async_event_clib_file_index;
/* following fields are accessed in datapath */
rdma_rxq_t *rxqs;
rdma_txq_t *txqs;
u32 flags;
u32 per_interface_next_index;
u32 sw_if_index;
u32 hw_if_index;
u32 lkey; /* cache of mr->lkey */
u8 pool; /* buffer pool index */
/* fields below are not accessed in datapath */
vlib_pci_device_info_t *pci;
u8 *name;
u8 *linux_ifname;
mac_address_t hwaddr;
vlib_pci_addr_t pci_addr;
u32 async_event_clib_file_index;
u32 dev_instance;
struct ibv_context *ctx;
struct ibv_pd *pd;
@ -80,7 +87,6 @@ typedef struct
struct ibv_flow *flow_ucast;
struct ibv_flow *flow_mcast;
/* error */
clib_error_t *error;
} rdma_device_t;