rdma: introduce direct verb for Cx4/5 tx

Direct Verb allows for direct access to NIC HW rx/tx rings. This patch
introduce TX direct verb support for Mellanox ConnectX-4/5 adapters.
'dv' mode must be explicitely selected at interface creation to benefit
from this.

Type: feature

Change-Id: If830ba9f33db73299acdbddc68b5c09eaf6add98
Signed-off-by: Benoît Ganne <bganne@cisco.com>
This commit is contained in:
Benoît Ganne
2019-12-16 10:42:25 +01:00
committed by Damjan Marion
parent dd648aac06
commit dc812d9a71
5 changed files with 557 additions and 73 deletions
+66 -19
View File
@@ -535,7 +535,8 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
txq = vec_elt_at_index (rd->txqs, qid);
txq->size = n_desc;
ASSERT (is_pow2 (n_desc));
txq->bufs_log2sz = min_log2 (n_desc);
vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
@@ -569,6 +570,57 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
qpa.qp_state = IBV_QPS_RTS;
if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
return clib_error_return_unix (0, "Modify QP (send) Failed");
txq->ibv_cq = txq->cq;
txq->ibv_qp = txq->qp;
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
{
rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl;
struct mlx5dv_cq dv_cq;
struct mlx5dv_qp dv_qp;
struct mlx5dv_obj obj = { };
obj.cq.in = txq->cq;
obj.cq.out = &dv_cq;
obj.qp.in = txq->qp;
obj.qp.out = &dv_qp;
if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
return clib_error_return_unix (0, "DV init obj failed");
if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt
|| !is_pow2 (dv_qp.sq.wqe_cnt)
|| sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride
|| (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t))
return clib_error_return (0, "Unsupported DV SQ parameters");
if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt
|| !is_pow2 (dv_cq.cqe_cnt)
|| sizeof (struct mlx5_cqe64) != dv_cq.cqe_size
|| (uword) dv_cq.buf % sizeof (struct mlx5_cqe64))
return clib_error_return (0, "Unsupported DV CQ parameters");
/* get SQ and doorbell addresses */
txq->dv_sq_wqes = dv_qp.sq.buf;
txq->dv_sq_dbrec = dv_qp.dbrec;
txq->dv_sq_db = dv_qp.bf.reg;
txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt);
/* get CQ and doorbell addresses */
txq->dv_cq_cqes = dv_cq.buf;
txq->dv_cq_dbrec = dv_cq.dbrec;
txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt);
/* init tx desc template */
STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl));
mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0,
RDMA_TXQ_DV_INVALID_ID);
/* FIXME: mlx5dv_set_eth_seg(&tmpl->eseg, MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM, 0, 0, 0); */
mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0);
}
return 0;
}
@@ -587,6 +639,13 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
return clib_error_return_unix (0, "PD Alloc Failed");
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
bm->buffer_mem_size,
IBV_ACCESS_LOCAL_WRITE)) == 0)
return clib_error_return_unix (0, "Register MR Failed");
rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
ethernet_mac_address_generate (rd->hwaddr.bytes);
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
@@ -657,28 +716,16 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
}
if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
args->rxq_size > 65535 || args->txq_size > 65535 ||
!is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
{
args->rv = VNET_API_ERROR_INVALID_VALUE;
args->error =
clib_error_return (0, "queue size must be a power of two >= %i",
VLIB_FRAME_SIZE);
args->error = clib_error_return (0, "queue size must be a power of two "
"between %i and 65535",
VLIB_FRAME_SIZE);
goto err0;
}
switch (args->mode)
{
case RDMA_MODE_AUTO:
break;
case RDMA_MODE_IBV:
break;
case RDMA_MODE_DV:
args->rv = VNET_API_ERROR_INVALID_VALUE;
args->error = clib_error_return (0, "unsupported mode");
goto err0;
break;
}
dev_list = ibv_get_device_list (&n_devs);
if (n_devs == 0)
{
@@ -762,8 +809,8 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
}
}
if ((args->error =
rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size,
args->rxq_num)))
goto err2;
if ((args->error = rdma_register_interface (vnm, rd)))
+407 -45
View File
File diff suppressed because it is too large Load Diff
+69 -7
View File
@@ -39,6 +39,19 @@ enum
#undef _
};
typedef struct
{
CLIB_ALIGN_MARK (align0, MLX5_SEND_WQE_BB);
struct mlx5_wqe_ctrl_seg ctrl;
struct mlx5_wqe_eth_seg eseg;
struct mlx5_wqe_data_seg dseg;
} rdma_mlx5_wqe_t;
#define RDMA_MLX5_WQE_SZ sizeof(rdma_mlx5_wqe_t)
#define RDMA_MLX5_WQE_DS (RDMA_MLX5_WQE_SZ/sizeof(struct mlx5_wqe_data_seg))
STATIC_ASSERT (RDMA_MLX5_WQE_SZ == MLX5_SEND_WQE_BB &&
RDMA_MLX5_WQE_SZ % sizeof (struct mlx5_wqe_data_seg) == 0,
"bad size");
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
@@ -65,14 +78,60 @@ typedef struct
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
/* following fields are accessed in datapath */
clib_spinlock_t lock;
union
{
struct
{
/* ibverb datapath. Cache of cq, sq below */
struct ibv_cq *ibv_cq;
struct ibv_qp *ibv_qp;
};
struct
{
/* direct verbs datapath */
rdma_mlx5_wqe_t *dv_sq_wqes;
volatile u32 *dv_sq_dbrec;
volatile u64 *dv_sq_db;
struct mlx5_cqe64 *dv_cq_cqes;
volatile u32 *dv_cq_dbrec;
};
};
u32 *bufs; /* vlib_buffer ring buffer */
u16 head;
u16 tail;
u16 dv_cq_idx; /* monotonic CQE index (valid only for direct verbs) */
u8 bufs_log2sz; /* log2 vlib_buffer entries */
u8 dv_sq_log2sz:4; /* log2 SQ WQE entries (valid only for direct verbs) */
u8 dv_cq_log2sz:4; /* log2 CQ CQE entries (valid only for direct verbs) */
STRUCT_MARK (cacheline1);
/* WQE template (valid only for direct verbs) */
u8 dv_wqe_tmpl[64];
/* end of 2nd 64-bytes cacheline (or 1st 128-bytes cacheline) */
STRUCT_MARK (cacheline2);
/* fields below are not accessed in datapath */
struct ibv_cq *cq;
struct ibv_qp *qp;
u32 *bufs;
u32 size;
u32 head;
u32 tail;
} rdma_txq_t;
STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline1, 64);
STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline2, 128);
#define RDMA_TXQ_DV_INVALID_ID 0xffffffff
#define RDMA_TXQ_BUF_SZ(txq) (1U << (txq)->bufs_log2sz)
#define RDMA_TXQ_DV_SQ_SZ(txq) (1U << (txq)->dv_sq_log2sz)
#define RDMA_TXQ_DV_CQ_SZ(txq) (1U << (txq)->dv_cq_log2sz)
#define RDMA_TXQ_USED_SZ(head, tail) ((u16)((u16)(tail) - (u16)(head)))
#define RDMA_TXQ_AVAIL_SZ(txq, head, tail) ((u16)(RDMA_TXQ_BUF_SZ (txq) - RDMA_TXQ_USED_SZ (head, tail)))
typedef struct
{
@@ -170,8 +229,11 @@ typedef struct
u16 cqe_flags;
} rdma_input_trace_t;
#define foreach_rdma_tx_func_error \
_(NO_FREE_SLOTS, "no free tx slots")
#define foreach_rdma_tx_func_error \
_(SEGMENT_SIZE_EXCEEDED, "segment size exceeded") \
_(NO_FREE_SLOTS, "no free tx slots") \
_(SUBMISSION, "tx submission errors") \
_(COMPLETION, "tx completion errors")
typedef enum
{
@@ -181,7 +243,7 @@ typedef enum
RDMA_TX_N_ERROR,
} rdma_tx_func_error_t;
#endif /* AVF_H */
#endif /* _RDMA_H_ */
/*
* fd.io coding-style-patch-verification: ON
+7 -2
View File
@@ -44,13 +44,13 @@ vpp# set int st rdma-0 up
vpp# ping 1.1.1.100`
```
### Containers support
## Containers support
It should work in containers as long as:
- the `ib_uverbs` module is loaded
- the device nodes `/dev/infiniband/uverbs[0-9]+` are usable from the
container (but see [security considerations](#Security considerations))
### SR-IOV VFs support
## SR-IOV VFs support
It should work on SR-IOV VFs the same way it does with PFs. Because of VFs
security containment features, make sure the MAC address of the rdma VPP
interface matches the MAC address assigned to the underlying VF.
@@ -68,3 +68,8 @@ aware of the [security considerations](#Security considerations)):
```
host# ip l set dev enp94s0f0 vf 0 spoof off trust on
```
## Direct Verb mode
Direct Verb allows the driver to access the NIC HW RX/TX rings directly
instead of having to go through libibverb and suffering associated overhead.
It will be automatically selected if the adapter supports it.
+8
View File
@@ -111,6 +111,14 @@
#define PREDICT_FALSE(x) __builtin_expect((x),0)
#define PREDICT_TRUE(x) __builtin_expect((x),1)
/*
* Compiler barrier
* prevent compiler to reorder memory access accross this boundary
* prevent compiler to cache values in register (force reload)
* Not to be confused with CPU memory barrier below
*/
#define CLIB_COMPILER_BARRIER() asm volatile ("":::"memory")
/* Full memory barrier (read and write). */
#define CLIB_MEMORY_BARRIER() __sync_synchronize ()