rdma: introduce direct verb for Cx4/5 tx
Direct Verb allows for direct access to NIC HW rx/tx rings. This patch introduce TX direct verb support for Mellanox ConnectX-4/5 adapters. 'dv' mode must be explicitely selected at interface creation to benefit from this. Type: feature Change-Id: If830ba9f33db73299acdbddc68b5c09eaf6add98 Signed-off-by: Benoît Ganne <bganne@cisco.com>
This commit is contained in:
committed by
Damjan Marion
parent
dd648aac06
commit
dc812d9a71
+66
-19
@@ -535,7 +535,8 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
|
||||
vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
|
||||
txq = vec_elt_at_index (rd->txqs, qid);
|
||||
txq->size = n_desc;
|
||||
ASSERT (is_pow2 (n_desc));
|
||||
txq->bufs_log2sz = min_log2 (n_desc);
|
||||
vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
|
||||
|
||||
if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
|
||||
@@ -569,6 +570,57 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
qpa.qp_state = IBV_QPS_RTS;
|
||||
if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
|
||||
return clib_error_return_unix (0, "Modify QP (send) Failed");
|
||||
|
||||
txq->ibv_cq = txq->cq;
|
||||
txq->ibv_qp = txq->qp;
|
||||
|
||||
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
|
||||
{
|
||||
rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl;
|
||||
struct mlx5dv_cq dv_cq;
|
||||
struct mlx5dv_qp dv_qp;
|
||||
struct mlx5dv_obj obj = { };
|
||||
|
||||
obj.cq.in = txq->cq;
|
||||
obj.cq.out = &dv_cq;
|
||||
obj.qp.in = txq->qp;
|
||||
obj.qp.out = &dv_qp;
|
||||
|
||||
if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
|
||||
return clib_error_return_unix (0, "DV init obj failed");
|
||||
|
||||
if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt
|
||||
|| !is_pow2 (dv_qp.sq.wqe_cnt)
|
||||
|| sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride
|
||||
|| (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t))
|
||||
return clib_error_return (0, "Unsupported DV SQ parameters");
|
||||
|
||||
if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt
|
||||
|| !is_pow2 (dv_cq.cqe_cnt)
|
||||
|| sizeof (struct mlx5_cqe64) != dv_cq.cqe_size
|
||||
|| (uword) dv_cq.buf % sizeof (struct mlx5_cqe64))
|
||||
return clib_error_return (0, "Unsupported DV CQ parameters");
|
||||
|
||||
/* get SQ and doorbell addresses */
|
||||
txq->dv_sq_wqes = dv_qp.sq.buf;
|
||||
txq->dv_sq_dbrec = dv_qp.dbrec;
|
||||
txq->dv_sq_db = dv_qp.bf.reg;
|
||||
txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt);
|
||||
|
||||
/* get CQ and doorbell addresses */
|
||||
txq->dv_cq_cqes = dv_cq.buf;
|
||||
txq->dv_cq_dbrec = dv_cq.dbrec;
|
||||
txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt);
|
||||
|
||||
/* init tx desc template */
|
||||
STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl));
|
||||
mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
|
||||
txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0,
|
||||
RDMA_TXQ_DV_INVALID_ID);
|
||||
/* FIXME: mlx5dv_set_eth_seg(&tmpl->eseg, MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM, 0, 0, 0); */
|
||||
mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -587,6 +639,13 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
|
||||
if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
|
||||
return clib_error_return_unix (0, "PD Alloc Failed");
|
||||
|
||||
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
|
||||
bm->buffer_mem_size,
|
||||
IBV_ACCESS_LOCAL_WRITE)) == 0)
|
||||
return clib_error_return_unix (0, "Register MR Failed");
|
||||
|
||||
rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
|
||||
|
||||
ethernet_mac_address_generate (rd->hwaddr.bytes);
|
||||
|
||||
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
|
||||
@@ -657,28 +716,16 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
|
||||
}
|
||||
|
||||
if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
|
||||
args->rxq_size > 65535 || args->txq_size > 65535 ||
|
||||
!is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
|
||||
{
|
||||
args->rv = VNET_API_ERROR_INVALID_VALUE;
|
||||
args->error =
|
||||
clib_error_return (0, "queue size must be a power of two >= %i",
|
||||
VLIB_FRAME_SIZE);
|
||||
args->error = clib_error_return (0, "queue size must be a power of two "
|
||||
"between %i and 65535",
|
||||
VLIB_FRAME_SIZE);
|
||||
goto err0;
|
||||
}
|
||||
|
||||
switch (args->mode)
|
||||
{
|
||||
case RDMA_MODE_AUTO:
|
||||
break;
|
||||
case RDMA_MODE_IBV:
|
||||
break;
|
||||
case RDMA_MODE_DV:
|
||||
args->rv = VNET_API_ERROR_INVALID_VALUE;
|
||||
args->error = clib_error_return (0, "unsupported mode");
|
||||
goto err0;
|
||||
break;
|
||||
}
|
||||
|
||||
dev_list = ibv_get_device_list (&n_devs);
|
||||
if (n_devs == 0)
|
||||
{
|
||||
@@ -762,8 +809,8 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
|
||||
}
|
||||
}
|
||||
|
||||
if ((args->error =
|
||||
rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
|
||||
if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size,
|
||||
args->rxq_num)))
|
||||
goto err2;
|
||||
|
||||
if ((args->error = rdma_register_interface (vnm, rd)))
|
||||
|
||||
+407
-45
File diff suppressed because it is too large
Load Diff
+69
-7
@@ -39,6 +39,19 @@ enum
|
||||
#undef _
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
CLIB_ALIGN_MARK (align0, MLX5_SEND_WQE_BB);
|
||||
struct mlx5_wqe_ctrl_seg ctrl;
|
||||
struct mlx5_wqe_eth_seg eseg;
|
||||
struct mlx5_wqe_data_seg dseg;
|
||||
} rdma_mlx5_wqe_t;
|
||||
#define RDMA_MLX5_WQE_SZ sizeof(rdma_mlx5_wqe_t)
|
||||
#define RDMA_MLX5_WQE_DS (RDMA_MLX5_WQE_SZ/sizeof(struct mlx5_wqe_data_seg))
|
||||
STATIC_ASSERT (RDMA_MLX5_WQE_SZ == MLX5_SEND_WQE_BB &&
|
||||
RDMA_MLX5_WQE_SZ % sizeof (struct mlx5_wqe_data_seg) == 0,
|
||||
"bad size");
|
||||
|
||||
typedef struct
|
||||
{
|
||||
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
|
||||
@@ -65,14 +78,60 @@ typedef struct
|
||||
typedef struct
|
||||
{
|
||||
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
|
||||
|
||||
/* following fields are accessed in datapath */
|
||||
clib_spinlock_t lock;
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
/* ibverb datapath. Cache of cq, sq below */
|
||||
struct ibv_cq *ibv_cq;
|
||||
struct ibv_qp *ibv_qp;
|
||||
};
|
||||
struct
|
||||
{
|
||||
/* direct verbs datapath */
|
||||
rdma_mlx5_wqe_t *dv_sq_wqes;
|
||||
volatile u32 *dv_sq_dbrec;
|
||||
volatile u64 *dv_sq_db;
|
||||
struct mlx5_cqe64 *dv_cq_cqes;
|
||||
volatile u32 *dv_cq_dbrec;
|
||||
};
|
||||
};
|
||||
|
||||
u32 *bufs; /* vlib_buffer ring buffer */
|
||||
u16 head;
|
||||
u16 tail;
|
||||
u16 dv_cq_idx; /* monotonic CQE index (valid only for direct verbs) */
|
||||
u8 bufs_log2sz; /* log2 vlib_buffer entries */
|
||||
u8 dv_sq_log2sz:4; /* log2 SQ WQE entries (valid only for direct verbs) */
|
||||
u8 dv_cq_log2sz:4; /* log2 CQ CQE entries (valid only for direct verbs) */
|
||||
STRUCT_MARK (cacheline1);
|
||||
|
||||
/* WQE template (valid only for direct verbs) */
|
||||
u8 dv_wqe_tmpl[64];
|
||||
|
||||
/* end of 2nd 64-bytes cacheline (or 1st 128-bytes cacheline) */
|
||||
STRUCT_MARK (cacheline2);
|
||||
|
||||
/* fields below are not accessed in datapath */
|
||||
struct ibv_cq *cq;
|
||||
struct ibv_qp *qp;
|
||||
u32 *bufs;
|
||||
u32 size;
|
||||
u32 head;
|
||||
u32 tail;
|
||||
|
||||
} rdma_txq_t;
|
||||
STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline1, 64);
|
||||
STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline2, 128);
|
||||
|
||||
#define RDMA_TXQ_DV_INVALID_ID 0xffffffff
|
||||
|
||||
#define RDMA_TXQ_BUF_SZ(txq) (1U << (txq)->bufs_log2sz)
|
||||
#define RDMA_TXQ_DV_SQ_SZ(txq) (1U << (txq)->dv_sq_log2sz)
|
||||
#define RDMA_TXQ_DV_CQ_SZ(txq) (1U << (txq)->dv_cq_log2sz)
|
||||
|
||||
#define RDMA_TXQ_USED_SZ(head, tail) ((u16)((u16)(tail) - (u16)(head)))
|
||||
#define RDMA_TXQ_AVAIL_SZ(txq, head, tail) ((u16)(RDMA_TXQ_BUF_SZ (txq) - RDMA_TXQ_USED_SZ (head, tail)))
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -170,8 +229,11 @@ typedef struct
|
||||
u16 cqe_flags;
|
||||
} rdma_input_trace_t;
|
||||
|
||||
#define foreach_rdma_tx_func_error \
|
||||
_(NO_FREE_SLOTS, "no free tx slots")
|
||||
#define foreach_rdma_tx_func_error \
|
||||
_(SEGMENT_SIZE_EXCEEDED, "segment size exceeded") \
|
||||
_(NO_FREE_SLOTS, "no free tx slots") \
|
||||
_(SUBMISSION, "tx submission errors") \
|
||||
_(COMPLETION, "tx completion errors")
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@@ -181,7 +243,7 @@ typedef enum
|
||||
RDMA_TX_N_ERROR,
|
||||
} rdma_tx_func_error_t;
|
||||
|
||||
#endif /* AVF_H */
|
||||
#endif /* _RDMA_H_ */
|
||||
|
||||
/*
|
||||
* fd.io coding-style-patch-verification: ON
|
||||
|
||||
@@ -44,13 +44,13 @@ vpp# set int st rdma-0 up
|
||||
vpp# ping 1.1.1.100`
|
||||
```
|
||||
|
||||
### Containers support
|
||||
## Containers support
|
||||
It should work in containers as long as:
|
||||
- the `ib_uverbs` module is loaded
|
||||
- the device nodes `/dev/infiniband/uverbs[0-9]+` are usable from the
|
||||
container (but see [security considerations](#Security considerations))
|
||||
|
||||
### SR-IOV VFs support
|
||||
## SR-IOV VFs support
|
||||
It should work on SR-IOV VFs the same way it does with PFs. Because of VFs
|
||||
security containment features, make sure the MAC address of the rdma VPP
|
||||
interface matches the MAC address assigned to the underlying VF.
|
||||
@@ -68,3 +68,8 @@ aware of the [security considerations](#Security considerations)):
|
||||
```
|
||||
host# ip l set dev enp94s0f0 vf 0 spoof off trust on
|
||||
```
|
||||
|
||||
## Direct Verb mode
|
||||
Direct Verb allows the driver to access the NIC HW RX/TX rings directly
|
||||
instead of having to go through libibverb and suffering associated overhead.
|
||||
It will be automatically selected if the adapter supports it.
|
||||
|
||||
@@ -111,6 +111,14 @@
|
||||
#define PREDICT_FALSE(x) __builtin_expect((x),0)
|
||||
#define PREDICT_TRUE(x) __builtin_expect((x),1)
|
||||
|
||||
/*
|
||||
* Compiler barrier
|
||||
* prevent compiler to reorder memory access accross this boundary
|
||||
* prevent compiler to cache values in register (force reload)
|
||||
* Not to be confused with CPU memory barrier below
|
||||
*/
|
||||
#define CLIB_COMPILER_BARRIER() asm volatile ("":::"memory")
|
||||
|
||||
/* Full memory barrier (read and write). */
|
||||
#define CLIB_MEMORY_BARRIER() __sync_synchronize ()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user