rdma: implement striding rq for multiseg rx
This change leverages the striding RQ feature of ConnectX-5 adapters to support chained buffers on the RX path. In Striding RQ mode, WQE are SG lists of data segments, each mapped to a vlib_buffer. When a packet is received, it can consume one or multiple data segments belonging to the WQE, without wasting the whole WQE. Change-Id: I74eba5b2c2c66538e75e046335058ba011cb27fd Type: improvement Signed-off-by: Mohammed Hawari <mohammed@hawari.fr>
This commit is contained in:

committed by
Damjan Marion

parent
91603958d1
commit
4df9f737a2
@@ -426,10 +426,14 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
struct ibv_cq_init_attr_ex cqa = { };
|
||||
struct ibv_wq_attr wqa;
|
||||
struct ibv_cq_ex *cqex;
|
||||
struct mlx5dv_wq_init_attr dv_wqia = { };
|
||||
|
||||
vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
|
||||
rxq = vec_elt_at_index (rd->rxqs, qid);
|
||||
rxq->size = n_desc;
|
||||
rxq->log_wqe_sz = 0;
|
||||
rxq->log_stride_per_wqe = 0;
|
||||
rxq->buf_sz = vlib_buffer_get_default_data_size (vm);
|
||||
vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
|
||||
|
||||
cqa.cqe = n_desc;
|
||||
@@ -456,7 +460,54 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
wqia.max_sge = 1;
|
||||
wqia.pd = rd->pd;
|
||||
wqia.cq = rxq->cq;
|
||||
if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
|
||||
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
|
||||
{
|
||||
if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ)
|
||||
{
|
||||
/* In STRIDING_RQ mode, map a descriptor to a stride, not a full WQE buffer */
|
||||
uword data_seg_log2_sz =
|
||||
min_log2 (vlib_buffer_get_default_data_size (vm));
|
||||
|
||||
/* The trick is also to map a descriptor to a data segment in the WQE SG list
|
||||
The number of strides per WQE and the size of a WQE (in 16-bytes words) both
|
||||
must be powers of two.
|
||||
Moreover, in striding RQ mode, WQEs must include the SRQ header, which occupies
|
||||
one 16-bytes word. That is why WQEs have 2*RDMA_RXQ_MAX_CHAIN_SZ 16-bytes words:
|
||||
- One for the SRQ Header
|
||||
- RDMA_RXQ_MAX_CHAIN_SZ for the different data segments (each mapped to
|
||||
a stride, and a vlib_buffer)
|
||||
- RDMA_RXQ_MAX_CHAIN_SZ-1 null data segments
|
||||
*/
|
||||
|
||||
wqia.max_sge = RDMA_RXQ_MAX_CHAIN_SZ;
|
||||
dv_wqia.comp_mask = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
|
||||
dv_wqia.striding_rq_attrs.two_byte_shift_en = 0;
|
||||
dv_wqia.striding_rq_attrs.single_wqe_log_num_of_strides =
|
||||
RDMA_RXQ_MAX_CHAIN_LOG_SZ;
|
||||
dv_wqia.striding_rq_attrs.single_stride_log_num_of_bytes =
|
||||
data_seg_log2_sz;
|
||||
wqia.max_wr >>= RDMA_RXQ_MAX_CHAIN_LOG_SZ;
|
||||
rxq->log_wqe_sz = RDMA_RXQ_MAX_CHAIN_LOG_SZ + 1;
|
||||
rxq->log_stride_per_wqe = RDMA_RXQ_MAX_CHAIN_LOG_SZ;
|
||||
rxq->buf_sz = 1 << data_seg_log2_sz;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* For now, in non STRIDING_RQ mode, SG operations/chained buffers
|
||||
are not supported */
|
||||
wqia.max_sge = 1;
|
||||
}
|
||||
|
||||
if ((rxq->wq = mlx5dv_create_wq (rd->ctx, &wqia, &dv_wqia)))
|
||||
{
|
||||
rxq->wq->events_completed = 0;
|
||||
pthread_mutex_init (&rxq->wq->mutex, NULL);
|
||||
pthread_cond_init (&rxq->wq->cond, NULL);
|
||||
}
|
||||
else
|
||||
return clib_error_return_unix (0, "Create WQ Failed");
|
||||
}
|
||||
else if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
|
||||
return clib_error_return_unix (0, "Create WQ Failed");
|
||||
|
||||
memset (&wqa, 0, sizeof (wqa));
|
||||
@@ -471,6 +522,7 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
struct mlx5dv_cq dv_cq;
|
||||
struct mlx5dv_rwq dv_rwq;
|
||||
u64 qw0;
|
||||
u64 qw0_nullseg;
|
||||
|
||||
obj.cq.in = rxq->cq;
|
||||
obj.cq.out = &dv_cq;
|
||||
@@ -488,16 +540,26 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
|
||||
rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
|
||||
rxq->cqn = dv_cq.cqn;
|
||||
|
||||
rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf;
|
||||
rxq->wqes = (mlx5dv_wqe_ds_t *) dv_rwq.buf;
|
||||
rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
|
||||
rxq->wq_stride = dv_rwq.stride;
|
||||
rxq->wqe_cnt = dv_rwq.wqe_cnt;
|
||||
|
||||
qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm));
|
||||
qw0 = clib_host_to_net_u32 (rxq->buf_sz);
|
||||
qw0_nullseg = 0;
|
||||
qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
|
||||
qw0_nullseg |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
|
||||
|
||||
for (int i = 0; i < rxq->size; i++)
|
||||
rxq->wqes[i].dsz_and_lkey = qw0;
|
||||
/* Prefill the different 16 bytes words of the WQ. If not in striding RQ mode,
|
||||
init with qw0 only with segments of rxq->buf_sz. Otherwise, for each WQE, the
|
||||
RDMA_RXQ_MAX_CHAIN_SZ + 1 first 16-bytes words are initialised with qw0, the rest
|
||||
are null segments */
|
||||
for (int i = 0; i < rxq->wqe_cnt << rxq->log_wqe_sz; i++)
|
||||
if (!(rd->flags & RDMA_DEVICE_F_STRIDING_RQ)
|
||||
|| (i == 0) || !(((i - 1) >> rxq->log_stride_per_wqe) & 0x1))
|
||||
rxq->wqes[i].dsz_and_lkey = qw0;
|
||||
else
|
||||
rxq->wqes[i].dsz_and_lkey = qw0_nullseg;
|
||||
|
||||
for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
|
||||
rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
|
||||
@@ -824,11 +886,25 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
|
||||
if (args->mode != RDMA_MODE_IBV)
|
||||
{
|
||||
struct mlx5dv_context mlx5dv_attrs = { };
|
||||
mlx5dv_attrs.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
|
||||
|
||||
if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0)
|
||||
{
|
||||
uword data_seg_log2_sz =
|
||||
min_log2 (vlib_buffer_get_default_data_size (vm));
|
||||
|
||||
if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
|
||||
rd->flags |= RDMA_DEVICE_F_MLX5DV;
|
||||
|
||||
if (data_seg_log2_sz <=
|
||||
mlx5dv_attrs.striding_rq_caps.max_single_stride_log_num_of_bytes
|
||||
&& data_seg_log2_sz >=
|
||||
mlx5dv_attrs.striding_rq_caps.min_single_stride_log_num_of_bytes
|
||||
&& RDMA_RXQ_MAX_CHAIN_LOG_SZ >=
|
||||
mlx5dv_attrs.striding_rq_caps.min_single_wqe_log_num_of_strides
|
||||
&& RDMA_RXQ_MAX_CHAIN_LOG_SZ <=
|
||||
mlx5dv_attrs.striding_rq_caps.max_single_wqe_log_num_of_strides)
|
||||
rd->flags |= RDMA_DEVICE_F_STRIDING_RQ;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,8 @@
|
||||
_(1, ADMIN_UP, "admin-up") \
|
||||
_(2, LINK_UP, "link-up") \
|
||||
_(3, PROMISC, "promiscuous") \
|
||||
_(4, MLX5DV, "mlx5dv")
|
||||
_(4, MLX5DV, "mlx5dv") \
|
||||
_(5, STRIDING_RQ, "striding-rq")
|
||||
|
||||
enum
|
||||
{
|
||||
@@ -81,12 +82,17 @@ typedef struct
|
||||
u16 n_mini_cqes_left;
|
||||
u16 last_cqe_flags;
|
||||
mlx5dv_cqe_t *cqes;
|
||||
mlx5dv_rwq_t *wqes;
|
||||
mlx5dv_wqe_ds_t *wqes;
|
||||
CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
|
||||
volatile u32 *wq_db;
|
||||
volatile u32 *cq_db;
|
||||
u32 cqn;
|
||||
u32 wqe_cnt;
|
||||
u32 wq_stride;
|
||||
u32 buf_sz;
|
||||
u32 striding_wqe_tail;
|
||||
u8 log_wqe_sz; /* log-size of a single WQE (in data segments) */
|
||||
u8 log_stride_per_wqe; /* Striding RQ: number of strides in a single WQE */
|
||||
} rdma_rxq_t;
|
||||
|
||||
typedef struct
|
||||
@@ -146,7 +152,9 @@ STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline2, 128);
|
||||
|
||||
#define RDMA_TXQ_USED_SZ(head, tail) ((u16)((u16)(tail) - (u16)(head)))
|
||||
#define RDMA_TXQ_AVAIL_SZ(txq, head, tail) ((u16)(RDMA_TXQ_BUF_SZ (txq) - RDMA_TXQ_USED_SZ (head, tail)))
|
||||
|
||||
#define RDMA_RXQ_MAX_CHAIN_LOG_SZ 3 /* This should NOT be lower than 3! */
|
||||
#define RDMA_RXQ_MAX_CHAIN_SZ (1U << RDMA_RXQ_MAX_CHAIN_LOG_SZ)
|
||||
#define RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ 5
|
||||
typedef struct
|
||||
{
|
||||
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
|
||||
@@ -193,6 +201,8 @@ typedef struct
|
||||
u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16];
|
||||
};
|
||||
vlib_buffer_t buffer_template;
|
||||
u32 current_segs[VLIB_FRAME_SIZE];
|
||||
u32 to_free_buffers[VLIB_FRAME_SIZE];
|
||||
} rdma_per_thread_data_t;
|
||||
|
||||
typedef struct
|
||||
|
@@ -21,7 +21,8 @@
|
||||
#undef always_inline
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#define always_inline static_always_inline
|
||||
|
||||
#include <vppinfra/types.h>
|
||||
#include <vppinfra/error.h>
|
||||
/* CQE flags - bits 16-31 of qword at offset 0x1c */
|
||||
#define CQE_FLAG_L4_OK 10
|
||||
#define CQE_FLAG_L3_OK 9
|
||||
@@ -35,6 +36,11 @@
|
||||
#define CQE_FLAG_L3_HDR_TYPE_IP6 2
|
||||
#define CQE_FLAG_IP_EXT_OPTS 1
|
||||
|
||||
/* CQE byte count (Striding RQ) */
|
||||
#define CQE_BC_FILLER_MASK (1 << 31)
|
||||
#define CQE_BC_CONSUMED_STRIDES_SHIFT (16)
|
||||
#define CQE_BC_CONSUMED_STRIDES_MASK (0x3fff << CQE_BC_CONSUMED_STRIDES_SHIFT)
|
||||
#define CQE_BC_BYTE_COUNT_MASK (0xffff)
|
||||
typedef struct
|
||||
{
|
||||
struct
|
||||
@@ -47,7 +53,9 @@ typedef struct
|
||||
u32 byte_cnt;
|
||||
u32 mini_cqe_num;
|
||||
};
|
||||
u8 pad3[15];
|
||||
u8 pad3[12];
|
||||
u16 wqe_counter;
|
||||
u8 signature;
|
||||
u8 opcode_cqefmt_se_owner;
|
||||
};
|
||||
} mlx5dv_cqe_t;
|
||||
@@ -68,7 +76,15 @@ typedef struct
|
||||
{
|
||||
u64 dsz_and_lkey;
|
||||
u64 addr;
|
||||
} mlx5dv_rwq_t;
|
||||
} mlx5dv_wqe_ds_t; /* a WQE data segment */
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u8 rsvd0[2];
|
||||
u16 next_wqe_index;
|
||||
u8 signature;
|
||||
u8 rsvd1[11];
|
||||
} mlx5dv_wqe_srq_next_t;
|
||||
|
||||
#define foreach_cqe_rx_field \
|
||||
_(0x1c, 26, 26, l4_ok) \
|
||||
|
Reference in New Issue
Block a user