rdma: add Mellanox mlx5 Direct Verbs receive support

Type: feature
Change-Id: I3f287ab536a482c366ad7df47e1c04e640992ebc
Signed-off-by: Damjan Marion <damarion@cisco.com>
This commit is contained in:
Damjan Marion
2020-03-12 11:56:00 +01:00
committed by Damjan Marion
parent d35887297d
commit dd648aac06
8 changed files with 859 additions and 37 deletions

@ -106,6 +106,68 @@ VLIB_CLI_COMMAND (rdma_delete_command, static) = {
};
/* *INDENT-ON* */
static clib_error_t *
test_rdma_dump_command_fn (vlib_main_t * vm, unformat_input_t * input,
vlib_cli_command_t * cmd)
{
unformat_input_t _line_input, *line_input = &_line_input;
u32 sw_if_index = ~0;
vnet_hw_interface_t *hw;
rdma_main_t *rm = &rdma_main;
rdma_device_t *rd;
vnet_main_t *vnm = vnet_get_main ();
int i;
/* Get a line of input. */
if (!unformat_user (input, unformat_line_input, line_input))
return 0;
while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
{
if (unformat (line_input, "sw_if_index %d", &sw_if_index))
;
else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
vnm, &sw_if_index))
;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
}
unformat_free (line_input);
if (sw_if_index == ~0)
return clib_error_return (0,
"please specify interface name or sw_if_index");
hw = vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index);
if (hw == NULL || rdma_device_class.index != hw->dev_class_index)
return clib_error_return (0, "not a RDMA interface");
rd = pool_elt_at_index (rm->devices, hw->dev_instance);
if ((rd->flags & RDMA_DEVICE_F_MLX5DV) == 0)
return clib_error_return (0, "not a mlx5 interface");
vlib_cli_output (vm, "netdev %s pci-addr %U lkey 0x%x",
rd->linux_ifname, format_vlib_pci_addr, &rd->pci->addr,
&rd->lkey);
vec_foreach_index (i, rd->rxqs)
{
vlib_cli_output (vm, "RX queue %u\n %U\n", i, format_rdma_rxq, rd, i);
}
return 0;
}
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (test_rdma_mlx5dv_dump_command, static) = {
.path = "test rdma dump",
.short_help = "test rdma dump {<interface> | sw_if_index <sw_idx>}",
.function = test_rdma_dump_command_fn,
};
/* *INDENT-ON* */
clib_error_t *
rdma_cli_init (vlib_main_t * vm)
{

@ -399,15 +399,32 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
{
rdma_rxq_t *rxq;
struct ibv_wq_init_attr wqia;
struct ibv_cq_init_attr_ex cqa = { };
struct ibv_wq_attr wqa;
struct ibv_cq_ex *cqex;
vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
rxq = vec_elt_at_index (rd->rxqs, qid);
rxq->size = n_desc;
vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
return clib_error_return_unix (0, "Create CQ Failed");
cqa.cqe = n_desc;
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
{
struct mlx5dv_cq_init_attr dvcq = { };
dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0)
return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed");
}
else
{
if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0)
return clib_error_return_unix (0, "Create CQ Failed");
}
rxq->cq = ibv_cq_ex_to_cq (cqex);
memset (&wqia, 0, sizeof (wqia));
wqia.wq_type = IBV_WQT_RQ;
@ -424,6 +441,44 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
if (ibv_modify_wq (rxq->wq, &wqa) != 0)
return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
{
struct mlx5dv_obj obj = { };
struct mlx5dv_cq dv_cq;
struct mlx5dv_rwq dv_rwq;
u64 qw0;
obj.cq.in = rxq->cq;
obj.cq.out = &dv_cq;
obj.rwq.in = rxq->wq;
obj.rwq.out = &dv_rwq;
if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
return clib_error_return_unix (0, "mlx5dv: failed to init rx obj");
if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t))
return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size");
rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt);
rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf;
rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
rxq->cqn = dv_cq.cqn;
rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf;
rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
rxq->wq_stride = dv_rwq.stride;
rxq->wqe_cnt = dv_rwq.wqe_cnt;
qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm));
qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
for (int i = 0; i < rxq->size; i++)
rxq->wqes[i].dsz_and_lkey = qw0;
for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
}
return 0;
}
@ -534,6 +589,12 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
ethernet_mac_address_generate (rd->hwaddr.bytes);
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
bm->buffer_mem_size,
IBV_ACCESS_LOCAL_WRITE)) == 0)
return clib_error_return_unix (0, "Register MR Failed");
rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
/*
* /!\ WARNING /!\ creation order is important
* We *must* create TX queues *before* RX queues, otherwise we will receive
@ -549,12 +610,6 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
if ((err = rdma_rxq_finalize (vm, rd)))
return err;
if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
bm->buffer_mem_size,
IBV_ACCESS_LOCAL_WRITE)) == 0)
return clib_error_return_unix (0, "Register MR Failed");
rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
return 0;
}
@ -687,6 +742,26 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
break;
}
if (args->mode != RDMA_MODE_IBV)
{
struct mlx5dv_context mlx5dv_attrs = { };
if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0)
{
if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
rd->flags |= RDMA_DEVICE_F_MLX5DV;
}
else
{
if (args->mode == RDMA_MODE_DV)
{
args->error = clib_error_return (0, "Direct Verbs mode not "
"supported on this interface");
goto err2;
}
}
}
if ((args->error =
rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
goto err2;

@ -48,6 +48,30 @@ t = format (t, "%s%s", t ? " ":"", c);
return s;
}
u8 *
format_rdma_bit_flag (u8 * s, va_list * args)
{
u64 flags = va_arg (*args, u64);
char **strs = va_arg (*args, char **);
u32 n_strs = va_arg (*args, u32);
int i = 0;
while (flags)
{
if ((flags & (1 << i)))
{
if (i < n_strs && strs[i] != 0)
s = format (s, " %s", strs[i]);
else
s = format (s, " unknown(%u)", i);
flags ^= 1 << i;
}
i++;
}
return s;
}
u8 *
format_rdma_device (u8 * s, va_list * args)
{
@ -56,13 +80,33 @@ format_rdma_device (u8 * s, va_list * args)
rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
u32 indent = format_get_indent (s);
s = format (s, "netdev: %v\n", rd->linux_ifname);
s = format (s, "netdev %v pci-addr %U\n", rd->linux_ifname,
format_vlib_pci_addr, &rd->pci->addr);
s = format (s, "%Uflags: %U", format_white_space, indent,
format_rdma_device_flags, rd);
if (rd->error)
s = format (s, "\n%Uerror %U", format_white_space, indent,
format_clib_error, rd->error);
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
{
struct mlx5dv_context c = { };
const char *str_flags[7] = { "cqe-v1", "obsolete", "mpw-allowed",
"enhanced-mpw", "cqe-128b-comp", "cqe-128b-pad",
"packet-based-credit-mode"
};
if (mlx5dv_query_device (rd->ctx, &c) != 0)
return s;
s = format (s, "\n%Umlx5: version %u", format_white_space, indent,
c.version);
s = format (s, "\n%Udevice flags: %U",
format_white_space, indent + 2,
format_rdma_bit_flag, c.flags, str_flags,
ARRAY_LEN (str_flags));
}
return s;
}
@ -74,11 +118,133 @@ format_rdma_input_trace (u8 * s, va_list * args)
rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *);
vnet_main_t *vnm = vnet_get_main ();
vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index);
char *l4_hdr_types[8] =
{ 0, "tcp", "udp", "tcp-empty-ack", "tcp-with-acl" };
char *l3_hdr_types[4] = { 0, "ip6", "ip4" };
u8 l3_hdr_type = CQE_FLAG_L3_HDR_TYPE (t->cqe_flags);
u8 l4_hdr_type = CQE_FLAG_L4_HDR_TYPE (t->cqe_flags);
s = format (s, "rdma: %v (%d) next-node %U",
hi->name, t->hw_if_index, format_vlib_next_node_name, vm,
node->index, t->next_index);
if (t->cqe_flags & CQE_FLAG_L2_OK)
s = format (s, " l2-ok");
if (t->cqe_flags & CQE_FLAG_L3_OK)
s = format (s, " l3-ok");
if (t->cqe_flags & CQE_FLAG_L4_OK)
s = format (s, " l4-ok");
if (t->cqe_flags & CQE_FLAG_IP_FRAG)
s = format (s, " ip-frag");
if (l3_hdr_type)
s = format (s, " %s", l3_hdr_types[l3_hdr_type]);
if (l4_hdr_type)
s = format (s, " %s", l4_hdr_types[l4_hdr_type]);
if ((t->cqe_flags & CQE_FLAG_IP_EXT_OPTS))
{
if (l3_hdr_type == CQE_FLAG_L3_HDR_TYPE_IP6)
s = format (s, " ip4-ext-hdr");
if (l3_hdr_type == CQE_FLAG_L3_HDR_TYPE_IP4)
s = format (s, " ip4-opt");
}
return s;
}
static u8 *
format_mlx5_bits (u8 * s, va_list * args)
{
void *ptr = va_arg (*args, void *);
u32 offset = va_arg (*args, u32);
u32 sb = va_arg (*args, u32);
u32 eb = va_arg (*args, u32);
if (sb == 63 && eb == 0)
{
u64 x = mlx5_get_u64 (ptr, offset);
return format (s, "0x%lx", x);
}
u32 x = mlx5_get_bits (ptr, offset, sb, eb);
s = format (s, "%d", x);
if (x > 9)
s = format (s, " (0x%x)", x);
return s;
}
static u8 *
format_mlx5_field (u8 * s, va_list * args)
{
void *ptr = va_arg (*args, void *);
u32 offset = va_arg (*args, u32);
u32 sb = va_arg (*args, u32);
u32 eb = va_arg (*args, u32);
char *name = va_arg (*args, char *);
u8 *tmp = 0;
tmp = format (0, "0x%02x %s ", offset, name);
if (sb == eb)
tmp = format (tmp, "[%u]", sb);
else
tmp = format (tmp, "[%u:%u]", sb, eb);
s = format (s, "%-45v = %U", tmp, format_mlx5_bits, ptr, offset, sb, eb);
vec_free (tmp);
return s;
}
u8 *
format_mlx5_cqe_rx (u8 * s, va_list * args)
{
void *cqe = va_arg (*args, void *);
uword indent = format_get_indent (s);
int line = 0;
#define _(a, b, c, d) \
if (mlx5_get_bits (cqe, a, b, c)) \
s = format (s, "%U%U\n", \
format_white_space, line++ ? indent : 0, \
format_mlx5_field, cqe, a, b, c, #d);
foreach_cqe_rx_field;
#undef _
return s;
}
u8 *
format_rdma_rxq (u8 * s, va_list * args)
{
rdma_device_t *rd = va_arg (*args, rdma_device_t *);
u32 queue_index = va_arg (*args, u32);
rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, queue_index);
u32 indent = format_get_indent (s);
s = format (s, "size %u head %u tail %u", rxq->size, rxq->head, rxq->tail);
if (rd->flags & RDMA_DEVICE_F_MLX5DV)
{
u32 next_cqe_index = rxq->cq_ci & (rxq->size - 1);
s = format (s, "\n%Uwq: stride %u wqe-cnt %u",
format_white_space, indent + 2, rxq->wq_stride,
rxq->wqe_cnt);
s = format (s, "\n%Ucq: cqn %u cqe-cnt %u ci %u",
format_white_space, indent + 2, rxq->cqn,
1 << rxq->log2_cq_size, rxq->cq_ci);
s = format (s, "\n%Unext-cqe(%u):", format_white_space, indent + 4,
next_cqe_index);
s = format (s, "\n%U%U", format_white_space, indent + 6,
format_mlx5_cqe_rx, rxq->cqes + next_cqe_index);
s = format (s, "\n%U%U", format_white_space, indent + 6,
format_hexdump, rxq->cqes + next_cqe_index,
sizeof (mlx5dv_cqe_t));
}
return s;
}

File diff suppressed because it is too large Load Diff

@ -23,12 +23,14 @@
#include <vlib/pci/pci.h>
#include <vnet/interface.h>
#include <vnet/ethernet/mac_address.h>
#include <rdma/rdma_mlx5dv.h>
#define foreach_rdma_device_flags \
_(0, ERROR, "error") \
_(1, ADMIN_UP, "admin-up") \
_(2, LINK_UP, "link-up") \
_(3, PROMISC, "promiscuous")
_(3, PROMISC, "promiscuous") \
_(4, MLX5DV, "mlx5dv")
enum
{
@ -46,6 +48,18 @@ typedef struct
u32 size;
u32 head;
u32 tail;
u32 cq_ci;
u16 log2_cq_size;
u16 n_mini_cqes;
u16 n_mini_cqes_left;
u16 last_cqe_flags;
mlx5dv_cqe_t *cqes;
mlx5dv_rwq_t *wqes;
volatile u32 *wq_db;
volatile u32 *cq_db;
u32 cqn;
u32 wqe_cnt;
u32 wq_stride;
} rdma_rxq_t;
typedef struct
@ -96,6 +110,12 @@ typedef struct
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
union
{
u16 cqe_flags[VLIB_FRAME_SIZE];
u16x8 cqe_flags8[VLIB_FRAME_SIZE / 8];
u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16];
};
vlib_buffer_t buffer_template;
} rdma_per_thread_data_t;
@ -140,12 +160,14 @@ extern vnet_device_class_t rdma_device_class;
format_function_t format_rdma_device;
format_function_t format_rdma_device_name;
format_function_t format_rdma_input_trace;
format_function_t format_rdma_rxq;
unformat_function_t unformat_rdma_create_if_args;
typedef struct
{
u32 next_index;
u32 hw_if_index;
u16 cqe_flags;
} rdma_input_trace_t;
#define foreach_rdma_tx_func_error \

@ -0,0 +1,156 @@
/*
*------------------------------------------------------------------
* Copyright (c) 2020 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*------------------------------------------------------------------
*/
#ifndef _RDMA_MLX5DV_H_
#define _RDMA_MLX5DV_H_
#undef always_inline
#include <infiniband/mlx5dv.h>
#define always_inline static_always_inline
/* CQE flags - bits 16-31 of qword at offset 0x1c */
#define CQE_FLAG_L4_OK 10
#define CQE_FLAG_L3_OK 9
#define CQE_FLAG_L2_OK 8
#define CQE_FLAG_IP_FRAG 7
#define CQE_FLAG_L4_HDR_TYPE(f) (((f) >> 4) & 7)
#define CQE_FLAG_L3_HDR_TYPE_SHIFT (2)
#define CQE_FLAG_L3_HDR_TYPE_MASK (3 << CQE_FLAG_L3_HDR_TYPE_SHIFT)
#define CQE_FLAG_L3_HDR_TYPE(f) (((f) & CQE_FLAG_L3_HDR_TYPE_MASK) >> CQE_FLAG_L3_HDR_TYPE_SHIFT)
#define CQE_FLAG_L3_HDR_TYPE_IP4 1
#define CQE_FLAG_L3_HDR_TYPE_IP6 2
#define CQE_FLAG_IP_EXT_OPTS 1
typedef struct
{
struct
{
u8 pad1[28];
u16 flags;
u8 pad2[14];
union
{
u32 byte_cnt;
u32 mini_cqe_num;
};
u8 pad3[15];
u8 opcode_cqefmt_se_owner;
};
} mlx5dv_cqe_t;
STATIC_ASSERT_SIZEOF (mlx5dv_cqe_t, 64);
typedef struct
{
union
{
u32 checksum;
u32 rx_hash_result;
};
u32 byte_count;
} mlx5dv_mini_cqe_t;
typedef struct
{
u64 dsz_and_lkey;
u64 addr;
} mlx5dv_rwq_t;
#define foreach_cqe_rx_field \
_(0x1c, 26, 26, l4_ok) \
_(0x1c, 25, 25, l3_ok) \
_(0x1c, 24, 24, l2_ok) \
_(0x1c, 23, 23, ip_frag) \
_(0x1c, 22, 20, l4_hdr_type) \
_(0x1c, 19, 18, l3_hdr_type) \
_(0x1c, 17, 17, ip_ext_opts) \
_(0x1c, 16, 16, cv) \
_(0x2c, 31, 0, byte_cnt) \
_(0x30, 63, 0, timestamp) \
_(0x38, 31, 24, rx_drop_counter) \
_(0x38, 23, 0, flow_tag) \
_(0x3c, 31, 16, wqe_counter) \
_(0x3c, 15, 8, signature) \
_(0x3c, 7, 4, opcode) \
_(0x3c, 3, 2, cqe_format) \
_(0x3c, 1, 1, sc) \
_(0x3c, 0, 0, owner)
/* inline functions */
static inline u32
mlx5_get_u32 (void *start, int offset)
{
return clib_net_to_host_u32 (*(u32 *) (((u8 *) start) + offset));
}
static inline u64
mlx5_get_u64 (void *start, int offset)
{
return clib_net_to_host_u64 (*(u64 *) (((u8 *) start) + offset));
}
static inline void
mlx5_set_u32 (void *start, int offset, u32 value)
{
(*(u32 *) (((u8 *) start) + offset)) = clib_host_to_net_u32 (value);
}
static inline void
mlx5_set_u64 (void *start, int offset, u64 value)
{
(*(u64 *) (((u8 *) start) + offset)) = clib_host_to_net_u64 (value);
}
static inline void
mlx5_set_bits (void *start, int offset, int first, int last, u32 value)
{
u32 mask = (1 << (first - last + 1)) - 1;
u32 old = mlx5_get_u32 (start, offset);
if ((last == 0) && (first == 31))
{
mlx5_set_u32 (start, offset, value);
return;
}
ASSERT (value == (value & mask));
value &= mask;
old &= ~(mask << last);
mlx5_set_u32 (start, offset, old | value << last);
}
static inline u32
mlx5_get_bits (void *start, int offset, int first, int last)
{
u32 value = mlx5_get_u32 (start, offset);
if ((last == 0) && (first == 31))
return value;
value >>= last;
value &= (1 << (first - last + 1)) - 1;
return value;
}
#endif /* RDMA_MLX5DV_H */
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/

@ -132,6 +132,16 @@ _(i8x16, i64x4, epi8_epi64)
#undef _
/* *INDENT-ON* */
static_always_inline u64x4
u64x4_byte_swap (u64x4 v)
{
u8x32 swap = {
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
};
return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
}
static_always_inline u32x8
u32x8_byte_swap (u32x8 v)
{

@ -103,6 +103,12 @@ u16x8_byte_swap (u16x8 v)
return (u16x8) vrev16q_u8 ((u8x16) v);
}
static_always_inline u32x4
u32x4_byte_swap (u32x4 v)
{
return vrev64q_u32 (v);
}
static_always_inline u8x16
u8x16_shuffle (u8x16 v, u8x16 m)
{