vlib: introduce DMA infrastructure

This patch introduces DMA infrastructure into vlib. This is well known that large amount of memory movements will drain core resource. Nowadays more and more hardware accelerators were designed out for freeing core from this burden. Meanwhile some restrictions still remained when utilizing hardware accelerators, e.g. cross numa throughput will have a significant drop compared to same node. Normally the number of hardware accelerator instances will less than cores number, not to mention that applications number will even beyond the number of cores. Some hardware may support share virtual address with cores, while others are not. Here we introduce new DMA infrastructure which can fulfill the requirements of vpp applications like session and memif and in the meantime dealing with hardware limitations. Here is some design backgrounds: Backend is the abstract of resource which allocated from DMA device and can do some basic operations like configuration, DMA copy and result query. Config is the abstract of application DMA requirement. Application need to request an unique config index from DMA infrastructure. This unique config index is associated with backend resource. Two options cpu fallback and barrier before last can be specified in config. DMA transfer will be performed by CPU when backend is busy if cpu fallback option is enabled. DMA transfer callback will be in order if barrier before last option is enabled. We constructs all the stuffs that DMA transfer request needed into DMA batch. It contains the pattern of DMA descriptors and function pointers for submission and callback. One DMA transfer request need multiple times batch update and one time batch submission. DMA backends will assigned to config's workers threads equally. Lock will be used for thread-safety if same backends assigned to multiple threads. Backend node will check all the pending requests in worker thread and do callback with the pointer of DMA batch if transfer completed. Application can utilize cookie in DMA batch for selves usage. DMA architecture: +----------+ +----------+ +----------+ +----------+ | Config1 | | Config2 | | Config1 | | Config2 | +----------+ +----------+ +----------+ +----------+ || || || || +-------------------------+ +-------------------------+ | DMA polling thread A | | DMA polling thread B | +-------------------------+ +-------------------------+ || || +----------+ +----------+ | Backend1 | | Backend2 | +----------+ +----------+ Type: feature Signed-off-by: Marvin Liu <yong.liu@intel.com> Change-Id: I1725e0c26687985aac29618c9abe4f5e0de08ebf
2022-08-17 09:38:40 +08:00
parent 9a6ad01c0d
commit abd5669422
14 changed files with 1349 additions and 0 deletions
--- a/src/plugins/dma_intel/CMakeLists.txt
+++ b/src/plugins/dma_intel/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright(c) 2022 Cisco Systems, Inc.
+
+add_vpp_plugin(dma_intel
+  SOURCES
+  dsa.c
+  format.c
+  main.c
+)
--- a/src/plugins/dma_intel/dsa.c
+++ b/src/plugins/dma_intel/dsa.c
--- a/src/plugins/dma_intel/dsa_intel.h
+++ b/src/plugins/dma_intel/dsa_intel.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2022 Intel and/or its affiliates.
+ */
+
+#ifndef __dma_intel_dsa_intel_h__
+#define __dma_intel_dsa_intel_h__
+
+#include <vlib/vlib.h>
+#include <vlib/dma/dma.h>
+#include <vlib/pci/pci.h>
+#include <vppinfra/format.h>
+typedef struct
+{
+  u32 pasid;
+  u32 op_flags;
+  u64 completion;
+  union
+  {
+    void *src;
+    void *desc_addr;
+  };
+  void *dst;
+  u32 size;
+  u16 intr_handle;
+  /* remaining 26 bytes are reserved */
+  u16 __reserved[13];
+} intel_dsa_desc_t;
+
+STATIC_ASSERT_SIZEOF (intel_dsa_desc_t, 64);
+
+#define DSA_DEV_PATH "/dev/dsa"
+#define SYS_DSA_PATH "/sys/bus/dsa/devices"
+
+typedef enum
+{
+  INTEL_DSA_DEVICE_TYPE_UNKNOWN,
+  INTEL_DSA_DEVICE_TYPE_KERNEL,
+  INTEL_DSA_DEVICE_TYPE_USER,
+  INTEL_DSA_DEVICE_TYPE_MDEV,
+} intel_dsa_wq_type_t;
+
+enum dsa_ops
+{
+  INTEL_DSA_OP_NOP = 0,
+  INTEL_DSA_OP_BATCH,
+  INTEL_DSA_OP_DRAIN,
+  INTEL_DSA_OP_MEMMOVE,
+  INTEL_DSA_OP_FILL
+};
+#define INTEL_DSA_OP_SHIFT		     24
+#define INTEL_DSA_FLAG_FENCE		     (1 << 0)
+#define INTEL_DSA_FLAG_BLOCK_ON_FAULT	     (1 << 1)
+#define INTEL_DSA_FLAG_COMPLETION_ADDR_VALID (1 << 2)
+#define INTEL_DSA_FLAG_REQUEST_COMPLETION    (1 << 3)
+#define INTEL_DSA_FLAG_CACHE_CONTROL	     (1 << 8)
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  volatile void *portal; /* portal exposed by dedicated work queue */
+  u64 submitted;
+  u64 completed;
+  u64 sw_fallback;
+  u32 max_transfer_size; /* maximum size of each transfer */
+  u16 max_transfers;	 /* maximum number referenced in a batch */
+  u16 n_threads;	 /* number of threads using this channel */
+  u16 n_enq;		 /* number of batches currently enqueued */
+  union
+  {
+    u16 wq_control;
+    struct
+    {
+      u16 type : 2;
+      u16 state : 1;
+      u16 ats_disable : 1;
+      u16 block_on_fault : 1;
+      u16 mode : 1;
+    };
+  };
+  u8 lock; /* spinlock, only used if m_threads > 1 */
+  u8 numa; /* numa node */
+  u8 size; /* size of work queue */
+  u8 did;  /* dsa device id */
+  u8 qid;  /* work queue id */
+} intel_dsa_channel_t;
+
+typedef struct intel_dsa_batch
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (start);
+  vlib_dma_batch_t batch; /* must be first */
+  intel_dsa_channel_t *ch;
+  u32 config_heap_index;
+  u32 max_transfers;
+  u32 config_index;
+  union
+  {
+    struct
+    {
+      u32 barrier_before_last : 1;
+      u32 sw_fallback : 1;
+    };
+    u32 features;
+  };
+  CLIB_CACHE_LINE_ALIGN_MARK (completion_cl);
+#define INTEL_DSA_STATUS_IDLE	     0x0
+#define INTEL_DSA_STATUS_SUCCESS     0x1
+#define INTEL_DSA_STATUS_BUSY	     0xa
+#define INTEL_DSA_STATUS_CPU_SUCCESS 0xb
+  u8 status;
+  /* to avoid read-modify-write completion is written as 64-byte
+   * DMA FILL operation */
+  CLIB_CACHE_LINE_ALIGN_MARK (descriptors);
+  intel_dsa_desc_t descs[0];
+} intel_dsa_batch_t;
+
+STATIC_ASSERT_OFFSET_OF (intel_dsa_batch_t, batch, 0);
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  intel_dsa_batch_t batch_template;
+  u32 alloc_size;
+  u32 max_transfers;
+  intel_dsa_batch_t **freelist;
+} intel_dsa_config_t;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  intel_dsa_channel_t *ch; /* channel used by this thread */
+  intel_dsa_batch_t **pending_batches;
+} intel_dsa_thread_t;
+
+typedef struct
+{
+  intel_dsa_channel_t ***channels;
+  intel_dsa_thread_t *dsa_threads;
+  intel_dsa_config_t *dsa_config_heap;
+  uword *dsa_config_heap_handle_by_config_index;
+  /* spin lock protect pmem */
+  clib_spinlock_t lock;
+} intel_dsa_main_t;
+
+extern intel_dsa_main_t intel_dsa_main;
+extern vlib_dma_backend_t intel_dsa_backend;
+format_function_t format_intel_dsa_addr;
+
+#define dsa_log_debug(f, ...)                                                 \
+  vlib_log (VLIB_LOG_LEVEL_DEBUG, intel_dsa_log.class, "%s: " f, __func__,    \
+	    ##__VA_ARGS__)
+
+#define dsa_log_info(f, ...)                                                  \
+  vlib_log (VLIB_LOG_LEVEL_INFO, intel_dsa_log.class, "%s: " f, __func__,     \
+	    ##__VA_ARGS__)
+
+#define dsa_log_error(f, ...)                                                 \
+  vlib_log (VLIB_LOG_LEVEL_ERR, intel_dsa_log.class, "%s: " f, __func__,      \
+	    ##__VA_ARGS__)
+
+#endif
--- a/src/plugins/dma_intel/format.c
+++ b/src/plugins/dma_intel/format.c
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2022 Intel and/or its affiliates.
+ */
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/dma/dma.h>
+#include <vnet/plugin/plugin.h>
+#include <dma_intel/dsa_intel.h>
+
+u8 *
+format_intel_dsa_addr (u8 *s, va_list *va)
+{
+  intel_dsa_channel_t *ch = va_arg (*va, intel_dsa_channel_t *);
+  return format (s, "wq%d.%d", ch->did, ch->qid);
+}
--- a/src/plugins/dma_intel/main.c
+++ b/src/plugins/dma_intel/main.c
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2022 Cisco Systems, Inc.
+ * Copyright (c) 2022 Intel and/or its affiliates.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/dma/dma.h>
+#include <vnet/plugin/plugin.h>
+#include <vpp/app/version.h>
+#include <vppinfra/linux/sysfs.h>
+#include <dma_intel/dsa_intel.h>
+
+VLIB_REGISTER_LOG_CLASS (intel_dsa_log, static) = {
+  .class_name = "intel_dsa",
+};
+
+intel_dsa_main_t intel_dsa_main;
+
+void
+intel_dsa_assign_channels (vlib_main_t *vm)
+{
+  intel_dsa_main_t *idm = &intel_dsa_main;
+  intel_dsa_channel_t *ch, **chv = 0;
+  u16 n_threads;
+  int n;
+
+  vec_foreach_index (n, idm->channels)
+    vec_append (chv, idm->channels[n]);
+
+  vec_validate (idm->dsa_threads, vlib_get_n_threads () - 1);
+
+  if (vec_len (chv) == 0)
+    {
+      dsa_log_debug ("No DSA channels found");
+      goto done;
+    }
+
+  if (vec_len (chv) >= vlib_get_n_threads ())
+    n_threads = 1;
+  else
+    n_threads = vlib_get_n_threads () % vec_len (chv) ?
+			vlib_get_n_threads () / vec_len (chv) + 1 :
+			vlib_get_n_threads () / vec_len (chv);
+
+  for (int i = 0; i < vlib_get_n_threads (); i++)
+    {
+      vlib_main_t *tvm = vlib_get_main_by_index (i);
+      ch = *vec_elt_at_index (chv, i / n_threads);
+      idm->dsa_threads[i].ch = ch;
+      ch->n_threads = n_threads;
+      dsa_log_debug ("Assigning channel %u/%u to thread %u (numa %u)", ch->did,
+		     ch->qid, i, tvm->numa_node);
+    }
+
+done:
+  /* free */
+  vec_free (chv);
+}
+
+static clib_error_t *
+intel_dsa_map_region (intel_dsa_channel_t *ch)
+{
+  static clib_error_t *error = NULL;
+  /* map one page */
+  uword size = 0x1000;
+  uword offset = 0;
+  char path[256] = { 0 };
+
+  snprintf (path, sizeof (path), "%s/wq%d.%d", DSA_DEV_PATH, ch->did, ch->qid);
+  int fd = open (path, O_RDWR);
+  if (fd < 0)
+    return clib_error_return (0, "failed to open dsa device %s", path);
+
+  ch->portal =
+    clib_mem_vm_map_shared (0, size, fd, offset, "%s", (char *) path);
+  if (ch->portal == CLIB_MEM_VM_MAP_FAILED)
+    {
+      error = clib_error_return (0, "mmap portal %s failed", path);
+      close (fd);
+      return error;
+    }
+
+  return NULL;
+}
+
+static clib_error_t *
+intel_dsa_get_info (intel_dsa_channel_t *ch, clib_error_t **error)
+{
+  clib_error_t *err;
+  u8 *tmpstr;
+  u8 *dev_dir_name = 0, *wq_dir_name = 0;
+
+  u8 *f = 0;
+  dev_dir_name = format (0, "%s/dsa%d", SYS_DSA_PATH, ch->did);
+
+  vec_reset_length (f);
+  f = format (f, "%v/numa_node%c", dev_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->numa = atoi ((char *) tmpstr);
+
+  wq_dir_name = format (0, "%s/%U", SYS_DSA_PATH, format_intel_dsa_addr, ch);
+
+  vec_reset_length (f);
+  f = format (f, "%v/max_transfer_size%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->max_transfer_size = atoi ((char *) tmpstr);
+
+  vec_reset_length (f);
+  f = format (f, "%v/max_batch_size%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->max_transfers = atoi ((char *) tmpstr);
+
+  vec_reset_length (f);
+  f = format (f, "%v/size%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->size = atoi ((char *) tmpstr);
+
+  vec_reset_length (f);
+  f = format (f, "%v/type%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  if (tmpstr)
+    {
+      if (!clib_strcmp ((char *) tmpstr, "enabled"))
+	ch->type = INTEL_DSA_DEVICE_TYPE_UNKNOWN;
+      else if (!clib_strcmp ((char *) tmpstr, "user"))
+	ch->type = INTEL_DSA_DEVICE_TYPE_USER;
+      else if (!clib_strcmp ((char *) tmpstr, "mdev"))
+	ch->type = INTEL_DSA_DEVICE_TYPE_KERNEL;
+      else
+	ch->type = INTEL_DSA_DEVICE_TYPE_UNKNOWN;
+      vec_free (tmpstr);
+    }
+
+  vec_reset_length (f);
+  f = format (f, "%v/state%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  if (tmpstr)
+    {
+      if (!clib_strcmp ((char *) tmpstr, "enabled"))
+	ch->state = 1;
+      else
+	ch->state = 0;
+      vec_free (tmpstr);
+    }
+
+  vec_reset_length (f);
+  f = format (f, "%v/ats_disable%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->ats_disable = atoi ((char *) tmpstr);
+
+  vec_reset_length (f);
+  f = format (f, "%v/block_on_fault%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  ch->block_on_fault = atoi ((char *) tmpstr);
+
+  vec_reset_length (f);
+  f = format (f, "%v/mode%c", wq_dir_name, 0);
+  err = clib_sysfs_read ((char *) f, "%s", &tmpstr);
+  if (err)
+    goto error;
+  if (tmpstr)
+    {
+      if (!clib_strcmp ((char *) tmpstr, "dedicated"))
+	ch->mode = 1;
+      else
+	ch->mode = 0;
+      vec_free (tmpstr);
+    }
+
+  vec_free (f);
+  vec_free (dev_dir_name);
+  vec_free (wq_dir_name);
+  return NULL;
+
+error:
+  vec_free (f);
+  vec_free (dev_dir_name);
+  vec_free (wq_dir_name);
+
+  return err;
+}
+
+clib_error_t *
+intel_dsa_add_channel (vlib_main_t *vm, intel_dsa_channel_t *ch)
+{
+  intel_dsa_main_t *dm = &intel_dsa_main;
+  clib_error_t *err = 0;
+
+  if (intel_dsa_map_region (ch))
+    return clib_error_return (0, "dsa open device failed");
+
+  if (intel_dsa_get_info (ch, &err))
+    return clib_error_return (err, "dsa info not scanned");
+
+  vec_validate (dm->channels, ch->numa);
+  vec_add1 (dm->channels[ch->numa], ch);
+
+  return err;
+}
+
+static clib_error_t *
+dsa_config (vlib_main_t *vm, unformat_input_t *input)
+{
+  clib_error_t *error = 0;
+  intel_dsa_channel_t *ch;
+  u8 did, qid;
+
+  if (intel_dsa_main.lock == 0)
+    clib_spinlock_init (&(intel_dsa_main.lock));
+
+  if ((error = vlib_dma_register_backend (vm, &intel_dsa_backend)))
+    goto done;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "dev wq%d.%d", &did, &qid))
+	{
+	  ch = clib_mem_alloc_aligned (sizeof (*ch), CLIB_CACHE_LINE_BYTES);
+	  clib_memset (ch, 0, sizeof (*ch));
+	  ch->did = did;
+	  ch->qid = qid;
+	  if (intel_dsa_add_channel (vm, ch))
+	    clib_mem_free (ch);
+	}
+      else if (unformat_skip_white_space (input))
+	;
+      else
+	{
+	  error = clib_error_return (0, "unknown input `%U'",
+				     format_unformat_error, input);
+	  goto done;
+	}
+    }
+
+done:
+  return error;
+}
+
+VLIB_CONFIG_FUNCTION (dsa_config, "dsa");
+
+clib_error_t *
+intel_dsa_num_workers_change (vlib_main_t *vm)
+{
+  intel_dsa_assign_channels (vm);
+  return 0;
+}
+
+VLIB_NUM_WORKERS_CHANGE_FN (intel_dsa_num_workers_change);
+
+VLIB_PLUGIN_REGISTER () = {
+  .version = VPP_BUILD_VER,
+  .description = "Intel DSA Backend",
+};