[PATCH v1 1/1] vhost: integrate dmadev in asynchronous datapath
    Xia, Chenbo 
    chenbo.xia at intel.com
       
    Wed Jan 19 03:18:45 CET 2022
    
    
  
> -----Original Message-----
> From: Hu, Jiayu <jiayu.hu at intel.com>
> Sent: Monday, January 17, 2022 1:40 PM
> To: Xia, Chenbo <chenbo.xia at intel.com>; dev at dpdk.org
> Cc: maxime.coquelin at redhat.com; i.maximets at ovn.org; Richardson, Bruce
> <bruce.richardson at intel.com>; Van Haaren, Harry <harry.van.haaren at intel.com>;
> Pai G, Sunil <sunil.pai.g at intel.com>; Mcnamara, John <john.mcnamara at intel.com>;
> Ding, Xuan <xuan.ding at intel.com>; Jiang, Cheng1 <cheng1.jiang at intel.com>;
> liangma at liangbit.com
> Subject: RE: [PATCH v1 1/1] vhost: integrate dmadev in asynchronous datapath
> 
> Hi Chenbo,
> 
> Please see replies inline.
> 
> Thanks,
> Jiayu
> 
> > -----Original Message-----
> > From: Xia, Chenbo <chenbo.xia at intel.com>
> > > diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> > > index 33d023aa39..44073499bc 100644
> > > --- a/examples/vhost/main.c
> > > +++ b/examples/vhost/main.c
> > > @@ -24,8 +24,9 @@
> > >  #include <rte_ip.h>
> > >  #include <rte_tcp.h>
> > >  #include <rte_pause.h>
> > > +#include <rte_dmadev.h>
> > > +#include <rte_vhost_async.h>
> > >
> > > -#include "ioat.h"
> > >  #include "main.h"
> > >
> > >  #ifndef MAX_QUEUES
> > > @@ -56,6 +57,14 @@
> > >  #define RTE_TEST_TX_DESC_DEFAULT 512
> > >
> > >  #define INVALID_PORT_ID 0xFF
> > > +#define INVALID_DMA_ID -1
> > > +
> > > +#define MAX_VHOST_DEVICE 1024
> > > +#define DMA_RING_SIZE 4096
> > > +
> > > +struct dma_for_vhost dma_bind[MAX_VHOST_DEVICE];
> > > +struct rte_vhost_async_dma_info
> > dma_config[RTE_DMADEV_DEFAULT_MAX];
> > > +static int dma_count;
> > >
> > >  /* mask of enabled ports */
> > >  static uint32_t enabled_port_mask = 0;
> > > @@ -96,8 +105,6 @@ static int builtin_net_driver;
> > >
> > >  static int async_vhost_driver;
> > >
> > > -static char *dma_type;
> > > -
> > >  /* Specify timeout (in useconds) between retries on RX. */
> > >  static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
> > >  /* Specify the number of retries on RX. */
> > > @@ -196,13 +203,134 @@ struct vhost_bufftable
> > *vhost_txbuff[RTE_MAX_LCORE *
> > > MAX_VHOST_DEVICE];
> > >  #define MBUF_TABLE_DRAIN_TSC((rte_get_tsc_hz() + US_PER_S - 1) \
> > >   / US_PER_S * BURST_TX_DRAIN_US)
> > >
> > > +static inline bool
> > > +is_dma_configured(int16_t dev_id)
> > > +{
> > > +int i;
> > > +
> > > +for (i = 0; i < dma_count; i++) {
> > > +if (dma_config[i].dev_id == dev_id) {
> > > +return true;
> > > +}
> > > +}
> > > +return false;
> > > +}
> > > +
> > >  static inline int
> > >  open_dma(const char *value)
> > >  {
> > > -if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
> > > -return open_ioat(value);
> > > +struct dma_for_vhost *dma_info = dma_bind;
> > > +char *input = strndup(value, strlen(value) + 1);
> > > +char *addrs = input;
> > > +char *ptrs[2];
> > > +char *start, *end, *substr;
> > > +int64_t vid, vring_id;
> > > +
> > > +struct rte_dma_info info;
> > > +struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > +struct rte_dma_vchan_conf qconf = {
> > > +.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > +.nb_desc = DMA_RING_SIZE
> > > +};
> > > +
> > > +int dev_id;
> > > +int ret = 0;
> > > +uint16_t i = 0;
> > > +char *dma_arg[MAX_VHOST_DEVICE];
> > > +int args_nr;
> > > +
> > > +while (isblank(*addrs))
> > > +addrs++;
> > > +if (*addrs == '\0') {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +/* process DMA devices within bracket. */
> > > +addrs++;
> > > +substr = strtok(addrs, ";]");
> > > +if (!substr) {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +args_nr = rte_strsplit(substr, strlen(substr),
> > > +dma_arg, MAX_VHOST_DEVICE, ',');
> > > +if (args_nr <= 0) {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +while (i < args_nr) {
> > > +char *arg_temp = dma_arg[i];
> > > +uint8_t sub_nr;
> > > +
> > > +sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
> > > +if (sub_nr != 2) {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +start = strstr(ptrs[0], "txd");
> > > +if (start == NULL) {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +start += 3;
> > > +vid = strtol(start, &end, 0);
> > > +if (end == start) {
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +vring_id = 0 + VIRTIO_RXQ;
> >
> > No need to introduce vring_id, it's always VIRTIO_RXQ
> 
> I will remove it later.
> 
> >
> > > +
> > > +dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
> > > +if (dev_id < 0) {
> > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to find
> > DMA %s.\n",
> > > ptrs[1]);
> > > +ret = -1;
> > > +goto out;
> > > +} else if (is_dma_configured(dev_id)) {
> > > +goto done;
> > > +}
> > > +
> >
> > Please call rte_dma_info_get before configure to make sure
> > info.max_vchans >=1
> 
> Do you suggest to use "rte_dma_info_get() and info.max_vchans=0" to indicate
> the device is not configured, rather than using is_dma_configure()?
No, I mean when you configure the dmadev with one vchan, make sure it does have
at least one vchanl, even the 'vchan == 0' case can hardly happen. 
Just like the function call sequence in test_dmadev_instance, test_dmadev.c.
> 
> >
> > > +if (rte_dma_configure(dev_id, &dev_config) != 0) {
> > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure
> > DMA %d.\n",
> > > dev_id);
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
> > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up
> > DMA %d.\n",
> > > dev_id);
> > > +ret = -1;
> > > +goto out;
> > > +}
> > >
> > > -return -1;
> > > +rte_dma_info_get(dev_id, &info);
> > > +if (info.nb_vchans != 1) {
> > > +RTE_LOG(ERR, VHOST_CONFIG, "DMA %d has no
> > queues.\n",
> > > dev_id);
> >
> > Then the above means the number of vchan is not configured.
> >
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +if (rte_dma_start(dev_id) != 0) {
> > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to start
> > DMA %u.\n",
> > > dev_id);
> > > +ret = -1;
> > > +goto out;
> > > +}
> > > +
> > > +dma_config[dma_count].dev_id = dev_id;
> > > +dma_config[dma_count].max_vchans = 1;
> > > +dma_config[dma_count++].max_desc = DMA_RING_SIZE;
> > > +
> > > +done:
> > > +(dma_info + vid)->dmas[vring_id].dev_id = dev_id;
> > > +i++;
> > > +}
> > > +out:
> > > +free(input);
> > > +return ret;
> > >  }
> > >
> > >  /*
> > > @@ -500,8 +628,6 @@ enum {
> > >  OPT_CLIENT_NUM,
> > >  #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
> > >  OPT_BUILTIN_NET_DRIVER_NUM,
> > > -#define OPT_DMA_TYPE            "dma-type"
> > > -OPT_DMA_TYPE_NUM,
> > >  #define OPT_DMAS                "dmas"
> > >  OPT_DMAS_NUM,
> > >  };
> > > @@ -539,8 +665,6 @@ us_vhost_parse_args(int argc, char **argv)
> > >  NULL, OPT_CLIENT_NUM},
> > >  {OPT_BUILTIN_NET_DRIVER, no_argument,
> > >  NULL, OPT_BUILTIN_NET_DRIVER_NUM},
> > > -{OPT_DMA_TYPE, required_argument,
> > > -NULL, OPT_DMA_TYPE_NUM},
> > >  {OPT_DMAS, required_argument,
> > >  NULL, OPT_DMAS_NUM},
> > >  {NULL, 0, 0, 0},
> > > @@ -661,10 +785,6 @@ us_vhost_parse_args(int argc, char **argv)
> > >  }
> > >  break;
> > >
> > > -case OPT_DMA_TYPE_NUM:
> > > -dma_type = optarg;
> > > -break;
> > > -
> > >  case OPT_DMAS_NUM:
> > >  if (open_dma(optarg) == -1) {
> > >  RTE_LOG(INFO, VHOST_CONFIG,
> > > @@ -841,9 +961,10 @@ complete_async_pkts(struct vhost_dev *vdev)
> > >  {
> > >  struct rte_mbuf *p_cpl[MAX_PKT_BURST];
> > >  uint16_t complete_count;
> > > +int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
> > >
> > >  complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
> > > -VIRTIO_RXQ, p_cpl,
> > MAX_PKT_BURST);
> > > +VIRTIO_RXQ, p_cpl, MAX_PKT_BURST,
> > dma_id, 0);
> > >  if (complete_count) {
> > >  free_pkts(p_cpl, complete_count);
> > >  __atomic_sub_fetch(&vdev->pkts_inflight, complete_count,
> > > __ATOMIC_SEQ_CST);
> > > @@ -883,11 +1004,12 @@ drain_vhost(struct vhost_dev *vdev)
> > >
> > >  if (builtin_net_driver) {
> > >  ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
> > > -} else if (async_vhost_driver) {
> > > +} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
> > >  uint16_t enqueue_fail = 0;
> > > +int16_t dma_id = dma_bind[vdev-
> > >vid].dmas[VIRTIO_RXQ].dev_id;
> > >
> > >  complete_async_pkts(vdev);
> > > -ret = rte_vhost_submit_enqueue_burst(vdev->vid,
> > VIRTIO_RXQ, m,
> > > nr_xmit);
> > > +ret = rte_vhost_submit_enqueue_burst(vdev->vid,
> > VIRTIO_RXQ, m,
> > > nr_xmit, dma_id, 0);
> > >  __atomic_add_fetch(&vdev->pkts_inflight, ret,
> > __ATOMIC_SEQ_CST);
> > >
> > >  enqueue_fail = nr_xmit - ret;
> > > @@ -905,7 +1027,7 @@ drain_vhost(struct vhost_dev *vdev)
> > >  __ATOMIC_SEQ_CST);
> > >  }
> > >
> > > -if (!async_vhost_driver)
> > > +if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
> > >  free_pkts(m, nr_xmit);
> > >  }
> > >
> > > @@ -1211,12 +1333,13 @@ drain_eth_rx(struct vhost_dev *vdev)
> > >  if (builtin_net_driver) {
> > >  enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
> > >  pkts, rx_count);
> > > -} else if (async_vhost_driver) {
> > > +} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
> > >  uint16_t enqueue_fail = 0;
> > > +int16_t dma_id = dma_bind[vdev-
> > >vid].dmas[VIRTIO_RXQ].dev_id;
> > >
> > >  complete_async_pkts(vdev);
> > >  enqueue_count = rte_vhost_submit_enqueue_burst(vdev-
> > >vid,
> > > -VIRTIO_RXQ, pkts, rx_count);
> > > +VIRTIO_RXQ, pkts, rx_count, dma_id,
> > 0);
> > >  __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count,
> > > __ATOMIC_SEQ_CST);
> > >
> > >  enqueue_fail = rx_count - enqueue_count;
> > > @@ -1235,7 +1358,7 @@ drain_eth_rx(struct vhost_dev *vdev)
> > >  __ATOMIC_SEQ_CST);
> > >  }
> > >
> > > -if (!async_vhost_driver)
> > > +if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
> > >  free_pkts(pkts, rx_count);
> > >  }
> > >
> > > @@ -1387,18 +1510,20 @@ destroy_device(int vid)
> > >  "(%d) device has been removed from data core\n",
> > >  vdev->vid);
> > >
> > > -if (async_vhost_driver) {
> > > +if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
> > >  uint16_t n_pkt = 0;
> > > +int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
> > >  struct rte_mbuf *m_cpl[vdev->pkts_inflight];
> > >
> > >  while (vdev->pkts_inflight) {
> > >  n_pkt = rte_vhost_clear_queue_thread_unsafe(vid,
> > VIRTIO_RXQ,
> > > -m_cpl, vdev->pkts_inflight);
> > > +m_cpl, vdev->pkts_inflight,
> > dma_id, 0);
> > >  free_pkts(m_cpl, n_pkt);
> > >  __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt,
> > > __ATOMIC_SEQ_CST);
> > >  }
> > >
> > >  rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
> > > +dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
> > >  }
> > >
> > >  rte_free(vdev);
> > > @@ -1468,20 +1593,14 @@ new_device(int vid)
> > >  "(%d) device has been added to data core %d\n",
> > >  vid, vdev->coreid);
> > >
> > > -if (async_vhost_driver) {
> > > -struct rte_vhost_async_config config = {0};
> > > -struct rte_vhost_async_channel_ops channel_ops;
> > > -
> > > -if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
> > > -channel_ops.transfer_data = ioat_transfer_data_cb;
> > > -channel_ops.check_completed_copies =
> > > -ioat_check_completed_copies_cb;
> > > -
> > > -config.features = RTE_VHOST_ASYNC_INORDER;
> > > +if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
> > > +int ret;
> > >
> > > -return rte_vhost_async_channel_register(vid,
> > VIRTIO_RXQ,
> > > -config, &channel_ops);
> > > +ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
> > > +if (ret == 0) {
> > > +dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled =
> > true;
> > >  }
> > > +return ret;
> > >  }
> > >
> > >  return 0;
> > > @@ -1502,14 +1621,15 @@ vring_state_changed(int vid, uint16_t
> > queue_id, int
> > > enable)
> > >  if (queue_id != VIRTIO_RXQ)
> > >  return 0;
> > >
> > > -if (async_vhost_driver) {
> > > +if (dma_bind[vid].dmas[queue_id].async_enabled) {
> > >  if (!enable) {
> > >  uint16_t n_pkt = 0;
> > > +int16_t dma_id =
> > dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
> > >  struct rte_mbuf *m_cpl[vdev->pkts_inflight];
> > >
> > >  while (vdev->pkts_inflight) {
> > >  n_pkt =
> > rte_vhost_clear_queue_thread_unsafe(vid,
> > > queue_id,
> > > -m_cpl, vdev-
> > >pkts_inflight);
> > > +m_cpl, vdev-
> > >pkts_inflight, dma_id,
> > > 0);
> > >  free_pkts(m_cpl, n_pkt);
> > >  __atomic_sub_fetch(&vdev->pkts_inflight,
> > n_pkt,
> > > __ATOMIC_SEQ_CST);
> > >  }
> > > @@ -1657,6 +1777,25 @@ create_mbuf_pool(uint16_t nr_port, uint32_t
> > > nr_switch_core, uint32_t mbuf_size,
> > >  rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
> > >  }
> > >
> > > +static void
> > > +init_dma(void)
> > > +{
> > > +int i;
> > > +
> > > +for (i = 0; i < MAX_VHOST_DEVICE; i++) {
> > > +int j;
> > > +
> > > +for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
> > > +dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
> > > +dma_bind[i].dmas[j].async_enabled = false;
> > > +}
> > > +}
> > > +
> > > +for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) {
> > > +dma_config[i].dev_id = INVALID_DMA_ID;
> > > +}
> > > +}
> > > +
> > >  /*
> > >   * Main function, does initialisation and calls the per-lcore functions.
> > >   */
> > > @@ -1679,6 +1818,9 @@ main(int argc, char *argv[])
> > >  argc -= ret;
> > >  argv += ret;
> > >
> > > +/* initialize dma structures */
> > > +init_dma();
> > > +
> > >  /* parse app arguments */
> > >  ret = us_vhost_parse_args(argc, argv);
> > >  if (ret < 0)
> > > @@ -1754,6 +1896,20 @@ main(int argc, char *argv[])
> > >  if (client_mode)
> > >  flags |= RTE_VHOST_USER_CLIENT;
> > >
> > > +if (async_vhost_driver) {
> > > +if (rte_vhost_async_dma_configure(dma_config, dma_count)
> > < 0) {
> > > +RTE_LOG(ERR, VHOST_PORT, "Failed to configure
> > DMA in
> > > vhost.\n");
> > > +for (i = 0; i < dma_count; i++) {
> > > +if (dma_config[i].dev_id != INVALID_DMA_ID)
> > {
> > > +rte_dma_stop(dma_config[i].dev_id);
> > > +dma_config[i].dev_id =
> > INVALID_DMA_ID;
> > > +}
> > > +}
> > > +dma_count = 0;
> > > +async_vhost_driver = false;
> > > +}
> > > +}
> > > +
> > >  /* Register vhost user driver to handle vhost messages. */
> > >  for (i = 0; i < nb_sockets; i++) {
> > >  char *file = socket_files + i * PATH_MAX;
> > > diff --git a/examples/vhost/main.h b/examples/vhost/main.h
> > > index e7b1ac60a6..b4a453e77e 100644
> > > --- a/examples/vhost/main.h
> > > +++ b/examples/vhost/main.h
> > > @@ -8,6 +8,7 @@
> > >  #include <sys/queue.h>
> > >
> > >  #include <rte_ether.h>
> > > +#include <rte_pci.h>
> > >
> > >  /* Macros for printing using RTE_LOG */
> > >  #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
> > > @@ -79,6 +80,16 @@ struct lcore_info {
> > >  struct vhost_dev_tailq_list vdev_list;
> > >  };
> > >
> > > +struct dma_info {
> > > +struct rte_pci_addr addr;
> > > +int16_t dev_id;
> > > +bool async_enabled;
> > > +};
> > > +
> > > +struct dma_for_vhost {
> > > +struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
> > > +};
> > > +
> > >  /* we implement non-extra virtio net features */
> > >  #define VIRTIO_NET_FEATURES0
> > >
> > > diff --git a/examples/vhost/meson.build b/examples/vhost/meson.build
> > > index 3efd5e6540..87a637f83f 100644
> > > --- a/examples/vhost/meson.build
> > > +++ b/examples/vhost/meson.build
> > > @@ -12,13 +12,9 @@ if not is_linux
> > >  endif
> > >
> > >  deps += 'vhost'
> > > +deps += 'dmadev'
> > >  allow_experimental_apis = true
> > >  sources = files(
> > >          'main.c',
> > >          'virtio_net.c',
> > >  )
> > > -
> > > -if dpdk_conf.has('RTE_RAW_IOAT')
> > > -    deps += 'raw_ioat'
> > > -    sources += files('ioat.c')
> > > -endif
> > > diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
> > > index cdb37a4814..8107329400 100644
> > > --- a/lib/vhost/meson.build
> > > +++ b/lib/vhost/meson.build
> > > @@ -33,7 +33,8 @@ headers = files(
> > >          'rte_vhost_async.h',
> > >          'rte_vhost_crypto.h',
> > >  )
> > > +
> > >  driver_sdk_headers = files(
> > >          'vdpa_driver.h',
> > >  )
> > > -deps += ['ethdev', 'cryptodev', 'hash', 'pci']
> > > +deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
> > > diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
> > > index a87ea6ba37..23a7a2d8b3 100644
> > > --- a/lib/vhost/rte_vhost_async.h
> > > +++ b/lib/vhost/rte_vhost_async.h
> > > @@ -27,70 +27,12 @@ struct rte_vhost_iov_iter {
> > >  };
> > >
> > >  /**
> > > - * dma transfer status
> > > + * DMA device information
> > >   */
> > > -struct rte_vhost_async_status {
> > > -/** An array of application specific data for source memory */
> > > -uintptr_t *src_opaque_data;
> > > -/** An array of application specific data for destination memory */
> > > -uintptr_t *dst_opaque_data;
> > > -};
> > > -
> > > -/**
> > > - * dma operation callbacks to be implemented by applications
> > > - */
> > > -struct rte_vhost_async_channel_ops {
> > > -/**
> > > - * instruct async engines to perform copies for a batch of packets
> > > - *
> > > - * @param vid
> > > - *  id of vhost device to perform data copies
> > > - * @param queue_id
> > > - *  queue id to perform data copies
> > > - * @param iov_iter
> > > - *  an array of IOV iterators
> > > - * @param opaque_data
> > > - *  opaque data pair sending to DMA engine
> > > - * @param count
> > > - *  number of elements in the "descs" array
> > > - * @return
> > > - *  number of IOV iterators processed, negative value means error
> > > - */
> > > -int32_t (*transfer_data)(int vid, uint16_t queue_id,
> > > -struct rte_vhost_iov_iter *iov_iter,
> > > -struct rte_vhost_async_status *opaque_data,
> > > -uint16_t count);
> > > -/**
> > > - * check copy-completed packets from the async engine
> > > - * @param vid
> > > - *  id of vhost device to check copy completion
> > > - * @param queue_id
> > > - *  queue id to check copy completion
> > > - * @param opaque_data
> > > - *  buffer to receive the opaque data pair from DMA engine
> > > - * @param max_packets
> > > - *  max number of packets could be completed
> > > - * @return
> > > - *  number of async descs completed, negative value means error
> > > - */
> > > -int32_t (*check_completed_copies)(int vid, uint16_t queue_id,
> > > -struct rte_vhost_async_status *opaque_data,
> > > -uint16_t max_packets);
> > > -};
> > > -
> > > -/**
> > > - *  async channel features
> > > - */
> > > -enum {
> > > -RTE_VHOST_ASYNC_INORDER = 1U << 0,
> > > -};
> > > -
> > > -/**
> > > - *  async channel configuration
> > > - */
> > > -struct rte_vhost_async_config {
> > > -uint32_t features;
> > > -uint32_t rsvd[2];
> > > +struct rte_vhost_async_dma_info {
> > > +int16_t dev_id;/* DMA device ID */
> > > +uint16_t max_vchans;/* max number of vchan */
> > > +uint16_t max_desc;/* max desc number of vchan */
> > >  };
> > >
> > >  /**
> > > @@ -100,17 +42,11 @@ struct rte_vhost_async_config {
> > >   *  vhost device id async channel to be attached to
> > >   * @param queue_id
> > >   *  vhost queue id async channel to be attached to
> > > - * @param config
> > > - *  Async channel configuration structure
> > > - * @param ops
> > > - *  Async channel operation callbacks
> > >   * @return
> > >   *  0 on success, -1 on failures
> > >   */
> > >  __rte_experimental
> > > -int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > > -struct rte_vhost_async_config config,
> > > -struct rte_vhost_async_channel_ops *ops);
> > > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id);
> > >
> > >  /**
> > >   * Unregister an async channel for a vhost queue
> > > @@ -136,17 +72,11 @@ int rte_vhost_async_channel_unregister(int vid,
> > uint16_t
> > > queue_id);
> > >   *  vhost device id async channel to be attached to
> > >   * @param queue_id
> > >   *  vhost queue id async channel to be attached to
> > > - * @param config
> > > - *  Async channel configuration
> > > - * @param ops
> > > - *  Async channel operation callbacks
> > >   * @return
> > >   *  0 on success, -1 on failures
> > >   */
> > >  __rte_experimental
> > > -int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t
> > queue_id,
> > > -struct rte_vhost_async_config config,
> > > -struct rte_vhost_async_channel_ops *ops);
> > > +int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t
> > > queue_id);
> > >
> > >  /**
> > >   * Unregister an async channel for a vhost queue without performing any
> > > @@ -179,12 +109,17 @@ int
> > rte_vhost_async_channel_unregister_thread_unsafe(int
> > > vid,
> > >   *  array of packets to be enqueued
> > >   * @param count
> > >   *  packets num to be enqueued
> > > + * @param dma_id
> > > + *  the identifier of the DMA device
> > > + * @param vchan
> > > + *  the identifier of virtual DMA channel
> > >   * @return
> > >   *  num of packets enqueued
> > >   */
> > >  __rte_experimental
> > >  uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
> > > -struct rte_mbuf **pkts, uint16_t count);
> > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
> > > +uint16_t vchan);
> >
> > All dma_id in the API should be uint16_t. Otherwise you need to check if
> valid.
> 
> Yes, you are right. Although dma_id is defined as int16_t and DMA library
> checks
> if it is valid, vhost doesn't handle DMA failure and we need to make sure
> dma_id
> is valid before using it. And even if vhost handles DMA error, a better place
> to check
> invalid dma_id is before passing it to DMA library too. I will add the check
> later.
> 
> >
> > >
> > >  /**
> > >   * This function checks async completion status for a specific vhost
> > > @@ -199,12 +134,17 @@ uint16_t rte_vhost_submit_enqueue_burst(int
> > vid,
> > > uint16_t queue_id,
> > >   *  blank array to get return packet pointer
> > >   * @param count
> > >   *  size of the packet array
> > > + * @param dma_id
> > > + *  the identifier of the DMA device
> > > + * @param vchan
> > > + *  the identifier of virtual DMA channel
> > >   * @return
> > >   *  num of packets returned
> > >   */
> > >  __rte_experimental
> > >  uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
> > > -struct rte_mbuf **pkts, uint16_t count);
> > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
> > > +uint16_t vchan);
> > >
> > >  /**
> > >   * This function returns the amount of in-flight packets for the vhost
> > > @@ -235,11 +175,32 @@ int rte_vhost_async_get_inflight(int vid, uint16_t
> > > queue_id);
> > >   *  Blank array to get return packet pointer
> > >   * @param count
> > >   *  Size of the packet array
> > > + * @param dma_id
> > > + *  the identifier of the DMA device
> > > + * @param vchan
> > > + *  the identifier of virtual DMA channel
> > >   * @return
> > >   *  Number of packets returned
> > >   */
> > >  __rte_experimental
> > >  uint16_t rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
> > > -struct rte_mbuf **pkts, uint16_t count);
> > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
> > > +uint16_t vchan);
> > > +/**
> > > + * The DMA vChannels used in asynchronous data path must be
> > configured
> > > + * first. So this function needs to be called before enabling DMA
> > > + * acceleration for vring. If this function fails, asynchronous data path
> > > + * cannot be enabled for any vring further.
> > > + *
> > > + * @param dmas
> > > + *  DMA information
> > > + * @param count
> > > + *  Element number of 'dmas'
> > > + * @return
> > > + *  0 on success, and -1 on failure
> > > + */
> > > +__rte_experimental
> > > +int rte_vhost_async_dma_configure(struct rte_vhost_async_dma_info
> > *dmas,
> > > +uint16_t count);
> >
> > I think based on current design, vhost can use every vchan if user app let
> it.
> > So the max_desc and max_vchans can just be got from dmadev APIs? Then
> > there's
> > no need to introduce the new ABI struct rte_vhost_async_dma_info.
> 
> Yes, no need to introduce struct rte_vhost_async_dma_info. We can either use
> struct rte_dma_info which is suggested by Maxime, or query from dma library
> via device id. Since dma device configuration is left to applications, I
> prefer to
> use rte_dma_info directly. How do you think?
If you only use rte_dma_info as input param, you will also need to call dmadev
API to get dmadev ID in rte_vhost_async_dma_configure (Or you add both rte_dma_info
and dmadev ID). So I suggest to only use dmadev ID as input.
> 
> >
> > And about max_desc, I see the dmadev lib, you can get vchan's max_desc
> > but you
> > may use a nb_desc (<= max_desc) to configure vchanl. And IIUC, vhost wants
> > to
> > know the nb_desc instead of max_desc?
> 
> True, nb_desc is better than max_desc. But dma library doesn’t provide
> function
> to query nb_desc for every vchannel. And rte_dma_info cannot be used in
> rte_vhost_async_dma_configure(), if vhost uses nb_desc. So the only way is
> to require users to provide nb_desc for every vchannel, and it will introduce
> a new struct. Is it really needed?
> 
Since now dmadev lib does not provide a way to query real nb_desc for a vchanl,
so I think we can just use max_desc.
But ideally, if dmadev lib provides such a way, the configured nb_desc and nb_vchanl
should be used to configure vhost lib.
@Bruce, should you add such a way in dmadev lib? As users now do not know the real
configured nb_desc of vchanl.
> >
> > >
> > >  #endif /* _RTE_VHOST_ASYNC_H_ */
> > > diff --git a/lib/vhost/version.map b/lib/vhost/version.map
> > > index a7ef7f1976..1202ba9c1a 100644
> > > --- a/lib/vhost/version.map
> > > +++ b/lib/vhost/version.map
> > > @@ -84,6 +84,9 @@ EXPERIMENTAL {
> > >
> > >  # added in 21.11
> > >  rte_vhost_get_monitor_addr;
> > > +
> > > +# added in 22.03
> > > +rte_vhost_async_dma_configure;
> > >  };
> > >
> > >  INTERNAL {
> > > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> > > index 13a9bb9dd1..32f37f4851 100644
> > > --- a/lib/vhost/vhost.c
> > > +++ b/lib/vhost/vhost.c
> > > @@ -344,6 +344,7 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
> > >  return;
> > >
> > >  rte_free(vq->async->pkts_info);
> > > +rte_free(vq->async->pkts_cmpl_flag);
> > >
> > >  rte_free(vq->async->buffers_packed);
> > >  vq->async->buffers_packed = NULL;
> > > @@ -1626,8 +1627,7 @@ rte_vhost_extern_callback_register(int vid,
> > >  }
> > >
> > >  static __rte_always_inline int
> > > -async_channel_register(int vid, uint16_t queue_id,
> > > -struct rte_vhost_async_channel_ops *ops)
> > > +async_channel_register(int vid, uint16_t queue_id)
> > >  {
> > >  struct virtio_net *dev = get_device(vid);
> > >  struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
> > > @@ -1656,6 +1656,14 @@ async_channel_register(int vid, uint16_t
> > queue_id,
> > >  goto out_free_async;
> > >  }
> > >
> > > +async->pkts_cmpl_flag = rte_zmalloc_socket(NULL, vq->size *
> > sizeof(bool),
> > > +RTE_CACHE_LINE_SIZE, node);
> > > +if (!async->pkts_cmpl_flag) {
> > > +VHOST_LOG_CONFIG(ERR, "failed to allocate async
> > pkts_cmpl_flag
> > > (vid %d, qid: %d)\n",
> > > +vid, queue_id);
> >
> > qid: %u
> >
> > > +goto out_free_async;
> > > +}
> > > +
> > >  if (vq_is_packed(dev)) {
> > >  async->buffers_packed = rte_malloc_socket(NULL,
> > >  vq->size * sizeof(struct
> > vring_used_elem_packed),
> > > @@ -1676,9 +1684,6 @@ async_channel_register(int vid, uint16_t
> > queue_id,
> > >  }
> > >  }
> > >
> > > -async->ops.check_completed_copies = ops-
> > >check_completed_copies;
> > > -async->ops.transfer_data = ops->transfer_data;
> > > -
> > >  vq->async = async;
> > >
> > >  return 0;
> > > @@ -1691,15 +1696,13 @@ async_channel_register(int vid, uint16_t
> > queue_id,
> > >  }
> > >
> > >  int
> > > -rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > > -struct rte_vhost_async_config config,
> > > -struct rte_vhost_async_channel_ops *ops)
> > > +rte_vhost_async_channel_register(int vid, uint16_t queue_id)
> > >  {
> > >  struct vhost_virtqueue *vq;
> > >  struct virtio_net *dev = get_device(vid);
> > >  int ret;
> > >
> > > -if (dev == NULL || ops == NULL)
> > > +if (dev == NULL)
> > >  return -1;
> > >
> > >  if (queue_id >= VHOST_MAX_VRING)
> > > @@ -1710,33 +1713,20 @@ rte_vhost_async_channel_register(int vid,
> > uint16_t
> > > queue_id,
> > >  if (unlikely(vq == NULL || !dev->async_copy))
> > >  return -1;
> > >
> > > -if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) {
> > > -VHOST_LOG_CONFIG(ERR,
> > > -"async copy is not supported on non-inorder mode "
> > > -"(vid %d, qid: %d)\n", vid, queue_id);
> > > -return -1;
> > > -}
> > > -
> > > -if (unlikely(ops->check_completed_copies == NULL ||
> > > -ops->transfer_data == NULL))
> > > -return -1;
> > > -
> > >  rte_spinlock_lock(&vq->access_lock);
> > > -ret = async_channel_register(vid, queue_id, ops);
> > > +ret = async_channel_register(vid, queue_id);
> > >  rte_spinlock_unlock(&vq->access_lock);
> > >
> > >  return ret;
> > >  }
> > >
> > >  int
> > > -rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t
> > queue_id,
> > > -struct rte_vhost_async_config config,
> > > -struct rte_vhost_async_channel_ops *ops)
> > > +rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t
> > queue_id)
> > >  {
> > >  struct vhost_virtqueue *vq;
> > >  struct virtio_net *dev = get_device(vid);
> > >
> > > -if (dev == NULL || ops == NULL)
> > > +if (dev == NULL)
> > >  return -1;
> > >
> > >  if (queue_id >= VHOST_MAX_VRING)
> > > @@ -1747,18 +1737,7 @@
> > rte_vhost_async_channel_register_thread_unsafe(int vid,
> > > uint16_t queue_id,
> > >  if (unlikely(vq == NULL || !dev->async_copy))
> > >  return -1;
> > >
> > > -if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) {
> > > -VHOST_LOG_CONFIG(ERR,
> > > -"async copy is not supported on non-inorder mode "
> > > -"(vid %d, qid: %d)\n", vid, queue_id);
> > > -return -1;
> > > -}
> > > -
> > > -if (unlikely(ops->check_completed_copies == NULL ||
> > > -ops->transfer_data == NULL))
> > > -return -1;
> > > -
> > > -return async_channel_register(vid, queue_id, ops);
> > > +return async_channel_register(vid, queue_id);
> > >  }
> > >
> > >  int
> > > @@ -1835,6 +1814,83 @@
> > rte_vhost_async_channel_unregister_thread_unsafe(int
> > > vid, uint16_t queue_id)
> > >  return 0;
> > >  }
> > >
> > > +static __rte_always_inline void
> > > +vhost_free_async_dma_mem(void)
> > > +{
> > > +uint16_t i;
> > > +
> > > +for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) {
> > > +struct async_dma_info *dma = &dma_copy_track[i];
> > > +int16_t j;
> > > +
> > > +if (dma->max_vchans == 0) {
> > > +continue;
> > > +}
> > > +
> > > +for (j = 0; j < dma->max_vchans; j++) {
> > > +rte_free(dma->vchans[j].metadata);
> > > +}
> > > +rte_free(dma->vchans);
> > > +dma->vchans = NULL;
> > > +dma->max_vchans = 0;
> > > +}
> > > +}
> > > +
> > > +int
> > > +rte_vhost_async_dma_configure(struct rte_vhost_async_dma_info *dmas,
> > uint16_t
> > > count)
> > > +{
> > > +uint16_t i;
> > > +
> > > +if (!dmas) {
> > > +VHOST_LOG_CONFIG(ERR, "Invalid DMA configuration
> > parameter.\n");
> > > +return -1;
> > > +}
> > > +
> > > +for (i = 0; i < count; i++) {
> > > +struct async_dma_vchan_info *vchans;
> > > +int16_t dev_id;
> > > +uint16_t max_vchans;
> > > +uint16_t max_desc;
> > > +uint16_t j;
> > > +
> > > +dev_id = dmas[i].dev_id;
> > > +max_vchans = dmas[i].max_vchans;
> > > +max_desc = dmas[i].max_desc;
> > > +
> > > +if (!rte_is_power_of_2(max_desc)) {
> > > +max_desc = rte_align32pow2(max_desc);
> > > +}
> >
> > I think when aligning to power of 2, it should exceed not max_desc?
> 
> Aligned max_desc is used to allocate context tracking array. We only need
> to guarantee the size of the array for every vchannel is >= max_desc. So it's
> OK to have greater array size than max_desc.
> 
> > And based on above comment, if this max_desc is nb_desc configured for
> > vchanl, you should just make sure the nb_desc be power-of-2.
> >
> > > +
> > > +vchans = rte_zmalloc(NULL, sizeof(struct
> > async_dma_vchan_info) *
> > > max_vchans,
> > > +RTE_CACHE_LINE_SIZE);
> > > +if (vchans == NULL) {
> > > +VHOST_LOG_CONFIG(ERR, "Failed to allocate vchans
> > for dma-
> > > %d."
> > > +" Cannot enable async data-path.\n",
> > dev_id);
> > > +vhost_free_async_dma_mem();
> > > +return -1;
> > > +}
> > > +
> > > +for (j = 0; j < max_vchans; j++) {
> > > +vchans[j].metadata = rte_zmalloc(NULL, sizeof(bool *)
> > *
> > > max_desc,
> > > +RTE_CACHE_LINE_SIZE);
> > > +if (!vchans[j].metadata) {
> > > +VHOST_LOG_CONFIG(ERR, "Failed to allocate
> > metadata for
> > > "
> > > +"dma-%d vchan-%u\n",
> > dev_id, j);
> > > +vhost_free_async_dma_mem();
> > > +return -1;
> > > +}
> > > +
> > > +vchans[j].ring_size = max_desc;
> > > +vchans[j].ring_mask = max_desc - 1;
> > > +}
> > > +
> > > +dma_copy_track[dev_id].vchans = vchans;
> > > +dma_copy_track[dev_id].max_vchans = max_vchans;
> > > +}
> > > +
> > > +return 0;
> > > +}
> > > +
> > >  int
> > >  rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
> > >  {
> > > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> > > index 7085e0885c..d9bda34e11 100644
> > > --- a/lib/vhost/vhost.h
> > > +++ b/lib/vhost/vhost.h
> > > @@ -19,6 +19,7 @@
> > >  #include <rte_ether.h>
> > >  #include <rte_rwlock.h>
> > >  #include <rte_malloc.h>
> > > +#include <rte_dmadev.h>
> > >
> > >  #include "rte_vhost.h"
> > >  #include "rte_vdpa.h"
> > > @@ -50,6 +51,7 @@
> > >
> > >  #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST)
> > >  #define VHOST_MAX_ASYNC_VEC 2048
> > > +#define VHOST_ASYNC_DMA_BATCHING_SIZE 32
> > >
> > >  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)\
> > >  ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> > VRING_DESC_F_WRITE) : \
> > > @@ -119,6 +121,41 @@ struct vring_used_elem_packed {
> > >  uint32_t count;
> > >  };
> > >
> > > +struct async_dma_vchan_info {
> > > +/* circular array to track copy metadata */
> > > +bool **metadata;
> >
> > If the metadata will only be flags, maybe just use some
> > name called XXX_flag
> 
> Sure, I will rename it.
> 
> >
> > > +
> > > +/* max elements in 'metadata' */
> > > +uint16_t ring_size;
> > > +/* ring index mask for 'metadata' */
> > > +uint16_t ring_mask;
> > > +
> > > +/* batching copies before a DMA doorbell */
> > > +uint16_t nr_batching;
> > > +
> > > +/**
> > > + * DMA virtual channel lock. Although it is able to bind DMA
> > > + * virtual channels to data plane threads, vhost control plane
> > > + * thread could call data plane functions too, thus causing
> > > + * DMA device contention.
> > > + *
> > > + * For example, in VM exit case, vhost control plane thread needs
> > > + * to clear in-flight packets before disable vring, but there could
> > > + * be anotther data plane thread is enqueuing packets to the same
> > > + * vring with the same DMA virtual channel. But dmadev PMD
> > functions
> > > + * are lock-free, so the control plane and data plane threads
> > > + * could operate the same DMA virtual channel at the same time.
> > > + */
> > > +rte_spinlock_t dma_lock;
> > > +};
> > > +
> > > +struct async_dma_info {
> > > +uint16_t max_vchans;
> > > +struct async_dma_vchan_info *vchans;
> > > +};
> > > +
> > > +extern struct async_dma_info
> > dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
> > > +
> > >  /**
> > >   * inflight async packet information
> > >   */
> > > @@ -129,9 +166,6 @@ struct async_inflight_info {
> > >  };
> > >
> > >  struct vhost_async {
> > > -/* operation callbacks for DMA */
> > > -struct rte_vhost_async_channel_ops ops;
> > > -
> > >  struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
> > >  struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
> > >  uint16_t iter_idx;
> > > @@ -139,6 +173,19 @@ struct vhost_async {
> > >
> > >  /* data transfer status */
> > >  struct async_inflight_info *pkts_info;
> > > +/**
> > > + * packet reorder array. "true" indicates that DMA
> > > + * device completes all copies for the packet.
> > > + *
> > > + * Note that this array could be written by multiple
> > > + * threads at the same time. For example, two threads
> > > + * enqueue packets to the same virtqueue with their
> > > + * own DMA devices. However, since offloading is
> > > + * per-packet basis, each packet flag will only be
> > > + * written by one thread. And single byte write is
> > > + * atomic, so no lock is needed.
> > > + */
> > > +bool *pkts_cmpl_flag;
> > >  uint16_t pkts_idx;
> > >  uint16_t pkts_inflight_n;
> > >  union {
> > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > > index b3d954aab4..9f81fc9733 100644
> > > --- a/lib/vhost/virtio_net.c
> > > +++ b/lib/vhost/virtio_net.c
> > > @@ -11,6 +11,7 @@
> > >  #include <rte_net.h>
> > >  #include <rte_ether.h>
> > >  #include <rte_ip.h>
> > > +#include <rte_dmadev.h>
> > >  #include <rte_vhost.h>
> > >  #include <rte_tcp.h>
> > >  #include <rte_udp.h>
> > > @@ -25,6 +26,9 @@
> > >
> > >  #define MAX_BATCH_LEN 256
> > >
> > > +/* DMA device copy operation tracking array. */
> > > +struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
> > > +
> > >  static  __rte_always_inline bool
> > >  rxvq_is_mergeable(struct virtio_net *dev)
> > >  {
> > > @@ -43,6 +47,108 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx,
> > uint32_t
> > > nr_vring)
> > >  return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
> > >  }
> > >
> > > +static __rte_always_inline uint16_t
> > > +vhost_async_dma_transfer(struct vhost_virtqueue *vq, int16_t dma_id,
> > > +uint16_t vchan, uint16_t head_idx,
> > > +struct rte_vhost_iov_iter *pkts, uint16_t nr_pkts)
> > > +{
> > > +struct async_dma_vchan_info *dma_info =
> > > &dma_copy_track[dma_id].vchans[vchan];
> > > +uint16_t ring_mask = dma_info->ring_mask;
> > > +uint16_t pkt_idx;
> > > +
> > > +rte_spinlock_lock(&dma_info->dma_lock);
> > > +
> > > +for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) {
> > > +struct rte_vhost_iovec *iov = pkts[pkt_idx].iov;
> > > +int copy_idx = 0;
> > > +uint16_t nr_segs = pkts[pkt_idx].nr_segs;
> > > +uint16_t i;
> > > +
> > > +if (rte_dma_burst_capacity(dma_id, vchan) < nr_segs) {
> > > +goto out;
> > > +}
> > > +
> > > +for (i = 0; i < nr_segs; i++) {
> > > +/**
> > > + * We have checked the available space before
> > submit copies
> > > to DMA
> > > + * vChannel, so we don't handle error here.
> > > + */
> > > +copy_idx = rte_dma_copy(dma_id, vchan,
> > > (rte_iova_t)iov[i].src_addr,
> > > +(rte_iova_t)iov[i].dst_addr, iov[i].len,
> > > +RTE_DMA_OP_FLAG_LLC);
> >
> > This assumes rte_dma_copy will always succeed if there's available space.
> >
> > But the API doxygen says:
> >
> > * @return
> >  *   - 0..UINT16_MAX: index of enqueued job.
> >  *   - -ENOSPC: if no space left to enqueue.
> >  *   - other values < 0 on failure.
> >
> > So it should consider other vendor-specific errors.
> 
> Error handling is not free here. Specifically, SW fallback is a way to handle
> failed
> copy operations. But it requires vhost to track VA for every source and
> destination
> buffer for every copy. DMA library uses IOVA, so vhost only prepares IOVA for
> copies of
> every packet in async data-path. In the case of IOVA as PA, the prepared IOVAs
> cannot
> be used as SW fallback, which means vhost needs to store VA for every copy of
> every
> packet too, even if there no errors will happen or IOVA is VA.
> 
> I am thinking that the only usable DMA engines in vhost are CBDMA and DSA, is
> it worth
> the cost for "future HW"? If there will be other vendor's HW in future, is it
> OK to add the
> support later? Or is there any way to get VA from IOVA?
Let's investigate how much performance drop the error handling will bring and see...
Thanks,
Chenbo
> 
> Thanks,
> Jiayu
> >
> > Thanks,
> > Chenbo
> >
> >
> 
    
    
More information about the dev
mailing list