[dpdk-dev] [PATCH v3] vhost: support inflight share memory protocol feature

lin li chuckylinchuckylin at gmail.com
Mon Apr 29 06:07:05 CEST 2019


Tiwei Bie <tiwei.bie at intel.com> 于2019年4月28日周日 下午7:18写道:
>
> On Fri, Apr 26, 2019 at 05:40:21AM -0400, Li Lin wrote:
> > From: lilin24 <lilin24 at baidu.com>
> >
> > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> > and VHOST_USER_SET_INFLIGHT_FD to support transferring a shared
> > buffer between qemu and backend.
> >
> > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the
> > shared buffer from backend. Then qemu should send it back
> > through VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> >
> > This shared buffer is used to process inflight I/O when backend
> > reconnect.
> >
> > Signed-off-by: lilin24 <lilin24 at baidu.com>
>
> You need to use your real name here.
>
> > Signed-off-by: Ni Xun <nixun at baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31 at baidu.com>
> >
> > v2:
> > - add set&clr inflight entry function
> > v3:
> > - simplified some function implementations
> >
> > ---
>
> You can put change logs here (i.e. after ---).
>
> >  lib/librte_vhost/rte_vhost.h  |  53 ++++++++++
> >  lib/librte_vhost/vhost.c      |  42 ++++++++
> >  lib/librte_vhost/vhost.h      |  11 ++
> >  lib/librte_vhost/vhost_user.c | 231 ++++++++++++++++++++++++++++++++++++++++++
> >  lib/librte_vhost/vhost_user.h |  16 ++-
> >  5 files changed, 351 insertions(+), 2 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> > index d2c0c93f4..bc25591a8 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -71,6 +71,10 @@ extern "C" {
> >  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> >  #endif
> >
> > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
> > +#endif
> > +
> >  /** Indicate whether protocol features negotiation is supported. */
> >  #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> >  #define VHOST_USER_F_PROTOCOL_FEATURES       30
> > @@ -98,12 +102,26 @@ struct rte_vhost_memory {
> >       struct rte_vhost_mem_region regions[];
> >  };
> >
> > +typedef struct VhostUserInflightEntry {
> > +     uint8_t inflight;
> > +} VhostUserInflightEntry;
> > +
> > +typedef struct VhostInflightInfo {
> > +     uint16_t version;
> > +     uint16_t last_inflight_io;
> > +     uint16_t used_idx;
> > +     VhostUserInflightEntry desc[0];
> > +} VhostInflightInfo;
>
> Is there any details on above structure? Why does it not match
> QueueRegionSplit or QueueRegionPacked structures described in
> qemu/docs/interop/vhost-user.txt?

Qemu have its vhost-user backend,qemu did the submission of IO in it.
The implementation of dpdk is more general. It is just to mark inflight entry.
The submission of inflight entry is handle over to different backends.
They have their own ways to handle it, such as spdk.
So there are some differences in data structure.

>
> > +
> >  struct rte_vhost_vring {
> >       struct vring_desc       *desc;
> >       struct vring_avail      *avail;
> >       struct vring_used       *used;
> >       uint64_t                log_guest_addr;
> >
> > +     VhostInflightInfo       *inflight;
> > +     int                inflight_flag;
> > +
> >       /** Deprecated, use rte_vhost_vring_call() instead. */
> >       int                     callfd;
> >
> > @@ -632,6 +650,41 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> >  int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >
> >  /**
> > + * set inflight flag for a entry.
> > + *
> > + * @param vring
> > + *  the structure to hold the requested vring info
> > + * @param idx
> > + *  inflight entry index
> > + */
> > +void rte_vhost_set_inflight(struct rte_vhost_vring *vring,
> > +             uint16_t idx);
> > +
> > +/**
> > + * clear inflight flag for a entry.
> > + *
> > + * @param vring
> > + *  the structure to hold the requested vring info
> > + * @param last_used_idx
> > + *  next free used_idx
> > + * @param idx
> > + *  inflight entry index
> > + */
> > +void rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > +             uint16_t last_used_idx, uint16_t idx);
> > +
> > +/**
> > + * set last inflight io index.
> > + *
> > + * @param vring
> > + *  the structure to hold the requested vring info
> > + * @param idx
> > + *  inflight entry index
> > + */
> > +void rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring,
> > +             uint16_t idx);
>
> New APIs should be experimental and also need to be
> added in rte_vhost_version.map file.
>
> > +
> > +/**
> >   * Get vhost RX queue avail count.
> >   *
> >   * @param vid
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > index 163f4595e..9ba692935 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -76,6 +76,8 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> >               close(vq->callfd);
> >       if (vq->kickfd >= 0)
> >               close(vq->kickfd);
> > +     if (vq->inflight)
> > +             vq->inflight = NULL;
> >  }
> >
> >  /*
> > @@ -589,6 +591,11 @@ rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
> >       vring->kickfd  = vq->kickfd;
> >       vring->size    = vq->size;
> >
> > +     vring->inflight = vq->inflight;
> > +
> > +     vring->inflight_flag = vq->inflight_flag;
> > +     vq->inflight_flag = 0;
>
> This will break the ABI. Better to introduce a new API to do this.
>
> > +
> >       return 0;
> >  }
> >
> > @@ -617,6 +624,41 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> >       return 0;
> >  }
> >
> > +void
> > +rte_vhost_set_inflight(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > +     VhostInflightInfo *inflight = vring->inflight;
> > +     if (unlikely(!inflight))
> > +             return;
> > +     inflight->desc[idx].inflight = 1;
> > +}
> > +
> > +void
> > +rte_vhost_clr_inflight(struct rte_vhost_vring *vring,
> > +     uint16_t last_used_idx, uint16_t idx)
> > +{
> > +     VhostInflightInfo *inflight = vring->inflight;
> > +
> > +     if (unlikely(!inflight))
> > +             return;
> > +
> > +     rte_compiler_barrier();
> > +     inflight->desc[idx].inflight = 0;
> > +     rte_compiler_barrier();
> > +     inflight->used_idx = last_used_idx;
> > +}
> > +
> > +void
> > +rte_vhost_set_last_inflight_io(struct rte_vhost_vring *vring, uint16_t idx)
> > +{
> > +     VhostInflightInfo *inflight = vring->inflight;
> > +
> > +     if (unlikely(!inflight))
> > +             return;
> > +
> > +     inflight->last_inflight_io = idx;
> > +}
>
> The rte_vhost_vring is used to return information to apps.
>
> IIUC, you want to update vq->inflight. So the rte_vhost_vring
> param should be replaced by vid + vring_idx.
>
> > +
> >  uint16_t
> >  rte_vhost_avail_entries(int vid, uint16_t queue_id)
> >  {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index e9138dfab..537d74c71 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -128,6 +128,10 @@ struct vhost_virtqueue {
> >       /* Physical address of used ring, for logging */
> >       uint64_t                log_guest_addr;
> >
> > +     /* Inflight share memory info */
> > +     VhostInflightInfo       *inflight;
> > +     bool                inflight_flag;
> > +
> >       uint16_t                nr_zmbuf;
> >       uint16_t                zmbuf_size;
> >       uint16_t                last_zmbuf_idx;
> > @@ -286,6 +290,12 @@ struct guest_page {
> >       uint64_t size;
> >  };
> >
> > +typedef struct VuDevInflightInfo {
> > +     int fd;
> > +     void *addr;
> > +     uint64_t size;
> > +} VuDevInflightInfo;
>
> Please follow DPDK's coding style when defining internal structure.
>
> > +
> >  /**
> >   * Device structure contains all configuration information relating
> >   * to the device.
> > @@ -303,6 +313,7 @@ struct virtio_net {
> >       uint32_t                nr_vring;
> >       int                     dequeue_zero_copy;
> >       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > +     VuDevInflightInfo inflight_info;
> >  #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> >       char                    ifname[IF_NAME_SZ];
> >       uint64_t                log_size;
> > diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> [...]
> > +static uint32_t get_pervq_shm_size(uint16_t queue_size)
> > +{
> > +     return ALIGN_UP(sizeof(VhostUserInflightEntry) * queue_size +
> > +             sizeof(uint16_t) * 3, INFLIGHT_ALIGNMENT);
> > +}
> > +
> > +static int
> > +vhost_user_get_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > +             int main_fd __rte_unused)
> > +{
> > +     int fd;
> > +     uint64_t mmap_size;
> > +     void *addr;
> > +     uint16_t num_queues, queue_size;
> > +     struct virtio_net *dev = *pdev;
> > +
> > +     if (msg->size != sizeof(msg->payload.inflight)) {
> > +             RTE_LOG(ERR, VHOST_CONFIG,
> > +                     "Invalid get_inflight_fd message size is %d",
> > +                     msg->size);
> > +             msg->payload.inflight.mmap_size = 0;
>
> In this case, you can't touch the payload.
>
> > +             return 0;
>
> And an error should be returned.
>
> > +     }
> > +
> > +     num_queues = msg->payload.inflight.num_queues;
> > +     queue_size = msg->payload.inflight.queue_size;
> > +
> > +     RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd num_queues: %u\n",
> > +                     msg->payload.inflight.num_queues);
> > +     RTE_LOG(INFO, VHOST_CONFIG, "get_inflight_fd queue_size: %u\n",
> > +                     msg->payload.inflight.queue_size);
> > +     mmap_size = num_queues * get_pervq_shm_size(queue_size);
> > +
> > +     addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
> > +     if (!addr) {
> > +             RTE_LOG(ERR, VHOST_CONFIG, "Failed to alloc vhost inflight area");
> > +                     msg->payload.inflight.mmap_size = 0;
> > +             return 0;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > +     }
> > +     memset(addr, 0, mmap_size);
> > +
> > +     dev->inflight_info.addr = addr;
> > +     dev->inflight_info.size = msg->payload.inflight.mmap_size = mmap_size;
> > +     dev->inflight_info.fd = msg->fds[0] = fd;
> > +     msg->payload.inflight.mmap_offset = 0;
> > +     msg->fd_num = 1;
> > +
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +                     "send inflight mmap_size: %lu\n",
> > +                     msg->payload.inflight.mmap_size);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +                     "send inflight mmap_offset: %lu\n",
> > +                     msg->payload.inflight.mmap_offset);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +                     "send inflight fd: %d\n", msg->fds[0]);
> > +
> > +     return RTE_VHOST_MSG_RESULT_REPLY;
> > +}
> > +
> > +static int
> > +vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
> > +             int main_fd __rte_unused)
> > +{
> > +     int fd, i;
> > +     uint64_t mmap_size, mmap_offset;
> > +     uint16_t num_queues, queue_size;
> > +     uint32_t pervq_inflight_size;
> > +     void *rc;
> > +     struct vhost_virtqueue *vq;
> > +     struct virtio_net *dev = *pdev;
> > +
> > +     fd = msg->fds[0];
> > +     if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
> > +             RTE_LOG(ERR, VHOST_CONFIG, "Invalid set_inflight_fd message size is %d,fd is %d\n",
> > +                     msg->size, fd);
> > +             return -1;
>
> Ditto.
>
> > +     }
> > +
> > +     mmap_size = msg->payload.inflight.mmap_size;
> > +     mmap_offset = msg->payload.inflight.mmap_offset;
> > +     num_queues = msg->payload.inflight.num_queues;
> > +     queue_size = msg->payload.inflight.queue_size;
> > +     pervq_inflight_size = get_pervq_shm_size(queue_size);
> > +
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd mmap_size: %lu\n", mmap_size);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd mmap_offset: %lu\n", mmap_offset);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd num_queues: %u\n", num_queues);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd queue_size: %u\n", queue_size);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd fd: %d\n", fd);
> > +     RTE_LOG(INFO, VHOST_CONFIG,
> > +             "set_inflight_fd pervq_inflight_size: %d\n",
> > +             pervq_inflight_size);
> > +
> > +     if (dev->inflight_info.addr)
> > +             munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +
> > +     rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > +                     fd, mmap_offset);
>
> Why call it rc? Maybe addr is a better name?

In some scenarios, shared memory is reallocated or resized by qemu, so
again mmap is needed.

>
> > +     if (rc == MAP_FAILED) {
> > +             RTE_LOG(ERR, VHOST_CONFIG, "failed to mmap share memory.\n");
> > +             return -1;
>
> Should always return RTE_VHOST_MSG_RESULT_* in
> message handler.
>
> > +     }
> > +
> > +     if (dev->inflight_info.fd)
> > +             close(dev->inflight_info.fd);
> > +
> > +     dev->inflight_info.fd = fd;
> > +     dev->inflight_info.addr = rc;
> > +     dev->inflight_info.size = mmap_size;
> > +
> > +     for (i = 0; i < num_queues; i++) {
> > +             vq = dev->virtqueue[i];
> > +             vq->inflight = (VhostInflightInfo *)rc;
> > +             rc = (void *)((char *)rc + pervq_inflight_size);
> > +     }
> > +
> > +     return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> >  static int
> >  vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
> >                       int main_fd __rte_unused)
> > @@ -1202,6 +1402,29 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
> >  }
> >
> >  static int
> > +vhost_check_queue_inflights(struct vhost_virtqueue *vq)
> > +{
> > +     uint16_t i = 0;
> > +
> > +     if ((!vq->inflight))
> > +             return RTE_VHOST_MSG_RESULT_ERR;
>
> Should check whether the feature is negotiated first.
>
> > +
> > +     if (!vq->inflight->version) {
> > +             vq->inflight->version = INFLIGHT_VERSION;
> > +             return RTE_VHOST_MSG_RESULT_OK;
> > +     }
> > +
> > +     for (i = 0; i < vq->size; i++) {
> > +             if (vq->inflight->desc[i].inflight == 1) {
> > +                     vq->inflight_flag = 1;
> > +                     break;
> > +             }
> > +     }
> > +
> > +     return RTE_VHOST_MSG_RESULT_OK;
> > +}
> > +
> > +static int
> >  vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> >                       int main_fd __rte_unused)
> >  {
> > @@ -1242,6 +1465,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
> >               close(vq->kickfd);
> >       vq->kickfd = file.fd;
> >
> > +     if (vhost_check_queue_inflights(vq)) {
> > +             RTE_LOG(ERR, VHOST_CONFIG,
> > +                     "Failed to inflights for vq: %d\n", file.index);
> > +             return RTE_VHOST_MSG_RESULT_ERR;
> > +     }
> > +
> >       return RTE_VHOST_MSG_RESULT_OK;
> >  }
> >
> > @@ -1762,6 +1991,8 @@ static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
> >       [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
> >       [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
> >       [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
> > +     [VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
> > +     [VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
> >  };
> >
> >
> > diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
> > index 2a650fe4b..35f969b1b 100644
> > --- a/lib/librte_vhost/vhost_user.h
> > +++ b/lib/librte_vhost/vhost_user.h
> > @@ -23,7 +23,8 @@
> >                                        (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
> >                                        (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
> >                                        (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
> > -                                      (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
> > +                                     (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT) | \
> > +                                     (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
>
> It will advertise this feature for builtin net and crypto
> backends. It's probably not what you intended.
>

Indeed, this feature is mainly used for spdk-like backends. You mean
this function is disabled by default?

> >
> >  typedef enum VhostUserRequest {
> >       VHOST_USER_NONE = 0,
> > @@ -54,7 +55,9 @@ typedef enum VhostUserRequest {
> >       VHOST_USER_POSTCOPY_ADVISE = 28,
> >       VHOST_USER_POSTCOPY_LISTEN = 29,
> >       VHOST_USER_POSTCOPY_END = 30,
> > -     VHOST_USER_MAX = 31
> > +     VHOST_USER_GET_INFLIGHT_FD = 31,
> > +     VHOST_USER_SET_INFLIGHT_FD = 32,
> > +     VHOST_USER_MAX = 33
> >  } VhostUserRequest;
> >
> >  typedef enum VhostUserSlaveRequest {
> > @@ -112,6 +115,13 @@ typedef struct VhostUserVringArea {
> >       uint64_t offset;
> >  } VhostUserVringArea;
> >
> > +typedef struct VhostUserInflight {
> > +     uint64_t mmap_size;
> > +     uint64_t mmap_offset;
> > +     uint16_t num_queues;
> > +     uint16_t queue_size;
> > +} VhostUserInflight;
> > +
> >  typedef struct VhostUserMsg {
> >       union {
> >               uint32_t master; /* a VhostUserRequest value */
> > @@ -131,6 +141,7 @@ typedef struct VhostUserMsg {
> >               struct vhost_vring_addr addr;
> >               VhostUserMemory memory;
> >               VhostUserLog    log;
> > +             VhostUserInflight inflight;
> >               struct vhost_iotlb_msg iotlb;
> >               VhostUserCryptoSessionParam crypto_session;
> >               VhostUserVringArea area;
> > @@ -148,6 +159,7 @@ typedef struct VhostUserMsg {
> >  /* vhost_user.c */
> >  int vhost_user_msg_handler(int vid, int fd);
> >  int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
> > +void *inflight_mem_alloc(const char *name, size_t size, int *fd);
> >
> >  /* socket.c */
> >  int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
> > --
> > 2.11.0
> >

Thank you for your comments. They will be revised in the next version.


More information about the dev mailing list