[dpdk-dev] 答复: [PATCH v4] vhost: support inflight share memory protocol feature

Li,Lin(ACG Cloud) lilin24 at baidu.com
Wed May 22 13:04:43 CEST 2019



> -----邮件原件-----
> 发件人: Maxime Coquelin [mailto:maxime.coquelin at redhat.com]
> 发送时间: 2019年5月17日 23:47
> 收件人: Li Lin <chuckylinchuckylin at gmail.com>; tiwei.bie at intel.com;
> zhihong.wang at intel.com
> 抄送: dev at dpdk.org; dariusz.stojaczyk at intel.com; changpeng.liu at intel.com;
> james.r.harris at intel.com; Xie,Yongji <xieyongji at baidu.com>; Li,Lin(ACG Cloud)
> <lilin24 at baidu.com>; Ni,Xun <nixun at baidu.com>; Zhang,Yu(ACG Cloud)
> <zhangyu31 at baidu.com>
> 主题: Re: [PATCH v4] vhost: support inflight share memory protocol feature
> 
> 
> 
> On 5/5/19 11:02 AM, Li Lin wrote:
> > From: Li Lin <lilin24 at baidu.com>
> >
> > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD
> and
> > VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer
> > between qemu and backend.
> >
> > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the shared buffer
> > from backend. Then qemu should send it back through
> > VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user.
> >
> > This shared buffer is used to process inflight I/O when backend
> > reconnect.
> >
> > Signed-off-by: Li Lin <lilin24 at baidu.com>
> > Signed-off-by: Ni Xun <nixun at baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31 at baidu.com>
> > ---
> > v2 - add set&clr inflight desc functions
> > v3 - simplified some functions implementation
> > v4 - rework some data structures according to qemu doc
> >
> >   lib/librte_vhost/rte_vhost.h           |  83 ++++++++++
> >   lib/librte_vhost/rte_vhost_version.map |   3 +
> >   lib/librte_vhost/vhost.c               | 105 ++++++++++++
> >   lib/librte_vhost/vhost.h               |  12 ++
> >   lib/librte_vhost/vhost_user.c          | 292
> +++++++++++++++++++++++++++++++++
> >   lib/librte_vhost/vhost_user.h          |  13 +-
> >   6 files changed, 507 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index d2c0c93f4..d4d709717 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -71,6 +71,10 @@ extern "C" {
> >   #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
> >   #endif
> >
> > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
> > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 #endif
> > +
> >   /** Indicate whether protocol features negotiation is supported. */
> >   #ifndef VHOST_USER_F_PROTOCOL_FEATURES
> >   #define VHOST_USER_F_PROTOCOL_FEATURES	30
> > @@ -98,12 +102,41 @@ struct rte_vhost_memory {
> >   	struct rte_vhost_mem_region regions[];
> >   };
> >
> > +struct inflight_desc {
> > +	uint8_t inflight;
> > +	uint8_t padding[5];
> > +	uint16_t next;
> > +	uint64_t counter;
> > +};
> > +
> > +struct inflight_info {
> > +	uint64_t features;
> > +	uint16_t version;
> > +	uint16_t desc_num;
> > +	uint16_t last_inflight_io;
> > +	uint16_t used_idx;
> > +	struct inflight_desc desc[0];
> > +};
> > +
> > +struct resubmit_desc {
> > +	uint16_t index;
> > +	uint64_t counter;
> > +};
> > +
> > +struct resubmit_info {
> > +	struct resubmit_desc *resubmit_list;
> > +	uint16_t resubmit_num;
> > +};
> > +
> >   struct rte_vhost_vring {
> >   	struct vring_desc	*desc;
> >   	struct vring_avail	*avail;
> >   	struct vring_used	*used;
> >   	uint64_t		log_guest_addr;
> >
> > +	struct inflight_info	*inflight;
> > +	struct resubmit_info	*resubmit;
> > +
> >   	/** Deprecated, use rte_vhost_vring_call() instead. */
> >   	int			callfd;
> >
> > @@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t
> vring_idx,
> >   int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >
> >   /**
> > + * set inflight flag for a desc.
> > + *
> > + * @param vid
> > + *  vhost device ID
> > + * @param vring_idx
> > + *  vring index
> > + * @param idx
> > + *  inflight entry index
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx,
> > +		uint16_t idx);
> > +
> > +/**
> > + * clear inflight flag for a desc.
> > + *
> > + * @param vid
> > + *  vhost device ID
> > + * @param vring_idx
> > + *  vring index
> > + * @param last_used_idx
> > + *  next free used_idx
> > + * @param idx
> > + *  inflight entry index
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> > +		uint16_t last_used_idx, uint16_t idx);
> > +
> > +/**
> > + * set last inflight io index.
> > + *
> > + * @param vid
> > + *  vhost device ID
> > + * @param vring_idx
> > + *  vring index
> > + * @param idx
> > + *  inflight entry index
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +int __rte_experimental
> > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx,
> > +		uint16_t idx);
> > +
> > +/**
> >    * Get vhost RX queue avail count.
> >    *
> >    * @param vid
> > diff --git a/lib/librte_vhost/rte_vhost_version.map
> > b/lib/librte_vhost/rte_vhost_version.map
> > index 5f1d4a75c..a992b3935 100644
> > --- a/lib/librte_vhost/rte_vhost_version.map
> > +++ b/lib/librte_vhost/rte_vhost_version.map
> > @@ -87,4 +87,7 @@ EXPERIMENTAL {
> >   	rte_vdpa_relay_vring_used;
> >   	rte_vhost_extern_callback_register;
> >   	rte_vhost_driver_set_protocol_features;
> > +	rte_vhost_set_inflight_desc;
> > +	rte_vhost_clr_inflight_desc;
> > +	rte_vhost_set_last_inflight_io;
> >   };
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 163f4595e..03eab3551 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy)
> >   		close(vq->callfd);
> >   	if (vq->kickfd >= 0)
> >   		close(vq->kickfd);
> > +	if (vq->inflight)
> > +		vq->inflight = NULL;
> > +	if (vq->resubmit->resubmit_list) {
> > +		free(vq->resubmit->resubmit_list);
> > +		vq->resubmit->resubmit_list = NULL;
> > +	}
> > +	if (vq->resubmit) {
> > +		free(vq->resubmit);
> > +		vq->resubmit = NULL;
> > +	}
> >   }
> >
> >   /*
> > @@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t
> vring_idx,
> >   	vring->kickfd  = vq->kickfd;
> >   	vring->size    = vq->size;
> >
> > +	vring->inflight = vq->inflight;
> > +	vring->resubmit = vq->resubmit;
> > +
> >   	return 0;
> >   }
> >
> > @@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx)
> >   	return 0;
> >   }
> >
> > +int
> > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t
> > +idx) {
> > +	struct virtio_net *dev;
> > +	struct vhost_virtqueue *vq;
> > +
> > +	dev = get_device(vid);
> > +	if (!dev)
> > +		return -1;
> > +
> > +	if (!(dev->features &
> > +		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > +		return 0;
> 
> Shouldn't -1 be returned here also?
> I think this function should be called only if the feature negotiation succeeded.
> 
> Returning an error here would help the calling application to know something
> went wrong.
> 
> Same comment applies for the clear function.

If the feature is not set,set & clr function returns directly.
They are similar to empty functions.
The return value is - 1, which represents an error in function execution
and unexpected behavior.

> > +
> > +	if (vring_idx >= VHOST_MAX_VRING)
> > +		return -1;
> > +
> > +	vq = dev->virtqueue[vring_idx];
> > +	if (!vq)
> > +		return -1;
> > +
> > +	if (unlikely(!vq->inflight))
> > +		return -1;
> > +
> > +	vq->inflight->desc[idx].counter = vq->counter++;
> > +	vq->inflight->desc[idx].inflight = 1;
> > +	return 0;
> > +}
> > +
> > +int
> > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx,
> > +	uint16_t last_used_idx, uint16_t idx) {
> > +	struct virtio_net *dev;
> > +	struct vhost_virtqueue *vq;
> > +
> > +	dev = get_device(vid);
> > +	if (!dev)
> > +		return -1;
> > +
> > +	if (!(dev->features &
> > +		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > +		return 0;
> > +
> > +	if (vring_idx >= VHOST_MAX_VRING)
> > +		return -1;
> > +
> > +	vq = dev->virtqueue[vring_idx];
> > +	if (!vq)
> > +		return -1;
> > +
> > +	if (unlikely(!vq->inflight))
> > +		return -1;
> > +
> > +	rte_compiler_barrier();
> > +
> > +	vq->inflight->desc[idx].inflight = 0;
> > +
> > +	rte_compiler_barrier();
> > +
> > +	vq->inflight->used_idx = last_used_idx;
> > +	return 0;
> > +}
> > +
> > +int
> > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t
> > +idx) {
> > +	struct virtio_net *dev;
> > +	struct vhost_virtqueue *vq;
> > +
> > +	dev = get_device(vid);
> > +	if (!dev)
> > +		return -1;
> > +
> > +	if (!(dev->features &
> > +		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
> > +		return 0;
> > +
> > +	if (vring_idx >= VHOST_MAX_VRING)
> > +		return -1;
> > +
> > +	vq = dev->virtqueue[vring_idx];
> > +	if (!vq)
> > +		return -1;
> > +
> > +	if (unlikely(!vq->inflight))
> > +		return -1;
> > +
> > +	vq->inflight->last_inflight_io = idx;
> > +	return 0;
> > +}
> 
> Are above functions supposed to be called in the hot path?
> If so, you might want to use unlikely for the error checks everywhere.

You're right. I'll modify it in the next version.

> 
> > +
> >   uint16_t
> >   rte_vhost_avail_entries(int vid, uint16_t queue_id)
> >   {
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > e9138dfab..3dace2ec3 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -128,6 +128,11 @@ struct vhost_virtqueue {
> >   	/* Physical address of used ring, for logging */
> >   	uint64_t		log_guest_addr;
> >
> > +	/* Inflight share memory info */
> > +	struct inflight_info	*inflight;
> > +	struct resubmit_info	*resubmit;
> > +	uint64_t counter;
> 
> counter and resubmit are too generic names.

I'll modify the names in the next version.

> 
> > +
> >   	uint16_t		nr_zmbuf;
> >   	uint16_t		zmbuf_size;
> >   	uint16_t		last_zmbuf_idx;
> > @@ -286,6 +291,12 @@ struct guest_page {
> >   	uint64_t size;
> >   };
> >
> > +struct inflight_mem_info {
> > +	int fd;
> > +	void *addr;
> > +	uint64_t size;
> > +};
> > +
> >   /**
> >    * Device structure contains all configuration information relating
> >    * to the device.
> > @@ -303,6 +314,7 @@ struct virtio_net {
> >   	uint32_t		nr_vring;
> >   	int			dequeue_zero_copy;
> >   	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > +	struct inflight_mem_info inflight_info;
> >   #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> >   	char			ifname[IF_NAME_SZ];
> >   	uint64_t		log_size;
> 
> Do you have some code example using these new APIs?
> It would help for reviewing the patch.

This patch is only valid for vhost blk. 
The use of new two messages and the implementation of packed queue are
described in docs/vhost-user.txt.

I have submitted SPDK-related patches to use new API.

> 
> Thanks,
> Maxime
> 



More information about the dev mailing list