[dpdk-dev] [PATCH] vhost: add support to large linear mbufs
Shahaf Shuler
shahafs at mellanox.com
Wed Oct 2 06:45:45 CEST 2019
Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:
> Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
>
> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If the
> data fits into a buffer, then all data is copied and a single linear buffer is
> returned. Otherwise it allocates additional mbufs and chains them together
> to return a multiple segments mbuf.
>
> While that covers most use cases, it forces applications that need to work
> with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance implications
> to the application.
>
> To resolve the issue, change the API so that the application can optionally
> provide a second mempool containing larger mbufs. If that is not provided
> (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool is used
> to return linear mbufs.
I understand the motivation.
However, providing a static pool w/ large buffers is not so efficient in terms of memory footprint. You will need to prepare to worst case (all packet are large) w/ max size of 64KB.
Also, the two mempools are quite restrictive as the memory fill of the mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2 mbuf.size = 64K, packet size 4KB.
Instead, how about using the mbuf external buffer feature?
The flow will be:
1. vhost PMD always receive a single mempool (like today)
2. on dequeue, PMD looks on the virtio packet size. If smaller than the mbuf size use the mbuf as is (like today)
3. otherwise, allocate a new buffer (inside the PMD) and link it to the mbuf as external buffer (rte_pktmbuf_attach_extbuf)
The pros of this approach is that you have full flexibility on the memory allocation, and therefore a lower footprint.
The cons is the OVS will need to know how to handle mbuf w/ external buffers (not too complex IMO).
>
> Signed-off-by: Flavio Leitner <fbl at sysclose.org>
> ---
> drivers/net/vhost/rte_eth_vhost.c | 4 +--
> examples/tep_termination/main.c | 2 +-
> examples/vhost/main.c | 2 +-
> lib/librte_vhost/rte_vhost.h | 5 ++-
> lib/librte_vhost/virtio_net.c | 57 +++++++++++++++++++++++--------
> 5 files changed, 50 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/vhost/rte_eth_vhost.c
> b/drivers/net/vhost/rte_eth_vhost.c
> index 46f01a7f4..ce7f68a5b 100644
> --- a/drivers/net/vhost/rte_eth_vhost.c
> +++ b/drivers/net/vhost/rte_eth_vhost.c
> @@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs,
> uint16_t nb_bufs)
> VHOST_MAX_PKT_BURST);
>
> nb_pkts = rte_vhost_dequeue_burst(r->vid, r-
> >virtqueue_id,
> - r->mb_pool, &bufs[nb_rx],
> - num);
> + r->mb_pool, NULL,
> + &bufs[nb_rx], num);
>
> nb_rx += nb_pkts;
> nb_receive -= nb_pkts;
> diff --git a/examples/tep_termination/main.c
> b/examples/tep_termination/main.c index ab956ad7c..3ebf0fa6e 100644
> --- a/examples/tep_termination/main.c
> +++ b/examples/tep_termination/main.c
> @@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
> if (likely(!vdev->remove)) {
> /* Handle guest TX*/
> tx_count = rte_vhost_dequeue_burst(vdev-
> >vid,
> - VIRTIO_TXQ, mbuf_pool,
> + VIRTIO_TXQ, mbuf_pool,
> NULL,
> pkts_burst,
> MAX_PKT_BURST);
> /* If this is the first received packet we need
> to learn the MAC */
> if (unlikely(vdev->ready ==
> DEVICE_MAC_LEARNING) && tx_count) { diff --git a/examples/vhost/main.c
> b/examples/vhost/main.c index ab649bf14..e9b306af3 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
> pkts, MAX_PKT_BURST);
> } else {
> count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
> - mbuf_pool, pkts, MAX_PKT_BURST);
> + mbuf_pool, NULL, pkts,
> MAX_PKT_BURST);
> }
>
> /* setup VMDq for the first packet */
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index
> 19474bca0..b05fd8e2a 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
> * virtio queue index in mq case
> * @param mbuf_pool
> * mbuf_pool where host mbuf is allocated.
> + * @param mbuf_pool_large
> + * mbuf_pool where larger host mbuf is allocated.
> * @param pkts
> * array to contain packets to be dequeued
> * @param count
> @@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
> * num of packets dequeued
> */
> uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count);
> + struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> + struct rte_mbuf **pkts, uint16_t count);
>
> /**
> * Get guest mem table: a list of memory regions.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> 5b85b832d..da9d77732 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
>
> static __rte_noinline uint16_t
> virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> + struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> + struct rte_mbuf **pkts, uint16_t count)
> {
> uint16_t i;
> uint16_t free_entries;
> + uint16_t mbuf_avail;
>
> if (unlikely(dev->dequeue_zero_copy)) {
> struct zcopy_mbuf *zmbuf, *next;
> @@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
> dev->vid, count);
>
> + /* If the large mpool is provided, find the threshold */
> + mbuf_avail = 0;
> + if (mbuf_pool_large)
> + mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
> for (i = 0; i < count; i++) {
> struct buf_vector buf_vec[BUF_VECTOR_MAX];
> uint16_t head_idx;
> - uint32_t dummy_len;
> + uint32_t buf_len;
> uint16_t nr_vec = 0;
> + struct rte_mempool *mpool;
> int err;
>
> if (unlikely(fill_vec_buf_split(dev, vq,
> vq->last_avail_idx + i,
> &nr_vec, buf_vec,
> - &head_idx, &dummy_len,
> + &head_idx, &buf_len,
> VHOST_ACCESS_RO) < 0))
> break;
>
> if (likely(dev->dequeue_zero_copy == 0))
> update_shadow_used_ring_split(vq, head_idx, 0);
>
> - pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> + if (mbuf_pool_large && buf_len > mbuf_avail)
> + mpool = mbuf_pool_large;
> + else
> + mpool = mbuf_pool;
> +
> + pkts[i] = rte_pktmbuf_alloc(mpool);
> if (unlikely(pkts[i] == NULL)) {
> RTE_LOG(ERR, VHOST_DATA,
> "Failed to allocate memory for mbuf.\n");
> break;
> }
>
> - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> - mbuf_pool);
> + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
> if (unlikely(err)) {
> rte_pktmbuf_free(pkts[i]);
> break;
> @@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>
> static __rte_noinline uint16_t
> virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> + struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> + struct rte_mbuf **pkts, uint16_t count)
> {
> uint16_t i;
> + uint16_t mbuf_avail;
>
> if (unlikely(dev->dequeue_zero_copy)) {
> struct zcopy_mbuf *zmbuf, *next;
> @@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
> VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
> dev->vid, count);
>
> + /* If the large mpool is provided, find the threshold */
> + mbuf_avail = 0;
> + if (mbuf_pool_large)
> + mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
> for (i = 0; i < count; i++) {
> struct buf_vector buf_vec[BUF_VECTOR_MAX];
> uint16_t buf_id;
> - uint32_t dummy_len;
> + uint32_t buf_len;
> uint16_t desc_count, nr_vec = 0;
> + struct rte_mempool *mpool;
> int err;
>
> if (unlikely(fill_vec_buf_packed(dev, vq,
> vq->last_avail_idx,
> &desc_count,
> buf_vec, &nr_vec,
> - &buf_id, &dummy_len,
> + &buf_id, &buf_len,
> VHOST_ACCESS_RO) < 0))
> break;
>
> @@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
> update_shadow_used_ring_packed(vq, buf_id, 0,
> desc_count);
>
> - pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> + if (mbuf_pool_large && buf_len > mbuf_avail)
> + mpool = mbuf_pool_large;
> + else
> + mpool = mbuf_pool;
> +
> + pkts[i] = rte_pktmbuf_alloc(mpool);
> if (unlikely(pkts[i] == NULL)) {
> RTE_LOG(ERR, VHOST_DATA,
> "Failed to allocate memory for mbuf.\n");
> break;
> }
>
> - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> - mbuf_pool);
> + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
> if (unlikely(err)) {
> rte_pktmbuf_free(pkts[i]);
> break;
> @@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>
> uint16_t
> rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> + struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> + struct rte_mbuf **pkts, uint16_t count)
> {
> struct virtio_net *dev;
> struct rte_mbuf *rarp_mbuf = NULL;
> @@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t
> queue_id,
> }
>
> if (vq_is_packed(dev))
> - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,
> count);
> + count = virtio_dev_tx_packed(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> + count);
> else
> - count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
> + count = virtio_dev_tx_split(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> + count);
>
> out:
> if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> --
> 2.20.1
More information about the dev
mailing list