[dpdk-dev] [PATCH] examples/vhost: Support jumbo frame in user space vhost

Ouyang, Changchun changchun.ouyang at intel.com
Thu Aug 21 03:24:33 CEST 2014


Hi all,

Any comments for this patch?
And what's the status for merging it into mainline?

Thanks in advance
Changchun

> -----Original Message-----
> From: Ouyang, Changchun
> Sent: Friday, August 15, 2014 12:58 PM
> To: dev at dpdk.org
> Cc: Cao, Waterman; Ouyang, Changchun
> Subject: [PATCH] examples/vhost: Support jumbo frame in user space vhost
> 
> This patch support mergeable RX feature and thus support jumbo frame RX
> and TX in user space vhost(as virtio backend).
> 
> On RX, it secures enough room from vring to accommodate one complete
> scattered packet which is received by PMD from physical port, and then copy
> data from mbuf to vring buffer, possibly across a few vring entries and
> descriptors.
> 
> On TX, it gets a jumbo frame, possibly described by a few vring descriptors
> which are chained together with the flags of 'NEXT', and then copy them into
> one scattered packet and TX it to physical port through PMD.
> 
> Signed-off-by: Changchun Ouyang <changchun.ouyang at intel.com>
> Acked-by: Huawei Xie <huawei.xie at intel.com>
> ---
>  examples/vhost/main.c       | 726
> ++++++++++++++++++++++++++++++++++++++++----
>  examples/vhost/virtio-net.h |  14 +
>  2 files changed, 687 insertions(+), 53 deletions(-)
> 
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c index
> 193aa25..7d9e6a2 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -106,6 +106,8 @@
>  #define BURST_RX_WAIT_US 15 	/* Defines how long we wait
> between retries on RX */
>  #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
> 
> +#define JUMBO_FRAME_MAX_SIZE    0x2600
> +
>  /* State of virtio device. */
>  #define DEVICE_MAC_LEARNING 0
>  #define DEVICE_RX			1
> @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv)
>  					us_vhost_usage(prgname);
>  					return -1;
>  				} else {
> -					if (ret)
> +					if (ret) {
> +
> 	vmdq_conf_default.rxmode.jumbo_frame = 1;
> +
> 	vmdq_conf_default.rxmode.max_rx_pkt_len
> +							=
> JUMBO_FRAME_MAX_SIZE;
>  						VHOST_FEATURES = (1ULL <<
> VIRTIO_NET_F_MRG_RXBUF);
> +					}
>  				}
>  			}
> 
> @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv)
>  		return -1;
>  	}
> 
> +	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame
> == 1)) {
> +		RTE_LOG(INFO, VHOST_PORT,
> +			"Vhost zero copy doesn't support jumbo frame,"
> +			"please specify '--mergeable 0' to disable the "
> +			"mergeable feature.\n");
> +		return -1;
> +	}
> +
>  	return 0;
>  }
> 
> @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
>   * This function adds buffers to the virtio devices RX virtqueue. Buffers can
>   * be received from the physical port or from another virtio device. A packet
>   * count is returned to indicate the number of packets that were succesfully
> - * added to the RX queue.
> + * added to the RX queue. This function works when mergeable is disabled.
>   */
>  static inline uint32_t __attribute__((always_inline))  virtio_dev_rx(struct
> virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
>  	uint64_t buff_hdr_addr = 0;
>  	uint32_t head[MAX_PKT_BURST], packet_len = 0;
>  	uint32_t head_idx, packet_success = 0;
> -	uint32_t mergeable, mrg_count = 0;
>  	uint32_t retry = 0;
>  	uint16_t avail_idx, res_cur_idx;
>  	uint16_t res_base_idx, res_end_idx;
> @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf
> **pkts, uint32_t count)
>  	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev-
> >device_fh);
>  	vq = dev->virtqueue[VIRTIO_RXQ];
>  	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
> +
>  	/* As many data cores may want access to available buffers, they
> need to be reserved. */
>  	do {
>  		res_base_idx = vq->last_used_idx_res; @@ -976,9 +990,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
>  	/* Prefetch available ring to retrieve indexes. */
>  	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
> 
> -	/* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
> -	mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
> -
>  	/* Retrieve all of the head indexes first to avoid caching issues. */
>  	for (head_idx = 0; head_idx < count; head_idx++)
>  		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
> (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net
> *dev, struct rte_mbuf **pkts, uint32_t count)
>  		/* Prefetch buffer address. */
>  		rte_prefetch0((void*)(uintptr_t)buff_addr);
> 
> -		if (mergeable && (mrg_count != 0)) {
> -			desc->len = packet_len =
> rte_pktmbuf_data_len(buff);
> -		} else {
> -			/* Copy virtio_hdr to packet and increment buffer
> address */
> -			buff_hdr_addr = buff_addr;
> -			packet_len = rte_pktmbuf_data_len(buff) + vq-
> >vhost_hlen;
> +		/* Copy virtio_hdr to packet and increment buffer address */
> +		buff_hdr_addr = buff_addr;
> +		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
> 
> -			/*
> -			 * If the descriptors are chained the header and data
> are placed in
> -			 * separate buffers.
> -			 */
> -			if (desc->flags & VRING_DESC_F_NEXT) {
> -				desc->len = vq->vhost_hlen;
> -				desc = &vq->desc[desc->next];
> -				/* Buffer address translation. */
> -				buff_addr = gpa_to_vva(dev, desc->addr);
> -				desc->len = rte_pktmbuf_data_len(buff);
> -			} else {
> -				buff_addr += vq->vhost_hlen;
> -				desc->len = packet_len;
> -			}
> +		/*
> +		 * If the descriptors are chained the header and data are
> +		 * placed in separate buffers.
> +		 */
> +		if (desc->flags & VRING_DESC_F_NEXT) {
> +			desc->len = vq->vhost_hlen;
> +			desc = &vq->desc[desc->next];
> +			/* Buffer address translation. */
> +			buff_addr = gpa_to_vva(dev, desc->addr);
> +			desc->len = rte_pktmbuf_data_len(buff);
> +		} else {
> +			buff_addr += vq->vhost_hlen;
> +			desc->len = packet_len;
>  		}
> 
> -		PRINT_PACKET(dev, (uintptr_t)buff_addr,
> rte_pktmbuf_data_len(buff), 0);
> -
>  		/* Update used ring with desc information */
>  		vq->used->ring[res_cur_idx & (vq->size - 1)].id =
> head[packet_success];
>  		vq->used->ring[res_cur_idx & (vq->size - 1)].len =
> packet_len;
> 
>  		/* Copy mbuf data to buffer */
> -		rte_memcpy((void *)(uintptr_t)buff_addr, (const
> void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
> +		rte_memcpy((void *)(uintptr_t)buff_addr,
> +			(const void *)buff->pkt.data,
> +			rte_pktmbuf_data_len(buff));
> +		PRINT_PACKET(dev, (uintptr_t)buff_addr,
> +			rte_pktmbuf_data_len(buff), 0);
> 
>  		res_cur_idx++;
>  		packet_success++;
> 
> -		/* If mergeable is disabled then a header is required per
> buffer. */
> -		if (!mergeable) {
> -			rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> (const void*)&virtio_hdr, vq->vhost_hlen);
> -			PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> -		} else {
> -			mrg_count++;
> -			/* Merge buffer can only handle so many buffers at a
> time. Tell the guest if this limit is reached. */
> -			if ((mrg_count == MAX_MRG_PKT_BURST) ||
> (res_cur_idx == res_end_idx)) {
> -				virtio_hdr.num_buffers = mrg_count;
> -				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX:
> Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
> -				rte_memcpy((void
> *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
> -				PRINT_PACKET(dev,
> (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
> -				mrg_count = 0;
> -			}
> -		}
> +		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> +			(const void *)&virtio_hdr, vq->vhost_hlen);
> +
> +		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> +
>  		if (res_cur_idx < res_end_idx) {
>  			/* Prefetch descriptor index. */
>  			rte_prefetch0(&vq->desc[head[packet_success]]);
> @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct
> rte_mbuf **pkts, uint32_t count)
>  	return count;
>  }
> 
> +static inline uint32_t __attribute__((always_inline))
> +copy_from_mbuf_to_vring(struct virtio_net *dev,
> +	uint16_t res_base_idx, uint16_t res_end_idx,
> +	struct rte_mbuf *pkt)
> +{
> +	uint32_t vec_idx = 0;
> +	uint32_t entry_success = 0;
> +	struct vhost_virtqueue *vq;
> +	/* The virtio_hdr is initialised to 0. */
> +	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
> +		{0, 0, 0, 0, 0, 0}, 0};
> +	uint16_t cur_idx = res_base_idx;
> +	uint64_t vb_addr = 0;
> +	uint64_t vb_hdr_addr = 0;
> +	uint32_t seg_offset = 0;
> +	uint32_t vb_offset = 0;
> +	uint32_t seg_avail;
> +	uint32_t vb_avail;
> +	uint32_t cpy_len, entry_len;
> +
> +	if (pkt == NULL)
> +		return 0;
> +
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
> +		"End Index %d\n",
> +		dev->device_fh, cur_idx, res_end_idx);
> +
> +	/*
> +	 * Convert from gpa to vva
> +	 * (guest physical addr -> vhost virtual addr)
> +	 */
> +	vq = dev->virtqueue[VIRTIO_RXQ];
> +	vb_addr =
> +		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> +	vb_hdr_addr = vb_addr;
> +
> +	/* Prefetch buffer address. */
> +	rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> +	virtio_hdr.num_buffers = res_end_idx - res_base_idx;
> +
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge
> buffers %d\n",
> +		dev->device_fh, virtio_hdr.num_buffers);
> +
> +	rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
> +		(const void *)&virtio_hdr, vq->vhost_hlen);
> +
> +	PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
> +
> +	seg_avail = rte_pktmbuf_data_len(pkt);
> +	vb_offset = vq->vhost_hlen;
> +	vb_avail =
> +		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
> +
> +	entry_len = vq->vhost_hlen;
> +
> +	if (vb_avail == 0) {
> +		uint32_t desc_idx =
> +			vq->buf_vec[vec_idx].desc_idx;
> +		vq->desc[desc_idx].len = vq->vhost_hlen;
> +
> +		if ((vq->desc[desc_idx].flags
> +			& VRING_DESC_F_NEXT) == 0) {
> +			/* Update used ring with desc information */
> +			vq->used->ring[cur_idx & (vq->size - 1)].id
> +				= vq->buf_vec[vec_idx].desc_idx;
> +			vq->used->ring[cur_idx & (vq->size - 1)].len
> +				= entry_len;
> +
> +			entry_len = 0;
> +			cur_idx++;
> +			entry_success++;
> +		}
> +
> +		vec_idx++;
> +		vb_addr =
> +			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> +
> +		/* Prefetch buffer address. */
> +		rte_prefetch0((void *)(uintptr_t)vb_addr);
> +		vb_offset = 0;
> +		vb_avail = vq->buf_vec[vec_idx].buf_len;
> +	}
> +
> +	cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> +	while (cpy_len > 0) {
> +		/* Copy mbuf data to vring buffer */
> +		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
> +			(const void *)(rte_pktmbuf_mtod(pkt, char*) +
> seg_offset),
> +			cpy_len);
> +
> +		PRINT_PACKET(dev,
> +			(uintptr_t)(vb_addr + vb_offset),
> +			cpy_len, 0);
> +
> +		seg_offset += cpy_len;
> +		vb_offset += cpy_len;
> +		seg_avail -= cpy_len;
> +		vb_avail -= cpy_len;
> +		entry_len += cpy_len;
> +
> +		if (seg_avail != 0) {
> +			/*
> +			 * The virtio buffer in this vring
> +			 * entry reach to its end.
> +			 * But the segment doesn't complete.
> +			 */
> +			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
> +				VRING_DESC_F_NEXT) == 0) {
> +				/* Update used ring with desc information */
> +				vq->used->ring[cur_idx & (vq->size - 1)].id
> +					= vq->buf_vec[vec_idx].desc_idx;
> +				vq->used->ring[cur_idx & (vq->size - 1)].len
> +					= entry_len;
> +				entry_len = 0;
> +				cur_idx++;
> +				entry_success++;
> +			}
> +
> +			vec_idx++;
> +			vb_addr = gpa_to_vva(dev,
> +				vq->buf_vec[vec_idx].buf_addr);
> +			vb_offset = 0;
> +			vb_avail = vq->buf_vec[vec_idx].buf_len;
> +			cpy_len = RTE_MIN(vb_avail, seg_avail);
> +		} else {
> +			/*
> +			 * This current segment complete, need continue to
> +			 * check if the whole packet complete or not.
> +			 */
> +			pkt = pkt->pkt.next;
> +			if (pkt != NULL) {
> +				/*
> +				 * There are more segments.
> +				 */
> +				if (vb_avail == 0) {
> +					/*
> +					 * This current buffer from vring is
> +					 * used up, need fetch next buffer
> +					 * from buf_vec.
> +					 */
> +					uint32_t desc_idx =
> +						vq-
> >buf_vec[vec_idx].desc_idx;
> +					vq->desc[desc_idx].len = vb_offset;
> +
> +					if ((vq->desc[desc_idx].flags &
> +						VRING_DESC_F_NEXT) == 0) {
> +						uint16_t wrapped_idx =
> +							cur_idx & (vq->size -
> 1);
> +						/*
> +						 * Update used ring with the
> +						 * descriptor information
> +						 */
> +						vq->used-
> >ring[wrapped_idx].id
> +							= desc_idx;
> +						vq->used-
> >ring[wrapped_idx].len
> +							= entry_len;
> +						entry_success++;
> +						entry_len = 0;
> +						cur_idx++;
> +					}
> +
> +					/* Get next buffer from buf_vec. */
> +					vec_idx++;
> +					vb_addr = gpa_to_vva(dev,
> +						vq-
> >buf_vec[vec_idx].buf_addr);
> +					vb_avail =
> +						vq-
> >buf_vec[vec_idx].buf_len;
> +					vb_offset = 0;
> +				}
> +
> +				seg_offset = 0;
> +				seg_avail = rte_pktmbuf_data_len(pkt);
> +				cpy_len = RTE_MIN(vb_avail, seg_avail);
> +			} else {
> +				/*
> +				 * This whole packet completes.
> +				 */
> +				uint32_t desc_idx =
> +					vq->buf_vec[vec_idx].desc_idx;
> +				vq->desc[desc_idx].len = vb_offset;
> +
> +				while (vq->desc[desc_idx].flags &
> +					VRING_DESC_F_NEXT) {
> +					desc_idx = vq->desc[desc_idx].next;
> +					 vq->desc[desc_idx].len = 0;
> +				}
> +
> +				/* Update used ring with desc information */
> +				vq->used->ring[cur_idx & (vq->size - 1)].id
> +					= vq->buf_vec[vec_idx].desc_idx;
> +				vq->used->ring[cur_idx & (vq->size - 1)].len
> +					= entry_len;
> +				entry_len = 0;
> +				cur_idx++;
> +				entry_success++;
> +				seg_avail = 0;
> +				cpy_len = RTE_MIN(vb_avail, seg_avail);
> +			}
> +		}
> +	}
> +
> +	return entry_success;
> +}
> +
> +/*
> + * This function adds buffers to the virtio devices RX virtqueue.
> +Buffers can
> + * be received from the physical port or from another virtio device. A
> +packet
> + * count is returned to indicate the number of packets that were
> +succesfully
> + * added to the RX queue. This function works for mergeable RX.
> + */
> +static inline uint32_t __attribute__((always_inline))
> +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
> +	uint32_t count)
> +{
> +	struct vhost_virtqueue *vq;
> +	uint32_t pkt_idx = 0, entry_success = 0;
> +	uint32_t retry = 0;
> +	uint16_t avail_idx, res_cur_idx;
> +	uint16_t res_base_idx, res_end_idx;
> +	uint8_t success = 0;
> +
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
> +		dev->device_fh);
> +	vq = dev->virtqueue[VIRTIO_RXQ];
> +	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> +
> +	if (count == 0)
> +		return 0;
> +
> +	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +		uint32_t secure_len = 0;
> +		uint16_t need_cnt;
> +		uint32_t vec_idx = 0;
> +		uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq-
> >vhost_hlen;
> +		uint16_t i, id;
> +
> +		do {
> +			/*
> +			 * As many data cores may want access to available
> +			 * buffers, they need to be reserved.
> +			 */
> +			res_base_idx = vq->last_used_idx_res;
> +			res_cur_idx = res_base_idx;
> +
> +			do {
> +				avail_idx = *((volatile uint16_t *)&vq->avail-
> >idx);
> +				if (unlikely(res_cur_idx == avail_idx)) {
> +					/*
> +					 * If retry is enabled and the queue is
> +					 * full then we wait and retry to avoid
> +					 * packet loss.
> +					 */
> +					if (enable_retry) {
> +						uint8_t cont = 0;
> +						for (retry = 0; retry <
> burst_rx_retry_num; retry++) {
> +
> 	rte_delay_us(burst_rx_delay_time);
> +							avail_idx =
> +								*((volatile
> uint16_t *)&vq->avail->idx);
> +							if
> (likely(res_cur_idx != avail_idx)) {
> +								cont = 1;
> +								break;
> +							}
> +						}
> +						if (cont == 1)
> +							continue;
> +					}
> +
> +					LOG_DEBUG(VHOST_DATA,
> +						"(%"PRIu64") Failed "
> +						"to get enough desc from "
> +						"vring\n",
> +						dev->device_fh);
> +					return pkt_idx;
> +				} else {
> +					uint16_t wrapped_idx =
> +						(res_cur_idx) & (vq->size - 1);
> +					uint32_t idx =
> +						vq->avail->ring[wrapped_idx];
> +					uint8_t next_desc;
> +
> +					do {
> +						next_desc = 0;
> +						secure_len += vq-
> >desc[idx].len;
> +						if (vq->desc[idx].flags &
> +
> 	VRING_DESC_F_NEXT) {
> +							idx = vq-
> >desc[idx].next;
> +							next_desc = 1;
> +						}
> +					} while (next_desc);
> +
> +					res_cur_idx++;
> +				}
> +			} while (pkt_len > secure_len);
> +
> +			/* vq->last_used_idx_res is atomically updated. */
> +			success = rte_atomic16_cmpset(&vq-
> >last_used_idx_res,
> +							res_base_idx,
> +							res_cur_idx);
> +		} while (success == 0);
> +
> +		id = res_base_idx;
> +		need_cnt = res_cur_idx - res_base_idx;
> +
> +		for (i = 0; i < need_cnt; i++, id++) {
> +			uint16_t wrapped_idx = id & (vq->size - 1);
> +			uint32_t idx = vq->avail->ring[wrapped_idx];
> +			uint8_t next_desc;
> +			do {
> +				next_desc = 0;
> +				vq->buf_vec[vec_idx].buf_addr =
> +					vq->desc[idx].addr;
> +				vq->buf_vec[vec_idx].buf_len =
> +					vq->desc[idx].len;
> +				vq->buf_vec[vec_idx].desc_idx = idx;
> +				vec_idx++;
> +
> +				if (vq->desc[idx].flags &
> VRING_DESC_F_NEXT) {
> +					idx = vq->desc[idx].next;
> +					next_desc = 1;
> +				}
> +			} while (next_desc);
> +		}
> +
> +		res_end_idx = res_cur_idx;
> +
> +		entry_success = copy_from_mbuf_to_vring(dev,
> res_base_idx,
> +			res_end_idx, pkts[pkt_idx]);
> +
> +		rte_compiler_barrier();
> +
> +		/*
> +		 * Wait until it's our turn to add our buffer
> +		 * to the used ring.
> +		 */
> +		while (unlikely(vq->last_used_idx != res_base_idx))
> +			rte_pause();
> +
> +		*(volatile uint16_t *)&vq->used->idx += entry_success;
> +		vq->last_used_idx = res_end_idx;
> +
> +		/* Kick the guest if necessary. */
> +		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> +			eventfd_write((int)vq->kickfd, 1);
> +	}
> +
> +	return count;
> +}
> +
>  /*
>   * Compares a packet destination MAC address to a device MAC address.
>   */
> @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct
> rte_mbuf *m)
>  				/*drop the packet if the device is marked for
> removal*/
>  				LOG_DEBUG(VHOST_DATA, "(%"PRIu64")
> Device is marked for removal\n", dev_ll->dev->device_fh);
>  			} else {
> +				uint32_t mergeable =
> +					dev_ll->dev->features &
> +					(1 << VIRTIO_NET_F_MRG_RXBUF);
> +
>  				/*send the packet to the local virtio device*/
> -				ret = virtio_dev_rx(dev_ll->dev, &m, 1);
> +				if (likely(mergeable == 0))
> +					ret = virtio_dev_rx(dev_ll->dev, &m,
> 1);
> +				else
> +					ret = virtio_dev_merge_rx(dev_ll-
> >dev,
> +						&m, 1);
> +
>  				if (enable_stats) {
>  					rte_atomic64_add(
>  					&dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1231,7 +1589,7 @@ virtio_tx_route(struct virtio_net* dev, struct
> rte_mbuf *m, struct rte_mempool *
>  	struct mbuf_table *tx_q;
>  	struct vlan_ethhdr *vlan_hdr;
>  	struct rte_mbuf **m_table;
> -	struct rte_mbuf *mbuf;
> +	struct rte_mbuf *mbuf, *prev;
>  	unsigned len, ret, offset = 0;
>  	const uint16_t lcore_id = rte_lcore_id();
>  	struct virtio_net_data_ll *dev_ll = ll_root_used; @@ -1284,12
> +1642,14 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m,
> struct rte_mempool *
>  	/* Allocate an mbuf and populate the structure. */
>  	mbuf = rte_pktmbuf_alloc(mbuf_pool);
>  	if (unlikely(mbuf == NULL)) {
> -		RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for
> mbuf.\n");
> +		RTE_LOG(ERR, VHOST_DATA,
> +			"Failed to allocate memory for mbuf.\n");
>  		return;
>  	}
> 
>  	mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
> -	mbuf->pkt.pkt_len = mbuf->pkt.data_len;
> +	mbuf->pkt.pkt_len = m->pkt.pkt_len + VLAN_HLEN + offset;
> +	mbuf->pkt.nb_segs = m->pkt.nb_segs;
> 
>  	/* Copy ethernet header to mbuf. */
>  	rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data,
> ETH_HLEN); @@ -1304,6 +1664,29 @@ virtio_tx_route(struct virtio_net* dev,
> struct rte_mbuf *m, struct rte_mempool *
>  	/* Copy the remaining packet contents to the mbuf. */
>  	rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
>  		(const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m-
> >pkt.data_len - ETH_HLEN));
> +
> +	/* Copy the remaining segments for the whole packet. */
> +	prev = mbuf;
> +	while (m->pkt.next) {
> +		/* Allocate an mbuf and populate the structure. */
> +		struct rte_mbuf *next_mbuf =
> rte_pktmbuf_alloc(mbuf_pool);
> +		if (unlikely(next_mbuf == NULL)) {
> +			rte_pktmbuf_free(mbuf);
> +			RTE_LOG(ERR, VHOST_DATA,
> +				"Failed to allocate memory for mbuf.\n");
> +			return;
> +		}
> +
> +		m = m->pkt.next;
> +		prev->pkt.next = next_mbuf;
> +		prev = next_mbuf;
> +		next_mbuf->pkt.data_len = m->pkt.data_len;
> +
> +		/* Copy data to next mbuf. */
> +		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
> +			rte_pktmbuf_mtod(m, const void *), m-
> >pkt.data_len);
> +	}
> +
>  	tx_q->m_table[len] = mbuf;
>  	len++;
>  	if (enable_stats) {
> @@ -1394,6 +1777,7 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
> 
>  		/* Setup dummy mbuf. This is copied to a real mbuf if
> transmitted out the physical port. */
>  		m.pkt.data_len = desc->len;
> +		m.pkt.pkt_len = desc->len;
>  		m.pkt.data = (void*)(uintptr_t)buff_addr;
> 
>  		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@
> -1420,6 +1804,227 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
>  		eventfd_write((int)vq->kickfd, 1);
>  }
> 
> +/* This function works for TX packets with mergeable feature enabled.
> +*/ static inline void __attribute__((always_inline))
> +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool
> +*mbuf_pool) {
> +	struct rte_mbuf *m, *prev;
> +	struct vhost_virtqueue *vq;
> +	struct vring_desc *desc;
> +	uint64_t vb_addr = 0;
> +	uint32_t head[MAX_PKT_BURST];
> +	uint32_t used_idx;
> +	uint32_t i;
> +	uint16_t free_entries, entry_success = 0;
> +	uint16_t avail_idx;
> +	uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
> +			+ RTE_PKTMBUF_HEADROOM);
> +
> +	vq = dev->virtqueue[VIRTIO_TXQ];
> +	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
> +
> +	/* If there are no available buffers then return. */
> +	if (vq->last_used_idx == avail_idx)
> +		return;
> +
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
> +		dev->device_fh);
> +
> +	/* Prefetch available ring to retrieve head indexes. */
> +	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> +	/*get the number of free entries in the ring*/
> +	free_entries = (avail_idx - vq->last_used_idx);
> +
> +	/* Limit to MAX_PKT_BURST. */
> +	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> +
> +	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> +		dev->device_fh, free_entries);
> +	/* Retrieve all of the head indexes first to avoid caching issues. */
> +	for (i = 0; i < free_entries; i++)
> +		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size -
> 1)];
> +
> +	/* Prefetch descriptor index. */
> +	rte_prefetch0(&vq->desc[head[entry_success]]);
> +	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> +	while (entry_success < free_entries) {
> +		uint32_t vb_avail, vb_offset;
> +		uint32_t seg_avail, seg_offset;
> +		uint32_t cpy_len;
> +		uint32_t seg_num = 0;
> +		struct rte_mbuf *cur;
> +		uint8_t alloc_err = 0;
> +
> +		desc = &vq->desc[head[entry_success]];
> +
> +		/* Discard first buffer as it is the virtio header */
> +		desc = &vq->desc[desc->next];
> +
> +		/* Buffer address translation. */
> +		vb_addr = gpa_to_vva(dev, desc->addr);
> +		/* Prefetch buffer address. */
> +		rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> +		used_idx = vq->last_used_idx & (vq->size - 1);
> +
> +		if (entry_success < (free_entries - 1)) {
> +			/* Prefetch descriptor index. */
> +			rte_prefetch0(&vq->desc[head[entry_success+1]]);
> +			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> (vq->size - 1)]);
> +		}
> +
> +		/* Update used index buffer information. */
> +		vq->used->ring[used_idx].id = head[entry_success];
> +		vq->used->ring[used_idx].len = 0;
> +
> +		vb_offset = 0;
> +		vb_avail = desc->len;
> +		seg_offset = 0;
> +		seg_avail = buf_size;
> +		cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> +		PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> +
> +		/* Allocate an mbuf and populate the structure. */
> +		m = rte_pktmbuf_alloc(mbuf_pool);
> +		if (unlikely(m == NULL)) {
> +			RTE_LOG(ERR, VHOST_DATA,
> +				"Failed to allocate memory for mbuf.\n");
> +			return;
> +		}
> +
> +		seg_num++;
> +		cur = m;
> +		prev = m;
> +		while (cpy_len != 0) {
> +			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *)
> + seg_offset),
> +				(void *)((uintptr_t)(vb_addr + vb_offset)),
> +				cpy_len);
> +
> +			seg_offset += cpy_len;
> +			vb_offset += cpy_len;
> +			vb_avail -= cpy_len;
> +			seg_avail -= cpy_len;
> +
> +			if (vb_avail != 0) {
> +				/*
> +				 * The segment reachs to its end,
> +				 * while the virtio buffer in TX vring has
> +				 * more data to be copied.
> +				 */
> +				cur->pkt.data_len = seg_offset;
> +				m->pkt.pkt_len += seg_offset;
> +				/* Allocate mbuf and populate the structure.
> */
> +				cur = rte_pktmbuf_alloc(mbuf_pool);
> +				if (unlikely(cur == NULL)) {
> +					RTE_LOG(ERR, VHOST_DATA, "Failed
> to "
> +						"allocate memory for
> mbuf.\n");
> +					rte_pktmbuf_free(m);
> +					alloc_err = 1;
> +					break;
> +				}
> +
> +				seg_num++;
> +				prev->pkt.next = cur;
> +				prev = cur;
> +				seg_offset = 0;
> +				seg_avail = buf_size;
> +			} else {
> +				if (desc->flags & VRING_DESC_F_NEXT) {
> +					/*
> +					 * There are more virtio buffers in
> +					 * same vring entry need to be copied.
> +					 */
> +					if (seg_avail == 0) {
> +						/*
> +						 * The current segment hasn't
> +						 * room to accomodate more
> +						 * data.
> +						 */
> +						cur->pkt.data_len =
> seg_offset;
> +						m->pkt.pkt_len +=
> seg_offset;
> +						/*
> +						 * Allocate an mbuf and
> +						 * populate the structure.
> +						 */
> +						cur =
> rte_pktmbuf_alloc(mbuf_pool);
> +						if (unlikely(cur == NULL)) {
> +							RTE_LOG(ERR,
> +								VHOST_DATA,
> +								"Failed to "
> +								"allocate
> memory "
> +								"for mbuf\n");
> +
> 	rte_pktmbuf_free(m);
> +							alloc_err = 1;
> +							break;
> +						}
> +						seg_num++;
> +						prev->pkt.next = cur;
> +						prev = cur;
> +						seg_offset = 0;
> +						seg_avail = buf_size;
> +					}
> +
> +					desc = &vq->desc[desc->next];
> +
> +					/* Buffer address translation. */
> +					vb_addr = gpa_to_vva(dev, desc-
> >addr);
> +					/* Prefetch buffer address. */
> +					rte_prefetch0((void
> *)(uintptr_t)vb_addr);
> +					vb_offset = 0;
> +					vb_avail = desc->len;
> +
> +					PRINT_PACKET(dev,
> (uintptr_t)vb_addr,
> +						desc->len, 0);
> +				} else {
> +					/* The whole packet completes. */
> +					cur->pkt.data_len = seg_offset;
> +					m->pkt.pkt_len += seg_offset;
> +					vb_avail = 0;
> +				}
> +			}
> +
> +			cpy_len = RTE_MIN(vb_avail, seg_avail);
> +		}
> +
> +		if (unlikely(alloc_err == 1))
> +			break;
> +
> +		m->pkt.nb_segs = seg_num;
> +
> +		/*
> +		 * If this is the first received packet we need to learn
> +		 * the MAC and setup VMDQ
> +		 */
> +		if (dev->ready == DEVICE_MAC_LEARNING) {
> +			if (dev->remove || (link_vmdq(dev, m) == -1)) {
> +				/*
> +				 * Discard frame if device is scheduled for
> +				 * removal or a duplicate MAC address is
> found.
> +				 */
> +				entry_success = free_entries;
> +				vq->last_used_idx += entry_success;
> +				rte_pktmbuf_free(m);
> +				break;
> +			}
> +		}
> +
> +		virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev-
> >device_fh);
> +		vq->last_used_idx++;
> +		entry_success++;
> +		rte_pktmbuf_free(m);
> +	}
> +
> +	rte_compiler_barrier();
> +	vq->used->idx += entry_success;
> +	/* Kick guest if required. */
> +	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> +		eventfd_write((int)vq->kickfd, 1);
> +
> +}
> +
>  /*
>   * This function is called by each data core. It handles all RX/TX registered
> with the
>   * core. For TX the specific lcore linked list is used. For RX, MAC addresses are
> compared @@ -1440,8 +2045,9 @@ switch_worker(__attribute__((unused))
> void *arg)
>  	const uint16_t lcore_id = rte_lcore_id();
>  	const uint16_t num_cores = (uint16_t)rte_lcore_count();
>  	uint16_t rx_count = 0;
> +	uint32_t mergeable = 0;
> 
> -	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n",
> lcore_id);
> +	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n",
> lcore_id);
>  	lcore_ll = lcore_info[lcore_id].lcore_ll;
>  	prev_tsc = 0;
> 
> @@ -1497,6 +2103,8 @@ switch_worker(__attribute__((unused)) void *arg)
>  		while (dev_ll != NULL) {
>  			/*get virtio device ID*/
>  			dev = dev_ll->dev;
> +			mergeable =
> +				dev->features & (1 <<
> VIRTIO_NET_F_MRG_RXBUF);
> 
>  			if (dev->remove) {
>  				dev_ll = dev_ll->next;
> @@ -1510,7 +2118,15 @@ switch_worker(__attribute__((unused)) void *arg)
>  					(uint16_t)dev->vmdq_rx_q,
> pkts_burst, MAX_PKT_BURST);
> 
>  				if (rx_count) {
> -					ret_count = virtio_dev_rx(dev,
> pkts_burst, rx_count);
> +					if (likely(mergeable == 0))
> +						ret_count =
> +							virtio_dev_rx(dev,
> +							pkts_burst, rx_count);
> +					else
> +						ret_count =
> +
> 	virtio_dev_merge_rx(dev,
> +							pkts_burst, rx_count);
> +
>  					if (enable_stats) {
>  						rte_atomic64_add(
>  						&dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1520,15 +2136,19 @@ switch_worker(__attribute__((unused)) void
> *arg)
>  					}
>  					while (likely(rx_count)) {
>  						rx_count--;
> -
> 	rte_pktmbuf_free_seg(pkts_burst[rx_count]);
> +
> 	rte_pktmbuf_free(pkts_burst[rx_count]);
>  					}
> 
>  				}
>  			}
> 
> -			if (!dev->remove)
> +			if (!dev->remove) {
>  				/*Handle guest TX*/
> -				virtio_dev_tx(dev, mbuf_pool);
> +				if (likely(mergeable == 0))
> +					virtio_dev_tx(dev, mbuf_pool);
> +				else
> +					virtio_dev_merge_tx(dev,
> mbuf_pool);
> +			}
> 
>  			/*move to the next device in the list*/
>  			dev_ll = dev_ll->next;
> diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h index
> 3d1f255..1a2f0dc 100644
> --- a/examples/vhost/virtio-net.h
> +++ b/examples/vhost/virtio-net.h
> @@ -45,6 +45,18 @@
>  /* Enum for virtqueue management. */
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> 
> +#define BUF_VECTOR_MAX 256
> +
> +/*
> + * Structure contains buffer address, length and descriptor index
> + * from vring to do scatter RX.
> +*/
> +struct buf_vector {
> +uint64_t buf_addr;
> +uint32_t buf_len;
> +uint32_t desc_idx;
> +};
> +
>  /*
>   * Structure contains variables relevant to TX/RX virtqueues.
>   */
> @@ -60,6 +72,8 @@ struct vhost_virtqueue
>  	volatile uint16_t	last_used_idx_res;	/* Used for multiple
> devices reserving buffers. */
>  	eventfd_t			callfd;				/*
> Currently unused as polling mode is enabled. */
>  	eventfd_t			kickfd;				/*
> Used to notify the guest (trigger interrupt). */
> +	/* Used for scatter RX. */
> +	struct buf_vector	buf_vec[BUF_VECTOR_MAX];
>  } __rte_cache_aligned;
> 
>  /*
> --
> 1.8.4.2



More information about the dev mailing list