[dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE

Wang, Xiao W xiao.w.wang at intel.com
Thu Nov 30 03:37:21 CET 2017



> -----Original Message-----
> From: Bie, Tiwei
> Sent: Friday, November 24, 2017 2:05 PM
> To: Wang, Xiao W <xiao.w.wang at intel.com>
> Cc: dev at dpdk.org; yliu at fridaylinux.org
> Subject: Re: [dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE
> 
> Hi,
> 
> Some quick comments. Will go through the whole patch later.
> 
> On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> > When live migration is done, for the backup VM, either the virtio
> > frontend or the vhost backend needs to send out gratuitous RARP packet
> > to announce its new network location.
> >
> > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support
> live
> > migration scenario where the vhost backend doesn't have the ability to
> > generate RARP packet.
> >
> > Brief introduction of the work flow:
> > 1. QEMU finishes live migration, pokes the backup VM with an interrupt.
> > 2. Virtio interrupt handler reads out the interrupt status value, and
> >    realizes it needs to send out RARP packet to announce its location.
> > 3. Pause device to stop worker thread touching the queues.
> > 4. Inject a RARP packet into a Tx Queue.
> > 5. Ack the interrupt via control queue.
> > 6. Resume device to continue packet processing.
> >
> > Signed-off-by: Xiao Wang <xiao.w.wang at intel.com>
> > ---
> >  drivers/net/virtio/virtio_ethdev.c | 131
> ++++++++++++++++++++++++++++++++++++-
> >  drivers/net/virtio/virtio_ethdev.h |   4 ++
> >  drivers/net/virtio/virtio_pci.h    |   1 +
> >  drivers/net/virtio/virtio_rxtx.c   |  81 +++++++++++++++++++++++
> >  drivers/net/virtio/virtqueue.h     |  11 ++++
> >  5 files changed, 226 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/virtio/virtio_ethdev.c
> b/drivers/net/virtio/virtio_ethdev.c
> > index 1959b11..6eaea0e 100644
> > --- a/drivers/net/virtio/virtio_ethdev.c
> > +++ b/drivers/net/virtio/virtio_ethdev.c
> > @@ -48,6 +48,8 @@
> >  #include <rte_pci.h>
> >  #include <rte_bus_pci.h>
> >  #include <rte_ether.h>
> > +#include <rte_ip.h>
> > +#include <rte_arp.h>
> >  #include <rte_common.h>
> >  #include <rte_errno.h>
> >  #include <rte_cpuflags.h>
> > @@ -55,6 +57,7 @@
> >  #include <rte_memory.h>
> >  #include <rte_eal.h>
> >  #include <rte_dev.h>
> > +#include <rte_cycles.h>
> >
> >  #include "virtio_ethdev.h"
> >  #include "virtio_pci.h"
> > @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set(
> >  	uint8_t stat_idx,
> >  	uint8_t is_rx);
> >
> > +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf,
> > +		const struct ether_addr *mac);
> > +static int virtio_dev_pause(struct rte_eth_dev *dev);
> > +static void virtio_dev_resume(struct rte_eth_dev *dev);
> > +static void generate_rarp(struct rte_eth_dev *dev);
> > +static void virtnet_ack_link_announce(struct rte_eth_dev *dev);
> > +
> >  /*
> >   * The set of PCI devices this driver supports
> >   */
> > @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct
> rte_eth_dev *dev,
> >  	return 0;
> >  }
> >
> > +#define RARP_PKT_SIZE	64
> > +
> > +static int
> > +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr
> *mac)
> > +{
> > +	struct ether_hdr *eth_hdr;
> > +	struct arp_hdr  *rarp;
> > +
> > +	if (rarp_mbuf->buf_len < RARP_PKT_SIZE) {
> > +		PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)",
> > +				rarp_mbuf->buf_len, RARP_PKT_SIZE);
> > +		return -1;
> > +	}
> > +
> > +	/* Ethernet header. */
> > +	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
> 
> You can use rte_pktmbuf_mtod() directly.

Looks better. Will change it in v2.

> 
> > +	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> > +	ether_addr_copy(mac, &eth_hdr->s_addr);
> > +	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> > +
> > +	/* RARP header. */
> > +	rarp = (struct arp_hdr *)(eth_hdr + 1);
> > +	rarp->arp_hrd = htons(ARP_HRD_ETHER);
> > +	rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> > +	rarp->arp_hln = ETHER_ADDR_LEN;
> > +	rarp->arp_pln = 4;
> > +	rarp->arp_op  = htons(ARP_OP_REVREQUEST);
> > +
> > +	ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> > +	ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> > +	memset(&rarp->arp_data.arp_sip, 0x00, 4);
> > +	memset(&rarp->arp_data.arp_tip, 0x00, 4);
> > +
> > +	rarp_mbuf->data_len = RARP_PKT_SIZE;
> > +	rarp_mbuf->pkt_len = RARP_PKT_SIZE;
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +virtio_dev_pause(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +
> > +	if (hw->started == 0)
> > +		return -1;
> > +	hw->started = 0;
> > +	/*
> > +	 * Prevent the worker thread from touching queues to avoid condition,
> > +	 * 1 ms should be enough for the ongoing Tx function to finish.
> > +	 */
> > +	rte_delay_ms(1);
> > +	return 0;
> > +}
> > +
> > +static void
> > +virtio_dev_resume(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +
> > +	hw->started = 1;
> > +}
> > +
> > +static void
> > +generate_rarp(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +	struct rte_mbuf *rarp_mbuf = NULL;
> > +	struct virtnet_tx *txvq = dev->data->tx_queues[0];
> > +	struct virtnet_rx *rxvq = dev->data->rx_queues[0];
> > +
> > +	rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
> > +	if (rarp_mbuf == NULL) {
> > +		PMD_DRV_LOG(ERR, "mbuf allocate failed");
> > +		return;
> > +	}
> > +
> > +	if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr))
> {
> > +		rte_pktmbuf_free(rarp_mbuf);
> > +		rarp_mbuf = NULL;
> > +		return;
> > +	}
> > +
> > +	/* If virtio port just stopped, no need to send RARP */
> > +	if (virtio_dev_pause(dev) < -1)
> 
> You mean < 0?

Yes, will fix it in v2.

> 
> > +		return;
> > +
> > +	virtio_inject_pkts(txvq, &rarp_mbuf, 1);
> > +	/* Recover the stored hw status to let worker thread continue */
> > +	virtio_dev_resume(dev);
> > +}
> > +
> > +static void
> > +virtnet_ack_link_announce(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +	struct virtio_pmd_ctrl ctrl;
> > +	int len;
> > +
> > +	ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;
> > +	ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;
> > +	len = 0;
> > +
> > +	virtio_send_command(hw->cvq, &ctrl, &len, 0);
> > +}
> > +
> >  /*
> > - * Process Virtio Config changed interrupt and call the callback
> > - * if link state changed.
> > + * Process virtio config changed interrupt. Call the callback
> > + * if link state changed; generate gratuitous RARP packet if
> > + * the status indicates an ANNOUNCE.
> >   */
> >  void
> >  virtio_interrupt_handler(void *param)
> > @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct
> rte_eth_dev *dev,
> >  						      NULL, NULL);
> >  	}
> >
> > +	if (isr & VIRTIO_NET_S_ANNOUNCE) {
> > +		rte_spinlock_lock(&hw->sl);
> > +		generate_rarp(dev);
> > +		virtnet_ack_link_announce(dev);
> > +		rte_spinlock_unlock(&hw->sl);
> > +	}
> >  }
> >
> >  /* set rx and tx handlers according to what is supported */
> > @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct
> rte_pci_device *pci_dev)
> >  			return -EBUSY;
> >  		}
> >
> > +	rte_spinlock_init(&hw->sl);
> > +
> >  	hw->use_simple_rx = 1;
> >  	hw->use_simple_tx = 1;
> >
> > @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct
> rte_eth_dev *dev)
> >
> >  	PMD_INIT_LOG(DEBUG, "stop");
> >
> > +	rte_spinlock_lock(&hw->sl);
> >  	if (intr_conf->lsc || intr_conf->rxq)
> >  		virtio_intr_disable(dev);
> >
> >  	hw->started = 0;
> >  	memset(&link, 0, sizeof(link));
> >  	virtio_dev_atomic_write_link_status(dev, &link);
> > +	rte_spinlock_unlock(&hw->sl);
> >  }
> >
> >  static int
> > diff --git a/drivers/net/virtio/virtio_ethdev.h
> b/drivers/net/virtio/virtio_ethdev.h
> > index 2039bc5..24271cb 100644
> > --- a/drivers/net/virtio/virtio_ethdev.h
> > +++ b/drivers/net/virtio/virtio_ethdev.h
> > @@ -67,6 +67,7 @@
> >  	 1u << VIRTIO_NET_F_HOST_TSO6	  |	\
> >  	 1u << VIRTIO_NET_F_MRG_RXBUF	  |	\
> >  	 1u << VIRTIO_NET_F_MTU	| \
> > +	 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE  | \
> >  	 1u << VIRTIO_RING_F_INDIRECT_DESC |    \
> >  	 1ULL << VIRTIO_F_VERSION_1       |	\
> >  	 1ULL << VIRTIO_F_IOMMU_PLATFORM)
> > @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue,
> struct rte_mbuf **rx_pkts,
> >  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> >  		uint16_t nb_pkts);
> >
> > +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> > +		uint16_t nb_pkts);
> > +
> >  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> >  		uint16_t nb_pkts);
> >
> > diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> > index 3c5ce66..3cd367e 100644
> > --- a/drivers/net/virtio/virtio_pci.h
> > +++ b/drivers/net/virtio/virtio_pci.h
> > @@ -270,6 +270,7 @@ struct virtio_hw {
> >  	struct virtio_pci_common_cfg *common_cfg;
> >  	struct virtio_net_config *dev_cfg;
> >  	void	    *virtio_user_dev;
> > +	rte_spinlock_t sl;
> 
> Need to add some detailed comments to describe what's
> protected by this lock.

With this feature, the hw->started flag can be changed by two threads:
App management thread and the interrupt handler thread.
This lock can prevent such a case:

1. When LM is done, config change interrupt triggered, the handler pause the device. hw->started  = 0;
2. app stops virtio port. Hw->started = 0;
3. RARP injected in Tx queue, ack sent in Control queue, resume device. hw->started = 1; (but the port is stopped already)

> 
> >
> >  	struct virtqueue **vqs;
> >  };
> > diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> > index 6a24fde..7313bdd 100644
> > --- a/drivers/net/virtio/virtio_rxtx.c
> > +++ b/drivers/net/virtio/virtio_rxtx.c
> > @@ -1100,3 +1100,84 @@
> >
> >  	return nb_tx;
> >  }
> > +
> > +uint16_t
> > +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t
> nb_pkts)
> > +{
> > +	struct virtnet_tx *txvq = tx_queue;
> > +	struct virtqueue *vq = txvq->vq;
> > +	struct virtio_hw *hw = vq->hw;
> > +	uint16_t hdr_size = hw->vtnet_hdr_size;
> > +	uint16_t nb_used, nb_tx = 0;
> > +
> > +	if (unlikely(nb_pkts < 1))
> > +		return nb_pkts;
> > +
> > +	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
> > +	nb_used = VIRTQUEUE_NUSED(vq);
> > +
> > +	virtio_rmb();
> > +	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
> > +		virtio_xmit_cleanup(vq, nb_used);
> > +
> > +	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
> > +		struct rte_mbuf *txm = tx_pkts[nb_tx];
> > +		int can_push = 0, use_indirect = 0, slots, need;
> > +
> > +		/* optimize ring usage */
> > +		if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
> > +					vtpci_with_feature(hw,
> VIRTIO_F_VERSION_1)) &&
> > +			rte_mbuf_refcnt_read(txm) == 1 &&
> > +			RTE_MBUF_DIRECT(txm) &&
> > +			txm->nb_segs == 1 &&
> > +			rte_pktmbuf_headroom(txm) >= hdr_size &&
> > +			rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > +				__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > +			can_push = 1;
> > +		else if (vtpci_with_feature(hw,
> VIRTIO_RING_F_INDIRECT_DESC) &&
> > +			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > +			use_indirect = 1;
> > +
> > +		/* How many main ring entries are needed to this Tx?
> > +		 * any_layout => number of segments
> > +		 * indirect   => 1
> > +		 * default    => number of segments + 1
> > +		 */
> > +		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
> > +		need = slots - vq->vq_free_cnt;
> > +
> > +		/* Positive value indicates it need free vring descriptors */
> > +		if (unlikely(need > 0)) {
> > +			nb_used = VIRTQUEUE_NUSED(vq);
> > +			virtio_rmb();
> > +			need = RTE_MIN(need, (int)nb_used);
> > +
> > +			virtio_xmit_cleanup(vq, need);
> > +			need = slots - vq->vq_free_cnt;
> > +			if (unlikely(need > 0)) {
> > +				PMD_TX_LOG(ERR,
> > +						"No free tx descriptors to
> transmit");
> > +				break;
> > +			}
> > +		}
> > +
> > +		/* Enqueue Packet buffers */
> > +		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect,
> can_push);
> > +
> > +		txvq->stats.bytes += txm->pkt_len;
> > +		virtio_update_packet_stats(&txvq->stats, txm);
> > +	}
> > +
> > +	txvq->stats.packets += nb_tx;
> > +
> > +	if (likely(nb_tx)) {
> > +		vq_update_avail_idx(vq);
> > +
> > +		if (unlikely(virtqueue_kick_prepare(vq))) {
> > +			virtqueue_notify(vq);
> > +			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
> > +		}
> > +	}
> > +
> > +	return nb_tx;
> > +}
> 
> What's the difference between virtio_inject_pkts() and
> virtio_xmit_pkts() except the latter will check hw->started?

No vlan tag insertion.
Actually they are both using virtqueue_enqueue_xmit() to enqueue packet.

Thanks for your comments.
Xiao


More information about the dev mailing list