[dpdk-dev] [PATCH v3 3/4] net/netvsc: split send buffers from transmit descriptors

Stephen Hemminger stephen at networkplumber.org
Wed Mar 18 21:51:03 CET 2020


The VMBus has reserved transmit area (per device) and transmit descriptors
(per queue). The previous code was always having a 1:1 mapping between
send buffers and descriptors. This can lead to one queue starving another
and also buffer bloat.

Change to working more like FreeBSD where there is a pool of transmit
descriptors per queue. If send buffer is not available then no aggregation
happens but the queue can still drain.

Fixes: 4e9c73e96e83 ("net/netvsc: add Hyper-V network device")
Cc: stable at dpdk.org
Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
---
 drivers/net/netvsc/hn_ethdev.c |   9 +-
 drivers/net/netvsc/hn_rxtx.c   | 261 ++++++++++++++++++++-------------
 drivers/net/netvsc/hn_var.h    |  10 +-
 3 files changed, 174 insertions(+), 106 deletions(-)

diff --git a/drivers/net/netvsc/hn_ethdev.c b/drivers/net/netvsc/hn_ethdev.c
index 564620748daf..ac6610838008 100644
--- a/drivers/net/netvsc/hn_ethdev.c
+++ b/drivers/net/netvsc/hn_ethdev.c
@@ -257,6 +257,9 @@ static int hn_dev_info_get(struct rte_eth_dev *dev,
 	dev_info->max_rx_queues = hv->max_queues;
 	dev_info->max_tx_queues = hv->max_queues;
 
+	dev_info->tx_desc_lim.nb_min = 1;
+	dev_info->tx_desc_lim.nb_max = 4096;
+
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
 		return 0;
 
@@ -982,7 +985,7 @@ eth_hn_dev_init(struct rte_eth_dev *eth_dev)
 	if  (err)
 		goto failed;
 
-	err = hn_tx_pool_init(eth_dev);
+	err = hn_chim_init(eth_dev);
 	if (err)
 		goto failed;
 
@@ -1018,7 +1021,7 @@ eth_hn_dev_init(struct rte_eth_dev *eth_dev)
 failed:
 	PMD_INIT_LOG(NOTICE, "device init failed");
 
-	hn_tx_pool_uninit(eth_dev);
+	hn_chim_uninit(eth_dev);
 	hn_detach(hv);
 	return err;
 }
@@ -1042,7 +1045,7 @@ eth_hn_dev_uninit(struct rte_eth_dev *eth_dev)
 	eth_dev->rx_pkt_burst = NULL;
 
 	hn_detach(hv);
-	hn_tx_pool_uninit(eth_dev);
+	hn_chim_uninit(eth_dev);
 	rte_vmbus_chan_close(hv->primary->chan);
 	rte_free(hv->primary);
 	ret = rte_eth_dev_owner_delete(hv->owner.id);
diff --git a/drivers/net/netvsc/hn_rxtx.c b/drivers/net/netvsc/hn_rxtx.c
index 7212780c156e..eea09a431ec6 100644
--- a/drivers/net/netvsc/hn_rxtx.c
+++ b/drivers/net/netvsc/hn_rxtx.c
@@ -18,6 +18,7 @@
 #include <rte_memzone.h>
 #include <rte_malloc.h>
 #include <rte_atomic.h>
+#include <rte_bitmap.h>
 #include <rte_branch_prediction.h>
 #include <rte_ether.h>
 #include <rte_common.h>
@@ -83,7 +84,7 @@ struct hn_txdesc {
 	struct rte_mbuf *m;
 
 	uint16_t	queue_id;
-	uint16_t	chim_index;
+	uint32_t	chim_index;
 	uint32_t	chim_size;
 	uint32_t	data_size;
 	uint32_t	packets;
@@ -98,6 +99,8 @@ struct hn_txdesc {
 	 RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +	\
 	 RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
 
+#define HN_RNDIS_PKT_ALIGNED	RTE_ALIGN(HN_RNDIS_PKT_LEN, RTE_CACHE_LINE_SIZE)
+
 /* Minimum space required for a packet */
 #define HN_PKTSIZE_MIN(align) \
 	RTE_ALIGN(RTE_ETHER_MIN_LEN + HN_RNDIS_PKT_LEN, align)
@@ -150,63 +153,80 @@ hn_rndis_pktmsg_offset(uint32_t ofs)
 static void hn_txd_init(struct rte_mempool *mp __rte_unused,
 			void *opaque, void *obj, unsigned int idx)
 {
+	struct hn_tx_queue *txq = opaque;
 	struct hn_txdesc *txd = obj;
-	struct rte_eth_dev *dev = opaque;
-	struct rndis_packet_msg *pkt;
 
 	memset(txd, 0, sizeof(*txd));
-	txd->chim_index = idx;
-
-	pkt = rte_malloc_socket("RNDIS_TX", HN_RNDIS_PKT_LEN,
-				rte_align32pow2(HN_RNDIS_PKT_LEN),
-				dev->device->numa_node);
-	if (!pkt)
-		rte_exit(EXIT_FAILURE, "can not allocate RNDIS header");
 
-	txd->rndis_pkt = pkt;
+	txd->queue_id = txq->queue_id;
+	txd->chim_index = NVS_CHIM_IDX_INVALID;
+	txd->rndis_pkt = (struct rndis_packet_msg *)(char *)txq->tx_rndis
+		+ idx * HN_RNDIS_PKT_ALIGNED;
 }
 
-/*
- * Unlike Linux and FreeBSD, this driver uses a mempool
- * to limit outstanding transmits and reserve buffers
- */
 int
-hn_tx_pool_init(struct rte_eth_dev *dev)
+hn_chim_init(struct rte_eth_dev *dev)
 {
 	struct hn_data *hv = dev->data->dev_private;
-	char name[RTE_MEMPOOL_NAMESIZE];
-	struct rte_mempool *mp;
+	uint32_t i, chim_bmp_size;
+
+	rte_spinlock_init(&hv->chim_lock);
+	chim_bmp_size = rte_bitmap_get_memory_footprint(hv->chim_cnt);
+	hv->chim_bmem = rte_zmalloc("hn_chim_bitmap", chim_bmp_size,
+				    RTE_CACHE_LINE_SIZE);
+	if (hv->chim_bmem == NULL) {
+		PMD_INIT_LOG(ERR, "failed to allocate bitmap size %u",
+			     chim_bmp_size);
+		return -1;
+	}
 
-	snprintf(name, sizeof(name),
-		 "hn_txd_%u", dev->data->port_id);
-
-	PMD_INIT_LOG(DEBUG, "create a TX send pool %s n=%u size=%zu socket=%d",
-		     name, hv->chim_cnt, sizeof(struct hn_txdesc),
-		     dev->device->numa_node);
-
-	mp = rte_mempool_create(name, hv->chim_cnt, sizeof(struct hn_txdesc),
-				HN_TXD_CACHE_SIZE, 0,
-				NULL, NULL,
-				hn_txd_init, dev,
-				dev->device->numa_node, 0);
-	if (!mp) {
-		PMD_DRV_LOG(ERR,
-			    "mempool %s create failed: %d", name, rte_errno);
-		return -rte_errno;
+	PMD_INIT_LOG(DEBUG, "create a TX bitmap n=%u size=%u",
+		     hv->chim_cnt, chim_bmp_size);
+
+	hv->chim_bmap = rte_bitmap_init(hv->chim_cnt, hv->chim_bmem,
+					chim_bmp_size);
+	if (hv->chim_bmap == NULL) {
+		PMD_INIT_LOG(ERR, "failed to init chim bitmap");
+		return -1;
 	}
 
-	hv->tx_pool = mp;
+	for (i = 0; i < hv->chim_cnt; i++)
+		rte_bitmap_set(hv->chim_bmap, i);
+
 	return 0;
 }
 
 void
-hn_tx_pool_uninit(struct rte_eth_dev *dev)
+hn_chim_uninit(struct rte_eth_dev *dev)
 {
 	struct hn_data *hv = dev->data->dev_private;
 
-	if (hv->tx_pool) {
-		rte_mempool_free(hv->tx_pool);
-		hv->tx_pool = NULL;
+	rte_bitmap_free(hv->chim_bmap);
+	rte_free(hv->chim_bmem);
+	hv->chim_bmem = NULL;
+}
+
+static uint32_t hn_chim_alloc(struct hn_data *hv)
+{
+	uint32_t index = NVS_CHIM_IDX_INVALID;
+	uint64_t slab;
+
+	rte_spinlock_lock(&hv->chim_lock);
+	if (rte_bitmap_scan(hv->chim_bmap, &index, &slab))
+		rte_bitmap_clear(hv->chim_bmap, index);
+	rte_spinlock_unlock(&hv->chim_lock);
+
+	return index;
+}
+
+static void hn_chim_free(struct hn_data *hv, uint32_t chim_idx)
+{
+	if (chim_idx >= hv->chim_cnt) {
+		PMD_DRV_LOG(ERR, "Invalid chimney index %u", chim_idx);
+	} else {
+		rte_spinlock_lock(&hv->chim_lock);
+		rte_bitmap_set(hv->chim_bmap, chim_idx);
+		rte_spinlock_unlock(&hv->chim_lock);
 	}
 }
 
@@ -220,15 +240,16 @@ static void hn_reset_txagg(struct hn_tx_queue *txq)
 
 int
 hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
-		      uint16_t queue_idx, uint16_t nb_desc __rte_unused,
+		      uint16_t queue_idx, uint16_t nb_desc,
 		      unsigned int socket_id,
 		      const struct rte_eth_txconf *tx_conf)
 
 {
 	struct hn_data *hv = dev->data->dev_private;
 	struct hn_tx_queue *txq;
+	char name[RTE_MEMPOOL_NAMESIZE];
 	uint32_t tx_free_thresh;
-	int err;
+	int err = -ENOMEM;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -252,6 +273,29 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
 
 	txq->free_thresh = tx_free_thresh;
 
+	snprintf(name, sizeof(name),
+		 "hn_txd_%u_%u", dev->data->port_id, queue_idx);
+
+	PMD_INIT_LOG(DEBUG, "TX descriptor pool %s n=%u size=%zu",
+		     name, nb_desc, sizeof(struct hn_txdesc));
+
+	txq->tx_rndis = rte_calloc("hn_txq_rndis", nb_desc,
+				   HN_RNDIS_PKT_ALIGNED, RTE_CACHE_LINE_SIZE);
+	if (txq->tx_rndis == NULL)
+		goto error;
+
+	txq->txdesc_pool = rte_mempool_create(name, nb_desc,
+					      sizeof(struct hn_txdesc),
+					      0, 0,
+					      NULL, NULL,
+					      hn_txd_init, txq,
+					      dev->device->numa_node, 0);
+	if (txq->txdesc_pool == NULL) {
+		PMD_DRV_LOG(ERR,
+			    "mempool %s create failed: %d", name, rte_errno);
+		goto error;
+	}
+
 	txq->agg_szmax  = RTE_MIN(hv->chim_szmax, hv->rndis_agg_size);
 	txq->agg_pktmax = hv->rndis_agg_pkts;
 	txq->agg_align  = hv->rndis_agg_align;
@@ -260,31 +304,57 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
 
 	err = hn_vf_tx_queue_setup(dev, queue_idx, nb_desc,
 				     socket_id, tx_conf);
-	if (err) {
-		rte_free(txq);
-		return err;
+	if (err == 0) {
+		dev->data->tx_queues[queue_idx] = txq;
+		return 0;
 	}
 
-	dev->data->tx_queues[queue_idx] = txq;
-	return 0;
+error:
+	if (txq->txdesc_pool)
+		rte_mempool_free(txq->txdesc_pool);
+	rte_free(txq->tx_rndis);
+	rte_free(txq);
+	return err;
+}
+
+
+static struct hn_txdesc *hn_txd_get(struct hn_tx_queue *txq)
+{
+	struct hn_txdesc *txd;
+
+	if (rte_mempool_get(txq->txdesc_pool, (void **)&txd)) {
+		++txq->stats.ring_full;
+		PMD_TX_LOG(DEBUG, "tx pool exhausted!");
+		return NULL;
+	}
+
+	txd->m = NULL;
+	txd->packets = 0;
+	txd->data_size = 0;
+	txd->chim_size = 0;
+
+	return txd;
+}
+
+static void hn_txd_put(struct hn_tx_queue *txq, struct hn_txdesc *txd)
+{
+	rte_mempool_put(txq->txdesc_pool, txd);
 }
 
 void
 hn_dev_tx_queue_release(void *arg)
 {
 	struct hn_tx_queue *txq = arg;
-	struct hn_txdesc *txd;
 
 	PMD_INIT_FUNC_TRACE();
 
 	if (!txq)
 		return;
 
-	/* If any pending data is still present just drop it */
-	txd = txq->agg_txd;
-	if (txd)
-		rte_mempool_put(txq->hv->tx_pool, txd);
+	if (txq->txdesc_pool)
+		rte_mempool_free(txq->txdesc_pool);
 
+	rte_free(txq->tx_rndis);
 	rte_free(txq);
 }
 
@@ -292,6 +362,7 @@ static void
 hn_nvs_send_completed(struct rte_eth_dev *dev, uint16_t queue_id,
 		      unsigned long xactid, const struct hn_nvs_rndis_ack *ack)
 {
+	struct hn_data *hv = dev->data->dev_private;
 	struct hn_txdesc *txd = (struct hn_txdesc *)xactid;
 	struct hn_tx_queue *txq;
 
@@ -312,9 +383,11 @@ hn_nvs_send_completed(struct rte_eth_dev *dev, uint16_t queue_id,
 		++txq->stats.errors;
 	}
 
-	rte_pktmbuf_free(txd->m);
+	if (txd->chim_index != NVS_CHIM_IDX_INVALID)
+		hn_chim_free(hv, txd->chim_index);
 
-	rte_mempool_put(txq->hv->tx_pool, txd);
+	rte_pktmbuf_free(txd->m);
+	hn_txd_put(txq, txd);
 }
 
 /* Handle transmit completion events */
@@ -1036,28 +1109,15 @@ static int hn_flush_txagg(struct hn_tx_queue *txq, bool *need_sig)
 	return ret;
 }
 
-static struct hn_txdesc *hn_new_txd(struct hn_data *hv,
-				    struct hn_tx_queue *txq)
-{
-	struct hn_txdesc *txd;
-
-	if (rte_mempool_get(hv->tx_pool, (void **)&txd)) {
-		++txq->stats.ring_full;
-		PMD_TX_LOG(DEBUG, "tx pool exhausted!");
-		return NULL;
-	}
-
-	txd->m = NULL;
-	txd->queue_id = txq->queue_id;
-	txd->packets = 0;
-	txd->data_size = 0;
-	txd->chim_size = 0;
-
-	return txd;
-}
-
+/*
+ * Try and find a place in a send chimney buffer to put
+ * the small packet. If space is available, this routine
+ * returns a pointer of where to place the data.
+ * If no space, caller should try direct transmit.
+ */
 static void *
-hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize)
+hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq,
+	     struct hn_txdesc *txd, uint32_t pktsize)
 {
 	struct hn_txdesc *agg_txd = txq->agg_txd;
 	struct rndis_packet_msg *pkt;
@@ -1085,7 +1145,7 @@ hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize)
 		}
 
 		chim = (uint8_t *)pkt + pkt->len;
-
+		txq->agg_prevpkt = chim;
 		txq->agg_pktleft--;
 		txq->agg_szleft -= pktsize;
 		if (txq->agg_szleft < HN_PKTSIZE_MIN(txq->agg_align)) {
@@ -1095,18 +1155,21 @@ hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize)
 			 */
 			txq->agg_pktleft = 0;
 		}
-	} else {
-		agg_txd = hn_new_txd(hv, txq);
-		if (!agg_txd)
-			return NULL;
-
-		chim = (uint8_t *)hv->chim_res->addr
-			+ agg_txd->chim_index * hv->chim_szmax;
 
-		txq->agg_txd = agg_txd;
-		txq->agg_pktleft = txq->agg_pktmax - 1;
-		txq->agg_szleft = txq->agg_szmax - pktsize;
+		hn_txd_put(txq, txd);
+		return chim;
 	}
+
+	txd->chim_index = hn_chim_alloc(hv);
+	if (txd->chim_index == NVS_CHIM_IDX_INVALID)
+		return NULL;
+
+	chim = (uint8_t *)hv->chim_res->addr
+			+ txd->chim_index * hv->chim_szmax;
+
+	txq->agg_txd = txd;
+	txq->agg_pktleft = txq->agg_pktmax - 1;
+	txq->agg_szleft = txq->agg_szmax - pktsize;
 	txq->agg_prevpkt = chim;
 
 	return chim;
@@ -1329,13 +1392,18 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		return (*vf_dev->tx_pkt_burst)(sub_q, tx_pkts, nb_pkts);
 	}
 
-	if (rte_mempool_avail_count(hv->tx_pool) <= txq->free_thresh)
+	if (rte_mempool_avail_count(txq->txdesc_pool) <= txq->free_thresh)
 		hn_process_events(hv, txq->queue_id, 0);
 
 	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
 		struct rte_mbuf *m = tx_pkts[nb_tx];
 		uint32_t pkt_size = m->pkt_len + HN_RNDIS_PKT_LEN;
 		struct rndis_packet_msg *pkt;
+		struct hn_txdesc *txd;
+
+		txd = hn_txd_get(txq);
+		if (txd == NULL)
+			break;
 
 		/* For small packets aggregate them in chimney buffer */
 		if (m->pkt_len < HN_TXCOPY_THRESHOLD && pkt_size <= txq->agg_szmax) {
@@ -1346,7 +1414,8 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 					goto fail;
 			}
 
-			pkt = hn_try_txagg(hv, txq, pkt_size);
+
+			pkt = hn_try_txagg(hv, txq, txd, pkt_size);
 			if (unlikely(!pkt))
 				break;
 
@@ -1360,21 +1429,13 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 			    hn_flush_txagg(txq, &need_sig))
 				goto fail;
 		} else {
-			struct hn_txdesc *txd;
-
-			/* can send chimney data and large packet at once */
-			txd = txq->agg_txd;
-			if (txd) {
-				hn_reset_txagg(txq);
-			} else {
-				txd = hn_new_txd(hv, txq);
-				if (unlikely(!txd))
-					break;
-			}
+			/* Send any outstanding packets in buffer */
+			if (txq->agg_txd && hn_flush_txagg(txq, &need_sig))
+				goto fail;
 
 			pkt = txd->rndis_pkt;
 			txd->m = m;
-			txd->data_size += m->pkt_len;
+			txd->data_size = m->pkt_len;
 			++txd->packets;
 
 			hn_encap(pkt, queue_id, m);
@@ -1383,7 +1444,7 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 			if (unlikely(ret != 0)) {
 				PMD_TX_LOG(NOTICE, "sg send failed: %d", ret);
 				++txq->stats.errors;
-				rte_mempool_put(hv->tx_pool, txd);
+				hn_txd_put(txq, txd);
 				goto fail;
 			}
 		}
diff --git a/drivers/net/netvsc/hn_var.h b/drivers/net/netvsc/hn_var.h
index 05bc492511ec..822d737bd3cc 100644
--- a/drivers/net/netvsc/hn_var.h
+++ b/drivers/net/netvsc/hn_var.h
@@ -52,6 +52,8 @@ struct hn_tx_queue {
 	uint16_t	port_id;
 	uint16_t	queue_id;
 	uint32_t	free_thresh;
+	struct rte_mempool *txdesc_pool;
+	void		*tx_rndis;
 
 	/* Applied packet transmission aggregation limits. */
 	uint32_t	agg_szmax;
@@ -115,8 +117,10 @@ struct hn_data {
 	uint16_t	num_queues;
 	uint64_t	rss_offloads;
 
+	rte_spinlock_t	chim_lock;
 	struct rte_mem_resource *chim_res;	/* UIO resource for Tx */
-	struct rte_mempool *tx_pool;		/* Tx descriptors */
+	struct rte_bitmap *chim_bmap;		/* Send buffer map */
+	void		*chim_bmem;
 	uint32_t	chim_szmax;		/* Max size per buffer */
 	uint32_t	chim_cnt;		/* Max packets per buffer */
 
@@ -157,8 +161,8 @@ uint16_t hn_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 uint16_t hn_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		      uint16_t nb_pkts);
 
-int	hn_tx_pool_init(struct rte_eth_dev *dev);
-void	hn_tx_pool_uninit(struct rte_eth_dev *dev);
+int	hn_chim_init(struct rte_eth_dev *dev);
+void	hn_chim_uninit(struct rte_eth_dev *dev);
 int	hn_dev_link_update(struct rte_eth_dev *dev, int wait);
 int	hn_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 			      uint16_t nb_desc, unsigned int socket_id,
-- 
2.20.1



More information about the dev mailing list