[dpdk-dev] [Patch 1/2] i40e simple tx: Larger list size (33 to 128) throughput optimization

Polehn, Mike A mike.a.polehn at intel.com
Tue Oct 27 21:56:44 CET 2015


Reduce the 32 packet list size focus for better packet list size range handling.

Changed maximum new buffer loop process size to the NIC queue free buffer count per loop.

Removed redundant single call check to just one call with focused loop.

Remove NIC register update write from per loop to one per write driver call to minimize CPU
stalls waiting for multiple SMP synchronization points and for earlier NIC register writes that
often take large cycle counts to complete. For example with an output list size of 64, the default 
loops size of 32, when 33 packets are queued on descriptor table, the second NIC register write will occur just after TX processing for 1 packet, resulting in a large CPU stall time.

Used some standard variables to help reduce overhead of non-standard variable sizes.

Reordered variable structure to put most active variables in first cache line, better utilize 
memory bytes inside cache line, and reduced active cache line count during call.

Signed-off-by: Mike A. Polehn <mike.a.polehn at intel.com>

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index ec62f75..2032e06 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -64,6 +64,7 @@
 #define DEFAULT_TX_FREE_THRESH 32
 #define I40E_MAX_PKT_TYPE      256
 #define I40E_RX_INPUT_BUF_MAX  256
+#define I40E_RX_FREE_THRESH_MIN  2
 
 #define I40E_TX_MAX_BURST  32
 
@@ -942,6 +943,12 @@ check_rx_burst_bulk_alloc_preconditions(__rte_unused struct i40e_rx_queue *rxq)
 			     "rxq->rx_free_thresh=%d",
 			     rxq->nb_rx_desc, rxq->rx_free_thresh);
 		ret = -EINVAL;
+	} else if (rxq->rx_free_thresh < I40E_RX_FREE_THRESH_MIN) {
+		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
+				"rxq->rx_free_thresh=%d, "
+				"I40E_RX_FREE_THRESH_MIN=%d",
+				rxq->rx_free_thresh, I40E_RX_FREE_THRESH_MIN);
+				ret = -EINVAL;
 	} else if (!(rxq->nb_rx_desc < (I40E_MAX_RING_DESC -
 				RTE_PMD_I40E_RX_MAX_BURST))) {
 		PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
@@ -1058,9 +1065,8 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
 {
 	volatile union i40e_rx_desc *rxdp;
 	struct i40e_rx_entry *rxep;
-	struct rte_mbuf *mb;
-	unsigned alloc_idx, i;
-	uint64_t dma_addr;
+	struct rte_mbuf *pk, *npk;
+	unsigned alloc_idx, i, l;
 	int diag;
 
 	/* Allocate buffers in bulk */
@@ -1076,22 +1082,36 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
 		return -ENOMEM;
 	}
 
+	pk = rxep->mbuf;
+	rte_prefetch0(pk);
+	rxep++;
+	npk = rxep->mbuf;
+	rte_prefetch0(npk);
+	rxep++;
+	l = rxq->rx_free_thresh - 2;
+
 	rxdp = &rxq->rx_ring[alloc_idx];
 	for (i = 0; i < rxq->rx_free_thresh; i++) {
-		if (likely(i < (rxq->rx_free_thresh - 1)))
+		struct rte_mbuf *mb = pk;
+		pk = npk;
+		if (likely(i < l)) {
 			/* Prefetch next mbuf */
-			rte_prefetch0(rxep[i + 1].mbuf);
-
-		mb = rxep[i].mbuf;
-		rte_mbuf_refcnt_set(mb, 1);
-		mb->next = NULL;
+			npk = rxep->mbuf;
+			rte_prefetch0(npk);
+			rxep++;
+		}
 		mb->data_off = RTE_PKTMBUF_HEADROOM;
+		rte_mbuf_refcnt_set(mb, 1);
 		mb->nb_segs = 1;
 		mb->port = rxq->port_id;
-		dma_addr = rte_cpu_to_le_64(\
-			RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
-		rxdp[i].read.hdr_addr = 0;
-		rxdp[i].read.pkt_addr = dma_addr;
+		mb->next = NULL;
+		{
+			uint64_t dma_addr = rte_cpu_to_le_64(
+				RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
+			rxdp->read.hdr_addr = dma_addr;
+			rxdp->read.pkt_addr = dma_addr;
+		}
+		rxdp++;
 	}
 
 	rxq->rx_last_pos = alloc_idx + rxq->rx_free_thresh - 1;


More information about the dev mailing list