[RFC 4/4] net/af_packet: add VPP-style prefetching to receive path

Stephen Hemminger stephen at networkplumber.org
Wed Jan 28 18:30:20 CET 2026


Implement the single/dual/quad loop design pattern from FD.IO VPP to
improve cache efficiency in the af_packet PMD receive path.

The original implementation processes packets one at a time in a simple
loop, which can result in cache misses when accessing frame headers and
packet data. The new implementation:

- Processes packets in batches of 4 (quad), 2 (dual), and 1 (single)
- Prefetches next batch of frame headers while processing current batch
- Prefetches packet data before memcpy to hide memory latency
- Reduces loop overhead through partial unrolling

Two helper functions are introduced:
- af_packet_get_frame(): Returns frame pointer at index with wraparound
- af_packet_rx_one(): Common per-packet processing (mbuf alloc, memcpy,
  VLAN handling, timestamp offload)

The quad loop checks availability of all 4 frames before processing,
falling through to dual/single loops when fewer frames are ready. Early
exit paths (out_advance1/2/3) ensure correct frame index tracking when
mbuf allocation fails mid-batch.

Prefetch strategy:
- Frame headers: prefetch N+4..N+7 while processing N..N+3
- Packet data: prefetch at tp_mac offset before memcpy

This pattern is well-established in high-performance packet processing
and should improve throughput by better utilizing CPU cache hierarchy,
particularly beneficial when processing bursts of packets.

Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
---
 drivers/net/af_packet/rte_eth_af_packet.c | 208 +++++++++++++++++-----
 1 file changed, 164 insertions(+), 44 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 5847e14d80..946c21d878 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -14,6 +14,7 @@
 #include <rte_malloc.h>
 #include <rte_kvargs.h>
 #include <bus_vdev_driver.h>
+#include <rte_prefetch.h>
 
 #include <errno.h>
 #include <linux/if_ether.h>
@@ -120,75 +121,194 @@ RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE);
 	RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \
 		## __VA_ARGS__, strerror(errno))
 
-static uint16_t
-eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+/*
+ * Helper to get the frame pointer at a given index with wraparound
+ */
+static inline struct tpacket2_hdr *
+af_packet_get_frame(struct pkt_rx_queue *pkt_q, unsigned int idx)
+{
+	if (idx >= pkt_q->framecount)
+		idx -= pkt_q->framecount;
+	return (struct tpacket2_hdr *)pkt_q->rd[idx].iov_base;
+}
+
+/*
+ * Process a single received packet - common code for all loop variants
+ */
+static inline int
+af_packet_rx_one(struct pkt_rx_queue *pkt_q,
+		 struct tpacket2_hdr *ppd,
+		 struct rte_mbuf **mbuf_out,
+		 unsigned long *rx_bytes)
 {
-	unsigned i;
-	struct tpacket2_hdr *ppd;
 	struct rte_mbuf *mbuf;
 	uint8_t *pbuf;
+
+	mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool);
+	if (unlikely(mbuf == NULL)) {
+		pkt_q->rx_nombuf++;
+		return -1;
+	}
+
+	rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen;
+	pbuf = (uint8_t *)ppd + ppd->tp_mac;
+	memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf));
+
+	if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+		mbuf->vlan_tci = ppd->tp_vlan_tci;
+		mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
+		if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf))
+			PMD_LOG(ERR, "Failed to reinsert VLAN tag");
+	}
+
+	if (pkt_q->timestamp_offloading) {
+		*RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset,
+			rte_mbuf_timestamp_t *) =
+				(uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec;
+		mbuf->ol_flags |= timestamp_dynflag;
+	}
+
+	mbuf->port = pkt_q->in_port;
+	*mbuf_out = mbuf;
+	*rx_bytes += mbuf->pkt_len;
+	ppd->tp_status = TP_STATUS_KERNEL;
+
+	return 0;
+}
+
+/*
+ * Receive packets using VPP-style single/dual/quad loop pattern with prefetching.
+ */
+static uint16_t
+eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
 	struct pkt_rx_queue *pkt_q = queue;
+	struct tpacket2_hdr *ppd0, *ppd1, *ppd2, *ppd3;
 	uint16_t num_rx = 0;
 	unsigned long num_rx_bytes = 0;
 	unsigned int framecount, framenum;
+	uint16_t n_left;
 
 	if (unlikely(nb_pkts == 0))
 		return 0;
 
-	/*
-	 * Reads the given number of packets from the AF_PACKET socket one by
-	 * one and copies the packet data into a newly allocated mbuf.
-	 */
 	framecount = pkt_q->framecount;
 	framenum = pkt_q->framenum;
-	for (i = 0; i < nb_pkts; i++) {
-		/* point at the next incoming frame */
-		ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
-		if ((ppd->tp_status & TP_STATUS_USER) == 0)
+	n_left = nb_pkts;
+
+	/* Quad loop: Process 4 packets at a time with prefetching */
+	while (n_left >= 4) {
+		ppd0 = af_packet_get_frame(pkt_q, framenum);
+		ppd1 = af_packet_get_frame(pkt_q, framenum + 1);
+		ppd2 = af_packet_get_frame(pkt_q, framenum + 2);
+		ppd3 = af_packet_get_frame(pkt_q, framenum + 3);
+
+		if ((ppd0->tp_status & TP_STATUS_USER) == 0)
 			break;
+		if ((ppd1->tp_status & TP_STATUS_USER) == 0)
+			goto dual_loop;
+		if ((ppd2->tp_status & TP_STATUS_USER) == 0)
+			goto dual_loop;
+		if ((ppd3->tp_status & TP_STATUS_USER) == 0)
+			goto dual_loop;
+
+		/* Prefetch next 4 frame headers */
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 4));
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 5));
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 6));
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 7));
+
+		/* Prefetch packet data */
+		rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
+		rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac);
+		rte_prefetch0((uint8_t *)ppd2 + ppd2->tp_mac);
+		rte_prefetch0((uint8_t *)ppd3 + ppd3->tp_mac);
+
+		if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out;
+		num_rx++;
+		if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out_advance1;
+		num_rx++;
+		if (unlikely(af_packet_rx_one(pkt_q, ppd2, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out_advance2;
+		num_rx++;
+		if (unlikely(af_packet_rx_one(pkt_q, ppd3, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out_advance3;
+		num_rx++;
 
-		/* allocate the next mbuf */
-		mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool);
-		if (unlikely(mbuf == NULL)) {
-			pkt_q->rx_nombuf++;
+		framenum += 4;
+		if (framenum >= framecount)
+			framenum -= framecount;
+		n_left -= 4;
+	}
+
+dual_loop:
+	/* Dual loop: Process 2 packets at a time */
+	while (n_left >= 2) {
+		ppd0 = af_packet_get_frame(pkt_q, framenum);
+		ppd1 = af_packet_get_frame(pkt_q, framenum + 1);
+
+		if ((ppd0->tp_status & TP_STATUS_USER) == 0)
 			break;
-		}
+		if ((ppd1->tp_status & TP_STATUS_USER) == 0)
+			goto single_loop;
 
-		/* packet will fit in the mbuf, go ahead and receive it */
-		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen;
-		pbuf = (uint8_t *) ppd + ppd->tp_mac;
-		memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf));
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 2));
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 3));
+		rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
+		rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac);
 
-		/* check for vlan info */
-		if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
-			mbuf->vlan_tci = ppd->tp_vlan_tci;
-			mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
+		if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out;
+		num_rx++;
+		if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out_advance1;
+		num_rx++;
 
-			if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf))
-				PMD_LOG(ERR, "Failed to reinsert VLAN tag");
-		}
+		framenum += 2;
+		if (framenum >= framecount)
+			framenum -= framecount;
+		n_left -= 2;
+	}
 
-		/* add kernel provided timestamp when offloading is enabled */
-		if (pkt_q->timestamp_offloading) {
-			/* since TPACKET_V2 timestamps are provided in nanoseconds resolution */
-			*RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset,
-				rte_mbuf_timestamp_t *) =
-					(uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec;
+single_loop:
+	/* Single loop: Process remaining packets */
+	while (n_left >= 1) {
+		ppd0 = af_packet_get_frame(pkt_q, framenum);
 
-			mbuf->ol_flags |= timestamp_dynflag;
-		}
+		if ((ppd0->tp_status & TP_STATUS_USER) == 0)
+			break;
 
-		/* release incoming frame and advance ring buffer */
-		ppd->tp_status = TP_STATUS_KERNEL;
-		if (++framenum >= framecount)
-			framenum = 0;
-		mbuf->port = pkt_q->in_port;
+		rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 1));
+		rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
 
-		/* account for the receive frame */
-		bufs[i] = mbuf;
+		if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0))
+			goto out;
 		num_rx++;
-		num_rx_bytes += mbuf->pkt_len;
+
+		if (++framenum >= framecount)
+			framenum = 0;
+		n_left--;
 	}
+
+	goto out;
+
+out_advance3:
+	framenum += 3;
+	if (framenum >= framecount)
+		framenum -= framecount;
+	goto out;
+out_advance2:
+	framenum += 2;
+	if (framenum >= framecount)
+		framenum -= framecount;
+	goto out;
+out_advance1:
+	framenum += 1;
+	if (framenum >= framecount)
+		framenum -= framecount;
+out:
 	pkt_q->framenum = framenum;
 	pkt_q->rx_pkts += num_rx;
 	pkt_q->rx_bytes += num_rx_bytes;
-- 
2.51.0



More information about the dev mailing list