[PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function

Jay Wang jay.wang2 at arm.com
Fri Apr 17 15:08:30 CEST 2026


This patch adds the NEON-optimised Tx burst function for Intel IAVF
driver on AArch64.

Signed-off-by: Jay Wang <jay.wang2 at arm.com>
---
 drivers/net/intel/iavf/iavf.h               |   1 +
 drivers/net/intel/iavf/iavf_rxtx.c          |  15 ++-
 drivers/net/intel/iavf/iavf_rxtx.h          |   2 -
 drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 120 ++++++++++++++++++++
 4 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index e4936f3566..3e71d345a9 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -356,6 +356,7 @@ enum iavf_rx_func_type {
 enum iavf_tx_func_type {
 	IAVF_TX_DISABLED,
 	IAVF_TX_DEFAULT,
+	IAVF_TX_NEON,
 	IAVF_TX_AVX2,
 	IAVF_TX_AVX2_OFFLOAD,
 	IAVF_TX_AVX512,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 15566a0e18..645bc5ccf6 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3662,6 +3662,15 @@ static const struct ci_tx_path_info iavf_tx_path_infos[] = {
 		}
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[IAVF_TX_NEON] = {
+		.pkt_burst = iavf_xmit_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.tx_offloads = IAVF_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128
+		}
+	},
 #endif
 };
 
@@ -3878,7 +3887,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	int mbuf_check = adapter->devargs.mbuf_check;
 	int no_poll_on_link_down = adapter->devargs.no_poll_on_link_down;
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	struct ci_tx_queue *txq;
 	int i;
 	const struct ci_tx_path_features *selected_features;
@@ -3892,7 +3901,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 	if (dev->data->dev_started)
 		goto out;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	if (iavf_tx_vec_dev_check(dev) != -1)
 		req_features.simd_width = iavf_get_max_simd_bitwidth();
 
@@ -3915,7 +3924,7 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 						IAVF_TX_DEFAULT);
 
 out:
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	selected_features = &iavf_tx_path_infos[adapter->tx_func_type].features;
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h
index 80b06518b0..8b8e55e66f 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.h
+++ b/drivers/net/intel/iavf/iavf_rxtx.h
@@ -558,8 +558,6 @@ uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue,
 					       struct rte_mbuf **rx_pkts,
 					       uint16_t nb_pkts);
-uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-				  uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 				 uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 45e377d728..9c91b6bac1 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -445,6 +445,120 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			rx_pkts + retval, nb_pkts);
 }
 
+static __rte_always_inline void
+iavf_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
+	 uint64_t flags)
+{
+	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
+		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
+
+	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
+	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
+}
+
+static __rte_always_inline void
+iavf_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt,
+	uint16_t nb_pkts, uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		iavf_vtx1(txdp, *pkt, flags);
+}
+
+static __rte_always_inline uint16_t
+iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct ci_tx_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
+	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->ci_tx_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			iavf_vtx1(txdp, *tx_pkts, flags);
+
+		/* write with RS for the last descriptor in the segment */
+		iavf_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->ci_tx_ring[tx_id];
+		txep = &txq->sw_ring_vec[tx_id];
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
+					 CI_TXD_QW1_CMD_S);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		   uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		/* cross rs_thresh boundary is not allowed */
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = iavf_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx],
+				num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
 void __rte_cold
 iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
 {
@@ -465,6 +579,12 @@ iavf_rx_vec_dev_check(struct rte_eth_dev *dev)
 	return iavf_rx_vec_dev_check_default(dev);
 }
 
+int __rte_cold
+iavf_tx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return iavf_tx_vec_dev_check_default(dev);
+}
+
 enum rte_vect_max_simd
 iavf_get_max_simd_bitwidth(void)
 {
-- 
2.43.0



More information about the dev mailing list