[PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc
Jay Wang
jay.wang2 at arm.com
Fri Apr 17 15:08:29 CEST 2026
Added the scattered burst function on AArch64 so that we can leverage
the NEON-optimised Rx raw burst function to handle scattered packets for
the legacy 32B descriptor.
Signed-off-by: Jay Wang <jay.wang2 at arm.com>
---
drivers/net/intel/iavf/iavf.h | 1 +
drivers/net/intel/iavf/iavf_rxtx.c | 16 ++-
drivers/net/intel/iavf/iavf_rxtx_vec_neon.c | 110 +++++++++++++++++++-
drivers/net/intel/iavf/meson.build | 2 +-
4 files changed, 122 insertions(+), 7 deletions(-)
diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 403c61e2e8..e4936f3566 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -334,6 +334,7 @@ enum iavf_rx_func_type {
IAVF_RX_BULK_ALLOC,
IAVF_RX_BULK_ALLOC_FLEX_RXD,
IAVF_RX_NEON,
+ IAVF_RX_NEON_SCATTERED,
IAVF_RX_AVX2,
IAVF_RX_AVX2_SCATTERED,
IAVF_RX_AVX2_OFFLOAD,
diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
index 4ff6c18dc4..15566a0e18 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.c
+++ b/drivers/net/intel/iavf/iavf_rxtx.c
@@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
}
},
#endif
-#elif defined RTE_ARCH_ARM
+#elif defined(RTE_ARCH_ARM64)
[IAVF_RX_NEON] = {
.pkt_burst = iavf_recv_pkts_vec,
.info = "Vector Neon",
.features = {
- .rx_offloads = IAVF_RX_SCALAR_OFFLOADS,
+ .rx_offloads = IAVF_RX_VECTOR_OFFLOADS,
.simd_width = RTE_VECT_SIMD_128,
.bulk_alloc = true
}
},
+ [IAVF_RX_NEON_SCATTERED] = {
+ .pkt_burst = iavf_recv_scattered_pkts_vec,
+ .info = "Vector Scattered Neon",
+ .features = {
+ .rx_offloads = IAVF_RX_VECTOR_OFFLOADS | RTE_ETH_RX_OFFLOAD_SCATTER,
+ .simd_width = RTE_VECT_SIMD_128,
+ .scattered = true,
+ .bulk_alloc = true
+ }
+ },
#endif
};
@@ -3839,7 +3849,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
if (adapter->rx_bulk_alloc_allowed) {
req_features.bulk_alloc = true;
default_path = IAVF_RX_BULK_ALLOC;
-#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
if (iavf_rx_vec_dev_check(dev) != -1)
req_features.simd_width = iavf_get_max_simd_bitwidth();
#endif
diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
index 28c90b2a72..45e377d728 100644
--- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
+++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2022 Intel Corporation
- * Copyright(c) 2022 Arm Limited
+ * Copyright(c) 2022-2026 Arm Limited
*/
#include <stdint.h>
@@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
struct rte_mbuf **__rte_restrict rx_pkts,
uint16_t nb_pkts, uint8_t *split_packet)
{
- RTE_SET_USED(split_packet);
-
volatile union ci_rx_desc *rxdp;
struct ci_rx_entry *sw_ring;
uint16_t nb_pkts_recd;
@@ -164,6 +162,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
4, 5, 6, 7 /* octet 4~7, 32bits rss */
};
+ uint8x16_t eop_check = {
+ 0x02, 0x00, 0x02, 0x00,
+ 0x02, 0x00, 0x02, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00
+ };
+
uint16x8_t crc_adjust = {
0, 0, /* ignore pkt_type field */
rxq->crc_len, /* sub crc on pkt_len */
@@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+ if (split_packet) {
+ rte_mbuf_prefetch_part2(rx_pkts[pos]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+ }
+
/* pkts shift the pktlen field to be 16-bit aligned*/
uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
len_shl);
@@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
staterr = vzipq_u16(sterr_tmp1.val[1],
sterr_tmp2.val[1]).val[0];
+ /* C* extract and record EOP bit */
+ if (split_packet) {
+ uint8x16_t eop_shuf_mask = {
+ 0x00, 0x02, 0x04, 0x06,
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+ uint8x16_t eop_bits;
+
+ /* and with mask to extract bits, flipping 1-0 */
+ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+ eop_bits = vandq_u8(eop_bits, eop_check);
+ /* the staterr values are not in order, as the count
+ * of dd bits doesn't care. However, for end of
+ * packet tracking, we do care, so shuffle. This also
+ * compresses the 32-bit values to 8-bit
+ */
+ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
+
+ /* store the resulting 32-bit value */
+ vst1q_lane_u32((uint32_t *)split_packet,
+ vreinterpretq_u32_u8(eop_bits), 0);
+ split_packet += IAVF_VPMD_DESCS_PER_LOOP;
+ }
+
staterr = vshlq_n_u16(staterr, IAVF_UINT16_BIT - 1);
staterr = vreinterpretq_u16_s16(
vshrq_n_s16(vreinterpretq_s16_u16(staterr),
@@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
+/*
+ * vPMD receive routine that reassembles single burst of 32 scattered
+ * packets.
+ *
+ * Notice:
+ * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static __rte_always_inline uint16_t
+iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct ci_rx_queue *rxq = rx_queue;
+ uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
+
+ /* get some new buffers */
+ uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+ split_flags);
+
+ if (nb_bufs == 0)
+ return 0;
+
+ /* happy day case, full burst + no packets to be assembled */
+ const uint64_t *split_fl64 = (uint64_t *)split_flags;
+ if (!rxq->pkt_first_seg &&
+ split_fl64[0] == 0 && split_fl64[1] == 0 &&
+ split_fl64[2] == 0 && split_fl64[3] == 0)
+ return nb_bufs;
+
+ /* reassmble any packets that need reassembly */
+ unsigned int i = 0;
+ if (!rxq->pkt_first_seg) {
+ /* find the first split flag, and only reassmeble then */
+ while (i < nb_bufs && !split_flags[i])
+ i++;
+ if (i == nb_bufs)
+ return nb_bufs;
+ rxq->pkt_first_seg = rx_pkts[i];
+ }
+ return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
+ &split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
+ rxq->crc_len);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > IAVF_VPMD_RX_BURST) {
+ uint16_t burst;
+ burst = iavf_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval, IAVF_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < IAVF_VPMD_RX_BURST)
+ return retval;
+ }
+ /* The last one burst or nb_pkts <= IAVF_VPMD_RX_BURST */
+ return retval + iavf_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval, nb_pkts);
+}
+
void __rte_cold
iavf_rx_queue_release_mbufs_neon(struct ci_rx_queue *rxq)
{
diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build
index f9576586f6..50630a88c8 100644
--- a/drivers/net/intel/iavf/meson.build
+++ b/drivers/net/intel/iavf/meson.build
@@ -29,7 +29,7 @@ sources = files(
if arch_subdir == 'x86'
sources_avx2 += files('iavf_rxtx_vec_avx2.c')
sources_avx512 += files('iavf_rxtx_vec_avx512.c')
-elif arch_subdir == 'arm'
+elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
sources += files('iavf_rxtx_vec_neon.c')
endif
--
2.43.0
More information about the dev
mailing list