|SUCCESS| [v1,3/3] net/iavf: add NEON support for Rx flex desc

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Fri Apr 17 16:26:30 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163173

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-17

I will review this patch series for the Intel IAVF driver adding NEON optimizations for AArch64.

commit: 20260417130833.2503592-2-jay.wang2 at arm.com
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v1 1/3] net/iavf: add Rx scattered function for 32B desc

This patch adds NEON-optimized scattered receive functionality for the
legacy 32B descriptor format on AArch64.

> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c

[ ... ]

> @@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
>  		   struct rte_mbuf **__rte_restrict rx_pkts,
>  		   uint16_t nb_pkts, uint8_t *split_packet)
>  {
> -	RTE_SET_USED(split_packet);
> -

Does this code safely handle the split_packet parameter when it's NULL?
The conditional logic later uses split_packet directly in vst1q_lane_u32
without checking if it's NULL first. In the non-scattered path
(iavf_recv_pkts_vec), NULL is passed.

> @@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
>  		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
>  		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
>  
> +		if (split_packet) {
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
> +		}
> +

[ ... ]

> @@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
>  		staterr = vzipq_u16(sterr_tmp1.val[1],
>  				    sterr_tmp2.val[1]).val[0];
>  
> +		/* C* extract and record EOP bit */
> +		if (split_packet) {
> +			uint8x16_t eop_shuf_mask = {
> +				0x00, 0x02, 0x04, 0x06,
> +				0xFF, 0xFF, 0xFF, 0xFF,
> +				0xFF, 0xFF, 0xFF, 0xFF,
> +				0xFF, 0xFF, 0xFF, 0xFF
> +			};
> +			uint8x16_t eop_bits;
> +
> +			/* and with mask to extract bits, flipping 1-0 */
> +			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
> +			eop_bits = vandq_u8(eop_bits, eop_check);
> +			/* the staterr values are not in order, as the count
> +			 * of dd bits doesn't care. However, for end of
> +			 * packet tracking, we do care, so shuffle. This also
> +			 * compresses the 32-bit values to 8-bit
> +			 */
> +			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
> +
> +			/* store the resulting 32-bit value */
> +			vst1q_lane_u32((uint32_t *)split_packet,
> +				vreinterpretq_u32_u8(eop_bits), 0);

Does this write to split_packet have proper bounds checking? The
split_packet array should have IAVF_VPMD_RX_BURST elements based on
the calling code, but is this loop guaranteed not to overflow it?

> +			split_packet += IAVF_VPMD_DESCS_PER_LOOP;
> +		}
> +

[ ... ]

> @@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
>  	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
>  }
>  
> +/*
> + * vPMD receive routine that reassembles single burst of 32 scattered
> + * packets.
> + *
> + * Notice:
> + * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> + */
> +static __rte_always_inline uint16_t
> +iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	struct ci_rx_queue *rxq = rx_queue;
> +	uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
> +
> +	/* get some new buffers */
> +	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
> +						split_flags);

Does this code handle alignment correctly? The _recv_raw_pkts_vec
function floor-aligns nb_pkts to IAVF_VPMD_DESCS_PER_LOOP. If nb_pkts
is less than IAVF_VPMD_DESCS_PER_LOOP, does nb_bufs become 0?

> +
> +	if (nb_bufs == 0)
> +		return 0;
> +
> +	/* happy day case, full burst + no packets to be assembled */
> +	const uint64_t *split_fl64 = (uint64_t *)split_flags;
> +	if (!rxq->pkt_first_seg &&
> +			split_fl64[0] == 0 && split_fl64[1] == 0 &&
> +			split_fl64[2] == 0 && split_fl64[3] == 0)
> +		return nb_bufs;

Does this pointer cast from uint8_t[] to uint64_t* have proper
alignment guarantees? On some ARM systems, unaligned accesses may cause
issues.

> +
> +	/* reassmble any packets that need reassembly */
> +	unsigned int i = 0;
> +	if (!rxq->pkt_first_seg) {
> +		/* find the first split flag, and only reassmeble then */
> +		while (i < nb_bufs && !split_flags[i])
> +			i++;
> +		if (i == nb_bufs)
> +			return nb_bufs;
> +		rxq->pkt_first_seg = rx_pkts[i];
> +	}
> +	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
> +			&split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
> +			rxq->crc_len);
> +}

[ ... ]

> diff --git a/drivers/net/intel/iavf/meson.build b/drivers/net/intel/iavf/meson.build
> index f9576586f6..50630a88c8 100644
> --- a/drivers/net/intel/iavf/meson.build
> +++ b/drivers/net/intel/iavf/meson.build
> @@ -29,7 +29,7 @@ sources = files(
>  if arch_subdir == 'x86'
>      sources_avx2 += files('iavf_rxtx_vec_avx2.c')
>      sources_avx512 += files('iavf_rxtx_vec_avx512.c')
> -elif arch_subdir == 'arm'
> +elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')

Does this build configuration change exclude 32-bit ARM correctly?
Should there be any compatibility considerations for existing 32-bit
ARM builds?

commit: 20260417130833.2503592-3-jay.wang2 at arm.com
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v1 2/3] net/iavf: add NEON-optimised Tx burst function

This patch adds NEON-optimized transmit burst functionality.

> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c

[ ... ]

> +static __rte_always_inline uint16_t
> +iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	volatile struct ci_tx_desc *txdp;
> +	struct ci_tx_entry_vec *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
> +	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
> +	int i;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
> +
> +	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	nb_commit = nb_pkts;
> +	if (unlikely(nb_pkts == 0))
> +		return 0;
> +
> +	tx_id = txq->tx_tail;
> +	txdp = &txq->ci_tx_ring[tx_id];
> +	txep = &txq->sw_ring_vec[tx_id];
> +
> +	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
> +
> +	n = (uint16_t)(txq->nb_tx_desc - tx_id);
> +	if (nb_commit >= n) {
> +		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
> +
> +		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
> +			iavf_vtx1(txdp, *tx_pkts, flags);
> +
> +		/* write with RS for the last descriptor in the segment */
> +		iavf_vtx1(txdp, *tx_pkts++, rs);

Does this code handle the case where n == 0 correctly? If tx_id equals
txq->nb_tx_desc, then n becomes 0, and the loop with "i < n - 1" could
underflow.

> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> +		/* avoid reach the end of ring */
> +		txdp = &txq->ci_tx_ring[tx_id];
> +		txep = &txq->sw_ring_vec[tx_id];
> +	}

commit: 20260417130833.2503592-4-jay.wang2 at arm.com
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v1 3/3] net/iavf: add NEON support for Rx flex desc

This patch adds NEON-optimized paths for flex descriptor processing.

> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c

[ ... ]

> +static __rte_always_inline void
> +iavf_flex_rxd_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts,
> +	uint32_t *ptype_tbl)
> +{
> +	const uint16x8_t ptype_mask = {
> +		0, IAVF_RX_FLEX_DESC_PTYPE_M,
> +		0, IAVF_RX_FLEX_DESC_PTYPE_M,
> +		0, IAVF_RX_FLEX_DESC_PTYPE_M,
> +		0, IAVF_RX_FLEX_DESC_PTYPE_M
> +	};
> +
> +	uint32x4_t ptype_01 = vzip1q_u32(vreinterpretq_u32_u64(descs[0]),
> +				vreinterpretq_u32_u64(descs[1]));
> +	uint32x4_t ptype_23 = vzip1q_u32(vreinterpretq_u32_u64(descs[2]),
> +				vreinterpretq_u32_u64(descs[3]));
> +	uint32x4_t ptype_all_u32 =
> +				vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(ptype_01),
> +					vreinterpretq_u64_u32(ptype_23)));
> +	uint16x8_t ptype_all = vreinterpretq_u16_u32(ptype_all_u32);
> +
> +	ptype_all = vandq_u16(ptype_all, ptype_mask);
> +
> +	rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)];
> +	rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)];
> +	rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)];
> +	rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)];

Does this code perform bounds checking on the ptype_tbl array access?
The ptype values extracted from hardware could potentially index out of
bounds if the table size is not validated.

> +}

[ ... ]

> +static __rte_always_inline uint16_t
> +iavf_recv_scattered_burst_vec_flex_rxd(void *rx_queue


More information about the test-report mailing list