|SUCCESS| [v3,3/3] net/iavf: add NEON support for Rx flex desc
dpdklab at iol.unh.edu
dpdklab at iol.unh.edu
Wed May 6 00:49:03 CEST 2026
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163581
_AI Code Review_
=== Patch Review: series.patch (via Claude) ===
Review date: 2026-05-05
git sha: 20260505110745.206312-2-jay.wang2 at arm.com (patch 1/3)
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v3 1/3] net/iavf: add Rx scattered function for 32B desc
This patch adds the NEON-optimised scattered burst function on AArch64
to leverage the NEON-optimised Rx raw burst function to handle scattered
packets for the legacy 32B descriptor.
> diff --git a/drivers/net/intel/iavf/iavf_rxtx.c b/drivers/net/intel/iavf/iavf_rxtx.c
> index 4ff6c18dc4..15566a0e18 100644
> --- a/drivers/net/intel/iavf/iavf_rxtx.c
> +++ b/drivers/net/intel/iavf/iavf_rxtx.c
[ ... ]
> @@ -3551,16 +3551,26 @@ static const struct ci_rx_path_info iavf_rx_path_infos[] = {
> }
> },
> #endif
> -#elif defined RTE_ARCH_ARM
> +#elif defined(RTE_ARCH_ARM64)
> [IAVF_RX_NEON] = {
> .pkt_burst = iavf_recv_pkts_vec,
> .info = "Vector Neon",
> .features = {
> - .rx_offloads = IAVF_RX_SCALAR_OFFLOADS,
> + .rx_offloads = IAVF_RX_VECTOR_OFFLOADS,
> .simd_width = RTE_VECT_SIMD_128,
> .bulk_alloc = true
> }
> },
Does this change to RTE_ARCH_ARM64 and IAVF_RX_VECTOR_OFFLOADS break
existing 32-bit ARM configurations? The original code supported
RTE_ARCH_ARM generally.
> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> index 28c90b2a72..45e377d728 100644
> --- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> +++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
[ ... ]
> @@ -145,8 +145,6 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
> struct rte_mbuf **__rte_restrict rx_pkts,
> uint16_t nb_pkts, uint8_t *split_packet)
> {
> - RTE_SET_USED(split_packet);
> -
The RTE_SET_USED macro was removed but split_packet is now conditionally
used. Does this trigger unused parameter warnings when split_packet is
NULL in non-scattered receive paths?
> @@ -238,6 +243,13 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
> vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
> vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
>
> + if (split_packet) {
> + rte_mbuf_prefetch_part2(rx_pkts[pos]);
> + rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> + rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> + rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
> + }
> +
[ ... ]
> @@ -306,6 +318,32 @@ _recv_raw_pkts_vec(struct ci_rx_queue *__rte_restrict rxq,
> staterr = vzipq_u16(sterr_tmp1.val[1],
> sterr_tmp2.val[1]).val[0];
>
> + /* C* extract and record EOP bit */
> + if (split_packet) {
> + uint8x16_t eop_shuf_mask = {
> + 0x00, 0x02, 0x04, 0x06,
> + 0xFF, 0xFF, 0xFF, 0xFF,
> + 0xFF, 0xFF, 0xFF, 0xFF,
> + 0xFF, 0xFF, 0xFF, 0xFF
> + };
> + uint8x16_t eop_bits;
> +
> + /* and with mask to extract bits, flipping 1-0 */
> + eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
> + eop_bits = vandq_u8(eop_bits, eop_check);
> + /* the staterr values are not in order, as the count
> + * of dd bits doesn't care. However, for end of
> + * packet tracking, we do care, so shuffle. This also
> + * compresses the 32-bit values to 8-bit
> + */
This multi-line comment does not follow the required style. Should be:
/* The staterr values are not in order, ...
* of dd bits doesn't care ...
*/
> + eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
> +
> + /* store the resulting 32-bit value */
> + vst1q_lane_u32((uint32_t *)split_packet,
> + vreinterpretq_u32_u8(eop_bits), 0);
Does this pointer cast require alignment guarantees? Can split_packet
be misaligned?
> + split_packet += IAVF_VPMD_DESCS_PER_LOOP;
> + }
> +
[ ... ]
> @@ -341,6 +379,72 @@ iavf_recv_pkts_vec(void *__rte_restrict rx_queue,
> return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
> }
>
> +/*
> + * vPMD receive routine that reassembles single burst of 32 scattered
> + * packets.
> + *
> + * Notice:
> + * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet
> + */
> +static __rte_always_inline uint16_t
> +iavf_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> + uint16_t nb_pkts)
> +{
> + struct ci_rx_queue *rxq = rx_queue;
> + uint8_t split_flags[IAVF_VPMD_RX_BURST] = {0};
> +
> + /* get some new buffers */
> + uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
> + split_flags);
> +
> + if (nb_bufs == 0)
> + return 0;
> +
> + /* happy day case, full burst + no packets to be assembled */
> + const uint64_t *split_fl64 = (uint64_t *)split_flags;
Does this cast require split_flags to be aligned to 8 bytes? Stack
allocated arrays may not have this guarantee.
> + if (!rxq->pkt_first_seg &&
> + split_fl64[0] == 0 && split_fl64[1] == 0 &&
> + split_fl64[2] == 0 && split_fl64[3] == 0)
Does the line break placement follow the continuation rules? The logical
operator should be at the end of the previous line.
> + return nb_bufs;
> +
> + /* reassmble any packets that need reassembly */
Typo: "reassmble" should be "reassemble"
> + unsigned int i = 0;
Variable declarations should be at the start of the block.
> + if (!rxq->pkt_first_seg) {
> + /* find the first split flag, and only reassmeble then */
Typo: "reassmeble" should be "reassemble"
> + while (i < nb_bufs && !split_flags[i])
> + i++;
> + if (i == nb_bufs)
> + return nb_bufs;
> + rxq->pkt_first_seg = rx_pkts[i];
> + }
> + return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i,
> + &split_flags[i], &rxq->pkt_first_seg, &rxq->pkt_last_seg,
> + rxq->crc_len);
Does this code handle the case where reassembly fails? Does
ci_rx_reassemble_packets() return an error code?
---
git sha: 20260505110745.206312-3-jay.wang2 at arm.com (patch 2/3)
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v3 2/3] net/iavf: add NEON-optimised Tx burst function
This patch adds the NEON-optimised Tx burst function for Intel IAVF
driver on AArch64.
> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> index 45e377d728..9c91b6bac1 100644
> --- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> +++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
[ ... ]
> +static __rte_always_inline uint16_t
> +iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> + uint16_t nb_pkts)
> +{
> + struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> + volatile struct ci_tx_desc *txdp;
> + struct ci_tx_entry_vec *txep;
> + uint16_t n, nb_commit, tx_id;
> + uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
> + uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
> + int i;
> +
> + /* cross rx_thresh boundary is not allowed */
This comment mentions rx_thresh but should mention tx_rs_thresh for a
transmit function.
> + nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> + if (txq->nb_tx_free < txq->tx_free_thresh)
> + ci_tx_free_bufs_vec(txq, iavf_tx_desc_done, false);
> +
> + nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> + nb_commit = nb_pkts;
> + if (unlikely(nb_pkts == 0))
> + return 0;
> +
> + tx_id = txq->tx_tail;
> + txdp = &txq->ci_tx_ring[tx_id];
> + txep = &txq->sw_ring_vec[tx_id];
> +
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
> +
> + n = (uint16_t)(txq->nb_tx_desc - tx_id);
> + if (nb_commit >= n) {
> + ci_tx_backlog_entry_vec(txep, tx_pkts, n);
> +
> + for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
> + iavf_vtx1(txdp, *tx_pkts, flags);
> +
> + /* write with RS for the last descriptor in the segment */
> + iavf_vtx1(txdp, *tx_pkts++, rs);
> +
> + nb_commit = (uint16_t)(nb_commit - n);
> +
> + tx_id = 0;
> + txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> + /* avoid reach the end of ring */
> + txdp = &txq->ci_tx_ring[tx_id];
> + txep = &txq->sw_ring_vec[tx_id];
> + }
> +
> + ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
> +
> + iavf_vtx(txdp, tx_pkts, nb_commit, flags);
> +
> + tx_id = (uint16_t)(tx_id + nb_commit);
> + if (tx_id > txq->tx_next_rs) {
> + txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
> + rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
> + CI_TXD_QW1_CMD_S);
> + txq->tx_next_rs =
> + (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
> + }
> +
> + txq->tx_tail = tx_id;
> +
> + IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
> +
> + return nb_pkts;
> +}
---
git sha: 20260505110745.206312-4-jay.wang2 at arm.com (patch 3/3)
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v3 3/3] net/iavf: add NEON support for Rx flex desc
This patch adds the NEON-optimised Rx paths to process receive flex
descriptor.
> diff --git a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> index 9c91b6bac1..e4c3f073cb 100644
> --- a/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
> +++ b/drivers/net/intel/iavf/iavf_rxtx_vec_neon.c
[ ... ]
> @@ -16,12 +16,445 @@
>
> #include "../common/rx_vec_arm.h
More information about the test-report
mailing list