|SUCCESS| [v4,1/1] net/ice: add NEON-optimised Rx/Tx vector paths

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Fri Apr 10 17:35:25 CEST 2026
Previous message (by thread): |SUCCESS| [v4,1/1] net/ice: add NEON-optimised Rx/Tx vector paths
Next message (by thread): |Smoke |SUCCESS| pw(162969-162997) sid(37877) job(PER_PATCH_BUILD18551-20260410213638) [v3, 29/29] net/ice: use common action checks for ACL
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/162968

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-10

# Review of DPDK ice NEON vector paths patch

commit: (patch from patchwork, ID 162968)
Author: Jay Wang <jay.wang2 at arm.com>
Subject: [PATCH v4 1/1] net/ice: add NEON-optimised Rx/Tx vector paths

This patch adds NEON-optimized receive and transmit paths for the Intel
ice driver on ARM64 platforms. The implementation includes vector
processing for packet reception with scatter support and vector transmit
functionality.

> diff --git a/drivers/net/intel/ice/ice_rxtx_vec_neon.c b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> new file mode 100644
> index 0000000000..6123e1ff8e
> --- /dev/null
> +++ b/drivers/net/intel/ice/ice_rxtx_vec_neon.c

[ ... ]

> +static __rte_always_inline uint32x4_t
> +ice_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
> +{
> +#define FDID_MIS_MAGIC 0xFFFFFFFFu

Can this define be moved outside the function to avoid redefining it on
every call? The coding style guide discourages #define within function
scope when it represents a constant that could be at file scope.

> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));

[ ... ]

> +static __rte_always_inline void
> +ice_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
> +	struct rte_mbuf **rx_pkts)

Does this function properly handle NULL rx_pkts entries? If
_ice_recv_raw_pkts_vec() is called with partially allocated mbufs, would
this code access invalid memory when writing to rx_pkts[0-3]?

[ ... ]

> +	/* check the additional fdir_flags if fdir is enabled */
> +	if (rxq->fdir_enabled) {
> +		const uint32x4_t fdir_id0_1 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
> +				vreinterpretq_u32_u64(descs[1]));
> +		const uint32x4_t fdir_id2_3 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
> +				vreinterpretq_u32_u64(descs[3]));
> +		const uint32x4_t fdir_id0_3 =
> +			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
> +				vreinterpretq_u64_u32(fdir_id2_3)));
> +		const uint32x4_t fdir_flags =
> +				ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
> +
> +		/* merge with fdir_flags */
> +		flags = vorrq_u32(flags, fdir_flags);
> +
> +		/* write fdir_id to mbuf */
> +		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
> +		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
> +		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
> +		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);

Does this code unconditionally overwrite hash.fdir.hi even when RSS
offload is enabled? Could this create a conflict between RSS hash and
FDIR metadata since they share the same union?

> +	}

[ ... ]

> +static __rte_always_inline uint16_t
> +_ice_recv_raw_pkts_vec(struct ci_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts, uint8_t *split_packet)

[ ... ]

> +	/* nb_pkts has to be floor-aligned to ICE_VPMD_DESCS_PER_LOOP */
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_VPMD_DESCS_PER_LOOP);
> +
> +	rxdp = rxq->rx_flex_ring + rxq->rx_tail;
> +	rte_prefetch0(rxdp);
> +
> +	/* see if we need to rearm the Rx queue */
> +	if (rxq->rxrearm_nb > ICE_VPMD_RXQ_REARM_THRESH)
> +		ice_rxq_rearm(rxq);
> +
> +	/* check to see if there is actually a packet available */
> +	if (!(rxdp->wb.status_error0 &
> +	      rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
> +		return 0;

Does this code correctly handle the case where nb_pkts is less than
ICE_VPMD_DESCS_PER_LOOP after alignment? If the caller requests fewer
packets than the loop processes, the function returns 0 without checking
if any descriptors are ready. Is this the intended behavior?

[ ... ]

> +		/* A.1 load descs[3-0] */
> +		descs[3] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
> +		descs[2] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
> +		descs[1] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
> +		descs[0] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));
> +
> +		/* use acquire fence to order loads of descriptor qwords */
> +		rte_atomic_thread_fence(rte_memory_order_acquire);
> +		/* A.2 reload qword0 to make it ordered after qword1 load */
> +		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
> +				descs[3], 0);
> +		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
> +				descs[2], 0);
> +		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
> +				descs[1], 0);
> +		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
> +				descs[0], 0);

Does this double-load pattern with the fence guarantee correct ordering
on all ARM64 implementations? The pattern loads the full descriptor,
then uses a fence, then reloads only lane 0. Is lane 0 guaranteed to
contain the status bits that determine descriptor validity?

[ ... ]

> +		/* C* extract and record EOP bits */
> +		if (split_packet) {
> +			uint8x16_t eop_bits;
> +
> +			/* and with mask to extract bits, flipping 1-0 */
> +			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
> +			eop_bits = vandq_u8(eop_bits, eop_check);
> +
> +			/* store the resulting 32-bit value */
> +			vst1q_lane_u32((uint32_t *)split_packet,
> +					vreinterpretq_u32_u8(eop_bits), 0);
> +			split_packet += ICE_VPMD_DESCS_PER_LOOP;

Does this code properly initialize split_packet array elements beyond
the first 4 bytes written by vst1q_lane_u32()? The function processes
ICE_VPMD_DESCS_PER_LOOP descriptors but only writes 32 bits of EOP
information. Could this leave uninitialized memory if
ICE_VPMD_DESCS_PER_LOOP > 4?

[ ... ]

> +uint16_t
> +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	uint16_t retval = 0;
> +
> +	while (nb_pkts > ICE_VPMD_RX_BURST) {
> +		uint16_t burst;
> +
> +		burst = ice_recv_scattered_burst_vec(rx_queue,
> +				rx_pkts + retval,
> +				ICE_VPMD_RX_BURST);
> +		retval += burst;
> +		nb_pkts -= burst;
> +		if (burst < ICE_VPMD_RX_BURST)
> +			return retval;
> +	}
> +
> +	return retval + ice_recv_scattered_burst_vec(rx_queue,
> +			rx_pkts + retval,
> +			nb_pkts);

Does this function handle the case where rx_pkts array size is smaller
than the accumulated retval? If ice_recv_scattered_burst_vec() returns
more packets than expected, could rx_pkts + retval overflow the array
bounds?

[ ... ]

> +static __rte_always_inline uint16_t
> +ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	volatile struct ci_tx_desc *txdp;
> +	struct ci_tx_entry_vec *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
> +	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
> +	int i;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false);

Does this call to ci_tx_free_bufs_vec() guarantee that enough
descriptors are freed? If ci_tx_free_bufs_vec() fails to free sufficient
descriptors, the subsequent check of nb_tx_free might still be
insufficient, but there's no error handling for this case.

> +	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	nb_commit = nb_pkts;
> +	if (unlikely(nb_pkts == 0))
> +		return 0;

[ ... ]

> +	n = (uint16_t)(txq->nb_tx_desc - tx_id);
> +	if (nb_commit >= n) {
> +		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
> +
> +		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
> +			ice_vtx1(txdp, *tx_pkts, flags);
> +
> +		/* write with RS for the last descriptor in the segment */
> +		ice_vtx1(txdp, *tx_pkts++, rs);
> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);

Does this reset of tx_next_rs correctly handle the wraparound case? If
the ring wraps and nb_commit >= n, tx_next_rs is set to tx_rs_thresh - 1,
but shouldn't it account for how many descriptors were already written in
this burst?

[ ... ]

> diff --git a/drivers/net/intel/ice/ice_rxtx.c b/drivers/net/intel/ice/ice_rxtx.c
> index 31b74be9ba..12eac6e41e 100644

[ ... ]

> +#elif defined(RTE_ARCH_ARM64)
> +	[ICE_RX_NEON] = {
> +		.pkt_burst = ice_recv_pkts_vec,
> +		.info = "Vector Neon",
> +		.features = {
> +			.rx_offloads = ICE_RX_VECTOR_OFFLOAD_OFFLOADS,
> +			.simd_width = RTE_VECT_SIMD_128,
> +			.bulk_alloc = true
> +		}
> +	},
> +	[ICE_RX_NEON_SCATTERED] = {
> +		.pkt_burst = ice_recv_scattered_pkts_vec,
> +		.info = "Vector Neon Scattered",
> +		.features = {
> +			.rx_offloads = ICE_RX_VECTOR_OFFLOAD_OFFLOADS,
> +			.simd_width = RTE_VECT_SIMD_128,
> +			.scattered = true,
> +			.bulk_alloc = true
> +		}
> +	},

Where is ICE_RX_VECTOR_OFFLOAD_OFFLOADS defined for ARM64? This appears
to reference an x86-specific macro. Does the ice driver have a common
definition, or does this need an ARM64-specific offload mask?

[ ... ]

> +#elif defined(RTE_ARCH_ARM64)
> +	if (ad->ptp_ena || !ad->rx_bulk_alloc_allowed) {
> +		rx_simd_width = RTE_VECT_SIMD_DISABLED;
> +	} else {
> +		rx_simd_width = ice_get_max_simd_bitwidth();
> +		if (rx_simd_width >= RTE_VECT_SIMD_128)
> +			if (ice_rx_vec_dev_check(dev) == -1)
> +				rx_simd_width = RTE_VECT_SIMD_DISABLED;
> +	}

Style issue: The nested if statements should use braces according to the
coding style guide. The inner if should be:

	if (rx_simd_width >= RTE_VECT_
Previous message (by thread): |SUCCESS| [v4,1/1] net/ice: add NEON-optimised Rx/Tx vector paths
Next message (by thread): |Smoke |SUCCESS| pw(162969-162997) sid(37877) job(PER_PATCH_BUILD18551-20260410213638) [v3, 29/29] net/ice: use common action checks for ACL
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the test-report mailing list