|SUCCESS| [v4,1/1] net/ice: add NEON-optimised Rx/Tx vector paths

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Fri Apr 10 16:16:07 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/162968

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-10

I'll review this patch which adds NEON-optimized Rx/Tx vector paths to the Intel ICE driver for ARM64 architecture.

commit: [Not provided in patch file]
Author: Jay Wang <jay.wang2 at arm.com>

[PATCH v4 1/1] net/ice: add NEON-optimised Rx/Tx vector paths

This patch adds the NEON-optimised Rx and Tx paths to the ice driver.

> diff --git a/drivers/net/intel/ice/ice_rxtx_vec_neon.c b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> new file mode 100644
> index 0000000000..6123e1ff8e
> --- /dev/null
> +++ b/drivers/net/intel/ice/ice_rxtx_vec_neon.c

[ ... ]

> +static __rte_always_inline void
> +ice_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
> +	struct rte_mbuf **rx_pkts)
> +{

[ ... ]

> +	/* check the additional fdir_flags if fdir is enabled */
> +	if (rxq->fdir_enabled) {
> +		const uint32x4_t fdir_id0_1 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
> +				vreinterpretq_u32_u64(descs[1]));
> +		const uint32x4_t fdir_id2_3 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
> +				vreinterpretq_u32_u64(descs[3]));
> +		const uint32x4_t fdir_id0_3 =
> +			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
> +				vreinterpretq_u64_u32(fdir_id2_3)));
> +		const uint32x4_t fdir_flags =
> +				ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
> +
> +		/* merge with fdir_flags */
> +		flags = vorrq_u32(flags, fdir_flags);
> +
> +		/* write fdir_id to mbuf */
> +		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
> +		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
> +		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
> +		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
> +	}

Does this code corrupt the RSS hash values that may have been written to
the mbuf hash union earlier in the function? The mbuf hash field is a
union, and writing to hash.fdir.hi could overwrite hash.rss values that
were previously set.

[ ... ]

> +static __rte_always_inline uint16_t
> +_ice_recv_raw_pkts_vec(struct ci_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts, uint8_t *split_packet)
> +{

[ ... ]

> +#ifndef	RTE_NET_INTEL_USE_16BYTE_DESC
> +
> +		/**
> +		 * needs to load 2nd 16B of each desc for RSS hash parsing,
> +		 * will cause performance drop to get into this context.
> +		 */
> +		if (rxq->ice_vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
> +				RTE_ETH_RX_OFFLOAD_RSS_HASH) {

Does this check need to ensure rxq->ice_vsi and rxq->ice_vsi->adapter
are not NULL before dereferencing? Can this code path be reached with
these pointers uninitialized?

[ ... ]

> +static uint16_t
> +ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	struct ci_rx_queue *rxq = rx_queue;
> +	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> +
> +	/* get some new buffers */
> +	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
> +						split_flags);
> +	if (nb_bufs == 0)
> +		return 0;
> +
> +	/* happy day case, full burst + no packets to be joined */
> +	const uint64_t *split_fl64 = (uint64_t *)split_flags;

Is this cast introducing alignment issues? The split_flags array is
uint8_t but is being accessed as uint64_t. Can this cause unaligned
memory access on ARM64?

[ ... ]

> +static __rte_always_inline void
> +ice_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
> +	 uint64_t flags)
> +{
> +	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
> +		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
> +		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
> +
> +	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
> +	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
> +}

Does this function need to handle the case where pkt is NULL? Can
rte_pktmbuf_iova() be called safely on a NULL pointer?

[ ... ]

> +static __rte_always_inline uint16_t
> +ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	volatile struct ci_tx_desc *txdp;
> +	struct ci_tx_entry_vec *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
> +	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
> +	int i;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false);
> +
> +	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	nb_commit = nb_pkts;
> +	if (unlikely(nb_pkts == 0))
> +		return 0;

Does this code handle the case where nb_tx_free could become zero after
calling ci_tx_free_bufs_vec()? Would this cause an infinite loop if
packets continuously fail to be freed?

[ ... ]

> diff --git a/drivers/net/intel/ice/ice_rxtx.c b/drivers/net/intel/ice/ice_rxtx.c
> index 31b74be9ba..12eac6e41e 100644
> --- a/drivers/net/intel/ice/ice_rxtx.c
> +++ b/drivers/net/intel/ice/ice_rxtx.c

[ ... ]

> @@ -3404,6 +3435,14 @@ ice_set_rx_function(struct rte_eth_dev *dev)
>  		for (i = 0; i < dev->data->nb_rx_queues; i++)
>  			if (dev->data->rx_queues[i])
>  				ice_rxq_vec_setup(dev->data->rx_queues[i]);
> +#elif defined(RTE_ARCH_ARM64)
> +	int i;
> +
> +	if (ice_rx_path_infos[ad->rx_func_type].features.simd_width >= RTE_VECT_SIMD_128)
> +		/* Vector function selected. Prepare the rxq accordingly. */
> +		for (i = 0; i < dev->data->nb_rx_queues; i++)
> +			if (dev->data->rx_queues[i])
> +				ice_rxq_vec_setup(dev->data->rx_queues[i]);
>  #endif

Does this code check whether dev->data->rx_queues is NULL before
accessing it? Can this cause a NULL pointer dereference when
dev->data->nb_rx_queues is non-zero but rx_queues is not allocated?

[ ... ]

> @@ -3729,8 +3778,13 @@ ice_set_tx_function(struct rte_eth_dev *dev)
>  						ICE_TX_DEFAULT);
>  
>  out:
> +#if defined(RTE_ARCH_X86)
>  	ad->tx_vec_allowed =
>  		(ice_tx_path_infos[ad->tx_func_type].features.simd_width >= RTE_VECT_SIMD_256);
> +#elif defined(RTE_ARCH_ARM64)
> +	ad->tx_vec_allowed =
> +		(ice_tx_path_infos[ad->tx_func_type].features.simd_width >= RTE_VECT_SIMD_128);
> +#endif

Should there be a fallback case when neither RTE_ARCH_X86 nor
RTE_ARCH_ARM64 is defined? Does tx_vec_allowed remain uninitialized on
other architectures?


More information about the test-report mailing list