|SUCCESS| [v5] net/ice: add NEON-optimised Rx/Tx vector paths

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Thu Apr 16 01:21:22 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163099

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-15

This patch adds NEON vector optimizations for the ice driver on ARM64.

commit [hash from patch]
Author: Jay Wang <jay.wang2 at arm.com>

net/ice: add NEON-optimised Rx/Tx vector paths

This patch adds the NEON-optimised Rx and Tx paths to the ice driver on
AArch64.

The single-core testpmd benchmarking was conducted by Andrew Bailey on
an Arm Grace system with an Intel E810 100G NIC. The test was run with
txq/rxq = 1 and 2048 descriptors. The throughput performance uplift is
shown in the following table.

> diff --git a/drivers/net/intel/ice/ice_rxtx_vec_neon.c b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> new file mode 100644
> index 0000000000..1d17916852
> --- /dev/null
> +++ b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> @@ -0,0 +1,745 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2026 Intel Corporation
> + * Copyright(c) 2026 Arm Limited
> + */
> +
> +#include "ice_rxtx_vec_common.h"
> +
> +#include "../common/rx_vec_arm.h"
> +
> +#include <rte_vect.h>
> +
> +#define ICE_UINT16_BIT	(CHAR_BIT * sizeof(uint16_t))

The coding style guide specifies that tabs must be used for indentation
and alignment.  Does the assignment operator need alignment here?

[ ... ]

> +static __rte_always_inline uint32x4_t
> +ice_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
> +{
> +#define FDID_MIS_MAGIC 0xFFFFFFFFu

Is this macro used elsewhere, or can it be a local const variable?  The
coding style prefers avoiding unnecessary macros.

> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));

[ ... ]

> +static __rte_always_inline void
> +ice_rxq_rearm(struct ci_rx_queue *rxq)
> +{
> +	ci_rxq_rearm(rxq);
> +}

Is this wrapper function necessary? If it simply calls ci_rxq_rearm(),
could the call sites use ci_rxq_rearm() directly?

[ ... ]

> +static __rte_always_inline void
> +ice_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
> +	struct rte_mbuf **rx_pkts)
> +{

[ ... ]

> +	/**
> +	 * mask everything except checksum, RSS and VLAN flags
> +	 * bit fields defined in enum ice_rx_flex_desc_status_error_0_bits
> +	 * bit7:4 for checksum.
> +	 * bit12 for RSS indication.
> +	 * bit13 for VLAN indication.
> +	 */

The coding style guide states that multi-line comments should have the
opening /* on a separate line. Does this apply here?

> +	const uint32x4_t desc_mask = {0x30f0, 0x30f0, 0x30f0, 0x30f0};

[ ... ]

> +	/**
> +	 * map the checksum, rss and vlan fields to the checksum, rss
> +	 * and vlan flag.
> +	 */
> +	const uint8x16_t cksum_flags = {
> +		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
> +		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,

These expressions are quite long.  Should intermediate macro definitions
or const variables be used for readability?

[ ... ]

> +	/**
> +	 * extract status_error0 field from 4 descriptors,
> +	 * and mask out everything else not in desc_mask
> +	 */
> +	flags = vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
> +			vreinterpretq_u32_u64(descs[1]));

[ ... ]

> +	/* check the additional fdir_flags if fdir is enabled */
> +	if (rxq->fdir_enabled) {
> +		const uint32x4_t fdir_id0_1 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
> +				vreinterpretq_u32_u64(descs[1]));
> +		const uint32x4_t fdir_id2_3 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
> +				vreinterpretq_u32_u64(descs[3]));
> +		const uint32x4_t fdir_id0_3 =
> +			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
> +				vreinterpretq_u64_u32(fdir_id2_3)));
> +		const uint32x4_t fdir_flags =
> +				ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
> +
> +		/* merge with fdir_flags */
> +		flags = vorrq_u32(flags, fdir_flags);
> +
> +		/* write fdir_id to mbuf */
> +		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
> +		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
> +		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
> +		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
> +	}

[ ... ]

> +static __rte_always_inline uint16_t
> +_ice_recv_raw_pkts_vec(struct ci_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts, uint8_t *split_packet)
> +{

[ ... ]

> +#ifndef	RTE_NET_INTEL_USE_16BYTE_DESC

Does this tab+spaces mix follow the coding style for preprocessor
directives?

> +
> +		/**
> +		 * needs to load 2nd 16B of each desc for RSS hash parsing,
> +		 * will cause performance drop to get into this context.
> +		 */
> +		if (rxq->ice_vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
> +				RTE_ETH_RX_OFFLOAD_RSS_HASH) {

[ ... ]

> +			rss_hash3 =
> +				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash3),
> +					rss_hash_msk));
> +			rss_hash2 =
> +				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash2),
> +					rss_hash_msk));
> +			rss_hash1 =
> +				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash1),
> +					rss_hash_msk));
> +			rss_hash0 =
> +				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash0),
> +					rss_hash_msk));

These lines are quite long. Should they be broken up or simplified for
readability?

[ ... ]

> +		/**
> +		 * reinterpret staterr as a signed 16-bit and
> +		 * arithmetic-shift-right by 15
> +		 * each lane becomes 0xFFFF if original DD bit was 1, otherwise 0.
> +		 * then interpret back to unsigned u16 vector
> +		 */
> +		staterr = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(staterr),
> +				ICE_UINT16_BIT - 1));

[ ... ]

> +		/**
> +		 * reinterpret u16x8 vector as u64x2, and fetch the low u64
> +		 * which contains the first four 16-bit lanes, and invert
> +		 * all bits.
> +		 */
> +		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
> +
> +		if (likely(stat == 0)) {
> +			nb_pkts_recd += ICE_VPMD_DESCS_PER_LOOP;
> +		} else {
> +			nb_pkts_recd += rte_ctz64(stat) / ICE_UINT16_BIT;
> +			break;
> +		}

Does this counting logic correctly handle the DD bit positions?  In the
v4 changelog, there was a fix for "icorrect Rx vector DD bit extraction
issue."  Has that been verified in this implementation?

[ ... ]

> +uint16_t
> +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	uint16_t retval = 0;
> +
> +	while (nb_pkts > ICE_VPMD_RX_BURST) {
> +		uint16_t burst;
> +
> +		burst = ice_recv_scattered_burst_vec(rx_queue,
> +				rx_pkts + retval,
> +				ICE_VPMD_RX_BURST);
> +		retval += burst;
> +		nb_pkts -= burst;
> +		if (burst < ICE_VPMD_RX_BURST)
> +			return retval;
> +	}
> +
> +	return retval + ice_recv_scattered_burst_vec(rx_queue,
> +			rx_pkts + retval,
> +			nb_pkts);
> +}

[ ... ]

> +static __rte_always_inline uint16_t
> +ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;

[ ... ]

> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false);
> +
> +	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	nb_commit = nb_pkts;
> +	if (unlikely(nb_pkts == 0))
> +		return 0;

[ ... ]

> diff --git a/drivers/net/intel/ice/meson.build b/drivers/net/intel/ice/meson.build
> index 293577676f..a205304c89 100644
> --- a/drivers/net/intel/ice/meson.build
> +++ b/drivers/net/intel/ice/meson.build
> @@ -33,6 +33,8 @@ endif
>  if arch_subdir == 'x86'
>      sources_avx2 += files('ice_rxtx_vec_avx2.c')
>      sources_avx512 += files('ice_rxtx_vec_avx512.c')
> +elif arch_subdir == 'arm' and dpdk_conf.get('RTE_ARCH_64')
> +    sources += files('ice_rxtx_vec_neon.c')
>  endif


More information about the test-report mailing list