|SUCCESS| [v5] net/ice: add NEON-optimised Rx/Tx vector paths

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Wed Apr 15 23:54:04 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163099

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-15

Commit: [git sha would be extracted from patch]
Author: Jay Wang <jay.wang2 at arm.com>

Subject: [PATCH v5] net/ice: add NEON-optimised Rx/Tx vector paths

This patch adds the NEON-optimised Rx and Tx paths to the ice driver on
AArch64. The implementation provides vector operations for improved packet
processing performance on Arm platforms.

> diff --git a/.mailmap b/.mailmap
> index 0e0d83e1c6..873bc62d0d 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -695,6 +695,7 @@ Javen Xu <javen_xu at realsil.com.cn>
>  Jay Ding <jay.ding at broadcom.com>
>  Jay Jayatheerthan <jay.jayatheerthan at intel.com>
>  Jay Rolette <rolette at infiniteio.com>
> +Jay Wang <jay.wang2 at arm.com>
>  Jay Zhou <jianjay.zhou at huawei.com>
>  Jayaprakash Shanmugam <jayaprakash.shanmugam at intel.com>
>  Jean Dao <jean.dao at 6wind.com>

[ ... ]

> diff --git a/drivers/net/intel/ice/ice_rxtx_vec_neon.c b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> new file mode 100644
> index 0000000000..1d17916852
> --- /dev/null
> +++ b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
> @@ -0,0 +1,745 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2026 Intel Corporation
> + * Copyright(c) 2026 Arm Limited
> + */
> +
> +#include "ice_rxtx_vec_common.h"
> +
> +#include "../common/rx_vec_arm.h"

Does this code follow the DPDK coding style guide regarding header
include ordering? Per coding_style.rst, system headers should come
first, then DPDK headers, then driver headers. Here we have driver
headers first, then what appears to be a common header that may not
exist based on the file path.

> +
> +#include <rte_vect.h>
> +
> +#define ICE_UINT16_BIT	(CHAR_BIT * sizeof(uint16_t))

Can this macro be replaced with a standard definition? The coding style
guide discourages defining constants that are derivable from standard
values.

> +
> +static __rte_always_inline uint32x4_t
> +ice_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
> +{
> +#define FDID_MIS_MAGIC 0xFFFFFFFFu

Does this code leak the macro definition? According to coding_style.rst,
local macros should be undefined when no longer needed, particularly
those defined within functions.

> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
> +	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));
> +
> +	const uint32x4_t pkt_fdir_bit = vdupq_n_u32((uint32_t)(RTE_MBUF_F_RX_FDIR |
> +		RTE_MBUF_F_RX_FDIR_ID));

Does the indentation here follow the DPDK style? Continuation lines
should align with the opening parenthesis or be indented by 4 spaces.

[ ... ]

> +static __rte_always_inline void
> +ice_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
> +	struct rte_mbuf **rx_pkts)

Does the function parameter indentation follow the style guide? Function
parameters on subsequent lines should align with the opening parenthesis
or be indented consistently.

> +{
> +	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};

[ ... ]

> +	/* check the additional fdir_flags if fdir is enabled */
> +	if (rxq->fdir_enabled) {

Is this conditional check thread-safe? Can rxq->fdir_enabled change
during execution without proper synchronization?

> +		const uint32x4_t fdir_id0_1 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
> +				vreinterpretq_u32_u64(descs[1]));
> +		const uint32x4_t fdir_id2_3 =
> +			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
> +				vreinterpretq_u32_u64(descs[3]));
> +		const uint32x4_t fdir_id0_3 =
> +			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
> +				vreinterpretq_u64_u32(fdir_id2_3)));

[ ... ]

> +static __rte_always_inline uint16_t
> +_ice_recv_raw_pkts_vec(struct ci_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts, uint8_t *split_packet)
> +{
> +	volatile union ci_rx_flex_desc *rxdp;
> +	struct ci_rx_entry *sw_ring;
> +	uint16_t nb_pkts_recd;
> +	int pos;
> +	uint32_t *ptype_tbl = rxq->ice_vsi->adapter->ptype_tbl;

Can this pointer chain be null? Does this code verify that
rxq->ice_vsi->adapter is valid before dereferencing?

> +
> +	uint16x8_t crc_adjust = {
> +		0, 0,          /* ignore pkt_type field */
> +		rxq->crc_len,  /* sub crc on pkt_len */
> +		0,             /* ignore high-16bits of pkt_len */
> +		rxq->crc_len,  /* sub crc on data_len */
> +		0, 0, 0        /* ignore non-length field */
> +	};

[ ... ]

> +	/* check to see if there is actually a packet available */
> +	if (!(rxdp->wb.status_error0 &
> +	      rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
> +		return 0;

[ ... ]

> +		/* A.1 load descs[3-0] */
> +		descs[3] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
> +		descs[2] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
> +		descs[1] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
> +		descs[0] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));

Does this code have whitespace alignment issues? There appear to be
extra spaces before the equals sign which may violate style guidelines.

> +
> +		/* use acquire fence to order loads of descriptor qwords */
> +		rte_atomic_thread_fence(rte_memory_order_acquire);
> +		/* A.2 reload qword0 to make it ordered after qword1 load */
> +		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
> +				descs[3], 0);
> +		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
> +				descs[2], 0);
> +		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
> +				descs[1], 0);
> +		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
> +				descs[0], 0);

[ ... ]

> +#ifndef	RTE_NET_INTEL_USE_16BYTE_DESC

Does this code have whitespace issues? There appears to be a tab
character after ifndef which should be a space per style guidelines.

> +
> +		/**
> +		 * needs to load 2nd 16B of each desc for RSS hash parsing,
> +		 * will cause performance drop to get into this context.
> +		 */
> +		if (rxq->ice_vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
> +				RTE_ETH_RX_OFFLOAD_RSS_HASH) {

[ ... ]

> +uint16_t
> +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	uint16_t retval = 0;
> +
> +	while (nb_pkts > ICE_VPMD_RX_BURST) {
> +		uint16_t burst;
> +
> +		burst = ice_recv_scattered_burst_vec(rx_queue,
> +				rx_pkts + retval,
> +				ICE_VPMD_RX_BURST);
> +		retval += burst;
> +		nb_pkts -= burst;
> +		if (burst < ICE_VPMD_RX_BURST)
> +			return retval;
> +	}
> +
> +	return retval + ice_recv_scattered_burst_vec(rx_queue,
> +			rx_pkts + retval,
> +			nb_pkts);
> +}

[ ... ]

> +static __rte_always_inline uint16_t
> +ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +	uint16_t nb_pkts)
> +{
> +	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
> +	volatile struct ci_tx_desc *txdp;
> +	struct ci_tx_entry_vec *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
> +	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
> +	int i;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);

[ ... ]

> +	if (nb_commit >= n) {
> +		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
> +
> +		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
> +			ice_vtx1(txdp, *tx_pkts, flags);
> +
> +		/* write with RS for the last descriptor in the segment */
> +		ice_vtx1(txdp, *tx_pkts++, rs);
> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> +		/* avoid reach the end of ring */
> +		txdp = &txq->ci_tx_ring[tx_id];
> +		txep = &txq->sw_ring_vec[tx_id];
> +	}
> +
> +	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
> +
> +	ice_vtx(txdp, tx_pkts, nb_commit, flags);
> +
> +	tx_id = (uint16_t)(tx_id + nb_commit);
> +	if (tx_id > txq->tx_next_rs) {
> +		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
> +			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
> +					 CI_TXD_QW1_CMD_S);
> +		txq->tx_next_rs =
> +			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
> +	}
> +
> +	txq->tx_tail = tx_id;
> +
> +	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
> +
> +	return nb_pkts;
> +}

[ ... ]

> +int __rte_cold
> +ice_rxq_vec_setup(struct ci_rx_queue *rxq)
> +{
> +	rxq->vector_rx = 1;

Does this code use an implicit boolean comparison? Per coding_style.rst,
explicit comparisons should be used for integer values being assigned to
boolean-like fields.

> +	rxq->mbuf_initializer = ci_rxq_mbuf_initializer(rxq->port_id);
> +	return 0;
> +}


More information about the test-report mailing list