[PATCH v1 1/1] net/ice: add Neon-optimised Rx/Tx vector paths

Jay Wang jay.wang2 at arm.com
Mon Mar 23 19:32:31 CET 2026


This patch adds the Neon-optimised Rx and Tx paths to the ice driver.

Tested on Ampere One platform with Intel E810-C NIC and 100G connection.
Tested with a single core and testpmd io forwarding mode. Observed
~30% performance boost in the above test compared to the default scalar
path.

Signed-off-by: Jay Wang <jay.wang2 at arm.com>
---
 .mailmap                                  |   1 +
 drivers/net/intel/ice/ice_ethdev.h        |   3 +
 drivers/net/intel/ice/ice_rxtx.c          |  53 +-
 drivers/net/intel/ice/ice_rxtx.h          |   6 +
 drivers/net/intel/ice/ice_rxtx_vec_neon.c | 747 ++++++++++++++++++++++
 drivers/net/intel/ice/meson.build         |   2 +
 6 files changed, 810 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/intel/ice/ice_rxtx_vec_neon.c

diff --git a/.mailmap b/.mailmap
index beccc84425..dfe92b0399 100644
--- a/.mailmap
+++ b/.mailmap
@@ -695,6 +695,7 @@ Javen Xu <javen_xu at realsil.com.cn>
 Jay Ding <jay.ding at broadcom.com>
 Jay Jayatheerthan <jay.jayatheerthan at intel.com>
 Jay Rolette <rolette at infiniteio.com>
+Jay Wang <jay.wang2 at arm.com>
 Jay Zhou <jianjay.zhou at huawei.com>
 Jayaprakash Shanmugam <jayaprakash.shanmugam at intel.com>
 Jean Dao <jean.dao at 6wind.com>
diff --git a/drivers/net/intel/ice/ice_ethdev.h b/drivers/net/intel/ice/ice_ethdev.h
index 4b3718f715..f6fd3bf106 100644
--- a/drivers/net/intel/ice/ice_ethdev.h
+++ b/drivers/net/intel/ice/ice_ethdev.h
@@ -204,6 +204,8 @@ enum ice_rx_func_type {
 	ICE_RX_AVX512_SCATTERED,
 	ICE_RX_AVX512_OFFLOAD,
 	ICE_RX_AVX512_SCATTERED_OFFLOAD,
+	ICE_RX_NEON,
+	ICE_RX_NEON_SCATTERED,
 };
 
 enum ice_tx_func_type {
@@ -213,6 +215,7 @@ enum ice_tx_func_type {
 	ICE_TX_AVX2_OFFLOAD,
 	ICE_TX_AVX512,
 	ICE_TX_AVX512_OFFLOAD,
+	ICE_TX_NEON,
 };
 
 struct ice_adapter;
diff --git a/drivers/net/intel/ice/ice_rxtx.c b/drivers/net/intel/ice/ice_rxtx.c
index 31b74be9ba..b34231c212 100644
--- a/drivers/net/intel/ice/ice_rxtx.c
+++ b/drivers/net/intel/ice/ice_rxtx.c
@@ -2515,7 +2515,9 @@ ice_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 	    ad->rx_func_type == ICE_RX_AVX512 ||
 	    ad->rx_func_type == ICE_RX_AVX512_SCATTERED ||
 	    ad->rx_func_type == ICE_RX_AVX512_OFFLOAD ||
-	    ad->rx_func_type == ICE_RX_AVX512_SCATTERED_OFFLOAD)
+	    ad->rx_func_type == ICE_RX_AVX512_SCATTERED_OFFLOAD ||
+	    ad->rx_func_type == ICE_RX_NEON ||
+	    ad->rx_func_type == ICE_RX_NEON_SCATTERED)
 		return ptypes;
 
 	return NULL;
@@ -3356,6 +3358,26 @@ static const struct ci_rx_path_info ice_rx_path_infos[] = {
 		}
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[ICE_RX_NEON] = {
+		.pkt_burst = ice_recv_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.rx_offloads = ICE_RX_VECTOR_OFFLOAD_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128,
+			.bulk_alloc = true
+		}
+	},
+	[ICE_RX_NEON_SCATTERED] = {
+		.pkt_burst = ice_recv_scattered_pkts_vec,
+		.info = "Vector Neon Scattered",
+		.features = {
+			.rx_offloads = ICE_RX_VECTOR_OFFLOAD_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128,
+			.scattered = true,
+			.bulk_alloc = true
+		}
+	},
 #endif
 };
 
@@ -3384,6 +3406,15 @@ ice_set_rx_function(struct rte_eth_dev *dev)
 			if (ice_rx_vec_dev_check(dev) == -1)
 				rx_simd_width = RTE_VECT_SIMD_DISABLED;
 	}
+#elif defined(RTE_ARCH_ARM64)
+	if (ad->ptp_ena || !ad->rx_bulk_alloc_allowed) {
+		rx_simd_width = RTE_VECT_SIMD_DISABLED;
+	} else {
+		rx_simd_width = ice_get_max_simd_bitwidth();
+		if (rx_simd_width >= RTE_VECT_SIMD_128)
+			if (ice_rx_vec_dev_check(dev) == -1)
+				rx_simd_width = RTE_VECT_SIMD_DISABLED;
+	}
 #endif
 
 	req_features.simd_width = rx_simd_width;
@@ -3404,6 +3435,14 @@ ice_set_rx_function(struct rte_eth_dev *dev)
 		for (i = 0; i < dev->data->nb_rx_queues; i++)
 			if (dev->data->rx_queues[i])
 				ice_rxq_vec_setup(dev->data->rx_queues[i]);
+#elif defined(RTE_ARCH_ARM64)
+	int i;
+
+	if (ice_rx_path_infos[ad->rx_func_type].features.simd_width >= RTE_VECT_SIMD_128)
+		/* Vector function selected. Prepare the rxq accordingly. */
+		for (i = 0; i < dev->data->nb_rx_queues; i++)
+			if (dev->data->rx_queues[i])
+				ice_rxq_vec_setup(dev->data->rx_queues[i]);
 #endif
 
 out:
@@ -3535,6 +3574,16 @@ static const struct ci_tx_path_info ice_tx_path_infos[] = {
 		.pkt_prep = ice_prep_pkts
 	},
 #endif
+#elif defined(RTE_ARCH_ARM64)
+	[ICE_TX_NEON] = {
+		.pkt_burst = ice_xmit_pkts_vec,
+		.info = "Vector Neon",
+		.features = {
+			.tx_offloads = ICE_TX_VECTOR_OFFLOADS,
+			.simd_width = RTE_VECT_SIMD_128
+		},
+		.pkt_prep = rte_eth_tx_pkt_prepare_dummy
+	},
 #endif
 };
 
@@ -3718,7 +3767,7 @@ ice_set_tx_function(struct rte_eth_dev *dev)
 
 	req_features.simple_tx = ad->tx_simple_allowed;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	if (ice_tx_vec_dev_check(dev) != -1)
 		req_features.simd_width = ice_get_max_simd_bitwidth();
 #endif
diff --git a/drivers/net/intel/ice/ice_rxtx.h b/drivers/net/intel/ice/ice_rxtx.h
index 77ed41f9fd..999b6b30d6 100644
--- a/drivers/net/intel/ice/ice_rxtx.h
+++ b/drivers/net/intel/ice/ice_rxtx.h
@@ -261,6 +261,12 @@ const uint32_t *ice_dev_supported_ptypes_get(struct rte_eth_dev *dev,
 void ice_select_rxd_to_pkt_fields_handler(struct ci_rx_queue *rxq,
 					  uint32_t rxdid);
 
+uint16_t ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts);
+uint16_t ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts);
+uint16_t ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts);
 int ice_rx_vec_dev_check(struct rte_eth_dev *dev);
 int ice_tx_vec_dev_check(struct rte_eth_dev *dev);
 int ice_rxq_vec_setup(struct ci_rx_queue *rxq);
diff --git a/drivers/net/intel/ice/ice_rxtx_vec_neon.c b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
new file mode 100644
index 0000000000..afd038efb5
--- /dev/null
+++ b/drivers/net/intel/ice/ice_rxtx_vec_neon.c
@@ -0,0 +1,747 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Intel Corporation
+ * Copyright(c) 2026 Arm Limited
+ */
+
+#include "ice_rxtx_vec_common.h"
+
+#include "../common/rx_vec_arm.h"
+
+#include <rte_vect.h>
+
+#define ICE_UINT16_BIT	(CHAR_BIT * sizeof(uint16_t))
+
+static __rte_always_inline uint32x4_t
+ice_flex_rxd_to_fdir_flags_vec(const uint32x4_t fdir_id0_3)
+{
+#define FDID_MIS_MAGIC 0xFFFFFFFFu
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1u << 2));
+	RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1u << 13));
+
+	const uint32x4_t pkt_fdir_bit = vdupq_n_u32((uint32_t)(RTE_MBUF_F_RX_FDIR |
+		RTE_MBUF_F_RX_FDIR_ID));
+
+	const uint32x4_t fdir_mis_mask = vdupq_n_u32(FDID_MIS_MAGIC);
+
+	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
+	uint32x4_t fdir_mask = vceqq_u32(fdir_id0_3, fdir_mis_mask);
+
+	/* XOR with 0xFFFFFFFF bit-reverses the mask */
+	fdir_mask = veorq_u32(fdir_mask, fdir_mis_mask);
+	const uint32x4_t fdir_flags = vandq_u32(fdir_mask, pkt_fdir_bit);
+
+	return fdir_flags;
+}
+
+static __rte_always_inline void
+ice_rxq_rearm(struct ci_rx_queue *rxq)
+{
+	ci_rxq_rearm(rxq);
+}
+
+static __rte_always_inline void
+ice_flex_rxd_to_olflags_v(struct ci_rx_queue *rxq, uint64x2_t descs[4],
+	struct rte_mbuf **rx_pkts)
+{
+	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+
+	uint32x4_t tmp_desc, flags, rss_vlan;
+
+	/* mask everything except checksum, RSS and VLAN flags
+	 * bit fields defined in enum ice_rx_flex_desc_status_error_0_bits
+	 * bit7:4 for checksum.
+	 * bit12 for RSS indication.
+	 * bit13 for VLAN indication.
+	 */
+	const uint32x4_t desc_mask = {0x30f0, 0x30f0, 0x30f0, 0x30f0};
+	const uint32x4_t cksum_mask = {
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+		RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK |
+		RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD
+	};
+
+	/* map the checksum, rss and vlan fields to the checksum, rss
+	 * and vlan flag.
+	 */
+	const uint8x16_t cksum_flags = {
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
+		 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+		(RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+		 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1
+	};
+
+	const uint8x16_t rss_vlan_flags = {
+		0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0
+	};
+
+	/* extract status_error0 field from 4 descriptors,
+	 * and mask out everything else not in desc_mask
+	 */
+	flags = vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+			vreinterpretq_u32_u64(descs[1]));
+	tmp_desc = vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+			vreinterpretq_u32_u64(descs[3]));
+	tmp_desc = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(flags),
+			vreinterpretq_u64_u32(tmp_desc)));
+
+	tmp_desc = vandq_u32(tmp_desc, desc_mask);
+
+	/* shift each 32-bit lane right by 4 so that we can use
+	 * the checksum bit as an index into cksum_flags
+	 */
+	tmp_desc = vshrq_n_u32(tmp_desc, 4);
+	flags = vreinterpretq_u32_u8(vqtbl1q_u8(cksum_flags,
+			vreinterpretq_u8_u32(tmp_desc)));
+
+	/* shift left by 1 bit because we shift right by 1 bit
+	 * in cksum_flags
+	 */
+	flags = vshlq_n_u32(flags, 1);
+
+	/* first check the outer L4 checksum */
+	uint32x4_t l4_outer_mask = {0x6, 0x6, 0x6, 0x6};
+	uint32x4_t l4_outer_flags = vandq_u32(flags, l4_outer_mask);
+	l4_outer_flags = vshlq_n_u32(l4_outer_flags, 20);
+
+	/* then check the rest of cksum bits */
+	uint32x4_t l3_l4_mask = {~0x6, ~0x6, ~0x6, ~0x6};
+	uint32x4_t l3_l4_flags = vandq_u32(flags, l3_l4_mask);
+	flags = vorrq_u32(l3_l4_flags, l4_outer_flags);
+
+	/* we need to mask out the redundant bits introduced by RSS or
+	 * VLAN fields.
+	 */
+	flags = vandq_u32(flags, cksum_mask);
+
+	/* map RSS, VLAN flags in HW desc to RTE_MBUF */
+	tmp_desc = vshrq_n_u32(tmp_desc, 8);
+	rss_vlan = vreinterpretq_u32_u8(vqtbl1q_u8(rss_vlan_flags,
+			vreinterpretq_u8_u32(tmp_desc)));
+
+	/* merge the flags */
+	flags = vorrq_u32(flags, rss_vlan);
+
+	/* check the additional fdir_flags if fdir is enabled */
+	if (rxq->fdir_enabled) {
+		const uint32x4_t fdir_id0_1 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[0]),
+				vreinterpretq_u32_u64(descs[1]));
+		const uint32x4_t fdir_id2_3 =
+			vzip2q_u32(vreinterpretq_u32_u64(descs[2]),
+				vreinterpretq_u32_u64(descs[3]));
+		const uint32x4_t fdir_id0_3 =
+			vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(fdir_id0_1),
+				vreinterpretq_u64_u32(fdir_id2_3)));
+		const uint32x4_t fdir_flags =
+				ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
+
+		/* merge with fdir_flags */
+		flags = vorrq_u32(flags, fdir_flags);
+
+		/* write fdir_id to mbuf */
+		rx_pkts[0]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 0);
+		rx_pkts[1]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 1);
+		rx_pkts[2]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 2);
+		rx_pkts[3]->hash.fdir.hi = vgetq_lane_u32(fdir_id0_3, 3);
+	}
+
+	/**
+	 * At this point, we have the 4 sets of flags in the low 16-bits
+	 * of each 32-bit value in flags.
+	 * We want to extract these, and merge them with the mbuf init data
+	 * so we can do a single 16-byte write to the mbuf to set the flags
+	 * and all the other initialization fields. Extracting the
+	 * appropriate flags means that we have to do a shift and blend for
+	 * each mbuf before we do the write.
+	 */
+	rearm0 = vsetq_lane_u64(vgetq_lane_u32(flags, 0), mbuf_init, 1);
+	rearm1 = vsetq_lane_u64(vgetq_lane_u32(flags, 1), mbuf_init, 1);
+	rearm2 = vsetq_lane_u64(vgetq_lane_u32(flags, 2), mbuf_init, 1);
+	rearm3 = vsetq_lane_u64(vgetq_lane_u32(flags, 3), mbuf_init, 1);
+
+	/* compile time check */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+			offsetof(struct rte_mbuf, rearm_data) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+			RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+	/* write the rearm data and the olflags in one write */
+	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
+	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
+	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
+	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
+}
+
+static __rte_always_inline void
+ice_flex_rxd_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts,
+	uint32_t *ptype_tbl)
+{
+	const uint16x8_t ptype_mask = {
+		0, ICE_RX_FLEX_DESC_PTYPE_M,
+		0, ICE_RX_FLEX_DESC_PTYPE_M,
+		0, ICE_RX_FLEX_DESC_PTYPE_M,
+		0, ICE_RX_FLEX_DESC_PTYPE_M
+	};
+
+	uint32x4_t ptype_01 = vzip1q_u32(vreinterpretq_u32_u64(descs[0]),
+			vreinterpretq_u32_u64(descs[1]));
+	uint32x4_t ptype_23 = vzip1q_u32(vreinterpretq_u32_u64(descs[2]),
+			vreinterpretq_u32_u64(descs[3]));
+	uint32x4_t ptype_all_u32 =
+			vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(ptype_01),
+				vreinterpretq_u64_u32(ptype_23)));
+	uint16x8_t ptype_all = vreinterpretq_u16_u32(ptype_all_u32);
+
+	ptype_all = vandq_u16(ptype_all, ptype_mask);
+
+	rx_pkts[0]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 1)];
+	rx_pkts[1]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 3)];
+	rx_pkts[2]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 5)];
+	rx_pkts[3]->packet_type = ptype_tbl[vgetq_lane_u16(ptype_all, 7)];
+}
+
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_VPMD_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < ICE_VPMD_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a ICE_VPMD_DESCS_PER_LOOP power-of-two
+ */
+static __rte_always_inline uint16_t
+_ice_recv_raw_pkts_vec(struct ci_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ci_rx_flex_desc *rxdp;
+	struct ci_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint32_t *ptype_tbl = rxq->ice_vsi->adapter->ptype_tbl;
+
+	uint16x8_t crc_adjust = {
+		0, 0,	/* ignore pkt_type field */
+		rxq->crc_len,	/* sub crc on pkt_len */
+		0,	/* ignore high-16bits of pkt_len */
+		rxq->crc_len,	/* sub crc on data_len */
+		0, 0, 0	/* ignore non-length fields */
+	};
+
+	/* mask to shuffle from flex descriptor to mbuf */
+	const uint8x16_t shuf_msk = {
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		0xFF, 0xFF,		/* pkt_type set as unknown */
+		4, 5,			/* octet 4~5, low bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits pkt_len, zero out */
+		4, 5,			/* octet 4~5, 16 bits data_len */
+		10, 11,			/* octet 10~11, 16 bits vlan_macip */
+		0xFF, 0xFF,		/* rss hash parsed separately */
+		0xFF, 0xFF,
+	};
+
+	const uint8x16_t eop_shuf_mask = {
+		0x06, 0x02, 0x04, 0x00,
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* compile-time check the above crc_adjust layout is correct */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	/* 4 packets DD mask */
+	const uint16x8_t dd_check = {
+		0x0001, 0x0001, 0x0001, 0x0001,
+		0, 0, 0, 0
+	};
+
+	/* 4 packets EOP mask */
+	const uint8x16_t eop_check = {
+		0x2, 0, 0x2, 0, 0x2, 0, 0x2, 0,
+		0, 0, 0, 0, 0, 0, 0, 0
+	};
+
+	/* nb_pkts has to be floor-aligned to ICE_VPMD_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_VPMD_DESCS_PER_LOOP);
+
+	rxdp = rxq->rx_flex_ring + rxq->rx_tail;
+	rte_prefetch0(rxdp);
+
+	/* see if we need to rearm the Rx queue */
+	if (rxq->rxrearm_nb > ICE_VPMD_RXQ_REARM_THRESH)
+		ice_rxq_rearm(rxq);
+
+	/* check to see if there is actually a packet available */
+	if (!(rxdp->wb.status_error0 &
+	      rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
+		return 0;
+
+	/* compile-time verification of the shuffle mask again */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* move the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packets in one loop
+	 * [A*. mask out 4 unused dirty fields in desc]
+	 * B. copy 4 mbuf pointers from sw_ring to rx_pkts
+	 * C. count the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+		 pos += ICE_VPMD_DESCS_PER_LOOP,
+		 rxdp += ICE_VPMD_DESCS_PER_LOOP) {
+		uint64x2_t descs[ICE_VPMD_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		uint16x8_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint16x8_t staterr;
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* A.1 load descs[3-0] */
+		descs[3] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3));
+		descs[2] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2));
+		descs[1] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1));
+		descs[0] =  vld1q_u64(RTE_CAST_PTR(uint64_t *, rxdp + 0));
+
+		/* use acquire fence to order loads of descriptor qwords */
+		rte_atomic_thread_fence(rte_memory_order_acquire);
+		/* A.2 reload qword0 to make it ordered after qword1 load */
+		descs[3] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 3),
+				descs[3], 0);
+		descs[2] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 2),
+				descs[2], 0);
+		descs[1] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp + 1),
+				descs[1], 0);
+		descs[0] = vld1q_lane_u64(RTE_CAST_PTR(uint64_t *, rxdp),
+				descs[0], 0);
+
+		/* B.1 load 4 mbuf pointers */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		/* B.2 copy 4 mbuf pointers into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* prefetch mbufs if it is a chained buffer */
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		ice_flex_rxd_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+
+		/* D.1 pkts convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb0 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* D.2 pkts set in in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb0), crc_adjust);
+		pkt_mb0 = vreinterpretq_u8_u16(tmp);
+
+#ifndef	RTE_NET_INTEL_USE_16BYTE_DESC
+
+		/**
+		 * needs to load 2nd 16B of each desc for RSS hash parsing,
+		 * will cause performance drop to get into this context.
+		 */
+		if (rxq->ice_vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
+				RTE_ETH_RX_OFFLOAD_RSS_HASH) {
+			/* load bottom half of every 32B desc */
+			const uint64x2_t raw_desc_bh3 =
+					vld1q_u64(RTE_CAST_PTR(const uint64_t *,
+						&rxdp[3].wb.status_error1));
+			const uint64x2_t raw_desc_bh2 =
+					vld1q_u64(RTE_CAST_PTR(const uint64_t *,
+						&rxdp[2].wb.status_error1));
+			const uint64x2_t raw_desc_bh1 =
+					vld1q_u64(RTE_CAST_PTR(const uint64_t *,
+						&rxdp[1].wb.status_error1));
+			const uint64x2_t raw_desc_bh0 =
+					vld1q_u64(RTE_CAST_PTR(const uint64_t *,
+						&rxdp[0].wb.status_error1));
+
+			/**
+			 * to shift the 32b RSS hash value to the
+			 * highest 32b of each 128b before mask
+			 */
+			uint64x2_t rss_hash3 = vshlq_n_u64(raw_desc_bh3, 32);
+			uint64x2_t rss_hash2 = vshlq_n_u64(raw_desc_bh2, 32);
+			uint64x2_t rss_hash1 = vshlq_n_u64(raw_desc_bh1, 32);
+			uint64x2_t rss_hash0 = vshlq_n_u64(raw_desc_bh0, 32);
+
+			const uint32x4_t rss_hash_msk = {0, 0, 0, 0xFFFFFFFFu};
+
+			rss_hash3 =
+				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash3),
+					rss_hash_msk));
+			rss_hash2 =
+				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash2),
+					rss_hash_msk));
+			rss_hash1 =
+				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash1),
+					rss_hash_msk));
+			rss_hash0 =
+				vreinterpretq_u64_u32(vandq_u32(vreinterpretq_u32_u64(rss_hash0),
+					rss_hash_msk));
+
+			pkt_mb3 = vorrq_u8(pkt_mb3, vreinterpretq_u8_u64(rss_hash3));
+			pkt_mb2 = vorrq_u8(pkt_mb2, vreinterpretq_u8_u64(rss_hash2));
+			pkt_mb1 = vorrq_u8(pkt_mb1, vreinterpretq_u8_u64(rss_hash1));
+			pkt_mb0 = vorrq_u8(pkt_mb0, vreinterpretq_u8_u64(rss_hash0));
+		}
+#endif
+
+		/* D.3 copy final data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb3);
+		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb2);
+		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb1);
+		vst1q_u8((void *)&rx_pkts[pos + 0]->rx_descriptor_fields1, pkt_mb0);
+
+		ice_flex_rxd_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+
+		/* C.1 filter staterr info only */
+		sterr_tmp2 = vzip2q_u16(vreinterpretq_u16_u64(descs[3]),
+				vreinterpretq_u16_u64(descs[2]));
+		sterr_tmp1 = vzip2q_u16(vreinterpretq_u16_u64(descs[1]),
+				vreinterpretq_u16_u64(descs[0]));
+
+		/* C.2 get 4 pkts status_error0 value */
+		staterr = vzip1q_u16(sterr_tmp2, sterr_tmp1);
+
+		/* C* extract and record EOP bits */
+		if (split_packet) {
+			uint8x16_t eop_bits;
+			/* and with mask to extract bits, flipping 1-0 */
+			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
+			eop_bits = vandq_u8(eop_bits, eop_check);
+			/* the staterr values are not in order, even though
+			 * the count of DD bits doesn't care. However, for
+			 * end of packet tracking, we do care, so shuffle.
+			 * Previously: descs[3] descs[1] descs[2] descs[0]
+			 * Shuffled: descs[0] descs[1] descs[2] descs[3]
+			 */
+			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
+
+			/* store the resulting 32-bit value */
+			vst1q_lane_u32((uint32_t *)split_packet,
+					vreinterpretq_u32_u8(eop_bits), 0);
+			split_packet += ICE_VPMD_DESCS_PER_LOOP;
+		}
+
+		/* C.3 count available number of descriptors */
+		/* mask everything except DD bit */
+		staterr = vandq_u16(staterr, dd_check);
+		/* move the statue bit (bit0) into the sign bit (bit15)
+		 * of each 16-bit lane
+		 */
+		staterr = vshlq_n_u16(staterr, ICE_UINT16_BIT - 1);
+		/* reinterpret staterr as a signed 16-bit and
+		 * arithmetic-shift-right by 15
+		 * each lane becomes 0xFFFF if original DD bit was 1, otherwise 0.
+		 * then interpret back to unsigned u16 vector
+		 */
+		staterr = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(staterr),
+				ICE_UINT16_BIT - 1));
+
+		/* reinterpret u16x8 vector as u64x2, and fetch the low u64
+		 * which contains the first four 16-bit lanes, and invert
+		 * all bits.
+		 */
+		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
+
+		if (unlikely(stat == 0)) {
+			nb_pkts_recd += ICE_VPMD_DESCS_PER_LOOP;
+		} else {
+			nb_pkts_recd += rte_ctz64(stat) / ICE_UINT16_BIT;
+			break;
+		}
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < ICE_VPMD_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+uint16_t
+ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ *
+ * Notice:
+ * - nb_pkts < ICE_VPMD_DESCS_PER_LOOP, just return no packet
+ */
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct ci_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+						split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+	/* check no split flags in both previous and current bursts */
+	if (!rxq->pkt_first_seg &&
+		split_fl64[0] == 0 && split_fl64[1] == 0 &&
+		split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly */
+	unsigned int i = 0;
+
+	if (!rxq->pkt_first_seg) {
+		/* find the first split flag, and only reassemble then */
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+		rxq->pkt_first_seg = rx_pkts[i];
+	}
+	return i + ci_rx_reassemble_packets(&rx_pkts[i], nb_bufs - i, &split_flags[i],
+			&rxq->pkt_first_seg, &rxq->pkt_last_seg, rxq->crc_len);
+}
+
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+				rx_pkts + retval,
+				ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+			rx_pkts + retval,
+			nb_pkts);
+}
+
+static __rte_always_inline void
+ice_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt,
+	 uint64_t flags)
+{
+	uint64_t high_qw = (CI_TX_DESC_DTYPE_DATA |
+		((uint64_t)flags << CI_TXD_QW1_CMD_S) |
+		((uint64_t)pkt->data_len << CI_TXD_QW1_TX_BUF_SZ_S));
+
+	uint64x2_t descriptor = {rte_pktmbuf_iova(pkt), high_qw};
+	vst1q_u64(RTE_CAST_PTR(uint64_t *, txdp), descriptor);
+}
+
+static __rte_always_inline void
+ice_vtx(volatile struct ci_tx_desc *txdp, struct rte_mbuf **pkt,
+	uint16_t nb_pkts, uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		ice_vtx1(txdp, *pkt, flags);
+}
+
+static __rte_always_inline uint16_t
+ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+	uint16_t nb_pkts)
+{
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+	volatile struct ci_tx_desc *txdp;
+	struct ci_tx_entry_vec *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = CI_TX_DESC_CMD_DEFAULT;
+	uint64_t rs = CI_TX_DESC_CMD_RS | CI_TX_DESC_CMD_DEFAULT;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ci_tx_free_bufs_vec(txq, ice_tx_desc_done, false);
+
+	nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	nb_commit = nb_pkts;
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->ci_tx_ring[tx_id];
+	txep = &txq->sw_ring_vec[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+		ci_tx_backlog_entry_vec(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			ice_vtx1(txdp, *tx_pkts, flags);
+
+		/* write with RS for the last descriptor in the segment */
+		ice_vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &txq->ci_tx_ring[tx_id];
+		txep = &txq->sw_ring_vec[tx_id];
+	}
+
+	ci_tx_backlog_entry_vec(txep, tx_pkts, nb_commit);
+
+	ice_vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->ci_tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+			rte_cpu_to_le_64(((uint64_t)CI_TX_DESC_CMD_RS) <<
+					 CI_TXD_QW1_CMD_S);
+		txq->tx_next_rs =
+			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+uint16_t
+ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+	uint16_t nb_pkts)
+{
+	uint16_t nb_tx = 0;
+	struct ci_tx_queue *txq = (struct ci_tx_queue *)tx_queue;
+
+	while (nb_pkts) {
+		uint16_t ret, num;
+
+		num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+		ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num);
+		nb_tx += ret;
+		nb_pkts -= ret;
+		if (ret < num)
+			break;
+	}
+
+	return nb_tx;
+}
+
+
+int __rte_cold
+ice_rxq_vec_setup(struct ci_rx_queue *rxq)
+{
+	rxq->vector_rx = 1;
+	rxq->mbuf_initializer = ci_rxq_mbuf_initializer(rxq->port_id);
+	return 0;
+}
+
+int __rte_cold
+ice_rx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return ice_rx_vec_dev_check_default(dev);
+}
+
+int __rte_cold
+ice_tx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return ice_tx_vec_dev_check_default(dev);
+}
+
+enum rte_vect_max_simd
+ice_get_max_simd_bitwidth(void)
+{
+	return RTE_MIN(128, rte_vect_get_max_simd_bitwidth());
+}
diff --git a/drivers/net/intel/ice/meson.build b/drivers/net/intel/ice/meson.build
index 293577676f..1dc7c0109a 100644
--- a/drivers/net/intel/ice/meson.build
+++ b/drivers/net/intel/ice/meson.build
@@ -33,6 +33,8 @@ endif
 if arch_subdir == 'x86'
     sources_avx2 += files('ice_rxtx_vec_avx2.c')
     sources_avx512 += files('ice_rxtx_vec_avx512.c')
+elif arch_subdir == 'arm'
+    sources += files('ice_rxtx_vec_neon.c')
 endif
 
 sources += files(
-- 
2.43.0



More information about the dev mailing list