[dpdk-dev] [PATCH v2 6/8] net/hns3: add vector Rx burst with NEON instructions

Wei Hu (Xavier) huwei013 at chinasoftinc.com
Wed Sep 9 11:23:37 CEST 2020


From: "Wei Hu (Xavier)" <xavier.huwei at huawei.com>

This patch adds NEON vector instructions to optimize Rx burst process.

Signed-off-by: Chengwen Feng <fengchengwen at huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei at huawei.com>
Signed-off-by: Huisong Li <lihuisong at huawei.com>
---
 drivers/net/hns3/hns3_ethdev.c        |   1 +
 drivers/net/hns3/hns3_ethdev.h        |   1 +
 drivers/net/hns3/hns3_ethdev_vf.c     |   1 +
 drivers/net/hns3/hns3_rxtx.c          |  94 +++++++++++++++-
 drivers/net/hns3/hns3_rxtx.h          |  35 +++++-
 drivers/net/hns3/hns3_rxtx_vec.c      | 167 ++++++++++++++++++++++++++++
 drivers/net/hns3/hns3_rxtx_vec.h      |  20 ++++
 drivers/net/hns3/hns3_rxtx_vec_neon.h | 203 ++++++++++++++++++++++++++++++++++
 8 files changed, 514 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c
index 9df5fc8..61be870 100644
--- a/drivers/net/hns3/hns3_ethdev.c
+++ b/drivers/net/hns3/hns3_ethdev.c
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
index 098b6ce..fd6a9f9 100644
--- a/drivers/net/hns3/hns3_ethdev.h
+++ b/drivers/net/hns3/hns3_ethdev.h
@@ -643,6 +643,7 @@ struct hns3_adapter {
 	};
 
 	bool rx_simple_allowed;
+	bool rx_vec_allowed;
 	bool tx_simple_allowed;
 	bool tx_vec_allowed;
 
diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c
index f3e6aea..93f2c93 100644
--- a/drivers/net/hns3/hns3_ethdev_vf.c
+++ b/drivers/net/hns3/hns3_ethdev_vf.c
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
 		goto cfg_err;
 
 	hns->rx_simple_allowed = true;
+	hns->rx_vec_allowed = true;
 	hns->tx_simple_allowed = true;
 	hns->tx_vec_allowed = true;
 
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 3e708b5..ada02de 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -41,9 +41,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
 	if (rxq->sw_ring == NULL)
 		return;
 
-	for (i = 0; i < rxq->nb_rx_desc; i++)
-		if (rxq->sw_ring[i].mbuf)
-			rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+	if (rxq->rx_rearm_nb == 0) {
+		for (i = 0; i < rxq->nb_rx_desc; i++) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	} else {
+		for (i = rxq->next_to_use;
+		     i != rxq->rx_rearm_start;
+		     i = (i + 1) % rxq->nb_rx_desc) {
+			if (rxq->sw_ring[i].mbuf != NULL)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	}
 
 	for (i = 0; i < rxq->bulk_mbuf_num; i++)
 		rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
@@ -661,10 +671,13 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	}
 
 	rxq->next_to_use = 0;
+	rxq->rx_rearm_start = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	hns3_init_rx_queue_hw(rxq);
+	hns3_rxq_vec_setup(rxq);
 
 	return 0;
 }
@@ -678,6 +691,8 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
 	rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	hns3_init_rx_queue_hw(rxq);
 }
 
@@ -860,6 +875,40 @@ hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue)
 	return 0;
 }
 
+/*
+ * Iterate over all Rx Queue, and call the callback() function for each Rx
+ * queue.
+ *
+ * @param[in] dev
+ *   The target eth dev.
+ * @param[in] callback
+ *   The function to call for each queue.
+ *   if callback function return nonzero will stop iterate and return it's value
+ * @param[in] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   0 on success, otherwise with errno set.
+ */
+int
+hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg)
+{
+	uint32_t i;
+	int ret;
+
+	if (dev->data->rx_queues == NULL)
+		return -EINVAL;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		ret = callback(dev->data->rx_queues[i], arg);
+		if (ret != 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void*
 hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 			    struct hns3_queue_info *q_info)
@@ -880,7 +929,13 @@ hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
 	/* Allocate rx ring hardware descriptors. */
 	rxq->queue_id = q_info->idx;
 	rxq->nb_rx_desc = q_info->nb_desc;
-	rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc);
+
+	/*
+	 * Allocate a litter more memory because rx vector functions
+	 * don't check boundaries each time.
+	 */
+	rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_desc);
 	rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx,
 					 rx_desc, HNS3_RING_BASE_ALIGN,
 					 q_info->socket_id);
@@ -1329,7 +1384,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 		conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
 	rxq->rx_deferred_start = conf->rx_deferred_start;
 
-	rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
+	rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+			sizeof(struct hns3_entry);
 	rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len,
 					  RTE_CACHE_LINE_SIZE, socket_id);
 	if (rxq->sw_ring == NULL) {
@@ -1340,6 +1396,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
 	rxq->next_to_use = 0;
 	rxq->rx_free_hold = 0;
+	rxq->rx_rearm_start = 0;
+	rxq->rx_rearm_nb = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
 	rxq->port_id = dev->data->port_id;
@@ -1431,7 +1489,8 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 	};
 
 	if (dev->rx_pkt_burst == hns3_recv_pkts ||
-	    dev->rx_pkt_burst == hns3_recv_scattered_pkts)
+	    dev->rx_pkt_burst == hns3_recv_scattered_pkts ||
+	    dev->rx_pkt_burst == hns3_recv_pkts_vec)
 		return ptypes;
 
 	return NULL;
@@ -1915,6 +1974,25 @@ hns3_recv_scattered_pkts(void *rx_queue,
 	return nb_rx;
 }
 
+void __rte_weak
+hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq)
+{
+}
+
+int __rte_weak
+hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+	return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_recv_pkts_vec(__rte_unused void *tx_queue,
+		   __rte_unused struct rte_mbuf **tx_pkts,
+		   __rte_unused uint16_t nb_pkts)
+{
+	return 0;
+}
+
 int
 hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 		       struct rte_eth_burst_mode *mode)
@@ -1925,6 +2003,7 @@ hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
 	} burst_infos[] = {
 		{ hns3_recv_pkts,		"Scalar" },
 		{ hns3_recv_scattered_pkts,	"Scalar Scattered" },
+		{ hns3_recv_pkts_vec,		"Vector Neon" },
 	};
 
 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
@@ -1949,6 +2028,9 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
 	struct hns3_adapter *hns = dev->data->dev_private;
 	uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
 
+	if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0)
+		return hns3_recv_pkts_vec;
+
 	if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
 	    (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
 		return hns3_recv_pkts;
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index b471bf5..27041ab 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -17,6 +17,18 @@
 #define HNS3_DEFAULT_TX_RS_THRESH	32
 #define HNS3_TX_FAST_FREE_AHEAD		64
 
+#define HNS3_DEFAULT_RX_BURST		32
+#if (HNS3_DEFAULT_RX_BURST > 64)
+#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n"
+#endif
+#define HNS3_DEFAULT_DESCS_PER_LOOP	4
+#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP	8
+#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP)
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_DEFAULT_DESCS_PER_LOOP
+#else
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN	HNS3_SVE_DEFAULT_DESCS_PER_LOOP
+#endif
+#define HNS3_DEFAULT_RXQ_REARM_THRESH	64
 #define HNS3_UINT8_BIT			8
 #define HNS3_UINT16_BIT			16
 #define HNS3_UINT32_BIT			32
@@ -236,7 +248,13 @@ struct hns3_desc {
 					uint16_t ot_vlan_tag;
 				};
 			};
-			uint32_t bd_base_info;
+			union {
+				uint32_t bd_base_info;
+				struct {
+					uint16_t bdtype_vld_udp0;
+					uint16_t fe_lum_crcp_l3l4p;
+				};
+			};
 		} rx;
 	};
 } __rte_packed;
@@ -270,7 +288,8 @@ struct hns3_rx_queue {
 	uint16_t rx_free_thresh;
 	uint16_t next_to_use;    /* index of next BD to be polled */
 	uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
-
+	uint16_t rx_rearm_start; /* index of BD that driver re-arming from */
+	uint16_t rx_rearm_nb;    /* number of remaining BDs to be re-armed */
 	/*
 	 * port based vlan configuration state.
 	 * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -292,6 +311,11 @@ struct hns3_rx_queue {
 
 	struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
 	uint16_t bulk_mbuf_num;
+
+	/* offset_table: used for vector, to solve execute re-order problem */
+	uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
+	uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
+	struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
 };
 
 struct hns3_tx_queue {
@@ -554,6 +578,8 @@ int hns3_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
 void hns3_enable_all_queues(struct hns3_hw *hw, bool en);
 int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue);
 int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue);
+int hns3_rxq_iterate(struct rte_eth_dev *dev,
+		 int (*callback)(struct hns3_rx_queue *, void *), void *arg);
 void hns3_dev_release_mbufs(struct hns3_adapter *hns);
 int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 			unsigned int socket, const struct rte_eth_rxconf *conf,
@@ -564,9 +590,12 @@ uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 				  uint16_t nb_pkts);
+uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts);
 int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
 			   __rte_unused uint16_t queue_id,
 			   struct rte_eth_burst_mode *mode);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -594,7 +623,9 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
+void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c
index 1154b6f..a26c83d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.c
+++ b/drivers/net/hns3/hns3_rxtx_vec.c
@@ -45,3 +45,170 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
 	return nb_tx;
 }
+
+static inline void
+hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq)
+{
+#define REARM_LOOP_STEP_NUM	4
+	struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start];
+	struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start;
+	uint64_t dma_addr;
+	int i;
+
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
+					  HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) {
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
+		return;
+	}
+
+	for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM,
+		rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) {
+		if (likely(i <
+			HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) {
+			rte_prefetch_non_temporal(rxep[4].mbuf);
+			rte_prefetch_non_temporal(rxep[5].mbuf);
+			rte_prefetch_non_temporal(rxep[6].mbuf);
+			rte_prefetch_non_temporal(rxep[7].mbuf);
+		}
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf);
+		rxdp[0].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[0].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf);
+		rxdp[1].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[1].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf);
+		rxdp[2].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[2].rx.bd_base_info = 0;
+
+		dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf);
+		rxdp[3].addr = rte_cpu_to_le_64(dma_addr);
+		rxdp[3].rx.bd_base_info = 0;
+	}
+
+	rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH;
+	if (rxq->rx_rearm_start >= rxq->nb_rx_desc)
+		rxq->rx_rearm_start = 0;
+
+	rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH;
+
+	hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH);
+}
+
+uint16_t
+hns3_recv_pkts_vec(void *__restrict rx_queue,
+		   struct rte_mbuf **__restrict rx_pkts,
+		   uint16_t nb_pkts)
+{
+	struct hns3_rx_queue *rxq = rx_queue;
+	struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use];
+	uint64_t bd_err_mask;  /* bit mask indicate whick pkts is error */
+	uint16_t nb_rx;
+
+	nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST);
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP);
+
+	rte_prefetch_non_temporal(rxdp);
+
+	if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH)
+		hns3_rxq_rearm_mbuf(rxq);
+
+	if (unlikely(!(rxdp->rx.bd_base_info &
+			rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B))))
+		return 0;
+
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf);
+	rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf);
+
+	bd_err_mask = 0;
+	nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask);
+	if (unlikely(bd_err_mask))
+		nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask);
+
+	return nb_rx;
+}
+
+static void
+hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+}
+
+void
+hns3_rxq_vec_setup(struct hns3_rx_queue *rxq)
+{
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc];
+	unsigned int i;
+
+	memset(&rxq->rx_ring[rxq->nb_rx_desc], 0,
+		sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST);
+
+	memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf));
+	for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++)
+		sw_ring[i].mbuf = &rxq->fake_mbuf;
+
+	hns3_rxq_vec_setup_rearm_data(rxq);
+
+	memset(rxq->offset_table, 0, sizeof(rxq->offset_table));
+}
+
+#ifndef RTE_LIBRTE_IEEE1588
+static int
+hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg)
+{
+	uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH +
+				HNS3_DEFAULT_RX_BURST;
+
+	if (rxq->nb_rx_desc < min_vec_bds)
+		return -ENOTSUP;
+
+	if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH)
+		return -ENOTSUP;
+
+	RTE_SET_USED(arg);
+	return 0;
+}
+#endif
+
+int
+hns3_rx_check_vec_support(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO |
+				 DEV_RX_OFFLOAD_VLAN;
+
+	if (dev->data->scattered_rx)
+		return -ENOTSUP;
+
+	if (fconf->mode != RTE_FDIR_MODE_NONE)
+		return -ENOTSUP;
+
+	if (rxmode->offloads & offloads_mask)
+		return -ENOTSUP;
+
+	if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0)
+		return -ENOTSUP;
+
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	return -ENOTSUP;
+#endif
+}
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
index 90679bf..c6df36d 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.h
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -54,4 +54,24 @@ hns3_tx_free_buffers(struct hns3_tx_queue *txq)
 	if (txq->next_to_clean >= txq->nb_tx_desc)
 		txq->next_to_clean = 0;
 }
+
+static inline uint16_t
+hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts,
+			uint16_t nb_pkts,
+			uint64_t pkt_err_mask)
+{
+	uint16_t count, i;
+	uint64_t mask;
+
+	count = 0;
+	for (i = 0; i < nb_pkts; i++) {
+		mask = ((uint64_t)1u) << i;
+		if (pkt_err_mask & mask)
+			rte_pktmbuf_free_seg(rx_pkts[i]);
+		else
+			rx_pkts[count++] = rx_pkts[i];
+	}
+
+	return count;
+}
 #endif /* _HNS3_RXTX_VEC_H_ */
diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h
index e878ee1..8d7721b 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_neon.h
+++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h
@@ -82,4 +82,207 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 
 	return nb_tx;
 }
+
+static inline uint32_t
+hns3_desc_parse_field(struct hns3_rx_queue *rxq,
+		      struct hns3_entry *sw_ring,
+		      struct hns3_desc *rxdp,
+		      uint32_t   bd_vld_num)
+{
+	uint32_t l234_info, ol_info, bd_base_info;
+	struct rte_mbuf *pkt;
+	uint32_t retcode = 0;
+	uint32_t cksum_err;
+	int ret, i;
+
+	for (i = 0; i < (int)bd_vld_num; i++) {
+		pkt = sw_ring[i].mbuf;
+
+		/* init rte_mbuf.rearm_data last 64-bit */
+		pkt->ol_flags = PKT_RX_RSS_HASH;
+
+		l234_info = rxdp[i].rx.l234_info;
+		ol_info = rxdp[i].rx.ol_info;
+		bd_base_info = rxdp[i].rx.bd_base_info;
+		ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
+					 l234_info, &cksum_err);
+		if (unlikely(ret)) {
+			retcode |= 1u << i;
+			continue;
+		}
+
+		pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+		if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+			hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
+					       cksum_err);
+	}
+
+	return retcode;
+}
+
+static inline uint16_t
+hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
+		    struct rte_mbuf **__restrict rx_pkts,
+		    uint16_t nb_pkts,
+		    uint64_t *bd_err_mask)
+{
+	uint16_t rx_id = rxq->next_to_use;
+	struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
+	struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
+	uint32_t bd_valid_num, parse_retcode;
+	uint16_t nb_rx = 0;
+	int pos, offset;
+
+	/* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
+	uint8x16_t shuf_desc_fields_msk = {
+		0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
+		22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
+		20, 21,	                 /* size to rte_mbuf.data_len */
+		0xff, 0xff,	         /* rte_mbuf.vlan_tci init zero */
+		8, 9, 10, 11,	         /* rx.rss_hash to rte_mbuf.hash.rss */
+	};
+
+	uint16x8_t crc_adjust = {
+		0, 0,         /* ignore pkt_type field */
+		rxq->crc_len, /* sub crc on pkt_len */
+		0,            /* ignore high-16bits of pkt_len */
+		rxq->crc_len, /* sub crc on data_len */
+		0, 0, 0,      /* ignore non-length fields */
+	};
+
+	for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
+				     rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
+		uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
+		uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint64x2_t mbp1, mbp2;
+		uint16x4_t bd_vld = {0};
+		uint16x8_t tmp;
+		uint64_t stat;
+
+		/* calc how many bd valid */
+		bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
+		bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
+		bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
+		bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
+
+		/* load 2 mbuf pointer */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		bd_vld = vshl_n_u16(bd_vld,
+				    HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
+		bd_vld = vreinterpret_u16_s16(
+				vshr_n_s16(vreinterpret_s16_u16(bd_vld),
+					   HNS3_UINT16_BIT - 1));
+		stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
+
+		/* load 2 mbuf pointer again */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		if (likely(stat == 0))
+			bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
+		else
+			bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
+		if (bd_valid_num == 0)
+			break;
+
+		/* use offset to control below data load oper ordering */
+		offset = rxq->offset_table[bd_valid_num];
+
+		/* store 2 mbuf pointer into rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* read first two descs */
+		descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
+		descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
+
+		/* store 2 mbuf pointer into rx_pkts again */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		/* read remains two descs */
+		descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
+		descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
+
+		pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
+		pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
+		pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
+		pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
+
+		/* pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
+		pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
+
+		/* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 1,2 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+
+		pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
+		pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
+		pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
+		pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
+
+		/* pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
+		pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
+
+		/* pkt 1,2 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
+			 pkt_mb1);
+		vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
+			 pkt_mb2);
+
+		/* pkt 3,4 remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+
+		/* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
+		*(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+		*(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
+			rxq->mbuf_initializer;
+
+		/* pkt 3,4 save to rx_pkts mbuf */
+		vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
+			 pkt_mb3);
+		vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
+			 pkt_mb4);
+
+		rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
+
+		parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
+			&rxdp[offset], bd_valid_num);
+		if (unlikely(parse_retcode))
+			(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
+
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
+		rte_prefetch0(sw_ring[pos +
+				      HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
+
+		nb_rx += bd_valid_num;
+		if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
+			break;
+	}
+
+	rxq->rx_rearm_nb += nb_rx;
+	rxq->next_to_use += nb_rx;
+	if (rxq->next_to_use >= rxq->nb_rx_desc)
+		rxq->next_to_use = 0;
+
+	return nb_rx;
+}
 #endif /* _HNS3_RXTX_VEC_NEON_H_ */
-- 
2.9.5



More information about the dev mailing list