[dpdk-dev] [PATCH v4 3/5] net/i40e: fix vector rx burst for i40e
Jeff Guo
jia.guo at intel.com
Thu Sep 17 09:58:32 CEST 2020
The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.
Bugzilla ID: 516
Fixes: 5b463eda8d26 ("net/i40e: make vector driver filenames consistent")
Fixes: ae0eb310f253 ("net/i40e: implement vector PMD for ARM")
Fixes: dafadd73762e ("net/i40e: add AVX2 Rx function")
Fixes: c3def6a8724c ("net/i40e: implement vector PMD for altivec")
Signed-off-by: Jeff Guo <jia.guo at intel.com>
---
drivers/net/i40e/i40e_rxtx.h | 1 +
drivers/net/i40e/i40e_rxtx_vec_altivec.c | 64 ++++++++++++++++--------
drivers/net/i40e/i40e_rxtx_vec_avx2.c | 29 ++++++-----
drivers/net/i40e/i40e_rxtx_vec_neon.c | 58 +++++++++++++--------
drivers/net/i40e/i40e_rxtx_vec_sse.c | 58 +++++++++++++--------
5 files changed, 133 insertions(+), 77 deletions(-)
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 57d7b4160..01d4609f9 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -14,6 +14,7 @@
#define RTE_I40E_MAX_RX_BURST RTE_I40E_RXQ_REARM_THRESH
#define RTE_I40E_TX_MAX_FREE_BUF_SZ 64
#define RTE_I40E_DESCS_PER_LOOP 4
+#define RTE_I40E_DESCS_PER_LOOP_AVX 8
#define I40E_RXBUF_SZ_1024 1024
#define I40E_RXBUF_SZ_2048 2048
diff --git a/drivers/net/i40e/i40e_rxtx_vec_altivec.c b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
index 6862a017e..345c63aa7 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_altivec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_altivec.c
@@ -188,11 +188,13 @@ desc_to_ptype_v(vector unsigned long descs[4], struct rte_mbuf **rx_pkts,
ptype_tbl[(*(vector unsigned char *)&ptype1)[8]];
}
- /* Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
+ */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts, uint8_t *split_packet)
@@ -214,9 +216,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
};
vector unsigned long dd_check, eop_check;
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -447,11 +446,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
- /* Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -459,19 +453,19 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
+ */
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+ /* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
/* get some new buffers */
uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
split_flags);
@@ -500,6 +494,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..b5e6867d0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -36,7 +36,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
rxq->nb_rx_desc) {
__m128i dma_addr0;
dma_addr0 = _mm_setzero_si128();
- for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+ for (i = 0; i < RTE_I40E_DESCS_PER_LOOP_AVX; i++) {
rxep[i].mbuf = &rxq->fake_mbuf;
_mm_store_si128((__m128i *)&rxdp[i].read,
dma_addr0);
@@ -219,13 +219,18 @@ desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
#define PKTLEN_SHIFT 10
-/* Force inline as some compilers will not inline by default. */
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP_AVX power-of-two
+ * - force inline as some compilers will not inline by default
+ */
static __rte_always_inline uint16_t
_recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts, uint8_t *split_packet)
{
-#define RTE_I40E_DESCS_PER_LOOP_AVX 8
-
const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
0, rxq->mbuf_initializer);
@@ -729,10 +734,6 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return received;
}
-/*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- */
uint16_t
i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -740,10 +741,8 @@ i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
}
-/*
+/**
* vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
*/
static uint16_t
i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -752,6 +751,9 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
struct i40e_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+ /* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
/* get some new buffers */
uint16_t nb_bufs = _recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
split_flags);
@@ -781,11 +783,8 @@ i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
-/*
+/**
* vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
*/
uint16_t
i40e_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 6f874e45b..143cdf4a5 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -187,11 +187,12 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
}
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
* Notice:
* - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
@@ -230,9 +231,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
0, 0, 0 /* ignore non-length fields */
};
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -426,12 +424,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
return nb_pkts_recd;
}
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
@@ -439,20 +431,20 @@ i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
*/
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+ /* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
/* get some new buffers */
uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
split_flags);
@@ -482,6 +474,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..605912246 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -342,11 +342,12 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
}
- /*
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
+ *
* Notice:
* - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+ * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
*/
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -378,9 +379,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
__m128i dd_check, eop_check;
- /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
- nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
-
/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
@@ -592,12 +590,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
return nb_pkts_recd;
}
- /*
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
- */
uint16_t
i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts)
@@ -605,20 +597,20 @@ i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
}
- /* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
- * numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
*/
-uint16_t
-i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
+static uint16_t
+i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
{
struct i40e_rx_queue *rxq = rx_queue;
uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+ /* split_flags only can support max of RTE_I40E_VPMD_RX_BURST */
+ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_VPMD_RX_BURST);
+
/* get some new buffers */
uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
split_flags);
@@ -648,6 +640,32 @@ i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
&split_flags[i]);
}
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
+ uint16_t burst;
+
+ burst = i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ RTE_I40E_VPMD_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < RTE_I40E_VPMD_RX_BURST)
+ return retval;
+ }
+
+ return retval + i40e_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval,
+ nb_pkts);
+}
+
static inline void
vtx1(volatile struct i40e_tx_desc *txdp,
struct rte_mbuf *pkt, uint64_t flags)
--
2.20.1
More information about the dev
mailing list