[dpdk-dev] [PATCH v3 1/2] net/mlx5: add enhanced multi-packet send for ConnectX-5

Yongseok Koh yskoh at mellanox.com
Thu Mar 16 00:55:44 CET 2017


ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx
descriptor can carry multiple packets either by including pointers of
packets or by inlining packets. Inlining packet data can be helpful to
better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid
mode - mixing inlined packets and pointers in a descriptor. This feature is
enabled by default if supported by HW.

Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
---
 drivers/net/mlx5/mlx5.c        |  37 +++-
 drivers/net/mlx5/mlx5.h        |   4 +-
 drivers/net/mlx5/mlx5_defs.h   |   7 +
 drivers/net/mlx5/mlx5_ethdev.c |   6 +-
 drivers/net/mlx5/mlx5_prm.h    |  20 ++
 drivers/net/mlx5/mlx5_rxtx.c   | 410 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h   |   7 +-
 drivers/net/mlx5/mlx5_txq.c    |  29 ++-
 8 files changed, 507 insertions(+), 13 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index ebc7984c8..bc6a34fcb 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -84,6 +84,12 @@
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
+/* Device parameter to include 2 dsegs in the title WQEBB. */
+#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
+
+/* Device parameter to limit the size of inlining packet. */
+#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
+
 /* Device parameter to enable hardware TSO offload. */
 #define MLX5_TSO "tso"
 
@@ -294,7 +300,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		priv->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
-		priv->mps &= !!tmp; /* Enable MPW only if HW supports */
+		priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED;
+	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
+		priv->mpw_hdr_dseg = !!tmp;
+	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
+		priv->inline_max_packet_sz = tmp;
 	} else if (strcmp(MLX5_TSO, key) == 0) {
 		priv->tso = !!tmp;
 	} else {
@@ -323,6 +333,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQ_MPW_EN,
+		MLX5_TXQ_MPW_HDR_DSEG_EN,
+		MLX5_TXQ_MAX_INLINE_LEN,
 		MLX5_TSO,
 		NULL,
 	};
@@ -434,24 +446,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		switch (pci_dev->id.device_id) {
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
 			tunnel_en = 1;
-			mps = 0;
+			mps = MLX5_MPW_DISABLED;
 			break;
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+			mps = MLX5_MPW;
+			break;
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
-			mps = 1;
 			tunnel_en = 1;
+			mps = MLX5_MPW_ENHANCED;
 			break;
 		default:
-			mps = 0;
+			mps = MLX5_MPW_DISABLED;
 		}
 		INFO("PCI information matches, using device \"%s\""
-		     " (SR-IOV: %s, MPS: %s)",
+		     " (SR-IOV: %s, %sMPS: %s)",
 		     list[i]->name,
 		     sriov ? "true" : "false",
-		     mps ? "true" : "false");
+		     mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
+		     mps != MLX5_MPW_DISABLED ? "true" : "false");
 		attr_ctx = ibv_open_device(list[i]);
 		err = errno;
 		break;
@@ -546,6 +561,13 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		priv->mps = mps; /* Enable MPW by default if supported. */
+		/* Set default values for Enhanced MPW, a.k.a MPWv2. */
+		if (mps == MLX5_MPW_ENHANCED) {
+			priv->mpw_hdr_dseg = 0;
+			priv->txqs_inline = MLX5_EMPW_MIN_TXQS;
+			priv->inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN;
+			priv->txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE;
+		}
 		priv->cqe_comp = 1; /* Enable compression by default. */
 		priv->tunnel_en = tunnel_en;
 		err = mlx5_args(priv, pci_dev->device.devargs);
@@ -613,6 +635,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			      "with TSO. MPS disabled");
 			priv->mps = 0;
 		}
+		INFO("%sMPS is %s",
+		     priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
+		     priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
 		/* Allocate and register default RSS hash keys. */
 		priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
 					    sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 870e01ff5..d26d46542 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -123,7 +123,8 @@ struct priv {
 	unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
 	unsigned int hw_padding:1; /* End alignment padding is supported. */
 	unsigned int sriov:1; /* This is a VF or PF with VF devices. */
-	unsigned int mps:1; /* Whether multi-packet send is supported. */
+	unsigned int mps:2; /* Multi-packet send mode (0: disabled). */
+	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
 	unsigned int pending_alarm:1; /* An alarm is pending. */
 	unsigned int tso:1; /* Whether TSO is supported. */
@@ -132,6 +133,7 @@ struct priv {
 	unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int txq_inline; /* Maximum packet size for inlining. */
 	unsigned int txqs_inline; /* Queue number threshold for inlining. */
+	unsigned int inline_max_packet_sz; /* Max packet size for inlining. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index eecb908ec..201bb3362 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -55,6 +55,13 @@
 #define MLX5_TX_COMP_THRESH 32
 
 /*
+ * Request TX completion every time the total number of WQEBBs used for inlining
+ * packets exceeds the size of WQ divided by this divisor. Better to be power of
+ * two for performance.
+ */
+#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3)
+
+/*
  * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
  * from which buffers are to be transmitted will have to be mapped by this
  * driver to their own Memory Region (MR). This is a slow operation.
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 5deb6e841..dd5fe5c1f 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1590,7 +1590,11 @@ priv_select_tx_function(struct priv *priv)
 {
 	priv->dev->tx_pkt_burst = mlx5_tx_burst;
 	/* Select appropriate TX function. */
-	if (priv->mps && priv->txq_inline) {
+	if (priv->mps == MLX5_MPW_ENHANCED) {
+		priv->dev->tx_pkt_burst =
+			mlx5_tx_burst_empw;
+		DEBUG("selected Enhanced MPW TX function");
+	} else if (priv->mps && priv->txq_inline) {
 		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
 		DEBUG("selected MPW inline TX function");
 	} else if (priv->mps) {
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 0a77f5be8..155bdbad4 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -73,6 +73,9 @@
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
+/* Max size of a WQE session. */
+#define MLX5_WQE_SIZE_MAX 960U
+
 /* Compute the number of DS. */
 #define MLX5_WQE_DS(n) \
 	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
@@ -80,10 +83,19 @@
 /* Room for inline data in multi-packet WQE. */
 #define MLX5_MWQE64_INL_DATA 28
 
+/* Default minimum number of Tx queues for inlining packets. */
+#define MLX5_EMPW_MIN_TXQS 8
+
+/* Default max packet length to be inlined. */
+#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
+
 #ifndef HAVE_VERBS_MLX5_OPCODE_TSO
 #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */
 #endif
 
+#define MLX5_OPC_MOD_ENHANCED_MPSW 0
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
 
@@ -176,10 +188,18 @@ struct mlx5_wqe64 {
 	uint8_t raw[32];
 } __rte_aligned(MLX5_WQE_SIZE);
 
+/* MPW mode. */
+enum mlx5_mpw_mode {
+	MLX5_MPW_DISABLED,
+	MLX5_MPW,
+	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */
+};
+
 /* MPW session status. */
 enum mlx5_mpw_state {
 	MLX5_MPW_STATE_OPENED,
 	MLX5_MPW_INL_STATE_OPENED,
+	MLX5_MPW_ENHANCED_STATE_OPENED,
 	MLX5_MPW_STATE_CLOSED,
 };
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index cf3abd3cc..823aac8f2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
 }
 
 /**
+ * Return the size of tailroom of WQ.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param addr
+ *   Pointer to tail of WQ.
+ *
+ * @return
+ *   Size of tailroom.
+ */
+static inline size_t
+tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
+{
+	size_t tailroom;
+	tailroom = (uintptr_t)(txq->wqes) +
+		   (1 << txq->wqe_n) * MLX5_WQE_SIZE -
+		   (uintptr_t)addr;
+	return tailroom;
+}
+
+/**
+ * Copy data to tailroom of circular queue.
+ *
+ * @param dst
+ *   Pointer to destination.
+ * @param src
+ *   Pointer to source.
+ * @param n
+ *   Number of bytes to copy.
+ * @param base
+ *   Pointer to head of queue.
+ * @param tailroom
+ *   Size of tailroom from dst.
+ *
+ * @return
+ *   Pointer after copied data.
+ */
+static inline void *
+mlx5_copy_to_wq(void *dst, const void *src, size_t n,
+		void *base, size_t tailroom)
+{
+	void *ret;
+
+	if (n > tailroom) {
+		rte_memcpy(dst, src, tailroom);
+		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
+			   n - tailroom);
+		ret = (uint8_t *)base + n - tailroom;
+	} else {
+		rte_memcpy(dst, src, n);
+		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
+	}
+	return ret;
+}
+
+/**
  * Manage TX completions.
  *
  * When sending a burst, mlx5_tx_burst() posts several WRs.
@@ -1267,6 +1323,360 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 }
 
 /**
+ * Open an Enhanced MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ * @param length
+ *   Packet length.
+ */
+static inline void
+mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
+{
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
+
+	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
+	mpw->pkts_n = 0;
+	mpw->total_len = sizeof(struct mlx5_wqe);
+	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
+				  (txq->wqe_ci << 8) |
+				  MLX5_OPCODE_ENHANCED_MPSW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
+	if (unlikely(padding)) {
+		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
+
+		/* Pad the first 2 DWORDs with zero-length inline header. */
+		*(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
+		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
+			htonl(MLX5_INLINE_SEG);
+		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
+		/* Start from the next WQEBB. */
+		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
+	} else {
+		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
+	}
+}
+
+/**
+ * Close an Enhanced MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ *
+ * @return
+ *   Number of consumed WQEs.
+ */
+static inline uint16_t
+mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+	uint16_t ret;
+
+	/* Store size in multiple of 16 bytes. Control and Ethernet segments
+	 * count as 2.
+	 */
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
+	mpw->state = MLX5_MPW_STATE_CLOSED;
+	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
+	txq->wqe_ci += ret;
+	return ret;
+}
+
+/**
+ * DPDK callback for TX with Enhanced MPW support.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	uint16_t elts_head = txq->elts_head;
+	const unsigned int elts_n = 1 << txq->elts_n;
+	unsigned int i = 0;
+	unsigned int j = 0;
+	unsigned int max_elts;
+	uint16_t max_wqe;
+	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
+	unsigned int mpw_room = 0;
+	unsigned int inl_pad = 0;
+	uint32_t inl_hdr;
+	struct mlx5_mpw mpw = {
+		.state = MLX5_MPW_STATE_CLOSED,
+	};
+
+	if (unlikely(!pkts_n))
+		return 0;
+	/* Start processing. */
+	txq_complete(txq);
+	max_elts = (elts_n - (elts_head - txq->elts_tail));
+	if (max_elts > elts_n)
+		max_elts -= elts_n;
+	/* A CQE slot must always be available. */
+	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
+	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!max_wqe))
+		return 0;
+	do {
+		struct rte_mbuf *buf = *(pkts++);
+		unsigned int elts_head_next;
+		uintptr_t addr;
+		uint64_t naddr;
+		unsigned int n;
+		unsigned int do_inline = 0; /* Whether inline is possible. */
+		uint32_t length;
+		unsigned int segs_n = buf->nb_segs;
+		uint32_t cs_flags = 0;
+
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		assert(segs_n);
+		if (max_elts - j < segs_n + 1)
+			break;
+		/* Do not bother with large packets MPW cannot handle. */
+		if (segs_n > MLX5_MPW_DSEG_MAX)
+			break;
+		/* Should we enable HW CKSUM offload. */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+		/* Retrieve packet information. */
+		length = PKT_LEN(buf);
+		/* Start new session if:
+		 * - multi-segment packet
+		 * - no space left even for a dseg
+		 * - next packet can be inlined with a new WQE
+		 * - cs_flag differs
+		 * It can't be MLX5_MPW_STATE_OPENED as always have a single
+		 * segmented packet.
+		 */
+		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
+			if ((segs_n != 1) ||
+			    (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
+			      mpw_room) ||
+			    (length <= txq->inline_max_packet_sz &&
+			     inl_pad + sizeof(inl_hdr) + length >
+			      mpw_room) ||
+			    (mpw.wqe->eseg.cs_flags != cs_flags))
+				max_wqe -= mlx5_empw_close(txq, &mpw);
+		}
+		if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
+			if (unlikely(segs_n != 1)) {
+				/* Fall back to legacy MPW.
+				 * A MPW session consumes 2 WQEs at most to
+				 * include MLX5_MPW_DSEG_MAX pointers.
+				 */
+				if (unlikely(max_wqe < 2))
+					break;
+				mlx5_mpw_new(txq, &mpw, length);
+			} else {
+				/* In Enhanced MPW, inline as much as the budget
+				 * is allowed. The remaining space is to be
+				 * filled with dsegs. If the title WQEBB isn't
+				 * padded, it will have 2 dsegs there.
+				 */
+				mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
+					    (max_inline ? max_inline :
+					     pkts_n * MLX5_WQE_DWORD_SIZE) +
+					    MLX5_WQE_SIZE);
+				if (unlikely(max_wqe * MLX5_WQE_SIZE <
+					      mpw_room))
+					break;
+				/* Don't pad the title WQEBB to not waste WQ. */
+				mlx5_empw_new(txq, &mpw, 0);
+				mpw_room -= mpw.total_len;
+				inl_pad = 0;
+				do_inline =
+					length <= txq->inline_max_packet_sz &&
+					sizeof(inl_hdr) + length <= mpw_room &&
+					!txq->mpw_hdr_dseg;
+			}
+			mpw.wqe->eseg.cs_flags = cs_flags;
+		} else {
+			/* Evaluate whether the next packet can be inlined.
+			 * Inlininig is possible when:
+			 * - length is less than configured value
+			 * - length fits for remaining space
+			 * - not required to fill the title WQEBB with dsegs
+			 */
+			do_inline =
+				length <= txq->inline_max_packet_sz &&
+				inl_pad + sizeof(inl_hdr) + length <=
+				 mpw_room &&
+				(!txq->mpw_hdr_dseg ||
+				 mpw.total_len >= MLX5_WQE_SIZE);
+		}
+		/* Multi-segment packets must be alone in their MPW. */
+		assert((segs_n == 1) || (mpw.pkts_n == 0));
+		if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) {
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+			length = 0;
+#endif
+			do {
+				volatile struct mlx5_wqe_data_seg *dseg;
+
+				elts_head_next =
+					(elts_head + 1) & (elts_n - 1);
+				assert(buf);
+				(*txq->elts)[elts_head] = buf;
+				dseg = mpw.data.dseg[mpw.pkts_n];
+				addr = rte_pktmbuf_mtod(buf, uintptr_t);
+				*dseg = (struct mlx5_wqe_data_seg){
+					.byte_count = htonl(DATA_LEN(buf)),
+					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+					.addr = htonll(addr),
+				};
+				elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+				length += DATA_LEN(buf);
+#endif
+				buf = buf->next;
+				++j;
+				++mpw.pkts_n;
+			} while (--segs_n);
+			/* A multi-segmented packet takes one MPW session.
+			 * TODO: Pack more multi-segmented packets if possible.
+			 */
+			mlx5_mpw_close(txq, &mpw);
+			if (mpw.pkts_n < 3)
+				max_wqe--;
+			else
+				max_wqe -= 2;
+		} else if (do_inline) {
+			/* Inline packet into WQE. */
+			unsigned int max;
+
+			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+			assert(length == DATA_LEN(buf));
+			inl_hdr = htonl(length | MLX5_INLINE_SEG);
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			mpw.data.raw = (volatile void *)
+				((uintptr_t)mpw.data.raw + inl_pad);
+			max = tx_mlx5_wq_tailroom(txq,
+					(void *)(uintptr_t)mpw.data.raw);
+			/* Copy inline header. */
+			mpw.data.raw = (volatile void *)
+				mlx5_copy_to_wq(
+					  (void *)(uintptr_t)mpw.data.raw,
+					  &inl_hdr,
+					  sizeof(inl_hdr),
+					  (void *)(uintptr_t)txq->wqes,
+					  max);
+			max = tx_mlx5_wq_tailroom(txq,
+					(void *)(uintptr_t)mpw.data.raw);
+			/* Copy packet data. */
+			mpw.data.raw = (volatile void *)
+				mlx5_copy_to_wq(
+					  (void *)(uintptr_t)mpw.data.raw,
+					  (void *)addr,
+					  length,
+					  (void *)(uintptr_t)txq->wqes,
+					  max);
+			++mpw.pkts_n;
+			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
+			/* No need to get completion as the entire packet is
+			 * copied to WQ. Free the buf right away.
+			 */
+			elts_head_next = elts_head;
+			rte_pktmbuf_free_seg(buf);
+			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
+			/* Add pad in the next packet if any. */
+			inl_pad = (((uintptr_t)mpw.data.raw +
+					(MLX5_WQE_DWORD_SIZE - 1)) &
+					~(MLX5_WQE_DWORD_SIZE - 1)) -
+				  (uintptr_t)mpw.data.raw;
+		} else {
+			/* No inline. Load a dseg of packet pointer. */
+			volatile rte_v128u32_t *dseg;
+
+			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
+			assert(length == DATA_LEN(buf));
+			if (!tx_mlx5_wq_tailroom(txq,
+					(void *)((uintptr_t)mpw.data.raw
+						+ inl_pad)))
+				dseg = (volatile void *)txq->wqes;
+			else
+				dseg = (volatile void *)
+					((uintptr_t)mpw.data.raw +
+					 inl_pad);
+			elts_head_next = (elts_head + 1) & (elts_n - 1);
+			(*txq->elts)[elts_head] = buf;
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
+				rte_prefetch2((void *)(addr +
+						n * RTE_CACHE_LINE_SIZE));
+			naddr = htonll(addr);
+			*dseg = (rte_v128u32_t) {
+				htonl(length),
+				txq_mp2mr(txq, txq_mb2mp(buf)),
+				naddr,
+				naddr >> 32,
+			};
+			mpw.data.raw = (volatile void *)(dseg + 1);
+			mpw.total_len += (inl_pad + sizeof(*dseg));
+			++j;
+			++mpw.pkts_n;
+			mpw_room -= (inl_pad + sizeof(*dseg));
+			inl_pad = 0;
+		}
+		elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += length;
+#endif
+		++i;
+	} while (i < pkts_n);
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Check whether completion threshold has been reached. */
+	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
+			(uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
+			 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
+
+		/* Request completion on last WQE. */
+		wqe->ctrl[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->ctrl[3] = elts_head;
+		txq->elts_comp = 0;
+		txq->mpw_comp = txq->wqe_ci;
+		txq->cq_pi++;
+	} else {
+		txq->elts_comp += j;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+#endif
+	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
+		mlx5_empw_close(txq, &mpw);
+	else if (mpw.state == MLX5_MPW_STATE_OPENED)
+		mlx5_mpw_close(txq, &mpw);
+	/* Ring QP doorbell. */
+	mlx5_tx_dbrec(txq, mpw.wqe);
+	txq->elts_head = elts_head;
+	return i;
+}
+
+/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 0db810c39..4a4bd8402 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -248,17 +248,21 @@ struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
+	uint16_t mpw_comp; /* WQ index since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
+	uint16_t cq_pi; /* Producer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_pi; /* Producer index for work queue. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
-	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint16_t inline_en:1; /* When set inline is enabled. */
 	uint16_t tso_en:1; /* When set hardware TSO is enabled. */
 	uint16_t tunnel_en:1;
 	/* When set TX offload for tunneled packets are supported. */
+	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
+	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
+	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
 	volatile void *wqes; /* Work queue (use volatile to write into). */
@@ -329,6 +333,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9d0c00f6d..bbfce756b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -266,6 +266,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		struct ibv_exp_cq_attr cq_attr;
 	} attr;
 	enum ibv_exp_query_intf_status status;
+	unsigned int cqe_n;
 	int ret = 0;
 
 	if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
@@ -276,6 +277,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	(void)conf; /* Thresholds configuration (ignored). */
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl.txq.elts_n = log2above(desc);
+	if (priv->mps == MLX5_MPW_ENHANCED)
+		tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -294,9 +297,12 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
 		.res_domain = tmpl.rd,
 	};
+	cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ?
+		((desc / MLX5_TX_COMP_THRESH) - 1) : 1;
+	if (priv->mps == MLX5_MPW_ENHANCED)
+		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
 	tmpl.cq = ibv_exp_create_cq(priv->ctx,
-				    (((desc / MLX5_TX_COMP_THRESH) - 1) ?
-				     ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
+				    cqe_n,
 				    NULL, NULL, 0, &attr.cq);
 	if (tmpl.cq == NULL) {
 		ret = ENOMEM;
@@ -340,9 +346,24 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		tmpl.txq.max_inline =
 			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
 			 RTE_CACHE_LINE_SIZE);
-		attr.init.cap.max_inline_data =
-			tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
 		tmpl.txq.inline_en = 1;
+		/* TSO and MPS can't be enabled concurrently. */
+		assert(!priv->tso || !priv->mps);
+		if (priv->mps == MLX5_MPW_ENHANCED) {
+			tmpl.txq.inline_max_packet_sz =
+				priv->inline_max_packet_sz;
+			/* To minimize the size of data set, avoid requesting
+			 * too large WQ.
+			 */
+			attr.init.cap.max_inline_data =
+				((RTE_MIN(priv->txq_inline,
+					  priv->inline_max_packet_sz) +
+				  (RTE_CACHE_LINE_SIZE - 1)) /
+				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
+		} else {
+			attr.init.cap.max_inline_data =
+				tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
+		}
 	}
 	if (priv->tso) {
 		uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER +
-- 
2.11.0



More information about the dev mailing list