[dpdk-dev] [PATCH 2/5] net/mlx4: support multi-segments Tx

Moti Haimovsky motih at mellanox.com
Thu Aug 24 17:54:07 CEST 2017


PRM now supports transmitting packets spanning over arbitrary
amount of buffers.

Signed-off-by: Moti Haimovsky <motih at mellanox.com>
---
 drivers/net/mlx4/mlx4_prm.h  |  16 +---
 drivers/net/mlx4/mlx4_rxtx.c | 213 +++++++++++++++++++++++++++++++------------
 drivers/net/mlx4/mlx4_rxtx.h |   3 +-
 drivers/net/mlx4/mlx4_txq.c  |  12 ++-
 4 files changed, 170 insertions(+), 74 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index c5ce33b..8b0248a 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -61,7 +61,7 @@
 #define MLX4_OPCODE_SEND	0x0a
 #define MLX4_EN_BIT_WQE_OWN	0x80000000
 
-#define SIZE_TO_TXBBS(size)     (RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE))
+#define SIZE_TO_TXBBS(size)	(RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE))
 
 /**
  * Update the HW with the new  CQ consumer value.
@@ -148,6 +148,7 @@
 
 /**
  * Fills the ctrl segment of a WQE with info needed for transmitting the packet.
+ * Owner field is filled later.
  *
  * @param seg
  *   Pointer to the control structure in the WQE.
@@ -161,8 +162,8 @@
  *   Immediate data/Invalidation key..
  */
 static inline void
-mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint32_t owner,
-	     uint8_t fence_size, uint32_t srcrb_flags, uint32_t imm)
+mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint8_t fence_size,
+		  uint32_t srcrb_flags, uint32_t imm)
 {
 	seg->fence_size = fence_size;
 	seg->srcrb_flags = rte_cpu_to_be_32(srcrb_flags);
@@ -173,13 +174,6 @@
 	 * For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm).
 	 */
 	seg->imm = imm;
-	/*
-	 * Make sure descriptor is fully written before
-	 * setting ownership bit (because HW can start
-	 * executing as soon as we do).
-	 */
-	rte_wmb();
-	seg->owner_opcode = rte_cpu_to_be_32(owner);
 }
 
 /**
@@ -241,7 +235,7 @@
  *   The number of data-segments the WQE contains.
  *
  * @return
- *   WQE size in bytes.
+ *   The calculated WQE size in bytes.
  */
 static inline int
 mlx4_wqe_calc_real_size(unsigned int count)
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 0720e34..e41ea9e 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -309,6 +309,101 @@
 }
 
 /**
+ * Copy a WQE written in the bounce buffer back to the SQ.
+ * Routine is used when a WQE wraps-around the SQ and therefore needs a
+ * special attention. note that the WQE is written backward to the SQ.
+ *
+ * @param txq
+ *   Pointer to mlx4 Tx queue structure.
+ * @param index
+ *   First SQ TXBB index for this WQE.
+ * @param desc_size
+ *   TXBB-aligned sixe of the WQE.
+ *
+ * @return
+ *   A pointer to the control segment of this WQE in the SQ.
+ */
+static struct mlx4_wqe_ctrl_seg
+*mlx4_bounce_to_desc(struct txq *txq,
+		     uint32_t index,
+		     unsigned int desc_size)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t copy = (sq->txbb_cnt - index) * TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			rte_wmb();
+		*((uint32_t *)(sq->buf + i)) =
+			*((uint32_t *)(txq->bounce_buf + copy + i));
+	}
+	for (i = copy - 4; i >= 4; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			rte_wmb();
+		*((uint32_t *)(sq->buf + index * TXBB_SIZE + i)) =
+		*((uint32_t *)(txq->bounce_buf + i));
+	}
+	/* Return real descriptor location */
+	return (struct mlx4_wqe_ctrl_seg *)(sq->buf + index * TXBB_SIZE);
+}
+
+/**
+ * Handle address translation of scattered buffers for mlx4_tx_burst().
+ *
+ * @param txq
+ *   TX queue structure.
+ * @param segs
+ *   Number of segments in buf.
+ * @param elt
+ *   TX queue element to fill.
+ * @param[in] buf
+ *   Buffer to process.
+ * @param elts_head
+ *   Index of the linear buffer to use if necessary (normally txq->elts_head).
+ * @param[out] sges
+ *   Array filled with SGEs on success.
+ *
+ * @return
+ *   A structure containing the processed packet size in bytes and the
+ *   number of SGEs. Both fields are set to (unsigned int)-1 in case of
+ *   failure.
+ */
+static inline int
+mlx4_tx_sg_virt_to_lkey(struct txq *txq, struct rte_mbuf *buf,
+			struct ibv_sge *sges, unsigned int segs)
+{
+	unsigned int j;
+
+	/* Register segments as SGEs. */
+	for (j = 0; (j != segs); ++j) {
+		struct ibv_sge *sge = &sges[j];
+		uint32_t lkey;
+
+		/* Retrieve Memory Region key for this memory pool. */
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+		if (unlikely(lkey == (uint32_t)-1)) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			goto stop;
+		}
+		/* Update SGE. */
+		sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
+		if (txq->priv->vf)
+			rte_prefetch0((volatile void *)
+				      (uintptr_t)sge->addr);
+		sge->length = buf->data_len;
+		sge->lkey = lkey;
+		buf = buf->next;
+	}
+	return 0;
+stop:
+	return -1;
+}
+
+
+/**
  * Posts a single work requests to a send queue.
  *
  * @param txq
@@ -323,36 +418,53 @@
  */
 static int
 mlx4_post_send(struct txq *txq,
+	       struct rte_mbuf *pkt,
 	       struct ibv_send_wr *wr,
 	       struct ibv_send_wr **bad_wr)
 {
 	struct mlx4_wqe_ctrl_seg *ctrl;
 	struct mlx4_wqe_data_seg *dseg;
 	struct mlx4_sq *sq = &txq->msq;
+	struct ibv_sge sge[wr->num_sge];
 	uint32_t srcrb_flags;
 	uint8_t fence_size;
 	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
 	uint32_t owner_opcode;
-	int wqe_real_size, nr_txbbs;
+	int wqe_real_size, wqe_size, nr_txbbs, i;
+	bool bounce = FALSE;
 
-	/* for now we support pkts with one buf only */
-	if (wr->num_sge != 1)
+	if (unlikely(mlx4_tx_sg_virt_to_lkey(txq, pkt, sge, wr->num_sge)))
 		goto err;
+	wr->sg_list = sge;
 	/* Calc the needed wqe size for this packet */
 	wqe_real_size = mlx4_wqe_calc_real_size(wr->num_sge);
 	if (unlikely(!wqe_real_size))
 		goto err;
+	wqe_size = RTE_ALIGN(wqe_real_size, TXBB_SIZE);
 	nr_txbbs = SIZE_TO_TXBBS(wqe_real_size);
 	/* Are we too big to handle ? */
 	if (unlikely(mlx4_wq_overflow(sq, nr_txbbs)))
 		goto err;
-	/* Get ctrl and single-data wqe entries */
-	ctrl = mlx4_get_send_wqe(sq, head_idx);
+	/* Get ctrl entry */
+	if (likely(head_idx + nr_txbbs <= sq->txbb_cnt)) {
+		ctrl = mlx4_get_send_wqe(sq, head_idx);
+	} else {
+		/* handle the case of wqe wraps around the SQ by working with
+		 * a side-buf and when done copying it back to the SQ
+		 */
+		ctrl = (struct mlx4_wqe_ctrl_seg *)txq->bounce_buf;
+		bounce = TRUE;
+	}
+	/* Get data-seg entry */
 	dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) +
 		sizeof(struct mlx4_wqe_ctrl_seg));
-	mlx4_set_data_seg(dseg, wr->sg_list);
-	/* For raw eth, the SOLICIT flag is used
-	 * to indicate that no icrc should be calculated
+	/* Fill-in date from last to first */
+	for (i = wr->num_sge  - 1; i >= 0; --i)
+		mlx4_set_data_seg(dseg + i,  wr->sg_list + i);
+	/* Handle control info
+	 *
+	 * For raw eth, the SOLICIT flag is used to indicate that
+	 * no icrc should be calculated
 	 */
 	srcrb_flags = MLX4_WQE_CTRL_SOLICIT |
 		      ((wr->send_flags & IBV_SEND_SIGNALED) ?
@@ -361,7 +473,19 @@
 		MLX4_WQE_CTRL_FENCE : 0) | ((wqe_real_size / 16) & 0x3f);
 	owner_opcode = MLX4_OPCODE_SEND |
 		       ((sq->head & sq->txbb_cnt) ? MLX4_EN_BIT_WQE_OWN : 0);
-	mlx4_set_ctrl_seg(ctrl, owner_opcode, fence_size, srcrb_flags, 0);
+	/* fill in ctrl info but ownership */
+	mlx4_set_ctrl_seg(ctrl, fence_size, srcrb_flags, 0);
+       /* If we used a bounce buffer then copy wqe back into sq */
+	if (unlikely(bounce))
+		ctrl = mlx4_bounce_to_desc(txq, head_idx, wqe_size);
+	/*
+	 * Make sure descriptor is fully written before
+	 * setting ownership bit (because HW can start
+	 * executing as soon as we do).
+	 */
+	 rte_wmb();
+	 ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode);
+
 	sq->head += nr_txbbs;
 	rte_wmb();
 	return 0;
@@ -439,62 +563,31 @@
 		/* Request Tx completion. */
 		if (unlikely(--elts_comp_cd == 0)) {
 			elts_comp_cd = txq->elts_comp_cd_init;
-			++elts_comp;
 			send_flags |= IBV_SEND_SIGNALED;
 		}
-		if (likely(segs == 1)) {
-			struct ibv_sge *sge = &elt->sge;
-			uintptr_t addr;
-			uint32_t length;
-			uint32_t lkey;
-
-			/* Retrieve buffer information. */
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			length = buf->data_len;
-			/* Retrieve memory region key for this memory pool. */
-			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up Tx element. */
-				elt->buf = NULL;
-				goto stop;
-			}
-			if (buf->pkt_len <= txq->max_inline)
-				send_flags |= IBV_SEND_INLINE;
-			/* Update element. */
-			elt->buf = buf;
-			if (txq->priv->vf)
-				rte_prefetch0((volatile void *)
-					      (uintptr_t)addr);
+		if (buf->pkt_len <= txq->max_inline)
+			send_flags |= IBV_SEND_INLINE;
+		/* Update element. */
+		elt->buf = buf;
+		if (txq->priv->vf)
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-			sge->addr = addr;
-			sge->length = length;
-			sge->lkey = lkey;
-			sent_size += length;
-			/* Set up WR. */
-			wr->sg_list  = sge;
-			wr->num_sge  = segs;
-			wr->opcode   = IBV_WR_SEND;
-			wr->send_flags = send_flags;
-			wr->next     = NULL;
-			/* post the pkt for sending */
-			err = mlx4_post_send(txq, wr, &wr_bad);
-			if (unlikely(err)) {
-				if (unlikely(wr_bad->send_flags &
-					     IBV_SEND_SIGNALED)) {
-					elts_comp_cd = 1;
-					--elts_comp;
-				}
-				elt->buf = NULL;
-				goto stop;
-			}
-			sent_size += length;
-		} else {
-			err = -1;
+		/* Set up WR. */
+		wr->sg_list  = NULL; /* handled in post_send */
+		wr->num_sge  = segs;
+		wr->opcode   = IBV_WR_SEND;
+		wr->send_flags = send_flags;
+		wr->next     = NULL;
+		/* post the pkt for sending */
+		err = mlx4_post_send(txq, buf, wr, &wr_bad);
+		if (unlikely(err)) {
+			if (unlikely(wr_bad->send_flags &
+				     IBV_SEND_SIGNALED))
+				elts_comp_cd = 1;
+			elt->buf = NULL;
 			goto stop;
 		}
+		++elts_comp;
+		sent_size += buf->pkt_len;
 		elts_head = elts_head_next;
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += sent_size;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index e442730..7cae7e2 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -139,13 +139,14 @@ struct txq {
 	struct txq_elt (*elts)[]; /**< Tx elements. */
 	unsigned int elts_head; /**< Current index in (*elts)[]. */
 	unsigned int elts_tail; /**< First element awaiting completion. */
-	unsigned int elts_comp; /**< Number of completion requests. */
+	unsigned int elts_comp; /**< Number of pkts waiting for completion. */
 	unsigned int elts_comp_cd; /**< Countdown for next completion. */
 	unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
 	struct mlx4_txq_stats stats; /**< Tx queue counters. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
 	struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */
 	struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+	char *bounce_buf; /**< Side memory to be used when wqe wraps around */
 };
 
 /* mlx4_rxq.c */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 1273738..6f6ea9c 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -83,8 +83,14 @@
 		rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
 	int ret = 0;
 
-	if (elts == NULL) {
-		ERROR("%p: can't allocate packets array", (void *)txq);
+	/* Allocate Bounce-buf memory */
+	txq->bounce_buf = (char *)rte_zmalloc_socket("TXQ",
+						     MAX_WQE_SIZE,
+						     RTE_CACHE_LINE_MIN_SIZE,
+						     txq->socket);
+
+	if ((elts == NULL) || (txq->bounce_buf == NULL)) {
+		ERROR("%p: can't allocate TXQ memory", (void *)txq);
 		ret = ENOMEM;
 		goto error;
 	}
@@ -110,6 +116,8 @@
 	assert(ret == 0);
 	return 0;
 error:
+	if (txq->bounce_buf != NULL)
+		rte_free(txq->bounce_buf);
 	if (elts != NULL)
 		rte_free(elts);
 	DEBUG("%p: failed, freed everything", (void *)txq);
-- 
1.8.3.1



More information about the dev mailing list