[dpdk-dev] [PATCH v2 42/51] net/mlx4: separate Rx/Tx functions

Adrien Mazarguil adrien.mazarguil at 6wind.com
Fri Sep 1 10:06:57 CEST 2017


This commit groups all data plane functions (Rx/Tx) into a separate file
and adjusts header files accordingly.

Private functions are now prefixed with "mlx4_" to prevent them from
conflicting with their mlx5 PMD counterparts at link time.

No impact on functionality.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
---
 drivers/net/mlx4/Makefile    |   1 +
 drivers/net/mlx4/mlx4.c      | 484 +----------------------------------
 drivers/net/mlx4/mlx4.h      |   2 +
 drivers/net/mlx4/mlx4_rxtx.c | 524 ++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_rxtx.h |  12 +
 5 files changed, 545 insertions(+), 478 deletions(-)

diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index f6e3001..8def32a 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -38,6 +38,7 @@ LIB = librte_pmd_mlx4.a
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
 
 # Basic CFLAGS.
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index ba06075..a409ec2 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -56,13 +56,11 @@
 #include <rte_mbuf.h>
 #include <rte_errno.h>
 #include <rte_mempool.h>
-#include <rte_prefetch.h>
 #include <rte_malloc.h>
 #include <rte_memory.h>
 #include <rte_flow.h>
 #include <rte_kvargs.h>
 #include <rte_interrupts.h>
-#include <rte_branch_prediction.h>
 #include <rte_common.h>
 
 /* Generated configuration header. */
@@ -505,9 +503,6 @@ mlx4_dev_configure(struct rte_eth_dev *dev)
 	return 0;
 }
 
-static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
-static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-
 /* TX queues handling. */
 
 /**
@@ -630,53 +625,6 @@ txq_cleanup(struct txq *txq)
 	memset(txq, 0, sizeof(*txq));
 }
 
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx4_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   0 on success, -1 on failure.
- */
-static int
-txq_complete(struct txq *txq)
-{
-	unsigned int elts_comp = txq->elts_comp;
-	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
-	struct ibv_wc wcs[elts_comp];
-	int wcs_n;
-
-	if (unlikely(elts_comp == 0))
-		return 0;
-	wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
-	if (unlikely(wcs_n == 0))
-		return 0;
-	if (unlikely(wcs_n < 0)) {
-		DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
-		      (void *)txq, wcs_n);
-		return -1;
-	}
-	elts_comp -= wcs_n;
-	assert(elts_comp <= txq->elts_comp);
-	/*
-	 * Assume WC status is successful as nothing can be done about it
-	 * anyway.
-	 */
-	elts_tail += wcs_n * txq->elts_comp_cd_init;
-	if (elts_tail >= elts_n)
-		elts_tail -= elts_n;
-	txq->elts_tail = elts_tail;
-	txq->elts_comp = elts_comp;
-	return 0;
-}
-
 struct mlx4_check_mempool_data {
 	int ret;
 	char *start;
@@ -738,10 +686,6 @@ static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start,
 	return data.ret;
 }
 
-/* For best performance, this function should not be inlined. */
-static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
-	__rte_noinline;
-
 /**
  * Register mempool as a memory region.
  *
@@ -753,7 +697,7 @@ static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
  * @return
  *   Memory region pointer, NULL in case of error and rte_errno is set.
  */
-static struct ibv_mr *
+struct ibv_mr *
 mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
 {
 	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
@@ -794,81 +738,6 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
 	return mr;
 }
 
-/**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-txq_mb2mp(struct rte_mbuf *buf)
-{
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
-}
-
-/**
- * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] mp
- *   Memory Pool for which a Memory Region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-static uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-	unsigned int i;
-	struct ibv_mr *mr;
-
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
-	/* Add a new entry, register MR first. */
-	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
-	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-		      (void *)txq);
-		return (uint32_t)-1;
-	}
-	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-		      (void *)txq);
-		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq->mp2mr[i].mp = mp;
-	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
-	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
-}
-
 struct txq_mp2mr_mbuf_check_data {
 	int ret;
 };
@@ -923,172 +792,7 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
 	if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
 			data.ret == -1)
 		return;
-	txq_mp2mr(txq, mp);
-}
-
-/**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct txq *txq = (struct txq *)dpdk_txq;
-	struct ibv_send_wr *wr_head = NULL;
-	struct ibv_send_wr **wr_next = &wr_head;
-	struct ibv_send_wr *wr_bad = NULL;
-	unsigned int elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp_cd = txq->elts_comp_cd;
-	unsigned int elts_comp = 0;
-	unsigned int i;
-	unsigned int max;
-	int err;
-
-	assert(elts_comp_cd != 0);
-	txq_complete(txq);
-	max = (elts_n - (elts_head - txq->elts_tail));
-	if (max > elts_n)
-		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
-		unsigned int elts_head_next =
-			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
-		struct txq_elt *elt = &(*txq->elts)[elts_head];
-		struct ibv_send_wr *wr = &elt->wr;
-		unsigned int segs = buf->nb_segs;
-		unsigned int sent_size = 0;
-		uint32_t send_flags = 0;
-
-		/* Clean up old buffer. */
-		if (likely(elt->buf != NULL)) {
-			struct rte_mbuf *tmp = elt->buf;
-
-#ifndef NDEBUG
-			/* Poisoning. */
-			memset(elt, 0x66, sizeof(*elt));
-#endif
-			/* Faster than rte_pktmbuf_free(). */
-			do {
-				struct rte_mbuf *next = tmp->next;
-
-				rte_pktmbuf_free_seg(tmp);
-				tmp = next;
-			} while (tmp != NULL);
-		}
-		/* Request TX completion. */
-		if (unlikely(--elts_comp_cd == 0)) {
-			elts_comp_cd = txq->elts_comp_cd_init;
-			++elts_comp;
-			send_flags |= IBV_SEND_SIGNALED;
-		}
-		if (likely(segs == 1)) {
-			struct ibv_sge *sge = &elt->sge;
-			uintptr_t addr;
-			uint32_t length;
-			uint32_t lkey;
-
-			/* Retrieve buffer information. */
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			length = buf->data_len;
-			/* Retrieve Memory Region key for this memory pool. */
-			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up TX element. */
-				elt->buf = NULL;
-				goto stop;
-			}
-			/* Update element. */
-			elt->buf = buf;
-			if (txq->priv->vf)
-				rte_prefetch0((volatile void *)
-					      (uintptr_t)addr);
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-			sge->addr = addr;
-			sge->length = length;
-			sge->lkey = lkey;
-			sent_size += length;
-		} else {
-			err = -1;
-			goto stop;
-		}
-		if (sent_size <= txq->max_inline)
-			send_flags |= IBV_SEND_INLINE;
-		elts_head = elts_head_next;
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += sent_size;
-		/* Set up WR. */
-		wr->sg_list = &elt->sge;
-		wr->num_sge = segs;
-		wr->opcode = IBV_WR_SEND;
-		wr->send_flags = send_flags;
-		*wr_next = wr;
-		wr_next = &wr->next;
-	}
-stop:
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-	/* Ring QP doorbell. */
-	*wr_next = NULL;
-	assert(wr_head);
-	err = ibv_post_send(txq->qp, wr_head, &wr_bad);
-	if (unlikely(err)) {
-		uint64_t obytes = 0;
-		uint64_t opackets = 0;
-
-		/* Rewind bad WRs. */
-		while (wr_bad != NULL) {
-			int j;
-
-			/* Force completion request if one was lost. */
-			if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
-				elts_comp_cd = 1;
-				--elts_comp;
-			}
-			++opackets;
-			for (j = 0; j < wr_bad->num_sge; ++j)
-				obytes += wr_bad->sg_list[j].length;
-			elts_head = (elts_head ? elts_head : elts_n) - 1;
-			wr_bad = wr_bad->next;
-		}
-		txq->stats.opackets -= opackets;
-		txq->stats.obytes -= obytes;
-		i -= opackets;
-		DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
-		      " (%" PRIu64 " bytes) rejected: %s",
-		      (void *)txq,
-		      opackets,
-		      obytes,
-		      (err <= -1) ? "Internal error" : strerror(err));
-	}
-	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
-	txq->elts_comp_cd = elts_comp_cd;
-	return i;
+	mlx4_txq_mp2mr(txq, mp);
 }
 
 /**
@@ -1546,132 +1250,6 @@ rxq_cleanup(struct rxq *rxq)
 }
 
 /**
- * DPDK callback for RX.
- *
- * The following function doesn't manage scattered packets.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
-	struct ibv_recv_wr *wr_head = NULL;
-	struct ibv_recv_wr **wr_next = &wr_head;
-	struct ibv_recv_wr *wr_bad = NULL;
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
-
-	ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(ret == 0))
-		return 0;
-	if (unlikely(ret < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, ret);
-		return 0;
-	}
-	assert(ret <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; i != (unsigned int)ret; ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint32_t len = wc->byte_len;
-		struct rte_mbuf *seg = elt->buf;
-		struct rte_mbuf *rep;
-
-		/* Sanity checks. */
-		assert(wr->sg_list == &elt->sge);
-		assert(wr->num_sge == 1);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		/* Link completed WRs together for repost. */
-		*wr_next = wr;
-		wr_next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p: bad work completion status (%d): %s",
-			      (void *)rxq, wc->status,
-			      ibv_wc_status_str(wc->status));
-			/* Increment dropped packets counter. */
-			++rxq->stats.idropped;
-			goto repost;
-		}
-		rep = rte_mbuf_raw_alloc(rxq->mp);
-		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p: can't allocate a new mbuf",
-			      (void *)rxq);
-			/* Increase out of memory counters. */
-			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			goto repost;
-		}
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		assert(elt->sge.lkey == rxq->mr->lkey);
-		elt->buf = rep;
-		/* Update seg information. */
-		seg->data_off = RTE_PKTMBUF_HEADROOM;
-		seg->nb_segs = 1;
-		seg->port = rxq->port_id;
-		seg->next = NULL;
-		seg->pkt_len = len;
-		seg->data_len = len;
-		seg->packet_type = 0;
-		seg->ol_flags = 0;
-		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += len;
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
-	}
-	if (unlikely(i == 0))
-		return 0;
-	/* Repost WRs. */
-	*wr_next = NULL;
-	assert(wr_head);
-	ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-	return pkts_ret;
-}
-
-/**
  * Allocate a Queue Pair.
  * Optionally setup inline receive if supported.
  *
@@ -2032,56 +1610,6 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
 }
 
 /**
- * Dummy DPDK callback for TX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	(void)dpdk_txq;
-	(void)pkts;
-	(void)pkts_n;
-	return 0;
-}
-
-/**
- * Dummy DPDK callback for RX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	(void)dpdk_rxq;
-	(void)pkts;
-	(void)pkts_n;
-	return 0;
-}
-
-/**
  * DPDK callback to close the device.
  *
  * Destroy all queues and objects, free memory.
@@ -2107,8 +1635,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
 	 * still required for DPDK 1.3 because some programs (such as testpmd)
 	 * never release them before closing the device.
 	 */
-	dev->rx_pkt_burst = removed_rx_burst;
-	dev->tx_pkt_burst = removed_tx_burst;
+	dev->rx_pkt_burst = mlx4_rx_burst_removed;
+	dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	if (priv->rxqs != NULL) {
 		/* XXX race condition if mlx4_rx_burst() is still running. */
 		usleep(1000);
@@ -2173,8 +1701,8 @@ priv_set_link(struct priv *priv, int up)
 		err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
 		if (err)
 			return err;
-		dev->rx_pkt_burst = removed_rx_burst;
-		dev->tx_pkt_burst = removed_tx_burst;
+		dev->rx_pkt_burst = mlx4_rx_burst_removed;
+		dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	}
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index edbece6..efccf1a 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -49,6 +49,7 @@
 #include <rte_ethdev.h>
 #include <rte_ether.h>
 #include <rte_interrupts.h>
+#include <rte_mempool.h>
 
 /* Request send completion once in every 64 sends, might be less. */
 #define MLX4_PMD_TX_PER_COMP_REQ 64
@@ -115,6 +116,7 @@ struct priv {
 
 /* mlx4.c */
 
+struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
 int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
 
 /* mlx4_intr.c */
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
new file mode 100644
index 0000000..b5e7777
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -0,0 +1,524 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mlx4_txq_complete(struct txq *txq)
+{
+	unsigned int elts_comp = txq->elts_comp;
+	unsigned int elts_tail = txq->elts_tail;
+	const unsigned int elts_n = txq->elts_n;
+	struct ibv_wc wcs[elts_comp];
+	int wcs_n;
+
+	if (unlikely(elts_comp == 0))
+		return 0;
+	wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
+	if (unlikely(wcs_n == 0))
+		return 0;
+	if (unlikely(wcs_n < 0)) {
+		DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
+		      (void *)txq, wcs_n);
+		return -1;
+	}
+	elts_comp -= wcs_n;
+	assert(elts_comp <= txq->elts_comp);
+	/*
+	 * Assume WC status is successful as nothing can be done about it
+	 * anyway.
+	 */
+	elts_tail += wcs_n * txq->elts_comp_cd_init;
+	if (elts_tail >= elts_n)
+		elts_tail -= elts_n;
+	txq->elts_tail = elts_tail;
+	txq->elts_comp = elts_comp;
+	return 0;
+}
+
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_INDIRECT(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
+ * remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+	struct ibv_mr *mr;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	/* Add a new entry, register MR first. */
+	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+	      (void *)txq, mp->name, (void *)mp);
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
+	if (unlikely(mr == NULL)) {
+		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+		      (void *)txq);
+		return (uint32_t)-1;
+	}
+	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+		/* Table is full, remove oldest entry. */
+		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+		      (void *)txq);
+		--i;
+		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+	}
+	/* Store the new entry. */
+	txq->mp2mr[i].mp = mp;
+	txq->mp2mr[i].mr = mr;
+	txq->mp2mr[i].lkey = mr->lkey;
+	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	return txq->mp2mr[i].lkey;
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	struct ibv_send_wr *wr_head = NULL;
+	struct ibv_send_wr **wr_next = &wr_head;
+	struct ibv_send_wr *wr_bad = NULL;
+	unsigned int elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	unsigned int elts_comp_cd = txq->elts_comp_cd;
+	unsigned int elts_comp = 0;
+	unsigned int i;
+	unsigned int max;
+	int err;
+
+	assert(elts_comp_cd != 0);
+	mlx4_txq_complete(txq);
+	max = (elts_n - (elts_head - txq->elts_tail));
+	if (max > elts_n)
+		max -= elts_n;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max == 0)
+		return 0;
+	if (max > pkts_n)
+		max = pkts_n;
+	for (i = 0; (i != max); ++i) {
+		struct rte_mbuf *buf = pkts[i];
+		unsigned int elts_head_next =
+			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		struct ibv_send_wr *wr = &elt->wr;
+		unsigned int segs = buf->nb_segs;
+		unsigned int sent_size = 0;
+		uint32_t send_flags = 0;
+
+		/* Clean up old buffer. */
+		if (likely(elt->buf != NULL)) {
+			struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+			/* Poisoning. */
+			memset(elt, 0x66, sizeof(*elt));
+#endif
+			/* Faster than rte_pktmbuf_free(). */
+			do {
+				struct rte_mbuf *next = tmp->next;
+
+				rte_pktmbuf_free_seg(tmp);
+				tmp = next;
+			} while (tmp != NULL);
+		}
+		/* Request Tx completion. */
+		if (unlikely(--elts_comp_cd == 0)) {
+			elts_comp_cd = txq->elts_comp_cd_init;
+			++elts_comp;
+			send_flags |= IBV_SEND_SIGNALED;
+		}
+		if (likely(segs == 1)) {
+			struct ibv_sge *sge = &elt->sge;
+			uintptr_t addr;
+			uint32_t length;
+			uint32_t lkey;
+
+			/* Retrieve buffer information. */
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			length = buf->data_len;
+			/* Retrieve memory region key for this memory pool. */
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+			if (unlikely(lkey == (uint32_t)-1)) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR"
+				      " association", (void *)txq);
+				/* Clean up Tx element. */
+				elt->buf = NULL;
+				goto stop;
+			}
+			/* Update element. */
+			elt->buf = buf;
+			if (txq->priv->vf)
+				rte_prefetch0((volatile void *)
+					      (uintptr_t)addr);
+			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+			sge->addr = addr;
+			sge->length = length;
+			sge->lkey = lkey;
+			sent_size += length;
+		} else {
+			err = -1;
+			goto stop;
+		}
+		if (sent_size <= txq->max_inline)
+			send_flags |= IBV_SEND_INLINE;
+		elts_head = elts_head_next;
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += sent_size;
+		/* Set up WR. */
+		wr->sg_list = &elt->sge;
+		wr->num_sge = segs;
+		wr->opcode = IBV_WR_SEND;
+		wr->send_flags = send_flags;
+		*wr_next = wr;
+		wr_next = &wr->next;
+	}
+stop:
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+	/* Ring QP doorbell. */
+	*wr_next = NULL;
+	assert(wr_head);
+	err = ibv_post_send(txq->qp, wr_head, &wr_bad);
+	if (unlikely(err)) {
+		uint64_t obytes = 0;
+		uint64_t opackets = 0;
+
+		/* Rewind bad WRs. */
+		while (wr_bad != NULL) {
+			int j;
+
+			/* Force completion request if one was lost. */
+			if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
+				elts_comp_cd = 1;
+				--elts_comp;
+			}
+			++opackets;
+			for (j = 0; j < wr_bad->num_sge; ++j)
+				obytes += wr_bad->sg_list[j].length;
+			elts_head = (elts_head ? elts_head : elts_n) - 1;
+			wr_bad = wr_bad->next;
+		}
+		txq->stats.opackets -= opackets;
+		txq->stats.obytes -= obytes;
+		i -= opackets;
+		DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
+		      " (%" PRIu64 " bytes) rejected: %s",
+		      (void *)txq,
+		      opackets,
+		      obytes,
+		      (err <= -1) ? "Internal error" : strerror(err));
+	}
+	txq->elts_head = elts_head;
+	txq->elts_comp += elts_comp;
+	txq->elts_comp_cd = elts_comp_cd;
+	return i;
+}
+
+/**
+ * DPDK callback for Rx.
+ *
+ * The following function doesn't manage scattered packets.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct rxq *rxq = (struct rxq *)dpdk_rxq;
+	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
+	const unsigned int elts_n = rxq->elts_n;
+	unsigned int elts_head = rxq->elts_head;
+	struct ibv_wc wcs[pkts_n];
+	struct ibv_recv_wr *wr_head = NULL;
+	struct ibv_recv_wr **wr_next = &wr_head;
+	struct ibv_recv_wr *wr_bad = NULL;
+	unsigned int i;
+	unsigned int pkts_ret = 0;
+	int ret;
+
+	ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
+	if (unlikely(ret == 0))
+		return 0;
+	if (unlikely(ret < 0)) {
+		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
+		      (void *)rxq, ret);
+		return 0;
+	}
+	assert(ret <= (int)pkts_n);
+	/* For each work completion. */
+	for (i = 0; i != (unsigned int)ret; ++i) {
+		struct ibv_wc *wc = &wcs[i];
+		struct rxq_elt *elt = &(*elts)[elts_head];
+		struct ibv_recv_wr *wr = &elt->wr;
+		uint32_t len = wc->byte_len;
+		struct rte_mbuf *seg = elt->buf;
+		struct rte_mbuf *rep;
+
+		/* Sanity checks. */
+		assert(wr->sg_list == &elt->sge);
+		assert(wr->num_sge == 1);
+		assert(elts_head < rxq->elts_n);
+		assert(rxq->elts_head < rxq->elts_n);
+		/*
+		 * Fetch initial bytes of packet descriptor into a
+		 * cacheline while allocating rep.
+		 */
+		rte_mbuf_prefetch_part1(seg);
+		rte_mbuf_prefetch_part2(seg);
+		/* Link completed WRs together for repost. */
+		*wr_next = wr;
+		wr_next = &wr->next;
+		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
+			/* Whatever, just repost the offending WR. */
+			DEBUG("rxq=%p: bad work completion status (%d): %s",
+			      (void *)rxq, wc->status,
+			      ibv_wc_status_str(wc->status));
+			/* Increment dropped packets counter. */
+			++rxq->stats.idropped;
+			goto repost;
+		}
+		rep = rte_mbuf_raw_alloc(rxq->mp);
+		if (unlikely(rep == NULL)) {
+			/*
+			 * Unable to allocate a replacement mbuf,
+			 * repost WR.
+			 */
+			DEBUG("rxq=%p: can't allocate a new mbuf",
+			      (void *)rxq);
+			/* Increase out of memory counters. */
+			++rxq->stats.rx_nombuf;
+			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
+			goto repost;
+		}
+		/* Reconfigure sge to use rep instead of seg. */
+		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
+		assert(elt->sge.lkey == rxq->mr->lkey);
+		elt->buf = rep;
+		/* Update seg information. */
+		seg->data_off = RTE_PKTMBUF_HEADROOM;
+		seg->nb_segs = 1;
+		seg->port = rxq->port_id;
+		seg->next = NULL;
+		seg->pkt_len = len;
+		seg->data_len = len;
+		seg->packet_type = 0;
+		seg->ol_flags = 0;
+		/* Return packet. */
+		*(pkts++) = seg;
+		++pkts_ret;
+		/* Increase bytes counter. */
+		rxq->stats.ibytes += len;
+repost:
+		if (++elts_head >= elts_n)
+			elts_head = 0;
+		continue;
+	}
+	if (unlikely(i == 0))
+		return 0;
+	/* Repost WRs. */
+	*wr_next = NULL;
+	assert(wr_head);
+	ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
+	if (unlikely(ret)) {
+		/* Inability to repost WRs is fatal. */
+		DEBUG("%p: recv_burst(): failed (ret=%d)",
+		      (void *)rxq->priv,
+		      ret);
+		abort();
+	}
+	rxq->elts_head = elts_head;
+	/* Increase packets counter. */
+	rxq->stats.ipackets += pkts_ret;
+	return pkts_ret;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_txq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_rxq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index ea55aed..669c8a4 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -119,4 +119,16 @@ struct txq {
 	unsigned int socket; /**< CPU socket ID for allocations. */
 };
 
+/* mlx4_rxtx.c */
+
+uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
 #endif /* MLX4_RXTX_H_ */
-- 
2.1.4



More information about the dev mailing list