[dpdk-dev] [PATCH v1 2/2] net/mlx5: add multiple process support

Xueming Li xuemingl at mellanox.com
Thu Aug 24 16:03:41 CEST 2017


PMD uses Verbs object which were not available in the shared memory, in
addition, due to IO pages, it was not possible to use the primary
process Tx queues from a secondary process.

This patch modify the location where Verbs objects are allocated (from
process memory address space to shared memory address space) and thus
allow a secondary process to use those object by mapping this shared
memory space its own memory space.
For Tx IO pages, it uses a unix socket to get back the communication
channel with the Kernel driver from the primary process, this is
necessary to remap those pages in the secondary process memory space and
thus use the same Tx queues.

This is only supported from Linux kernel (v4.14) and rdma-core (v14).

Cc: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
Signed-off-by: Xueming Li <xuemingl at mellanox.com>
---
 doc/guides/nics/mlx5.rst       |   3 +-
 drivers/net/mlx5/Makefile      |   1 +
 drivers/net/mlx5/mlx5.c        | 132 ++++++++++++------
 drivers/net/mlx5/mlx5.h        |  18 +--
 drivers/net/mlx5/mlx5_ethdev.c | 215 ++++++------------------------
 drivers/net/mlx5/mlx5_rxq.c    |  41 ------
 drivers/net/mlx5/mlx5_rxtx.h   |   5 +-
 drivers/net/mlx5/mlx5_socket.c | 294 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_txq.c    |  89 ++++++++-----
 9 files changed, 501 insertions(+), 297 deletions(-)
 create mode 100644 drivers/net/mlx5/mlx5_socket.c

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index a68b7ad..9eeada4 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -87,7 +87,7 @@ Features
 - Flow director (RTE_FDIR_MODE_PERFECT, RTE_FDIR_MODE_PERFECT_MAC_VLAN and
   RTE_ETH_FDIR_REJECT).
 - Flow API.
-- Secondary process TX is supported.
+- Secondary process.
 - KVM and VMware ESX SR-IOV modes are supported.
 - RSS hash result is supported.
 - Hardware TSO.
@@ -99,7 +99,6 @@ Limitations
 - Inner RSS for VXLAN frames is not supported yet.
 - Port statistics through software counters only.
 - Hardware checksum RX offloads for VXLAN inner header are not supported yet.
-- Secondary process RX is not supported.
 
 Configuration
 -------------
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 0feed4c..6c8f404 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -52,6 +52,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 
 # Basic CFLAGS.
 CFLAGS += -O3
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 39a159c..3002e7e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -126,6 +126,52 @@ struct mlx5_args {
 }
 
 /**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ *
+ * @param[in] size
+ *   The size in bytes of the memory to allocate.
+ * @param[in] data
+ *   A pointer to the callback data.
+ *
+ * @return
+ *   a pointer to the allocate space.
+ */
+static void *
+mlx5_extern_alloc_buf(size_t size, void *data)
+{
+	struct priv *priv = data;
+	void *ret;
+	size_t alignment = sysconf(_SC_PAGESIZE);
+
+	assert(data != NULL);
+	assert(!mlx5_is_secondary());
+
+	ret = rte_malloc_socket(__func__, size, alignment,
+			priv->dev->device->numa_node);
+	DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret);
+	return ret;
+}
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ *   A pointer to the memory to free.
+ * @param[in] data
+ *   A pointer to the callback data.
+ */
+static void
+mlx5_extern_free_buf(void *ptr, void *data __rte_unused)
+{
+	assert(data != NULL);
+	assert(!mlx5_is_secondary());
+
+	DEBUG("Extern free request: %p", ptr);
+	rte_free(ptr);
+}
+
+/**
  * DPDK callback to close the device.
  *
  * Destroy all queues and objects, free memory.
@@ -203,6 +249,7 @@ struct mlx5_args {
 	}
 	if (priv->reta_idx != NULL)
 		rte_free(priv->reta_idx);
+	priv_socket_uninit(priv);
 	priv_unlock(priv);
 	memset(priv, 0, sizeof(*priv));
 }
@@ -526,6 +573,7 @@ struct mlx5_args {
 		assert(err > 0);
 		return -err;
 	}
+	err = 0; /* previous errors are handled if attr_ctx is NULL. */
 	ibv_dev = list[i];
 
 	DEBUG("device opened");
@@ -555,6 +603,40 @@ struct mlx5_args {
 			.tso = MLX5_ARG_UNSET,
 		};
 
+		mlx5_dev[idx].ports |= test;
+		if (mlx5_is_secondary()) {
+			/* from rte_ethdev.c */
+			char name[RTE_ETH_NAME_MAX_LEN];
+
+			snprintf(name, sizeof(name), "%s port %u",
+				 ibv_get_device_name(ibv_dev), port);
+			eth_dev = rte_eth_dev_attach_secondary(name);
+			if (eth_dev == NULL) {
+				ERROR("can not attach rte ethdev");
+				err = ENOMEM;
+				goto error;
+			}
+			eth_dev->dev_ops = &mlx5_dev_ops;
+			priv = eth_dev->data->dev_private;
+			/* TODO replace with mlx5dv_context */
+			priv->num_uars_per_page = 1;
+			/* Receive command fd from primary process */
+			err = priv_socket_connect(priv);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
+			/* Remap UAR for Tx queues. */
+			err = mlx5_tx_uar_remap(priv, err);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
+			mlx5_dev_select_rx_function(eth_dev);
+			mlx5_dev_select_tx_function(eth_dev);
+			continue;
+		}
+
 		device_attr_ex.comp_mask = 0;
 
 		DEBUG("using port %u (%08" PRIx32 ")", port, test);
@@ -753,37 +835,8 @@ struct mlx5_args {
 			err = ENOMEM;
 			goto port_error;
 		}
-
-		/* Secondary processes have to use local storage for their
-		 * private data as well as a copy of eth_dev->data, but this
-		 * pointer must not be modified before burst functions are
-		 * actually called. */
-		if (mlx5_is_secondary()) {
-			struct mlx5_secondary_data *sd =
-				&mlx5_secondary_data[eth_dev->data->port_id];
-			sd->primary_priv = eth_dev->data->dev_private;
-			if (sd->primary_priv == NULL) {
-				ERROR("no private data for port %u",
-						eth_dev->data->port_id);
-				err = EINVAL;
-				goto port_error;
-			}
-			sd->shared_dev_data = eth_dev->data;
-			rte_spinlock_init(&sd->lock);
-			memcpy(sd->data.name, sd->shared_dev_data->name,
-				   sizeof(sd->data.name));
-			sd->data.dev_private = priv;
-			sd->data.rx_mbuf_alloc_failed = 0;
-			sd->data.mtu = ETHER_MTU;
-			sd->data.port_id = sd->shared_dev_data->port_id;
-			sd->data.mac_addrs = priv->mac;
-			eth_dev->tx_pkt_burst = mlx5_tx_burst_secondary_setup;
-			eth_dev->rx_pkt_burst = mlx5_rx_burst_secondary_setup;
-		} else {
-			eth_dev->data->dev_private = priv;
-			eth_dev->data->mac_addrs = priv->mac;
-		}
-
+		eth_dev->data->dev_private = priv;
+		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = &pci_dev->device;
 		rte_eth_copy_pci_info(eth_dev, pci_dev);
 		eth_dev->device->driver = &mlx5_driver.driver;
@@ -791,6 +844,15 @@ struct mlx5_args {
 		eth_dev->dev_ops = &mlx5_dev_ops;
 		TAILQ_INIT(&priv->flows);
 
+		/* Hint libmlx5 to use PMD allocator for PRM resources */
+		struct mlx5dv_ctx_allocators alctr = {
+				.alloc = &mlx5_extern_alloc_buf,
+				.free = &mlx5_extern_free_buf,
+				.data = priv,
+		};
+		mlx5dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+				(void *)((uintptr_t)&alctr));
+
 		/* Bring Ethernet device up. */
 		DEBUG("forcing Ethernet interface up");
 		priv_set_flags(priv, ~IFF_UP, IFF_UP);
@@ -885,14 +947,6 @@ struct mlx5_args {
 static void
 rte_mlx5_pmd_init(void)
 {
-	/*
-	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
-	 * huge pages. Calling ibv_fork_init() during init allows
-	 * applications to use fork() safely for purposes other than
-	 * using this PMD, which is not supported in forked processes.
-	 */
-	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
-	ibv_fork_init();
 	rte_pci_register(&mlx5_driver);
 }
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2dee07c..b5d2f67 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -157,16 +157,11 @@ struct priv {
 	uint32_t link_speed_capa; /* Link speed capabilities. */
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
+	int socket; /* Socket to exchange data with secondaries. */
+	struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */
+	int num_uars_per_page; /* number of UARs per system page */
 };
 
-/* Local storage for secondary process data. */
-struct mlx5_secondary_data {
-	struct rte_eth_dev_data data; /* Local device data. */
-	struct priv *primary_priv; /* Private structure from primary. */
-	struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */
-	rte_spinlock_t lock; /* Port configuration lock. */
-} mlx5_secondary_data[RTE_MAX_ETHPORTS];
-
 /**
  * Lock private structure to protect it from concurrent access in the
  * control path.
@@ -314,4 +309,11 @@ int mlx5_flow_destroy(struct rte_eth_dev *, struct rte_flow *,
 void priv_flow_stop(struct priv *);
 int priv_flow_rxq_in_use(struct priv *, struct rxq *);
 
+/* mlx5_socket.c */
+
+int priv_socket_init(struct priv *priv);
+int priv_socket_uninit(struct priv *priv);
+void priv_socket_handle(struct priv *priv);
+int priv_socket_connect(struct priv *priv);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index fce7dd5..84efeda 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -31,6 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define _GNU_SOURCE
+
 #include <stddef.h>
 #include <assert.h>
 #include <unistd.h>
@@ -49,6 +51,7 @@
 #include <linux/sockios.h>
 #include <linux/version.h>
 #include <fcntl.h>
+#include <sys/un.h>
 
 /* DPDK headers don't like -pedantic. */
 #ifdef PEDANTIC
@@ -132,12 +135,7 @@ struct ethtool_link_settings {
 struct priv *
 mlx5_get_priv(struct rte_eth_dev *dev)
 {
-	struct mlx5_secondary_data *sd;
-
-	if (!mlx5_is_secondary())
-		return dev->data->dev_private;
-	sd = &mlx5_secondary_data[dev->data->port_id];
-	return sd->data.dev_private;
+	return dev->data->dev_private;
 }
 
 /**
@@ -149,7 +147,7 @@ struct priv *
 inline int
 mlx5_is_secondary(void)
 {
-	return rte_eal_process_type() != RTE_PROC_PRIMARY;
+	return rte_eal_process_type() == RTE_PROC_SECONDARY;
 }
 
 /**
@@ -1292,6 +1290,23 @@ struct priv *
 }
 
 /**
+ * Handle interrupts from the socket.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+static void
+mlx5_dev_handler_socket(void *cb_arg)
+{
+	struct rte_eth_dev *dev = cb_arg;
+	struct priv *priv = dev->data->dev_private;
+
+	priv_lock(priv);
+	priv_socket_handle(priv);
+	priv_unlock(priv);
+}
+
+/**
  * Uninstall interrupt handler.
  *
  * @param priv
@@ -1302,16 +1317,21 @@ struct priv *
 void
 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
 {
-	if (!dev->data->dev_conf.intr_conf.lsc)
-		return;
-	rte_intr_callback_unregister(&priv->intr_handle,
-				     mlx5_dev_interrupt_handler,
-				     dev);
+	if (dev->data->dev_conf.intr_conf.lsc)
+		rte_intr_callback_unregister(&priv->intr_handle,
+						 mlx5_dev_interrupt_handler,
+						 dev);
+	if (priv->socket)
+		rte_intr_callback_unregister(&priv->intr_handle_socket,
+					     mlx5_dev_handler_socket,
+					     dev);
 	if (priv->pending_alarm)
 		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
 	priv->pending_alarm = 0;
 	priv->intr_handle.fd = 0;
 	priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	priv->intr_handle_socket.fd = 0;
+	priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
 }
 
 /**
@@ -1327,21 +1347,29 @@ struct priv *
 {
 	int rc, flags;
 
-	if (!dev->data->dev_conf.intr_conf.lsc)
-		return;
+	assert(!mlx5_is_secondary());
 	assert(priv->ctx->async_fd > 0);
 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
 	rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
 	if (rc < 0) {
 		INFO("failed to change file descriptor async event queue");
 		dev->data->dev_conf.intr_conf.lsc = 0;
-	} else {
+	}
+	if (dev->data->dev_conf.intr_conf.lsc) {
 		priv->intr_handle.fd = priv->ctx->async_fd;
 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		rte_intr_callback_register(&priv->intr_handle,
 					   mlx5_dev_interrupt_handler,
 					   dev);
 	}
+
+	rc = priv_socket_init(priv);
+	if (!rc && priv->socket) {
+		priv->intr_handle_socket.fd = priv->socket;
+		priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
+		rte_intr_callback_register(&priv->intr_handle_socket,
+				mlx5_dev_handler_socket, dev);
+	}
 }
 
 /**
@@ -1420,163 +1448,6 @@ struct priv *
 }
 
 /**
- * Configure secondary process queues from a private data pointer (primary
- * or secondary) and update burst callbacks. Can take place only once.
- *
- * All queues must have been previously created by the primary process to
- * avoid undefined behavior.
- *
- * @param priv
- *   Private data pointer from either primary or secondary process.
- *
- * @return
- *   Private data pointer from secondary process, NULL in case of error.
- */
-struct priv *
-mlx5_secondary_data_setup(struct priv *priv)
-{
-	unsigned int port_id = 0;
-	struct mlx5_secondary_data *sd;
-	void **tx_queues;
-	void **rx_queues;
-	unsigned int nb_tx_queues;
-	unsigned int nb_rx_queues;
-	unsigned int i;
-
-	/* priv must be valid at this point. */
-	assert(priv != NULL);
-	/* priv->dev must also be valid but may point to local memory from
-	 * another process, possibly with the same address and must not
-	 * be dereferenced yet. */
-	assert(priv->dev != NULL);
-	/* Determine port ID by finding out where priv comes from. */
-	while (1) {
-		sd = &mlx5_secondary_data[port_id];
-		rte_spinlock_lock(&sd->lock);
-		/* Primary process? */
-		if (sd->primary_priv == priv)
-			break;
-		/* Secondary process? */
-		if (sd->data.dev_private == priv)
-			break;
-		rte_spinlock_unlock(&sd->lock);
-		if (++port_id == RTE_DIM(mlx5_secondary_data))
-			port_id = 0;
-	}
-	/* Switch to secondary private structure. If private data has already
-	 * been updated by another thread, there is nothing else to do. */
-	priv = sd->data.dev_private;
-	if (priv->dev->data == &sd->data)
-		goto end;
-	/* Sanity checks. Secondary private structure is supposed to point
-	 * to local eth_dev, itself still pointing to the shared device data
-	 * structure allocated by the primary process. */
-	assert(sd->shared_dev_data != &sd->data);
-	assert(sd->data.nb_tx_queues == 0);
-	assert(sd->data.tx_queues == NULL);
-	assert(sd->data.nb_rx_queues == 0);
-	assert(sd->data.rx_queues == NULL);
-	assert(priv != sd->primary_priv);
-	assert(priv->dev->data == sd->shared_dev_data);
-	assert(priv->txqs_n == 0);
-	assert(priv->txqs == NULL);
-	assert(priv->rxqs_n == 0);
-	assert(priv->rxqs == NULL);
-	nb_tx_queues = sd->shared_dev_data->nb_tx_queues;
-	nb_rx_queues = sd->shared_dev_data->nb_rx_queues;
-	/* Allocate local storage for queues. */
-	tx_queues = rte_zmalloc("secondary ethdev->tx_queues",
-				sizeof(sd->data.tx_queues[0]) * nb_tx_queues,
-				RTE_CACHE_LINE_SIZE);
-	rx_queues = rte_zmalloc("secondary ethdev->rx_queues",
-				sizeof(sd->data.rx_queues[0]) * nb_rx_queues,
-				RTE_CACHE_LINE_SIZE);
-	if (tx_queues == NULL || rx_queues == NULL)
-		goto error;
-	/* Lock to prevent control operations during setup. */
-	priv_lock(priv);
-	/* TX queues. */
-	for (i = 0; i != nb_tx_queues; ++i) {
-		struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
-		struct txq_ctrl *primary_txq_ctrl;
-		struct txq_ctrl *txq_ctrl;
-
-		if (primary_txq == NULL)
-			continue;
-		primary_txq_ctrl = container_of(primary_txq,
-						struct txq_ctrl, txq);
-		txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl) +
-					     (1 << primary_txq->elts_n) *
-					     sizeof(struct rte_mbuf *), 0,
-					     primary_txq_ctrl->socket);
-		if (txq_ctrl != NULL) {
-			if (txq_ctrl_setup(priv->dev,
-					   txq_ctrl,
-					   1 << primary_txq->elts_n,
-					   primary_txq_ctrl->socket,
-					   NULL) == 0) {
-				txq_ctrl->txq.stats.idx =
-					primary_txq->stats.idx;
-				tx_queues[i] = &txq_ctrl->txq;
-				continue;
-			}
-			rte_free(txq_ctrl);
-		}
-		while (i) {
-			txq_ctrl = tx_queues[--i];
-			txq_cleanup(txq_ctrl);
-			rte_free(txq_ctrl);
-		}
-		goto error;
-	}
-	/* RX queues. */
-	for (i = 0; i != nb_rx_queues; ++i) {
-		struct rxq_ctrl *primary_rxq =
-			container_of((*sd->primary_priv->rxqs)[i],
-				     struct rxq_ctrl, rxq);
-
-		if (primary_rxq == NULL)
-			continue;
-		/* Not supported yet. */
-		rx_queues[i] = NULL;
-	}
-	/* Update everything. */
-	priv->txqs = (void *)tx_queues;
-	priv->txqs_n = nb_tx_queues;
-	priv->rxqs = (void *)rx_queues;
-	priv->rxqs_n = nb_rx_queues;
-	sd->data.rx_queues = rx_queues;
-	sd->data.tx_queues = tx_queues;
-	sd->data.nb_rx_queues = nb_rx_queues;
-	sd->data.nb_tx_queues = nb_tx_queues;
-	sd->data.dev_link = sd->shared_dev_data->dev_link;
-	sd->data.mtu = sd->shared_dev_data->mtu;
-	memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state,
-	       sizeof(sd->data.rx_queue_state));
-	memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state,
-	       sizeof(sd->data.tx_queue_state));
-	sd->data.dev_flags = sd->shared_dev_data->dev_flags;
-	/* Use local data from now on. */
-	rte_mb();
-	priv->dev->data = &sd->data;
-	rte_mb();
-	mlx5_dev_select_tx_function(priv->dev);
-	mlx5_dev_select_rx_function(priv->dev);
-	priv_unlock(priv);
-end:
-	/* More sanity checks. */
-	assert(priv->dev->data == &sd->data);
-	rte_spinlock_unlock(&sd->lock);
-	return priv;
-error:
-	priv_unlock(priv);
-	rte_free(tx_queues);
-	rte_free(rx_queues);
-	rte_spinlock_unlock(&sd->lock);
-	return NULL;
-}
-
-/**
  * Configure the TX function to use.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 18e711e..60edf9d 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1318,47 +1318,6 @@
 }
 
 /**
- * DPDK callback for RX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal RX burst callback.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
-{
-	struct rxq *rxq = dpdk_rxq;
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
-	struct priv *primary_priv;
-	unsigned int index;
-
-	if (priv == NULL)
-		return 0;
-	primary_priv =
-		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->rxqs_n; ++index)
-		if (((*primary_priv->rxqs)[index] == rxq) ||
-		    ((*priv->rxqs)[index] == rxq))
-			break;
-	if (index == priv->rxqs_n)
-		return 0;
-	rxq = (*priv->rxqs)[index];
-	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
-}
-
-/**
  * Allocate queue vector and fill epoll fd list for Rx interrupts.
  *
  * @param priv
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 393c500..3940e00 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -290,6 +290,8 @@ struct txq_ctrl {
 	struct ibv_qp *qp; /* Queue Pair. */
 	unsigned int socket; /* CPU socket ID for allocations. */
 	struct txq txq; /* Data path structure. */
+
+	off_t uar_mmap_offset; /* UAR offset for secondary process mmap. */
 };
 
 /* mlx5_rxq.c */
@@ -314,7 +316,6 @@ int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t,
 int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_rxconf *, struct rte_mempool *);
 void mlx5_rx_queue_release(void *);
-uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 int priv_rx_intr_vec_enable(struct priv *priv);
 void priv_rx_intr_vec_disable(struct priv *priv);
 int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
@@ -328,7 +329,7 @@ int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
 int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_txconf *);
 void mlx5_tx_queue_release(void *);
-uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
+int mlx5_tx_uar_remap(struct priv *priv, int fd);
 
 /* mlx5_rxtx.c */
 
diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c
new file mode 100644
index 0000000..e371ab6
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_socket.c
@@ -0,0 +1,294 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2016 6WIND S.A.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#define _GNU_SOURCE
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+priv_socket_init(struct priv *priv)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int ret;
+	int flags;
+	struct stat file_stat;
+
+	/*
+	 * Initialise the socket to communicate with the secondary
+	 * process.
+	 */
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		WARN("secondary process not supported: %s", strerror(errno));
+		return ret;
+	}
+	priv->socket = ret;
+	flags = fcntl(priv->socket, F_GETFL, 0);
+	if (flags == -1)
+		goto out;
+	ret = fcntl(priv->socket, F_SETFL, flags | O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
+			MLX5_DRIVER_NAME, priv->socket);
+	ret = stat(sun.sun_path, &file_stat);
+	if (!ret)
+		claim_zero(remove(sun.sun_path));
+	ret = bind(priv->socket, (const struct sockaddr *)&sun, sizeof(sun));
+	if (ret < 0) {
+		WARN("cannot bind socket, secondary process not supported: %s",
+		     strerror(errno));
+		goto close;
+	}
+	ret = listen(priv->socket, 0);
+	if (ret < 0) {
+		WARN("Secondary process not supported: %s", strerror(errno));
+		goto close;
+	}
+	return ret;
+close:
+	remove(sun.sun_path);
+out:
+	claim_zero(close(priv->socket));
+	priv->socket = 0;
+	return -(ret);
+}
+
+/**
+ * Un-Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+priv_socket_uninit(struct priv *priv)
+{
+	MKSTR(path, "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->socket);
+	claim_zero(close(priv->socket));
+	priv->socket = 0;
+	claim_zero(remove(path));
+	return 0;
+}
+
+/**
+ * Handle socket interrupts.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+priv_socket_handle(struct priv *priv)
+{
+	int conn_sock;
+	int ret = 0;
+	struct cmsghdr *cmsg = NULL;
+	struct ucred *cred = NULL;
+	char buf[CMSG_SPACE(sizeof(struct ucred))] = { 0 };
+	char vbuf[1024] = { 0 };
+	struct iovec io = {
+		.iov_base = vbuf,
+		.iov_len = sizeof(*vbuf),
+	};
+	struct msghdr msg = {
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+	};
+	int *fd;
+
+	/* Accept the connection from the client. */
+	conn_sock = accept(priv->socket, NULL, NULL);
+	if (conn_sock < 0) {
+		WARN("connection failed: %s", strerror(errno));
+		return;
+	}
+	ret = setsockopt(conn_sock, SOL_SOCKET, SO_PASSCRED, &(int){1},
+			sizeof(int));
+	if (ret < 0) {
+		WARN("cannot change socket options");
+		goto out;
+	}
+	ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+	if (ret < 0) {
+		WARN("received an empty message: %s", strerror(errno));
+		goto out;
+	}
+	/* Expect to receive credentials only. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		WARN("no message");
+		goto out;
+	}
+	if ((cmsg->cmsg_type == SCM_CREDENTIALS) &&
+	    (cmsg->cmsg_len >= sizeof(*cred))) {
+		cred = (struct ucred *)CMSG_DATA(cmsg);
+		assert(cred != NULL);
+	}
+	cmsg = CMSG_NXTHDR(&msg, cmsg);
+	if (cmsg != NULL) {
+		WARN("Message wrongly formated");
+		goto out;
+	}
+	/* Make sure all the ancillary data was received and valid. */
+	if ((cred == NULL) ||
+	    (cred->uid != getuid()) ||
+	    (cred->gid != getgid())) {
+		WARN("wrong credentials");
+		goto out;
+	}
+	/* Set-up the ancillary data. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	assert(cmsg != NULL);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(priv->ctx->cmd_fd));
+	fd = (int *)CMSG_DATA(cmsg);
+	*fd = priv->ctx->cmd_fd;
+	ret = sendmsg(conn_sock, &msg, 0);
+	if (ret < 0)
+		WARN("cannot send response");
+out:
+	close(conn_sock);
+}
+
+/**
+ * Connect to the primary process.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   fd on success, negative errno value on failure.
+ */
+int
+priv_socket_connect(struct priv *priv)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int socket_fd;
+	int *fd = NULL;
+	int ret;
+	struct ucred *cred;
+	char buf[CMSG_SPACE(sizeof(*cred))] = { 0 };
+	char vbuf[1024] = { 0 };
+	struct iovec io = {
+		.iov_base = vbuf,
+		.iov_len = sizeof(*vbuf),
+	};
+	struct msghdr msg = {
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+	};
+	struct cmsghdr *cmsg;
+
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		WARN("cannot connect to primary");
+		return ret;
+	}
+	socket_fd = ret;
+	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
+			MLX5_DRIVER_NAME, priv->socket);
+	ret = connect(socket_fd, (const struct sockaddr *)&sun, sizeof(sun));
+	if (ret < 0) {
+		WARN("cannot connect to primary");
+		goto out;
+	}
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		DEBUG("cannot get first message");
+		goto out;
+	}
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_CREDENTIALS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(*cred));
+	cred = (struct ucred *)CMSG_DATA(cmsg);
+	if (cred == NULL) {
+		DEBUG("no credentials received");
+		goto out;
+	}
+	cred->pid = getpid();
+	cred->uid = getuid();
+	cred->gid = getgid();
+	ret = sendmsg(socket_fd, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		WARN("cannot send credentials to primary: %s",
+		     strerror(errno));
+		goto out;
+	}
+	ret = recvmsg(socket_fd, &msg, MSG_WAITALL);
+	if (ret <= 0) {
+		WARN("no message from primary: %s", strerror(errno));
+		goto out;
+	}
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		WARN("No file descriptor received");
+		goto out;
+	}
+	fd = (int *)CMSG_DATA(cmsg);
+	if (*fd <= 0) {
+		WARN("no file descriptor received: %s", strerror(errno));
+		ret = *fd;
+		goto out;
+	}
+	ret = *fd;
+out:
+	close(socket_fd);
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 0ea6630..6f57319 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -36,6 +36,8 @@
 #include <errno.h>
 #include <string.h>
 #include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -174,6 +176,8 @@
 	struct mlx5dv_cq cq_info;
 	struct mlx5dv_obj obj;
 
+	qp.comp_mask = MLX5DV_QP_MASK_UAR_INFO;
+
 	obj.cq.in = ibcq;
 	obj.cq.out = &cq_info;
 	obj.qp.in = tmpl->qp;
@@ -183,7 +187,7 @@
 	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
 		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
-		return EINVAL;
+		return -EINVAL;
 	}
 	tmpl->txq.cqe_n = log2above(cq_info.cqe_cnt);
 	tmpl->txq.qp_num_8s = tmpl->qp->qp_num << 8;
@@ -198,6 +202,14 @@
 	tmpl->txq.elts =
 		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
 		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+
+	if (qp.comp_mask | MLX5DV_QP_MASK_UAR_INFO) {
+		tmpl->uar_mmap_offset = qp.uar_info.mmap_offset;
+	} else {
+		ERROR("Failed to retrieve UAR info, invalid libmlx5.so version");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -539,42 +551,53 @@
 }
 
 /**
- * DPDK callback for TX in secondary processes.
+ * Map locally UAR used in Tx queues for BlueFlame doorbell.
  *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal TX burst callback.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
  *
  * @return
- *   Number of packets successfully transmitted (<= pkts_n).
+ *   0 on success, errno value on failure.
  */
-uint16_t
-mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
+int
+mlx5_tx_uar_remap(struct priv *priv, int fd)
 {
-	struct txq *txq = dpdk_txq;
-	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
-	struct priv *primary_priv;
-	unsigned int index;
+	unsigned int i, j;
+	uintptr_t pages[priv->txqs_n];
+	unsigned int pages_n = 0;
+	uintptr_t uar_va;
+	void *addr;
+	struct txq *txq;
+	struct txq_ctrl *txq_ctrl;
+	int already_mapped;
+	size_t page_size = sysconf(_SC_PAGESIZE);
 
-	if (priv == NULL)
-		return 0;
-	primary_priv =
-		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->txqs_n; ++index)
-		if (((*primary_priv->txqs)[index] == txq) ||
-		    ((*priv->txqs)[index] == txq))
-			break;
-	if (index == priv->txqs_n)
-		return 0;
-	txq = (*priv->txqs)[index];
-	return priv->dev->tx_pkt_burst(txq, pkts, pkts_n);
+	for (i = 0; i != priv->txqs_n; ++i) {
+		txq = (*priv->txqs)[i];
+		txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+		uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
+		uar_va = RTE_ALIGN_FLOOR(uar_va,
+				page_size / priv->num_uars_per_page);
+		already_mapped = 0;
+		for (j = 0; j != pages_n; ++j) {
+			if (pages[j] == uar_va) {
+				already_mapped = 1;
+				break;
+			}
+		}
+		if (already_mapped)
+			continue;
+
+		pages[pages_n++] = uar_va;
+		addr = mmap((void *)uar_va, page_size,
+				PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+				txq_ctrl->uar_mmap_offset);
+		if (addr != (void *)uar_va) {
+			ERROR("call to mmap failed on UAR for txq %d\n", i);
+			return -1;
+		}
+	}
+	return 0;
 }
-- 
1.8.3.1



More information about the dev mailing list