[dpdk-dev] [PATCH 3/3] net/mlx4: add secondary process support

Yongseok Koh yskoh at mellanox.com
Thu Mar 7 08:39:05 CET 2019


In order to support secondary process, a few features are required.

a) rdma-core library should allocate device resources using DPDK's memory
   allocator.

b) UAR should be remapped for secondary processes. Currently, in order not
   to use different data structure for secondary processes, PMD tries to
   reserve identical virtual address space for both primary and secondary
   processes.

c) IPC channel is necessary, which can be easily set with rte_mp APIs.
   Through the channel, Verbs command FD is delivered to the secondary
   process and the device stop/start event is also broadcast from primary
   process.

Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |  10 +
 drivers/net/mlx4/Makefile         |   6 +
 drivers/net/mlx4/meson.build      |   3 +
 drivers/net/mlx4/mlx4.c           | 378 ++++++++++++++++++++++++++++++++++++--
 drivers/net/mlx4/mlx4.h           |  57 ++++++
 drivers/net/mlx4/mlx4_mp.c        | 278 ++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_mr.c        |  32 +++-
 drivers/net/mlx4/mlx4_prm.h       |   4 +-
 drivers/net/mlx4/mlx4_rxtx.c      |   2 +
 drivers/net/mlx4/mlx4_rxtx.h      |   1 +
 drivers/net/mlx4/mlx4_txq.c       | 110 +++++++++++
 12 files changed, 860 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/mlx4/mlx4_mp.c

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index a211aef332..4502aa2a87 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -29,6 +29,7 @@ Packet type parsing  = Y
 Basic stats          = Y
 Stats per queue      = Y
 FW version           = Y
+Multiprocess aware   = Y
 Other kdrv           = Y
 Power8               = Y
 x86-32               = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 4ad361a2c2..cd34838f41 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -145,6 +145,16 @@ below.
 Limitations
 -----------
 
+- For secondary process:
+
+  - Forked secondary process not supported.
+  - All mempools must be initialized before rte_eth_dev_start().
+  - External memory unregistered in EAL memseg list cannot be used for DMA
+    unless such memory has been registered by ``mlx4_mr_update_ext_mp()`` in
+    primary process and remapped to the same virtual address in secondary
+    process. If the external memory is registered by primary process but has
+    different virtual address in secondary process, unexpected error may happen.
+
 - CRC stripping is supported by default and always reported as "true".
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index b527efd625..8126b0dfc6 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -18,6 +18,7 @@ ifneq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y)
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c
 endif
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mp.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
@@ -93,6 +94,11 @@ mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_UAR_MMAP_OFFSET \
+		infiniband/mlx4dv.h \
+		enum MLX4DV_QP_MASK_UAR_MMAP_OFFSET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_IBV_MLX4_WQE_LSO_SEG \
 		infiniband/mlx4dv.h \
 		type 'struct mlx4_wqe_lso_seg' \
diff --git a/drivers/net/mlx4/meson.build b/drivers/net/mlx4/meson.build
index 650e2c8fbc..de020701d1 100644
--- a/drivers/net/mlx4/meson.build
+++ b/drivers/net/mlx4/meson.build
@@ -33,6 +33,7 @@ if build
 		'mlx4_ethdev.c',
 		'mlx4_flow.c',
 		'mlx4_intr.c',
+		'mlx4_mp.c',
 		'mlx4_mr.c',
 		'mlx4_rxq.c',
 		'mlx4_rxtx.c',
@@ -76,6 +77,8 @@ if build
 	has_sym_args = [
 		[ 'HAVE_IBV_MLX4_BUF_ALLOCATORS', 'infiniband/mlx4dv.h',
 		'MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS' ],
+		[ 'HAVE_IBV_MLX4_UAR_MMAP_OFFSET', 'infiniband/mlx4dv.h',
+		'MLX4DV_QP_MASK_UAR_MMAP_OFFSET' ],
 	]
 	config = configuration_data()
 	foreach arg:has_sym_args
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 0e0b035df0..a5cfcdbee3 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -17,6 +17,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <unistd.h>
 
 /* Verbs headers do not support -pedantic. */
@@ -48,10 +49,21 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
-struct mlx4_dev_list mlx4_mem_event_cb_list =
-	LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+#if defined(HAVE_IBV_MLX4_UAR_MMAP_OFFSET) && \
+	defined(HAVE_IBV_MLX4_BUF_ALLOCATORS)
+#define HAVE_IBV_MLX4_SECONDARY_PROCESS
+#endif
+
+static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data";
+
+/* Shared memory between primary and secondary processes. */
+struct mlx4_shared_data *mlx4_shared_data;
 
-rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+/* Spinlock for mlx4_shared_data allocation. */
+static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx4_local_data mlx4_local_data;
 
 /** Configuration structure for device arguments. */
 struct mlx4_conf {
@@ -69,6 +81,77 @@ const char *pmd_mlx4_init_params[] = {
 
 static void mlx4_dev_stop(struct rte_eth_dev *dev);
 
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_shared_data(void)
+{
+	const struct rte_memzone *mz;
+	int ret = 0;
+
+	rte_spinlock_lock(&mlx4_shared_data_lock);
+	if (mlx4_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate shared memory. */
+			mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA,
+						 sizeof(*mlx4_shared_data),
+						 SOCKET_ID_ANY, 0);
+			if (mz == NULL) {
+				ERROR("Cannot allocate mlx4 shared data\n");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx4_shared_data = mz->addr;
+			memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data));
+			rte_spinlock_init(&mlx4_shared_data->lock);
+		} else {
+			/* Lookup allocated shared memory. */
+			mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+			if (mz == NULL) {
+				ERROR("Cannot attach mlx4 shared data\n");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx4_shared_data = mz->addr;
+			memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+		}
+	}
+error:
+	rte_spinlock_unlock(&mlx4_shared_data_lock);
+	return ret;
+}
+
+/**
+ * Uninitialize shared data between primary and secondary process.
+ *
+ * The pointer of secondary process is dereferenced and primary process frees
+ * the memzone.
+ */
+static void
+mlx4_uninit_shared_data(void)
+{
+	const struct rte_memzone *mz;
+
+	rte_spinlock_lock(&mlx4_shared_data_lock);
+	if (mlx4_shared_data) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+			rte_memzone_free(mz);
+		} else {
+			memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+		}
+		mlx4_shared_data = NULL;
+	}
+	rte_spinlock_unlock(&mlx4_shared_data_lock);
+}
+
 #ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
 /**
  * Verbs callback to allocate a memory. This function should allocate the space
@@ -181,6 +264,11 @@ mlx4_dev_start(struct rte_eth_dev *dev)
 		return 0;
 	DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
 	priv->started = 1;
+	ret = mlx4_tx_uar_remap(dev, priv->ctx->cmd_fd);
+	if (ret) {
+		ERROR("%p: cannot remap UAR", (void *)dev);
+		goto err;
+	}
 	ret = mlx4_rss_init(priv);
 	if (ret) {
 		ERROR("%p: cannot initialize RSS resources: %s",
@@ -208,6 +296,8 @@ mlx4_dev_start(struct rte_eth_dev *dev)
 	rte_wmb();
 	dev->tx_pkt_burst = mlx4_tx_burst;
 	dev->rx_pkt_burst = mlx4_rx_burst;
+	/* Enable datapath on secondary process. */
+	mlx4_mp_req_start_rxtx(dev);
 	return 0;
 err:
 	mlx4_dev_stop(dev);
@@ -226,6 +316,8 @@ static void
 mlx4_dev_stop(struct rte_eth_dev *dev)
 {
 	struct mlx4_priv *priv = dev->data->dev_private;
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+	int i;
 
 	if (!priv->started)
 		return;
@@ -234,9 +326,20 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
 	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx4_mp_req_stop_rxtx(dev);
 	mlx4_flow_sync(priv, NULL);
 	mlx4_rxq_intr_disable(priv);
 	mlx4_rss_deinit(priv);
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq;
+
+		txq = dev->data->tx_queues[i];
+		if (!txq)
+			continue;
+		munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->msq.db,
+					       page_size), page_size);
+	}
 }
 
 /**
@@ -259,6 +362,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx4_mp_req_stop_rxtx(dev);
 	mlx4_flow_clean(priv);
 	mlx4_rss_deinit(priv);
 	for (i = 0; i != dev->data->nb_rx_queues; ++i)
@@ -310,6 +415,16 @@ static const struct eth_dev_ops mlx4_dev_ops = {
 	.is_removed = mlx4_is_removed,
 };
 
+#ifdef HAVE_IBV_MLX4_SECONDARY_PROCESS
+/* Available operations from secondary process. */
+static const struct eth_dev_ops mlx4_dev_sec_ops = {
+	.stats_get = mlx4_stats_get,
+	.stats_reset = mlx4_stats_reset,
+	.fw_version_get = mlx4_fw_version_get,
+	.dev_infos_get = mlx4_dev_infos_get,
+};
+#endif
+
 /**
  * Get PCI information from struct ibv_device.
  *
@@ -549,6 +664,200 @@ mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
 
 static struct rte_pci_driver mlx4_driver;
 
+static int
+find_lower_va_bound(const struct rte_memseg_list *msl,
+		const struct rte_memseg *ms, void *arg)
+{
+	void **addr = arg;
+
+	if (msl->external)
+		return 0;
+	if (*addr == NULL)
+		*addr = ms->addr;
+	else
+		*addr = RTE_MIN(*addr, ms->addr);
+
+	return 0;
+}
+
+/**
+ * Reserve UAR address space for primary process.
+ *
+ * Process local resource is used by both primary and secondary to avoid
+ * duplicate reservation. The space has to be available on both primary and
+ * secondary process, TXQ UAR maps to this area using fixed mmap w/o double
+ * check.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_primary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+	void *addr = (void *)0;
+
+	if (sd->uar_base)
+		return 0;
+	/* find out lower bound of hugepage segments */
+	rte_memseg_walk(find_lower_va_bound, &addr);
+	/* keep distance to hugepages to minimize potential conflicts. */
+	addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX4_UAR_OFFSET + MLX4_UAR_SIZE));
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(addr, MLX4_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("failed to reserve UAR address space, please"
+		      " adjust MLX4_UAR_SIZE or try --base-virtaddr");
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	/* Accept either same addr or a new addr returned from mmap if target
+	 * range occupied.
+	 */
+	INFO("reserved UAR address space: %p", addr);
+	sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
+	return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for primary process.
+ */
+static void
+mlx4_uar_uninit_primary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+
+	if (!sd->uar_base)
+		return;
+	munmap(sd->uar_base, MLX4_UAR_SIZE);
+	sd->uar_base = NULL;
+}
+
+/**
+ * Reserve UAR address space for secondary process, align with primary process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_secondary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+	struct mlx4_local_data *ld = &mlx4_local_data;
+	void *addr;
+
+	if (ld->uar_base) { /* Already reserved. */
+		assert(sd->uar_base == ld->uar_base);
+		return 0;
+	}
+	assert(sd->uar_base);
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(sd->uar_base, MLX4_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("UAR mmap failed: %p size: %llu",
+		      sd->uar_base, MLX4_UAR_SIZE);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (sd->uar_base != addr) {
+		ERROR("UAR address %p size %llu occupied, please"
+		      " adjust MLX4_UAR_OFFSET or try EAL parameter"
+		      " --base-virtaddr",
+		      sd->uar_base, MLX4_UAR_SIZE);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	ld->uar_base = addr;
+	INFO("reserved UAR address space: %p", addr);
+	return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for secondary process.
+ */
+static void
+mlx4_uar_uninit_secondary(void)
+{
+	struct mlx4_local_data *ld = &mlx4_local_data;
+
+	if (!ld->uar_base)
+		return;
+	munmap(ld->uar_base, MLX4_UAR_SIZE);
+	ld->uar_base = NULL;
+}
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_once(void)
+{
+	struct mlx4_shared_data *sd;
+	struct mlx4_local_data *ld = &mlx4_local_data;
+	int ret;
+
+	if (mlx4_init_shared_data())
+		return -rte_errno;
+	sd = mlx4_shared_data;
+	assert(sd);
+	rte_spinlock_lock(&sd->lock);
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (sd->init_done)
+			break;
+		LIST_INIT(&sd->mem_event_cb_list);
+		rte_rwlock_init(&sd->mem_event_rwlock);
+		rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+						mlx4_mr_mem_event_cb, NULL);
+		mlx4_mp_init_primary();
+		ret = mlx4_uar_init_primary();
+		if (ret)
+			goto error;
+		sd->init_done = true;
+		break;
+	case RTE_PROC_SECONDARY:
+		if (ld->init_done)
+			break;
+		mlx4_mp_init_secondary();
+		ret = mlx4_uar_init_secondary();
+		if (ret)
+			goto error;
+		++sd->secondary_cnt;
+		ld->init_done = true;
+		break;
+	default:
+		break;
+	}
+	rte_spinlock_unlock(&sd->lock);
+	return 0;
+error:
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		mlx4_uar_uninit_primary();
+		mlx4_mp_uninit_primary();
+		rte_mem_event_callback_unregister("MLX4_MEM_EVENT_CB", NULL);
+		break;
+	case RTE_PROC_SECONDARY:
+		mlx4_uar_uninit_secondary();
+		mlx4_mp_uninit_secondary();
+		break;
+	default:
+		break;
+	}
+	rte_spinlock_unlock(&sd->lock);
+	mlx4_uninit_shared_data();
+	return -rte_errno;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -579,6 +888,12 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 	int i;
 
 	(void)pci_drv;
+	err = mlx4_init_once();
+	if (err) {
+		ERROR("unable to init PMD global data: %s",
+		      strerror(rte_errno));
+		return -rte_errno;
+	}
 	assert(pci_drv == &mlx4_driver);
 	list = mlx4_glue->get_device_list(&i);
 	if (list == NULL) {
@@ -659,6 +974,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		struct mlx4_priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
+		char name[RTE_ETH_NAME_MAX_LEN];
 
 		/* If port is not enabled, skip. */
 		if (!(conf.ports.enabled & (1 << i)))
@@ -669,6 +985,44 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			err = ENODEV;
 			goto port_error;
 		}
+		snprintf(name, sizeof(name), "%s port %u",
+			 mlx4_glue->get_device_name(ibv_dev), port);
+#ifdef HAVE_IBV_MLX4_SECONDARY_PROCESS
+		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+			eth_dev = rte_eth_dev_attach_secondary(name);
+			if (eth_dev == NULL) {
+				ERROR("can not attach rte ethdev");
+				rte_errno = ENOMEM;
+				err = rte_errno;
+				goto error;
+			}
+			eth_dev->device = &pci_dev->device;
+			eth_dev->dev_ops = &mlx4_dev_sec_ops;
+			/* Receive command fd from primary process */
+			err = mlx4_mp_req_verbs_cmd_fd(eth_dev);
+			if (err < 0) {
+				err = rte_errno;
+				goto error;
+			}
+			/* Remap UAR for Tx queues. */
+			err = mlx4_tx_uar_remap(eth_dev, err);
+			if (err) {
+				err = rte_errno;
+				goto error;
+			}
+			/*
+			 * Ethdev pointer is still required as input since
+			 * the primary device is not accessible from the
+			 * secondary process.
+			 */
+			eth_dev->tx_pkt_burst = mlx4_tx_burst;
+			eth_dev->rx_pkt_burst = mlx4_rx_burst;
+			claim_zero(mlx4_glue->close_device(ctx));
+			rte_eth_copy_pci_info(eth_dev, pci_dev);
+			rte_eth_dev_probing_finish(eth_dev);
+			continue;
+		}
+#endif
 		/* Check port status. */
 		err = mlx4_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -774,14 +1128,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		/* Get actual MTU if possible. */
 		mlx4_mtu_get(priv, &priv->mtu);
 		DEBUG("port %u MTU is %u", priv->port, priv->mtu);
-		/* from rte_ethdev.c */
-		{
-			char name[RTE_ETH_NAME_MAX_LEN];
-
-			snprintf(name, sizeof(name), "%s port %u",
-				 mlx4_glue->get_device_name(ibv_dev), port);
-			eth_dev = rte_eth_dev_allocate(name);
-		}
+		eth_dev = rte_eth_dev_allocate(name);
 		if (eth_dev == NULL) {
 			err = ENOMEM;
 			ERROR("can not allocate rte ethdev");
@@ -842,9 +1189,10 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			goto port_error;
 		}
 		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+		rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
+		LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list,
+				 priv, mem_event_cb);
+		rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
 		continue;
 port_error:
@@ -1075,8 +1423,6 @@ RTE_INIT(rte_mlx4_pmd_init)
 	}
 	mlx4_glue->fork_init();
 	rte_pci_register(&mlx4_driver);
-	rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
-					mlx4_mr_mem_event_cb, NULL);
 }
 
 RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index d43e05ea74..bb75f99e03 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -53,6 +53,16 @@
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
+/* Reserved address space for UAR mapping. */
+#define MLX4_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4))
+
+/* Offset of reserved UAR address space to hugepage memory. Offset is used here
+ * to minimize possibility of address next to hugepage being used by other code
+ * in either primary or secondary process, failing to map TX UAR would make TX
+ * packets invisible to HW.
+ */
+#define MLX4_UAR_OFFSET (2ULL << (sizeof(uintptr_t) * 4))
+
 enum {
 	PCI_VENDOR_ID_MELLANOX = 0x15b3,
 };
@@ -63,6 +73,23 @@ enum {
 	PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
 };
 
+/* Request types for IPC. */
+enum mlx4_mp_req_type {
+	MLX4_MP_REQ_VERBS_CMD_FD = 1,
+	MLX4_MP_REQ_START_RXTX,
+	MLX4_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mlx4_mp_param {
+	enum mlx4_mp_req_type type;
+	int port_id;
+	int result;
+};
+
+/** Key string for IPC. */
+#define MLX4_MP_NAME "net_mlx4_mp"
+
 /** Driver name reported to lower layers and used in log output. */
 #define MLX4_DRIVER_NAME "net_mlx4"
 
@@ -93,6 +120,27 @@ struct mlx4_verbs_alloc_ctx {
 LIST_HEAD(mlx4_dev_list, mlx4_priv);
 LIST_HEAD(mlx4_mr_list, mlx4_mr);
 
+/* Shared data between primary and secondary processes. */
+struct mlx4_shared_data {
+	rte_spinlock_t lock;
+	/* Global spinlock for primary and secondary processes. */
+	int init_done; /* Whether primary has done initialization. */
+	unsigned int secondary_cnt; /* Number of secondary processes init'd. */
+	void *uar_base;
+	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+	struct mlx4_dev_list mem_event_cb_list;
+	rte_rwlock_t mem_event_rwlock;
+};
+
+/* Per-process data structure, not visible to other processes. */
+struct mlx4_local_data {
+	int init_done; /* Whether a secondary has done initialization. */
+	void *uar_base;
+	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+};
+
+extern struct mlx4_shared_data *mlx4_shared_data;
+
 /** Private data structure. */
 struct mlx4_priv {
 	LIST_ENTRY(mlx4_priv) mem_event_cb;
@@ -175,4 +223,13 @@ void mlx4_rxq_intr_disable(struct mlx4_priv *priv);
 int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
 
+/* mlx4_mp.c */
+void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev);
+void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+void mlx4_mp_init_primary(void);
+void mlx4_mp_uninit_primary(void);
+void mlx4_mp_init_secondary(void);
+void mlx4_mp_uninit_secondary(void);
+
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_mp.c b/drivers/net/mlx4/mlx4_mp.c
new file mode 100644
index 0000000000..b0a91b44fd
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mp.c
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Initialize IPC message.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[out] msg
+ *   Pointer to message to fill in.
+ * @param[in] type
+ *   Message type.
+ */
+static inline void
+mp_init_msg(struct rte_eth_dev *dev, struct rte_mp_msg *msg,
+	    enum mlx4_mp_req_type type)
+{
+	struct mlx4_mp_param *param = (struct mlx4_mp_param *)msg->param;
+
+	memset(msg, 0, sizeof(*msg));
+	strlcpy(msg->name, MLX4_MP_NAME, sizeof(msg->name));
+	msg->len_param = sizeof(*param);
+	param->type = type;
+	param->port_id = dev->data->port_id;
+}
+
+/**
+ * Return file descriptor for mmap to the secondary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+	const struct mlx4_mp_param *param =
+		(const struct mlx4_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev = &rte_eth_devices[param->port_id];
+	struct mlx4_priv *priv = dev->data->dev_private;
+	int ret = 0;
+
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	switch (param->type) {
+	case MLX4_MP_REQ_VERBS_CMD_FD:
+		mp_init_msg(dev, &mp_res, param->type);
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = priv->ctx->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		ERROR("port %u invalid mp request type", dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+	const struct mlx4_mp_param *param =
+		(const struct mlx4_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev = &rte_eth_devices[param->port_id];
+	int ret = 0;
+
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	switch (param->type) {
+	case MLX4_MP_REQ_START_RXTX:
+		INFO("port %u starting datapath", dev->data->port_id);
+		rte_mb();
+		dev->tx_pkt_burst = mlx4_tx_burst;
+		dev->rx_pkt_burst = mlx4_rx_burst;
+		mp_init_msg(dev, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX4_MP_REQ_STOP_RXTX:
+		INFO("port %u stopping datapath", dev->data->port_id);
+		dev->tx_pkt_burst = mlx4_tx_burst_removed;
+		dev->rx_pkt_burst = mlx4_rx_burst_removed;
+		rte_mb();
+		mp_init_msg(dev, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		ERROR("port %u invalid mp request type", dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] type
+ *   Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx4_mp_req_type type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx4_mp_param *res __rte_unused;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	int ret;
+
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!mlx4_shared_data->secondary_cnt)
+		return;
+	if (type != MLX4_MP_REQ_START_RXTX && type != MLX4_MP_REQ_STOP_RXTX) {
+		ERROR("port %u unknown request (req_type %d)",
+		      dev->data->port_id, type);
+		return;
+	}
+	mp_init_msg(dev, &mp_req, type);
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		ERROR("port %u failed to request stop/start Rx/Tx (%d)",
+		      dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		ERROR("port %u not all secondaries responded (req_type %d)",
+		      dev->data->port_id, type);
+		goto exit;
+	}
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mlx4_mp_param *)mp_res->param;
+	assert(!res->result);
+exit:
+	free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX4_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX4_MP_REQ_STOP_RXTX);
+}
+
+/**
+ * IPC message handler of primary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ *
+ * @return
+ *   fd on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx4_mp_param *res __rte_unused;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	int cmd_fd;
+	int ret;
+
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	mp_init_msg(dev, &mp_req, MLX4_MP_REQ_VERBS_CMD_FD);
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		ERROR("port %u failed to get command FD from primary process",
+		      dev->data->port_id);
+		return -rte_errno;
+	}
+	assert(mp_rep.nb_received == 1);
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mlx4_mp_param *)mp_res->param;
+	assert(!res->result);
+	assert(mp_res->num_fds == 1);
+	cmd_fd = mp_res->fds[0];
+	free(mp_rep.msgs);
+	DEBUG("port %u command FD from primary is %d",
+	      dev->data->port_id, cmd_fd);
+	return cmd_fd;
+}
+
+/**
+ * Initialize by primary process.
+ */
+void
+mlx4_mp_init_primary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	rte_mp_action_register(MLX4_MP_NAME, mp_primary_handle);
+}
+
+/**
+ * Un-initialize by primary process.
+ */
+void
+mlx4_mp_uninit_primary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	rte_mp_action_unregister(MLX4_MP_NAME);
+}
+
+/**
+ * Initialize by secondary process.
+ */
+void
+mlx4_mp_init_secondary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	rte_mp_action_register(MLX4_MP_NAME, mp_secondary_handle);
+}
+
+/**
+ * Un-initialize by secondary process.
+ */
+void
+mlx4_mp_uninit_secondary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	rte_mp_action_unregister(MLX4_MP_NAME);
+}
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index e4be46ab2a..01894faecf 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -489,6 +489,8 @@ mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
 	struct mlx4_mr *mr_next;
 	struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
+	/* Must be called from the primary process. */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/*
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
@@ -561,6 +563,14 @@ mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 
 	DEBUG("port %u creating a MR using address (%p)",
 	      dev->data->port_id, (void *)addr);
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		WARN("port %u using address (%p) of unregistered mempool"
+		     " in secondary process, please create mempool"
+		     " before rte_eth_dev_start()",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = EPERM;
+		goto err_nolock;
+	}
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
 	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
@@ -890,14 +900,17 @@ mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
 	struct mlx4_priv *priv;
+	struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
 
+	/* Must be called from the primary process. */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	switch (event_type) {
 	case RTE_MEM_EVENT_FREE:
-		rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+		rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx4 devices. */
-		LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+		LIST_FOREACH(priv, dev_list, mem_event_cb)
 			mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
-		rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+		rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
 	default:
@@ -1130,6 +1143,7 @@ mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
 	struct mlx4_mr_cache entry;
 	uint32_t lkey;
 
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&priv->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
@@ -1225,6 +1239,14 @@ mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
 	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
 	struct mlx4_priv *priv = txq->priv;
 
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		WARN("port %u using address (%p) from unregistered mempool"
+		     " having externally allocated memory"
+		     " in secondary process, please create mempool"
+		     " prior to rte_eth_dev_start()",
+		     PORT_ID(priv), (void *)addr);
+		return UINT32_MAX;
+	}
 	mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
 	return mlx4_tx_addr2mr_bh(txq, addr);
 }
@@ -1336,9 +1358,9 @@ mlx4_mr_release(struct rte_eth_dev *dev)
 	struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
 
 	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+	rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
 	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+	rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
 #ifndef NDEBUG
 	mlx4_mr_dump_dev(dev);
 #endif
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index aef77ba06e..b3e11dde25 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -77,7 +77,9 @@ struct mlx4_sq {
 	uint32_t owner_opcode;
 	/**< Default owner opcode with HW valid owner bit. */
 	uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */
-	volatile uint32_t *db; /**< Pointer to the doorbell. */
+	volatile uint32_t *qp_sdb; /**< Pointer to the doorbell. */
+	volatile uint32_t *db; /**< Pointer to the doorbell remapped. */
+	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
 };
 
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8c88effcd1..f22f1ba559 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -1365,6 +1365,7 @@ mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	(void)dpdk_txq;
 	(void)pkts;
 	(void)pkts_n;
+	rte_mb();
 	return 0;
 }
 
@@ -1390,5 +1391,6 @@ mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	(void)dpdk_rxq;
 	(void)pkts;
 	(void)pkts_n;
+	rte_mb();
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 9409602b32..7d7a8988ed 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -152,6 +152,7 @@ uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 
 /* mlx4_txq.c */
 
+int mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd);
 uint64_t mlx4_get_tx_port_offloads(struct mlx4_priv *priv);
 int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			uint16_t desc, unsigned int socket,
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 2dc198e77f..f3275fe024 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -13,7 +13,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <inttypes.h>
+#include <unistd.h>
 
 /* Verbs headers do not support -pedantic. */
 #ifdef PEDANTIC
@@ -38,6 +40,97 @@
 #include "mlx4_utils.h"
 
 /**
+ * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
+ * Both primary and secondary process do mmap to make UAR address
+ * aligned.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd)
+{
+	unsigned int i, j;
+	const unsigned int txqs_n = dev->data->nb_tx_queues;
+	uintptr_t pages[txqs_n];
+	unsigned int pages_n = 0;
+	uintptr_t uar_va;
+	uintptr_t off;
+	void *addr;
+	void *ret;
+	struct txq *txq;
+	int already_mapped;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+
+	memset(pages, 0, txqs_n * sizeof(uintptr_t));
+	/*
+	 * As rdma-core, UARs are mapped in size of OS page size.
+	 * Use aligned address to avoid duplicate mmap.
+	 * Ref to libmlx4 function: mlx4_init_context()
+	 */
+	for (i = 0; i != txqs_n; ++i) {
+		txq = dev->data->tx_queues[i];
+		if (!txq)
+			continue;
+		/* UAR addr form verbs used to find dup and offset in page. */
+		uar_va = (uintptr_t)txq->msq.qp_sdb;
+		off = uar_va & (page_size - 1); /* offset in page. */
+		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
+		already_mapped = 0;
+		for (j = 0; j != pages_n; ++j) {
+			if (pages[j] == uar_va) {
+				already_mapped = 1;
+				break;
+			}
+		}
+		/* new address in reserved UAR address space. */
+		addr = RTE_PTR_ADD(mlx4_shared_data->uar_base,
+				   uar_va & (uintptr_t)(MLX4_UAR_SIZE - 1));
+		if (!already_mapped) {
+			pages[pages_n++] = uar_va;
+			/* fixed mmap to specified address in reserved
+			 * address space.
+			 */
+			ret = mmap(addr, page_size,
+				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+				   txq->msq.uar_mmap_offset);
+			if (ret != addr) {
+				/* fixed mmap has to return same address. */
+				ERROR("call to mmap failed on UAR for txq %u",
+				      i);
+				rte_errno = ENXIO;
+				return -rte_errno;
+			}
+		}
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once. */
+			txq->msq.db = RTE_PTR_ADD((void *)addr, off);
+		else
+			assert(txq->msq.db ==
+			       RTE_PTR_ADD((void *)addr, off));
+	}
+	return 0;
+}
+#else
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev __rte_unused, int fd __rte_unused)
+{
+	/*
+	 * If rdma-core doesn't support UAR remap, secondary process is not
+	 * supported, thus secondary cannot call this function but only primary
+	 * makes a call. Return success to not interrupt initialization.
+	 */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	return 0;
+}
+#endif
+
+/**
  * Free Tx queue elements.
  *
  * @param txq
@@ -89,7 +182,12 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
 	sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
 	sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
 				     (0u << MLX4_SQ_OWNER_BIT));
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	sq->uar_mmap_offset = dqp->uar_mmap_offset;
+	sq->qp_sdb = dqp->sdb;
+#else
 	sq->db = dqp->sdb;
+#endif
 	sq->doorbell_qpn = dqp->doorbell_qpn;
 	cq->buf = dcq->buf.buf;
 	cq->cqe_cnt = dcq->cqe_cnt;
@@ -307,6 +405,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		goto error;
 	}
 	/* Retrieve device queue information. */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	dv_qp = (struct mlx4dv_qp){
+		.comp_mask = MLX4DV_QP_MASK_UAR_MMAP_OFFSET,
+	};
+#endif
 	mlxdv.cq.in = txq->cq;
 	mlxdv.cq.out = &dv_cq;
 	mlxdv.qp.in = txq->qp;
@@ -318,6 +421,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		      " accessing the device queues", (void *)dev);
 		goto error;
 	}
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	if (!(dv_qp.comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET)) {
+		rte_errno = EINVAL;
+		ERROR("%p: failed to obtain UAR mmap offset", (void *)dev);
+		goto error;
+	}
+#endif
 	mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
 	/* Save first wqe pointer in the first element. */
 	(&(*txq->elts)[0])->wqe =
-- 
2.11.0



More information about the dev mailing list