[dpdk-dev] [PATCH v1 2/2] net/mlx5: add multiple process support

Nélio Laranjeiro nelio.laranjeiro at 6wind.com
Fri Aug 25 09:27:22 CEST 2017


Hi Xueming,

Please see comments below,

On Thu, Aug 24, 2017 at 10:03:41PM +0800, Xueming Li wrote:
> PMD uses Verbs object which were not available in the shared memory, in
> addition, due to IO pages, it was not possible to use the primary
> process Tx queues from a secondary process.
> 
> This patch modify the location where Verbs objects are allocated (from
> process memory address space to shared memory address space) and thus
> allow a secondary process to use those object by mapping this shared
> memory space its own memory space.
> For Tx IO pages, it uses a unix socket to get back the communication
> channel with the Kernel driver from the primary process, this is
> necessary to remap those pages in the secondary process memory space and
> thus use the same Tx queues.
> 
> This is only supported from Linux kernel (v4.14) and rdma-core (v14).

Will it not be supported also with MLNX_OFED 4.2 on older kernels?

> Cc: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
> Signed-off-by: Xueming Li <xuemingl at mellanox.com>
> ---
>  doc/guides/nics/mlx5.rst       |   3 +-
>  drivers/net/mlx5/Makefile      |   1 +
>  drivers/net/mlx5/mlx5.c        | 132 ++++++++++++------
>  drivers/net/mlx5/mlx5.h        |  18 +--
>  drivers/net/mlx5/mlx5_ethdev.c | 215 ++++++------------------------
>  drivers/net/mlx5/mlx5_rxq.c    |  41 ------
>  drivers/net/mlx5/mlx5_rxtx.h   |   5 +-
>  drivers/net/mlx5/mlx5_socket.c | 294 +++++++++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_txq.c    |  89 ++++++++-----
>  9 files changed, 501 insertions(+), 297 deletions(-)
>  create mode 100644 drivers/net/mlx5/mlx5_socket.c

You should also bring back the "Multiprocess aware" in
doc/guides/nics/features/mlx5.ini removed in [1].

> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 39a159c..3002e7e 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -126,6 +126,52 @@ struct mlx5_args {
>  }
>  
>  /**
> + * Verbs callback to allocate a memory. This function should allocate the space
> + * according to the size provided residing inside a huge page.
> + *
> + * @param[in] size
> + *   The size in bytes of the memory to allocate.
> + * @param[in] data
> + *   A pointer to the callback data.
> + *
> + * @return
> + *   a pointer to the allocate space.
> + */
> +static void *
> +mlx5_extern_alloc_buf(size_t size, void *data)
> +{
> +	struct priv *priv = data;
> +	void *ret;
> +	size_t alignment = sysconf(_SC_PAGESIZE);

Seems dangerous assuming in the PMD the page size expected by a library, there
is no guarantee it will not change in the future and this may cause
un-expected behaviors.
The library should provide such alignment information through the callback
parameter.

> +	assert(data != NULL);
> +	assert(!mlx5_is_secondary());
> +
> +	ret = rte_malloc_socket(__func__, size, alignment,
> +			priv->dev->device->numa_node);

Indentation is wrong.

> +	DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret);
> +	return ret;
> +}
>[...]
> @@ -526,6 +573,7 @@ struct mlx5_args {
>  		assert(err > 0);
>  		return -err;
>  	}
> +	err = 0; /* previous errors are handled if attr_ctx is NULL. */
>  	ibv_dev = list[i];
>  
>  	DEBUG("device opened");

This hunk seems to fix a bug not related to this feature, please put it in its
own commit and reference the commit introducing the issue.

> @@ -555,6 +603,40 @@ struct mlx5_args {
>  			.tso = MLX5_ARG_UNSET,
>  		};
>  
> +		mlx5_dev[idx].ports |= test;
> +		if (mlx5_is_secondary()) {
> +			/* from rte_ethdev.c */
> +			char name[RTE_ETH_NAME_MAX_LEN];
> +
> +			snprintf(name, sizeof(name), "%s port %u",
> +				 ibv_get_device_name(ibv_dev), port);
> +			eth_dev = rte_eth_dev_attach_secondary(name);
> +			if (eth_dev == NULL) {
> +				ERROR("can not attach rte ethdev");
> +				err = ENOMEM;
> +				goto error;
> +			}
> +			eth_dev->dev_ops = &mlx5_dev_ops;

There are several operation the secondary process should not be allowed to
make, maybe it is better to defines a new array of ops for the secondary
process and assign it here.

This can also be done on an extra commit adding this new devops array and
removing all the verification is_secondary() across the PMD files.

> +			priv = eth_dev->data->dev_private;
> +			/* TODO replace with mlx5dv_context */

According to a modification below "struct mlx5dv_ctx_allocators alctr = ", it
seems this is already done.
Please add the dependency on Schachar series as comment in the commit log
(after a '---' line), it will considerably help Ferruh.

> @@ -753,37 +835,8 @@ struct mlx5_args {
>  			err = ENOMEM;
>  			goto port_error;
>  		}
> -
> -		/* Secondary processes have to use local storage for their
> -		 * private data as well as a copy of eth_dev->data, but this
> -		 * pointer must not be modified before burst functions are
> -		 * actually called. */
> -		if (mlx5_is_secondary()) {
> -			struct mlx5_secondary_data *sd =
> -				&mlx5_secondary_data[eth_dev->data->port_id];
> -			sd->primary_priv = eth_dev->data->dev_private;
> -			if (sd->primary_priv == NULL) {
> -				ERROR("no private data for port %u",
> -						eth_dev->data->port_id);
> -				err = EINVAL;
> -				goto port_error;
> -			}
> -			sd->shared_dev_data = eth_dev->data;
> -			rte_spinlock_init(&sd->lock);
> -			memcpy(sd->data.name, sd->shared_dev_data->name,
> -				   sizeof(sd->data.name));
> -			sd->data.dev_private = priv;
> -			sd->data.rx_mbuf_alloc_failed = 0;
> -			sd->data.mtu = ETHER_MTU;
> -			sd->data.port_id = sd->shared_dev_data->port_id;
> -			sd->data.mac_addrs = priv->mac;
> -			eth_dev->tx_pkt_burst = mlx5_tx_burst_secondary_setup;
> -			eth_dev->rx_pkt_burst = mlx5_rx_burst_secondary_setup;
> -		} else {
> -			eth_dev->data->dev_private = priv;
> -			eth_dev->data->mac_addrs = priv->mac;
> -		}
> -
> +		eth_dev->data->dev_private = priv;
> +		eth_dev->data->mac_addrs = priv->mac;
>  		eth_dev->device = &pci_dev->device;
>  		rte_eth_copy_pci_info(eth_dev, pci_dev);
>  		eth_dev->device->driver = &mlx5_driver.driver;

This patch is not rebased on top of dpdk-next-net/master, this part as already
been removed on [1].

> @@ -791,6 +844,15 @@ struct mlx5_args {
>  		eth_dev->dev_ops = &mlx5_dev_ops;
>  		TAILQ_INIT(&priv->flows);
>  
> +		/* Hint libmlx5 to use PMD allocator for PRM resources */

PRM stand for Programmer Reference Manual?  If so, please remove this
acronyms from the comment.

> +		struct mlx5dv_ctx_allocators alctr = {
> +				.alloc = &mlx5_extern_alloc_buf,
> +				.free = &mlx5_extern_free_buf,
> +				.data = priv,
> +		};
> +		mlx5dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
> +				(void *)((uintptr_t)&alctr));

Indentation.

> @@ -885,14 +947,6 @@ struct mlx5_args {
>  static void
>  rte_mlx5_pmd_init(void)
>  {
> -	/*
> -	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
> -	 * huge pages. Calling ibv_fork_init() during init allows
> -	 * applications to use fork() safely for purposes other than
> -	 * using this PMD, which is not supported in forked processes.
> -	 */
> -	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
> -	ibv_fork_init();
>  	rte_pci_register(&mlx5_driver);
>  }
  
Seems this modification is not directly related to the multiple process, can
you move it to another commit?

> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 2dee07c..b5d2f67 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -157,16 +157,11 @@ struct priv {
>  	uint32_t link_speed_capa; /* Link speed capabilities. */
>  	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
>  	rte_spinlock_t lock; /* Lock for control functions. */
> +	int socket; /* Socket to exchange data with secondaries. */
> +	struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */
> +	int num_uars_per_page; /* number of UARs per system page */
>  };

"socket" seems to be a UNIX Socket, to avoid any confusion with "NUMA Socket"
please rename it.

> -/* Local storage for secondary process data. */
> -struct mlx5_secondary_data {
> -	struct rte_eth_dev_data data; /* Local device data. */
> -	struct priv *primary_priv; /* Private structure from primary. */
> -	struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */
> -	rte_spinlock_t lock; /* Port configuration lock. */
> -} mlx5_secondary_data[RTE_MAX_ETHPORTS];
> -

Should also make part of another commit fixing [1].

> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index fce7dd5..84efeda 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -132,12 +135,7 @@ struct ethtool_link_settings {
>  struct priv *
>  mlx5_get_priv(struct rte_eth_dev *dev)
>  {
> -	struct mlx5_secondary_data *sd;
> -
> -	if (!mlx5_is_secondary())
> -		return dev->data->dev_private;
> -	sd = &mlx5_secondary_data[dev->data->port_id];
> -	return sd->data.dev_private;
> +	return dev->data->dev_private;
>  }
 
This hunk should also be in another commit fixing [1].
 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 393c500..3940e00 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -290,6 +290,8 @@ struct txq_ctrl {
>  	struct ibv_qp *qp; /* Queue Pair. */
>  	unsigned int socket; /* CPU socket ID for allocations. */
>  	struct txq txq; /* Data path structure. */
> +

Extra empty line.

> +	off_t uar_mmap_offset; /* UAR offset for secondary process mmap. */
>  };
>  
>  /* mlx5_rxq.c */
> diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c
> new file mode 100644
> index 0000000..e371ab6
> --- /dev/null
> +++ b/drivers/net/mlx5/mlx5_socket.c
> @@ -0,0 +1,294 @@
> +#define _GNU_SOURCE
> +
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +
> +#include "mlx5.h"
> +#include "mlx5_utils.h"
> +
> +/**
> + * Initialise the socket to communicate with the secondary process
> + *
> + * @param[in] priv
> + *   Pointer to private structure.
> + *
> + * @return
> + *   0 on success, errno value on failure.
> + */
> +int
> +priv_socket_init(struct priv *priv)
> +{
> +	struct sockaddr_un sun = {
> +		.sun_family = AF_UNIX,
> +	};
> +	int ret;
> +	int flags;
> +	struct stat file_stat;
> +
> +	/*
> +	 * Initialise the socket to communicate with the secondary
> +	 * process.
> +	 */
> +	ret = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (ret < 0) {
> +		WARN("secondary process not supported: %s", strerror(errno));
> +		return ret;
> +	}
> +	priv->socket = ret;
> +	flags = fcntl(priv->socket, F_GETFL, 0);
> +	if (flags == -1)
> +		goto out;
> +	ret = fcntl(priv->socket, F_SETFL, flags | O_NONBLOCK);
> +	if (ret < 0)
> +		goto out;
> +	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
> +			MLX5_DRIVER_NAME, priv->socket);

To keep the same coding style of the PMD, the indentation should be fixed.

> +	ret = stat(sun.sun_path, &file_stat);
> +	if (!ret)
> +		claim_zero(remove(sun.sun_path));
> +	ret = bind(priv->socket, (const struct sockaddr *)&sun, sizeof(sun));
> +	if (ret < 0) {
> +		WARN("cannot bind socket, secondary process not supported: %s",
> +		     strerror(errno));
> +		goto close;
> +	}
> +	ret = listen(priv->socket, 0);
> +	if (ret < 0) {
> +		WARN("Secondary process not supported: %s", strerror(errno));
> +		goto close;
> +	}
> +	return ret;
> +close:
> +	remove(sun.sun_path);
> +out:
> +	claim_zero(close(priv->socket));
> +	priv->socket = 0;
> +	return -(ret);
> +}
>[...]
> +
> +/**
> + * Connect to the primary process.
> + *
> + * @param[in] priv
> + *   Pointer to private structure.
> + *
> + * @return
> + *   fd on success, negative errno value on failure.
> + */
> +int
> +priv_socket_connect(struct priv *priv)
> +{
> +	struct sockaddr_un sun = {
> +		.sun_family = AF_UNIX,
> +	};
> +	int socket_fd;
> +	int *fd = NULL;
> +	int ret;
> +	struct ucred *cred;
> +	char buf[CMSG_SPACE(sizeof(*cred))] = { 0 };
> +	char vbuf[1024] = { 0 };
> +	struct iovec io = {
> +		.iov_base = vbuf,
> +		.iov_len = sizeof(*vbuf),
> +	};
> +	struct msghdr msg = {
> +		.msg_control = buf,
> +		.msg_controllen = sizeof(buf),
> +		.msg_iov = &io,
> +		.msg_iovlen = 1,
> +	};
> +	struct cmsghdr *cmsg;
> +
> +	ret = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (ret < 0) {
> +		WARN("cannot connect to primary");
> +		return ret;
> +	}
> +	socket_fd = ret;
> +	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
> +			MLX5_DRIVER_NAME, priv->socket);

Same here about indentation.

> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 0ea6630..6f57319 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -36,6 +36,8 @@
>  #include <errno.h>
>  #include <string.h>
>  #include <stdint.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
>  
>  /* Verbs header. */
>  /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
> @@ -174,6 +176,8 @@
>  	struct mlx5dv_cq cq_info;
>  	struct mlx5dv_obj obj;
>  
> +	qp.comp_mask = MLX5DV_QP_MASK_UAR_INFO;
> +

Extra empty line.

>  	obj.cq.in = ibcq;
>  	obj.cq.out = &cq_info;
>  	obj.qp.in = tmpl->qp;
> @@ -183,7 +187,7 @@
>  	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
>  		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
>  		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
> -		return EINVAL;
> +		return -EINVAL;

This don't respect the return value documented by the function.

>  	}
>  	tmpl->txq.cqe_n = log2above(cq_info.cqe_cnt);
>  	tmpl->txq.qp_num_8s = tmpl->qp->qp_num << 8;
> @@ -198,6 +202,14 @@
>  	tmpl->txq.elts =
>  		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
>  		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
> +
> +	if (qp.comp_mask | MLX5DV_QP_MASK_UAR_INFO) {
> +		tmpl->uar_mmap_offset = qp.uar_info.mmap_offset;
> +	} else {
> +		ERROR("Failed to retrieve UAR info, invalid libmlx5.so version");
> +		return -EINVAL;

Same here.

> +	}
> +
>  	return 0;
>  }
>  
> @@ -539,42 +551,53 @@
>  }
>  
>  /**
> - * DPDK callback for TX in secondary processes.
> + * Map locally UAR used in Tx queues for BlueFlame doorbell.
>   *
> - * This function configures all queues from primary process information
> - * if necessary before reverting to the normal TX burst callback.
> - *
> - * @param dpdk_txq
> - *   Generic pointer to TX queue structure.
> - * @param[in] pkts
> - *   Packets to transmit.
> - * @param pkts_n
> - *   Number of packets in array.
> + * @param[in] priv
> + *   Pointer to private structure.
> + * @param fd
> + *   Verbs file descriptor to map UAR pages.
>   *
>   * @return
> - *   Number of packets successfully transmitted (<= pkts_n).
> + *   0 on success, errno value on failure.
>   */
> -uint16_t
> -mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
> -			      uint16_t pkts_n)
> +int
> +mlx5_tx_uar_remap(struct priv *priv, int fd)
>  {
> -	struct txq *txq = dpdk_txq;
> -	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
> -	struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
> -	struct priv *primary_priv;
> -	unsigned int index;
> +	unsigned int i, j;
> +	uintptr_t pages[priv->txqs_n];
> +	unsigned int pages_n = 0;
> +	uintptr_t uar_va;
> +	void *addr;
> +	struct txq *txq;
> +	struct txq_ctrl *txq_ctrl;
> +	int already_mapped;
> +	size_t page_size = sysconf(_SC_PAGESIZE);

Same comment here about the page_size, there is no guarantee the library will
use this value in the future.  PMD should not make such assumption.

> -	if (priv == NULL)
> -		return 0;
> -	primary_priv =
> -		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
> -	/* Look for queue index in both private structures. */
> -	for (index = 0; index != priv->txqs_n; ++index)
> -		if (((*primary_priv->txqs)[index] == txq) ||
> -		    ((*priv->txqs)[index] == txq))
> -			break;
> -	if (index == priv->txqs_n)
> -		return 0;
> -	txq = (*priv->txqs)[index];
> -	return priv->dev->tx_pkt_burst(txq, pkts, pkts_n);
> +	for (i = 0; i != priv->txqs_n; ++i) {
> +		txq = (*priv->txqs)[i];
> +		txq_ctrl = container_of(txq, struct txq_ctrl, txq);
> +		uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
> +		uar_va = RTE_ALIGN_FLOOR(uar_va,
> +				page_size / priv->num_uars_per_page);

Indentation.

> +		already_mapped = 0;
> +		for (j = 0; j != pages_n; ++j) {
> +			if (pages[j] == uar_va) {
> +				already_mapped = 1;
> +				break;
> +			}
> +		}
> +		if (already_mapped)
> +			continue;
> +

Extra empty line.

> +		pages[pages_n++] = uar_va;
> +		addr = mmap((void *)uar_va, page_size,
> +				PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
> +				txq_ctrl->uar_mmap_offset);

Indentation.

> +		if (addr != (void *)uar_va) {
> +			ERROR("call to mmap failed on UAR for txq %d\n", i);
> +			return -1;
> +		}
> +	}
> +	return 0;
>  }
> -- 
> 1.8.3.1

Nice work.

Thanks,

[1] http://dpdk.org/ml/archives/dev/2017-August/073212.html

-- 
Nélio Laranjeiro
6WIND


More information about the dev mailing list