[dpdk-dev] [PATCH v4] net/mlx5: control transmit doorbell register mapping

Raslan Darawsheh rasland at mellanox.com
Fri Nov 8 16:22:32 CET 2019


Hi,
> -----Original Message-----
> From: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> Sent: Friday, November 8, 2019 5:08 PM
> To: dev at dpdk.org
> Cc: Matan Azrad <matan at mellanox.com>; Raslan Darawsheh
> <rasland at mellanox.com>; Ori Kam <orika at mellanox.com>;
> stable at dpdk.org
> Subject: [PATCH v4] net/mlx5: control transmit doorbell register mapping
> 
> The rdma core library can map doorbell register in two ways, depending on
> the environment variable "MLX5_SHUT_UP_BF":
> 
>   - as regular cached memory, the variable is either missing or
>     set to zero. This type of mapping may cause the significant
>     doorbell register writing latency and requires explicit
>     memory write barrier to mitigate this issue and prevent
>     write combining.
> 
>   - as non-cached memory, the variable is present and set to
>     not "0" value. This type of mapping may cause performance
>     impact under heavy loading conditions but the explicit write
>     memory barrier is not required and it may improve core
>     performance.
> 
> The new devarg is introduced "tx_db_nc", if this parameter is set to zero, the
> doorbell register is forced to be mapped to cached memory and requires
> explicit memory barrier after writing to. If "tx_db_nc" is set to non-zero value
> the doorbell will be mapped as non-cached memory, not requiring the
> memory barrier. If "tx_db_nc" is missing the behaviour will be defined by
> presence of "MLX5_SHUT_UP_BF" in environment. If variable is missed the
> default value zero will be set for ARM64 hosts and one for others.
> 
> In run time the code checks the mapping type and provides the memory
> barrier after writing to tx doorbell register if it is needed. The mapping type is
> extracted directly from the uar_mmap_offset field in the queue properties.
> 
> Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template")
> Cc: stable at dpdk.org
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> Acked-by: Matan Azrad <matan at mellanox.com>
> 
> ---
> It would be nice to have this fix in 19.08.1+
> 
> v4: rebase on top
> v3:
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F62773%2F&data=02%7C01%7Crasland%40mell
> anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637088224824292194&sdata=qYY7BVPW
> MZHjRpoaxgtUg2vJzuEBG3bSXd042sLsCuw%3D&reserved=0
>     default tx_db_nc values are changed
> v2:
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F62739%2F&data=02%7C01%7Crasland%40mell
> anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637088224824292194&sdata=ND76ZF3M
> kjPOh8qkDxYswzovfZ3dXY8u6UzY%2Fm9%2FH4c%3D&reserved=0
> 
>  doc/guides/nics/mlx5.rst     | 23 +++++++++++
>  drivers/net/mlx5/Makefile    |  5 +++
>  drivers/net/mlx5/meson.build |  2 +
>  drivers/net/mlx5/mlx5.c      | 90
> ++++++++++++++++++++++++++++++++++++++------
>  drivers/net/mlx5/mlx5.h      |  1 +
>  drivers/net/mlx5/mlx5_defs.h | 16 ++++++++  drivers/net/mlx5/mlx5_rxtx.c
> | 17 ++++++++-  drivers/net/mlx5/mlx5_rxtx.h |  1 +
> drivers/net/mlx5/mlx5_txq.c  | 27 ++++++++++++-
>  9 files changed, 169 insertions(+), 13 deletions(-)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index
> 3651e82..5fd313c 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -552,6 +552,29 @@ Run-time configuration
>    Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
>    option or reported by the NIC, the eMPW feature is disengaged.
> 
> +- ``tx_db_nc`` parameter [int]
> +
> +  The rdma core library can map doorbell register in two ways,
> + depending on the  environment variable "MLX5_SHUT_UP_BF":
> +
> +  - As regular cached memory, if the variable is either missing or set to zero.
> +  - As non-cached memory, if the variable is present and set to not "0" value.
> +
> +  The type of mapping may slightly affect the Tx performance, the
> + optimal choice  is strongly relied on the host architecture and should be
> deduced practically.
> +
> +  If ``tx_db_nc`` is either omitted or set to zero, the doorbell is
> + forced to be  mapped to regular memory, the PMD will perform the extra
> + write memory barrier  after writing to doorbell, it might increase the
> + needed CPU clocks per packet  to send, but latency might be improved.
> +
> +  If ``tx_db_nc`` is set to not zero, the doorbell is forced to be
> + mapped to  non cached memory, the PMD will not perform the extra write
> + memory barrier  after writing to doorbell, on some architectures it
> + might improve the  performance.
> +
> +  The default ``tx_db_nc`` value is zero ARM64 hosts and one for others.
> +
>  - ``tx_vec_en`` parameter [int]
> 
>    A nonzero value enables Tx vector on ConnectX-5, ConnectX-6, ConnectX-6
> DX diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index
> d01fa73..5b79631 100644
> --- a/drivers/net/mlx5/Makefile
> +++ b/drivers/net/mlx5/Makefile
> @@ -206,6 +206,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-
> config-h.sh
>  		func mlx5dv_dr_action_create_flow_meter \
>  		$(AUTOCONF_OUTPUT)
>  	$Q sh -- '$<' '$@' \
> +		HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD \
> +		infiniband/mlx5dv.h \
> +		enum MLX5_MMAP_GET_NC_PAGES_CMD \
> +		$(AUTOCONF_OUTPUT)
> +	$Q sh -- '$<' '$@' \
>  		HAVE_ETHTOOL_LINK_MODE_25G \
>  		/usr/include/linux/ethtool.h \
>  		enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \ diff --
> git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build index
> 511f5b7..05fadf6 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -134,6 +134,8 @@ if build
>  		'mlx5dv_dr_action_create_dest_devx_tir' ],
>  		[ 'HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER',
> 'infiniband/mlx5dv.h',
>  		'mlx5dv_dr_action_create_flow_meter' ],
> +		[ 'HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD',
> 'infiniband/mlx5dv.h',
> +		'MLX5_MMAP_GET_NC_PAGES_CMD' ],
>  		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
>  		'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ],
>  		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h', diff --
> git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 9a2c711..276087d 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -96,6 +96,12 @@
>  #define MLX5_TXQ_MPW_EN "txq_mpw_en"
> 
>  /*
> + * Device parameter to force doorbell register mapping
> + * to non-cahed region eliminating the extra write memory barrier.
> + */
> +#define MLX5_TX_DB_NC "tx_db_nc"
> +
> +/*
>   * Device parameter to include 2 dsegs in the title WQEBB.
>   * Deprecated, ignored.
>   */
> @@ -421,6 +427,37 @@ struct mlx5_flow_id_pool *  }  #endif /*
> HAVE_IBV_FLOW_DV_SUPPORT */
> 
> +static int
> +mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
> +{
> +	char *env;
> +	int value;
> +
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	/* Get environment variable to store. */
> +	env = getenv(MLX5_SHUT_UP_BF);
> +	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
> +	if (config->dbnc == MLX5_ARG_UNSET)
> +		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT,
> 1);
> +	else
> +		setenv(MLX5_SHUT_UP_BF, config->dbnc ? "1" : "0", 1);
> +	return value;
> +}
> +
> +static void
> +mlx5_restore_doorbell_mapping_env(const struct mlx5_dev_config
> *config,
> +				  int value)
> +{
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	if (config->dbnc == MLX5_ARG_UNSET)
> +		return;
> +	/* Restore the original environment variable state. */
> +	if (value == MLX5_ARG_UNSET)
> +		unsetenv(MLX5_SHUT_UP_BF);
> +	else
> +		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); }
> +
>  /**
>   * Allocate shared IB device context. If there is multiport device the
>   * master and representors will share this context, if there is single @@ -
> 434,22 +471,26 @@ struct mlx5_flow_id_pool *
>   *
>   * @param[in] spawn
>   *   Pointer to the IB device attributes (name, port, etc).
> + * @param[in] config
> + *   Pointer to device configuration structure.
>   *
>   * @return
>   *   Pointer to mlx5_ibv_shared object on success,
>   *   otherwise NULL and rte_errno is set.
>   */
>  static struct mlx5_ibv_shared *
> -mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn)
> +mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
> +			const struct mlx5_dev_config *config)
>  {
>  	struct mlx5_ibv_shared *sh;
> +	int dbmap_env;
>  	int err = 0;
>  	uint32_t i;
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>  	struct mlx5_devx_tis_attr tis_attr = { 0 };  #endif
> 
> -assert(spawn);
> +	assert(spawn);
>  	/* Secondary process should not create the shared context. */
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	pthread_mutex_lock(&mlx5_ibv_list_mutex);
> @@ -472,16 +513,31 @@ struct mlx5_flow_id_pool *
>  		rte_errno  = ENOMEM;
>  		goto exit;
>  	}
> +	/*
> +	 * Configure environment variable "MLX5_BF_SHUT_UP"
> +	 * before the device creation. The rdma_core library
> +	 * checks the variable at device creation and
> +	 * stores the result internally.
> +	 */
> +	dbmap_env = mlx5_config_doorbell_mapping_env(config);
>  	/* Try to open IB device with DV first, then usual Verbs. */
>  	errno = 0;
>  	sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
>  	if (sh->ctx) {
>  		sh->devx = 1;
>  		DRV_LOG(DEBUG, "DevX is supported");
> +		/* The device is created, no need for environment. */
> +		mlx5_restore_doorbell_mapping_env(config, dbmap_env);
>  	} else {
> +		/* The environment variable is still configured. */
>  		sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
> +		err = errno ? errno : ENODEV;
> +		/*
> +		 * The environment variable is not needed anymore,
> +		 * all device creation attempts are completed.
> +		 */
> +		mlx5_restore_doorbell_mapping_env(config, dbmap_env);
>  		if (!sh->ctx) {
> -			err = errno ? errno : ENODEV;
>  			goto error;
>  		}
>  		DRV_LOG(DEBUG, "DevX is NOT supported"); @@ -1300,6
> +1356,8 @@ struct mlx5_flow_id_pool *
>  		DRV_LOG(WARNING, "%s: deprecated parameter, ignored",
> key);
>  	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
>  		config->mps = !!tmp;
> +	} else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
> +		config->dbnc = !!tmp;
>  	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
>  		DRV_LOG(WARNING, "%s: deprecated parameter, ignored",
> key);
>  	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { @@ -
> 1373,6 +1431,7 @@ struct mlx5_flow_id_pool *
>  		MLX5_TXQ_MPW_EN,
>  		MLX5_TXQ_MPW_HDR_DSEG_EN,
>  		MLX5_TXQ_MAX_INLINE_LEN,
> +		MLX5_TX_DB_NC,
>  		MLX5_TX_VEC_EN,
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
> @@ -1938,7 +1997,20 @@ struct mlx5_flow_id_pool *
>  		eth_dev->tx_pkt_burst =
> mlx5_select_tx_function(eth_dev);
>  		return eth_dev;
>  	}
> -	sh = mlx5_alloc_shared_ibctx(spawn);
> +	/*
> +	 * Some parameters ("tx_db_nc" in particularly) are needed in
> +	 * advance to create dv/verbs device context. We proceed the
> +	 * devargs here to get ones, and later proceed devargs again
> +	 * to override some hardware settings.
> +	 */
> +	err = mlx5_args(&config, dpdk_dev->devargs);
> +	if (err) {
> +		err = rte_errno;
> +		DRV_LOG(ERR, "failed to process device arguments: %s",
> +			strerror(rte_errno));
> +		goto error;
> +	}
> +	sh = mlx5_alloc_shared_ibctx(spawn, &config);
>  	if (!sh)
>  		return NULL;
>  	config.devx = sh->devx;
> @@ -2180,13 +2252,8 @@ struct mlx5_flow_id_pool *
>  		}
>  		own_domain_id = 1;
>  	}
> -	err = mlx5_args(&config, dpdk_dev->devargs);
> -	if (err) {
> -		err = rte_errno;
> -		DRV_LOG(ERR, "failed to process device arguments: %s",
> -			strerror(rte_errno));
> -		goto error;
> -	}
> +	/* Override some values set by hardware configuration. */
> +	mlx5_args(&config, dpdk_dev->devargs);
>  	err = mlx5_dev_check_sibling_config(priv, &config);
>  	if (err)
>  		goto error;
> @@ -3031,6 +3098,7 @@ struct mlx5_flow_id_pool *
>  	dev_config = (struct mlx5_dev_config){
>  		.hw_padding = 0,
>  		.mps = MLX5_ARG_UNSET,
> +		.dbnc = MLX5_ARG_UNSET,
>  		.rx_vec_en = 1,
>  		.txq_inline_max = MLX5_ARG_UNSET,
>  		.txq_inline_min = MLX5_ARG_UNSET,
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> e8148ce..1a92c10 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -267,6 +267,7 @@ struct mlx5_dev_config {
>  		/* Rx queue count threshold to enable MPRQ. */
>  	} mprq; /* Configurations for Multi-Packet RQ. */
>  	int mps; /* Multi-packet send supported mode. */
> +	int dbnc; /* Skip doorbell register write barrier. */
>  	unsigned int flow_prio; /* Number of flow priorities. */
>  	enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM];
>  	/* Availibility of mreg_c's. */
> diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
> index 0ef532f..03060aa 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -123,6 +123,22 @@
>  #define MLX5_UAR_PAGE_NUM_MAX 64
>  #define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) -
> 1)
> 
> +/* Fields of memory mapping type in offset parameter of mmap() */
> +#define MLX5_UAR_MMAP_CMD_SHIFT 8 #define
> MLX5_UAR_MMAP_CMD_MASK 0xff
> +
> +/* Environment variable to control the doorbell register mapping. */
> +#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF"
> +#if defined(RTE_ARCH_ARM64)
> +#define MLX5_SHUT_UP_BF_DEFAULT "0"
> +#else
> +#define MLX5_SHUT_UP_BF_DEFAULT "1"
> +#endif
> +
> +#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD #define
> +MLX5_MMAP_GET_NC_PAGES_CMD 3 #endif
> +
>  /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
> #define MLX5_MPRQ_STRIDE_NUM_N 6U
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 8bc0542..1ea5960 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -4754,8 +4754,23 @@ enum mlx5_txcmp_code {
>  	 * to improve latencies. The pure software related data treatment
>  	 * can be completed after doorbell. Tx CQEs for this SQ are
>  	 * processed in this thread only by the polling.
> +	 *
> +	 * The rdma core library can map doorbell register in two ways,
> +	 * depending on the environment variable "MLX5_SHUT_UP_BF":
> +	 *
> +	 * - as regular cached memory, the variable is either missing or
> +	 *   set to zero. This type of mapping may cause the significant
> +	 *   doorbell register writing latency and requires explicit
> +	 *   memory write barrier to mitigate this issue and prevent
> +	 *   write combining.
> +	 *
> +	 * - as non-cached memory, the variable is present and set to
> +	 *   not "0" value. This type of mapping may cause performance
> +	 *   impact under heavy loading conditions but the explicit write
> +	 *   memory barrier is not required and it may improve core
> +	 *   performance.
>  	 */
> -	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
> +	mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc);
>  	/* Not all of the mbufs may be stored into elts yet. */
>  	part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent -
> loc.pkts_copy;
>  	if (!MLX5_TXOFF_CONFIG(INLINE) && part) { diff --git
> a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index
> 559d225..88c50aa 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -287,6 +287,7 @@ struct mlx5_txq_data {
>  	/* When set TX offload for tunneled packets are supported. */
>  	uint16_t swp_en:1; /* Whether SW parser is enabled. */
>  	uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
> +	uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
>  	uint16_t inlen_send; /* Ordinary send data inline size. */
>  	uint16_t inlen_empw; /* eMPW max packet size to inline. */
>  	uint16_t inlen_mode; /* Minimal data length to inline. */ diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 97991f0..a0d6164 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -18,6 +18,7 @@
>  #pragma GCC diagnostic ignored "-Wpedantic"
>  #endif
>  #include <infiniband/verbs.h>
> +#include <infiniband/mlx5dv.h>
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> @@ -302,6 +303,28 @@
>  }
> 
>  /**
> + * Configure the doorbell register non-cached attribute.
> + *
> + * @param txq_ctrl
> + *   Pointer to Tx queue control structure.
> + * @param page_size
> + *   Systme page size
> + */
> +static void
> +txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size) {
> +	unsigned int cmd;
> +
> +	txq_ctrl->txq.db_nc = 0;
> +	/* Check the doorbell register mapping type. */
> +	cmd = txq_ctrl->uar_mmap_offset / page_size;
> +	cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> +	cmd &= MLX5_UAR_MMAP_CMD_MASK;
> +	if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> +		txq_ctrl->txq.db_nc = 1;
> +}
> +
> +/**
>   * Initialize Tx UAR registers for primary process.
>   *
>   * @param txq_ctrl
> @@ -312,9 +335,9 @@
>  {
>  	struct mlx5_priv *priv = txq_ctrl->priv;
>  	struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
> +	const size_t page_size = sysconf(_SC_PAGESIZE);
>  #ifndef RTE_ARCH_64
>  	unsigned int lock_idx;
> -	const size_t page_size = sysconf(_SC_PAGESIZE);
>  #endif
> 
>  	if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) @@ -322,6 +345,7
> @@
>  	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>  	assert(ppriv);
>  	ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
> +	txq_uar_ncattr_init(txq_ctrl, page_size);
>  #ifndef RTE_ARCH_64
>  	/* Assign an UAR lock according to UAR page number */
>  	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & @@ -375,6
> +399,7 @@
>  	}
>  	addr = RTE_PTR_ADD(addr, offset);
>  	ppriv->uar_table[txq->idx] = addr;
> +	txq_uar_ncattr_init(txq_ctrl, page_size);
>  	return 0;
>  }
> 
> --
> 1.8.3.1


Patch applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh



More information about the dev mailing list