[dpdk-dev] [PATCH 3/3] net/mlx5: add Multi-Packet Rx support

Shahaf Shuler shahafs at mellanox.com
Mon May 7 12:49:10 CEST 2018


Hi Koh,

Nice work. Please see some comments. 

Thursday, May 3, 2018 2:21 AM, Yongseok Koh:
> Subject: [dpdk-dev] [PATCH 3/3] net/mlx5: add Multi-Packet Rx support
> 
> Multi-Packet Rx Queue (MPRQ a.k.a Striding RQ) can further save PCIe
> bandwidth by posting a single large buffer for multiple packets. Instead of
> posting a buffer per a packet, one large buffer is posted in order to receive
> multiple packets on the buffer. A MPRQ buffer consists of multiple fixed-size
> strides and each stride receives one packet.
> 
> Rx packet is mem-copied to a user-provided mbuf if the size of Rx packet is
> comparatively small, or PMD attaches the Rx packet to the mbuf by external
> buffer attachment - rte_pktmbuf_attach_extbuf(). A mempool for external
> buffers will be allocated and managed by PMD.
> 
> Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
> ---
>  doc/guides/nics/mlx5.rst         |  25 ++
>  drivers/net/mlx5/mlx5.c          |  66 ++++++
>  drivers/net/mlx5/mlx5.h          |   4 +
>  drivers/net/mlx5/mlx5_defs.h     |  23 ++
>  drivers/net/mlx5/mlx5_ethdev.c   |   3 +
>  drivers/net/mlx5/mlx5_prm.h      |  15 ++
>  drivers/net/mlx5/mlx5_rxq.c      | 494
> ++++++++++++++++++++++++++++++++++++---
>  drivers/net/mlx5/mlx5_rxtx.c     | 207 +++++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx.h     |  37 ++-
>  drivers/net/mlx5/mlx5_rxtx_vec.c |   4 +
>  drivers/net/mlx5/mlx5_rxtx_vec.h |   3 +-
>  drivers/net/mlx5/mlx5_trigger.c  |   6 +-
>  12 files changed, 844 insertions(+), 43 deletions(-)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index
> 0fe6e1835..37c463362 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -229,6 +229,31 @@ Run-time configuration
>    - x86_64 with ConnectX-4, ConnectX-4 LX and ConnectX-5.
>    - POWER8 and ARMv8 with ConnectX-4 LX and ConnectX-5.
> 
> +- ``mprq_en`` parameter [int]
> +
> +  A nonzero value enables configuring Multi-Packet Rx queues. Rx queue
> + is  configured as Multi-Packet RQ if the total number of Rx queues is
> + ``rxqs_min_mprq`` or more and Rx scatter isn't configured. Disabled by
> + default.
> +
> +  Multi-Packet Rx Queue (MPRQ a.k.a Striding RQ) can further save PCIe
> + bandwidth  by posting a single large buffer for multiple packets.
> + Instead of posting a  buffers per a packet, one large buffer is posted
> + in order to receive multiple  packets on the buffer. A MPRQ buffer
> + consists of multiple fixed-size strides  and each stride receives one packet.
> +
> +- ``mprq_max_memcpy_len`` parameter [int]
> +  The maximum size of packet for memcpy in case of Multi-Packet Rx
> +queue. Rx
> +  packet is mem-copied to a user-provided mbuf if the size of Rx packet
> +is less
> +  than or equal to this parameter. Otherwise, PMD will attach the Rx
> +packet to
> +  the mbuf by external buffer attachment -
> ``rte_pktmbuf_attach_extbuf()``.
> +  A mempool for external buffers will be allocated and managed by PMD.
> +The
> +  default value is 128.

Need to add valid only when mprq_en is set. 

> +
> +- ``rxqs_min_mprq`` parameter [int]
> +  Configure Rx queues as Multi-Packet RQ if the total number of Rx
> +queues is greater or
> +  equal to this value. The default value is 12.

Need to add valid only when mprq_en is set.

> +
>  - ``txq_inline`` parameter [int]
> 

The documentation is missing application restrictions when using mprq, like handling of the ol_flags. 

>    Amount of data to be inlined during TX operations. Improves latency.
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 2883f20af..80f6838f9 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -46,6 +46,18 @@
>  /* Device parameter to enable RX completion queue compression. */
> #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
> 
> +/* Device parameter to enable Multi-Packet Rx queue. */ #define
> +MLX5_RX_MPRQ_EN "mprq_en"
> +
> +/* Device parameter to limit the size of memcpy'd packet. */ #define
> +MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
> +
> +/*
> + * Device parameter to set the minimum number of Rx queues to configure
> + * Multi-Packet Rx queue.
> + */
> +#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
> +
>  /* Device parameter to configure inline send. */  #define MLX5_TXQ_INLINE
> "txq_inline"
> 
> @@ -241,6 +253,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  		priv->txqs = NULL;
>  	}
>  	mlx5_flow_delete_drop_queue(dev);
> +	mlx5_mprq_free_mp(dev);

Allocation of the mempool is on the port start, but the free is on the port close?  Seems inconsist. 
Why not free on the port stop? 

>  	mlx5_mr_release(dev);
>  	if (priv->pd != NULL) {
>  		assert(priv->ctx != NULL);
> @@ -444,6 +457,12 @@ mlx5_args_check(const char *key, const char *val,
> void *opaque)
>  	}
>  	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
>  		config->cqe_comp = !!tmp;
> +	} else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
> +		config->mprq = !!tmp;
> +	} else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
> +		config->mprq_max_memcpy_len = tmp;
> +	} else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
> +		config->rxqs_mprq = tmp;
>  	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
>  		config->txq_inline = tmp;
>  	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { @@ -486,6
> +505,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs
> *devargs)  {
>  	const char **params = (const char *[]){
>  		MLX5_RXQ_CQE_COMP_EN,
> +		MLX5_RX_MPRQ_EN,
> +		MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
> +		MLX5_RXQS_MIN_MPRQ,
>  		MLX5_TXQ_INLINE,
>  		MLX5_TXQS_MIN_INLINE,
>  		MLX5_TXQ_MPW_EN,
> @@ -667,6 +689,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  	unsigned int tunnel_en = 0;
>  	unsigned int swp = 0;
>  	unsigned int verb_priorities = 0;
> +	unsigned int mprq = 0;
>  	int idx;
>  	int i;
>  	struct mlx5dv_context attrs_out = {0}; @@ -754,6 +777,9 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,  #ifdef
> HAVE_IBV_DEVICE_TUNNEL_SUPPORT
>  	attrs_out.comp_mask |=
> MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
>  #endif
> +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> +	attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
> #endif
>  	mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
>  	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
>  		if (attrs_out.flags &
> MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { @@ -772,6 +798,37 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
>  	DRV_LOG(DEBUG, "SWP support: %u", swp);  #endif
> +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> +	if (attrs_out.comp_mask &
> MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
> +		struct mlx5dv_striding_rq_caps mprq_caps =
> +			attrs_out.striding_rq_caps;
> +
> +		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes:
> %d",
> +			mprq_caps.min_single_stride_log_num_of_bytes);
> +		DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes:
> %d",
> +			mprq_caps.max_single_stride_log_num_of_bytes);
> +		DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides:
> %d",
> +			mprq_caps.min_single_wqe_log_num_of_strides);
> +		DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides:
> %d",
> +			mprq_caps.max_single_wqe_log_num_of_strides);
> +		DRV_LOG(DEBUG, "\tsupported_qpts: %d",
> +			mprq_caps.supported_qpts);
> +		if (mprq_caps.min_single_stride_log_num_of_bytes <=
> +		    MLX5_MPRQ_MIN_STRIDE_SZ_N &&
> +		    mprq_caps.max_single_stride_log_num_of_bytes >=
> +		    MLX5_MPRQ_STRIDE_SZ_N &&
> +		    mprq_caps.min_single_wqe_log_num_of_strides <=
> +		    MLX5_MPRQ_MIN_STRIDE_NUM_N &&
> +		    mprq_caps.max_single_wqe_log_num_of_strides >=
> +		    MLX5_MPRQ_STRIDE_NUM_N) {
> +			DRV_LOG(DEBUG, "Multi-Packet RQ is supported");
> +			mprq = 1;
> +		} else {
> +			DRV_LOG(DEBUG, "Multi-Packet RQ isn't
> supported");
> +			mprq = 0;
> +		}
> +	}
> +#endif
>  	if (RTE_CACHE_LINE_SIZE == 128 &&
>  	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
>  		cqe_comp = 0;
> @@ -820,6 +877,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>  			.inline_max_packet_sz = MLX5_ARG_UNSET,
>  			.vf_nl_en = 1,
>  			.swp = !!swp,
> +			.mprq = 0, /* Disabled by default. */
> +			.mprq_max_memcpy_len =
> MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
> +			.rxqs_mprq = MLX5_MPRQ_MIN_RXQS,
>  		};
> 
>  		len = snprintf(name, sizeof(name), PCI_PRI_FMT, @@ -978,6
> +1038,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			DRV_LOG(WARNING, "Rx CQE compression isn't
> supported");
>  			config.cqe_comp = 0;
>  		}
> +		if (config.mprq && !mprq) {
> +			DRV_LOG(WARNING, "Multi-Packet RQ isn't
> supported");
> +			config.mprq = 0;
> +		}
> +		DRV_LOG(INFO, "Multi-Packet RQ is %s",
> +			config.mprq ? "enabled" : "disabled");
>  		eth_dev = rte_eth_dev_allocate(name);
>  		if (eth_dev == NULL) {
>  			DRV_LOG(ERR, "can not allocate rte ethdev"); diff --
> git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> d3fc74dc1..7883dfc8f 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -102,6 +102,9 @@ struct mlx5_dev_config {
>  	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
>  	unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
>  	unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload.
> */
> +	unsigned int mprq:1; /* Whether Multi-Packet RQ is supported. */
> +	unsigned int mprq_max_memcpy_len; /* Maximum packet size to
> memcpy. */

For mprq..

> +	unsigned int rxqs_mprq; /* Queue count threshold for Multi-Packet
> RQ.
> +*/
>  	unsigned int max_verbs_prio; /* Number of Verb flow priorities. */
>  	unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO.
> */
>  	unsigned int ind_table_max_size; /* Maximum indirection table size.
> */ @@ -154,6 +157,7 @@ struct priv {
>  	unsigned int txqs_n; /* TX queues array size. */
>  	struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */
>  	struct mlx5_txq_data *(*txqs)[]; /* TX queues. */
> +	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ.
> */
>  	struct rte_eth_rss_conf rss_conf; /* RSS configuration. */
>  	struct rte_intr_handle intr_handle; /* Interrupt handler. */
>  	unsigned int (*reta_idx)[]; /* RETA index table. */ diff --git
> a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index
> 72e80af26..a8b7ff42b 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -95,4 +95,27 @@
>   */
>  #define MLX5_UAR_OFFSET (1ULL << 32)
> 
> +/* Log 2 of the size of a stride for Multi-Packet RQ. */ #define
> +MLX5_MPRQ_STRIDE_SZ_N 11

Why it is a fixed value? 
In your design you use single stride per packet. If so, why not setting this value according to the max_rx_pkt_len configured by the application?
it can adjust the memory footprint to the application needs. 


#define MLX5_MPRQ_MIN_STRIDE_SZ_N 6
> +
> +/* Log 2 of the number of strides per WQE for Multi-Packet RQ. */
> +#define MLX5_MPRQ_STRIDE_NUM_N 4 #define
> MLX5_MPRQ_MIN_STRIDE_NUM_N 3
> +
> +/* Two-byte shift is disabled for Multi-Packet RQ. */ #define
> +MLX5_MPRQ_TWO_BYTE_SHIFT 0
> +
> +/*
> + * Minimum size of packet to be memcpy'd instead of being attached as
> +an
> + * external buffer.
> + */
> +#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128
> +
> +/* Minimum number Rx queues to enable Multi-Packet RQ. */ #define
> +MLX5_MPRQ_MIN_RXQS 12

It is not for this patch, but probably we will have different values for different arch (power/arm/x86)

> +
> +/* Cache size of mempool for Multi-Packet RQ. */ #define
> +MLX5_MPRQ_MP_CACHE_SZ 32

This value is independent on the rxq depth? Number of queues? 
I would expect the cache size to be depended on the total number of mbufs being worked in parallel. 

> +
>  #endif /* RTE_PMD_MLX5_DEFS_H_ */
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> b/drivers/net/mlx5/mlx5_ethdev.c index 6bb43cf4e..c081217ef 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -507,6 +507,7 @@ mlx5_dev_supported_ptypes_get(struct
> rte_eth_dev *dev)
>  	};
> 
>  	if (dev->rx_pkt_burst == mlx5_rx_burst ||
> +	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
>  	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
>  		return ptypes;
>  	return NULL;
> @@ -1162,6 +1163,8 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
>  		rx_pkt_burst = mlx5_rx_burst_vec;
>  		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
>  			dev->data->port_id);
> +	} else if (mlx5_mprq_enabled(dev)) {
> +		rx_pkt_burst = mlx5_rx_burst_mprq;
>  	}
>  	return rx_pkt_burst;
>  }
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index 05a682898..d79ff2225 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -219,6 +219,21 @@ struct mlx5_mpw {
>  	} data;
>  };
> 
> +/* WQE for Multi-Packet RQ. */
> +struct mlx5_wqe_mprq {
> +	struct mlx5_wqe_srq_next_seg next_seg;
> +	struct mlx5_wqe_data_seg dseg;
> +};
> +
> +#define MLX5_MPRQ_LEN_MASK 0x000ffff
> +#define MLX5_MPRQ_LEN_SHIFT 0
> +#define MLX5_MPRQ_STRIDE_NUM_MASK 0x7fff0000

PRM says:
byte_cnt[29:16]: specify number of consumed strides.

Isn't it should be 
#define MLX5_MPRQ_STRIDE_NUM_MASK 0x3fff0000


 #define
> +MLX5_MPRQ_STRIDE_NUM_SHIFT 16 #define MLX5_MPRQ_FILLER_MASK
> 0x80000000
> +#define MLX5_MPRQ_FILLER_SHIFT 31
> +
> +#define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2
> +
>  /* CQ element structure - should be equal to the cache line size */  struct
> mlx5_cqe {  #if (RTE_CACHE_LINE_SIZE == 128) diff --git
> a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index
> 22e2f9673..3d8ade10b 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -55,7 +55,73 @@ uint8_t rss_hash_default_key[] = {  const size_t
> rss_hash_default_key_len = sizeof(rss_hash_default_key);
> 
>  /**
> - * Allocate RX queue elements.
> + * Check whether Multi-Packet RQ can be enabled for the device.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + *
> + * @return
> + *   1 if supported, negative errno value if not.
> + */
> +inline int
> +mlx5_check_mprq_support(struct rte_eth_dev *dev) {
> +	struct priv *priv = dev->data->dev_private;
> +
> +	if (priv->config.mprq && priv->rxqs_n >= priv->config.rxqs_mprq)
> +		return 1;
> +	return -ENOTSUP;
> +}
> +
> +/**
> + * Check whether Multi-Packet RQ is enabled for the Rx queue.
> + *
> + *  @param rxq
> + *     Pointer to receive queue structure.
> + *
> + * @return
> + *   0 if disabled, otherwise enabled.
> + */
> +inline int
> +mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq) {
> +	return rxq->strd_num_n > 0;
> +}
> +
> +/**
> + * Check whether Multi-Packet RQ is enabled for the device.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + *
> + * @return
> + *   0 if disabled, otherwise enabled.
> + */
> +inline int
> +mlx5_mprq_enabled(struct rte_eth_dev *dev) {
> +	struct priv *priv = dev->data->dev_private;
> +	uint16_t i;
> +	uint16_t n = 0;
> +
> +	if (mlx5_check_mprq_support(dev) < 0)
> +		return 0;
> +	/* All the configured queues should be enabled. */
> +	for (i = 0; i < priv->rxqs_n; ++i) {
> +		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
> +
> +		if (!rxq)
> +			continue;
> +		if (mlx5_rxq_mprq_enabled(rxq))
> +			++n;
> +	}
> +	/* Multi-Packet RQ can't be partially configured. */
> +	assert(n == 0 || n == priv->rxqs_n);
> +	return n == priv->rxqs_n;
> +}
> +
> +/**
> + * Allocate RX queue elements for Multi-Packet RQ.
>   *
>   * @param rxq_ctrl
>   *   Pointer to RX queue structure.
> @@ -63,8 +129,58 @@ const size_t rss_hash_default_key_len =
> sizeof(rss_hash_default_key);
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
> -int
> -rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
> +static int
> +rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) {
> +	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
> +	unsigned int wqe_n = 1 << rxq->elts_n;
> +	unsigned int i;
> +	int err;
> +
> +	/* Iterate on segments. */
> +	for (i = 0; i <= wqe_n; ++i) {
> +		struct mlx5_mprq_buf *buf;
> +
> +		if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) {
> +			DRV_LOG(ERR, "port %u empty mbuf pool", rxq-
> >port_id);
> +			rte_errno = ENOMEM;
> +			goto error;
> +		}
> +		if (i < wqe_n)
> +			(*rxq->mprq_bufs)[i] = buf;
> +		else
> +			rxq->mprq_repl = buf;
> +	}
> +	DRV_LOG(DEBUG,
> +		"port %u Rx queue %u allocated and configured %u
> segments",
> +		rxq->port_id, rxq_ctrl->idx, wqe_n);
> +	return 0;
> +error:
> +	err = rte_errno; /* Save rte_errno before cleanup. */
> +	wqe_n = i;
> +	for (i = 0; (i != wqe_n); ++i) {
> +		if ((*rxq->elts)[i] != NULL)
> +			rte_mempool_put(rxq->mprq_mp,
> +					(*rxq->mprq_bufs)[i]);
> +		(*rxq->mprq_bufs)[i] = NULL;
> +	}
> +	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
> +		rxq->port_id, rxq_ctrl->idx);
> +	rte_errno = err; /* Restore rte_errno. */
> +	return -rte_errno;
> +}
> +
> +/**
> + * Allocate RX queue elements for Single-Packet RQ.
> + *
> + * @param rxq_ctrl
> + *   Pointer to RX queue structure.
> + *
> + * @return
> + *   0 on success, errno value on failure.
> + */
> +static int
> +rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
>  {
>  	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
>  	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; @@ -140,13 +256,57
> @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)  }
> 
>  /**
> - * Free RX queue elements.
> + * Allocate RX queue elements.
> + *
> + * @param rxq_ctrl
> + *   Pointer to RX queue structure.
> + *
> + * @return
> + *   0 on success, errno value on failure.
> + */
> +int
> +rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) {
> +	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
> +	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
> +}
> +
> +/**
> + * Free RX queue elements for Multi-Packet RQ.
>   *
>   * @param rxq_ctrl
>   *   Pointer to RX queue structure.
>   */
>  static void
> -rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
> +rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) {
> +	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
> +	uint16_t i;
> +
> +	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
> +		rxq->port_id, rxq_ctrl->idx);
> +	if (rxq->mprq_bufs == NULL)
> +		return;
> +	assert(mlx5_rxq_check_vec_support(rxq) < 0);
> +	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
> +		if ((*rxq->mprq_bufs)[i] != NULL)
> +			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
> +		(*rxq->mprq_bufs)[i] = NULL;
> +	}
> +	if (rxq->mprq_repl != NULL) {
> +		mlx5_mprq_buf_free(rxq->mprq_repl);
> +		rxq->mprq_repl = NULL;
> +	}
> +}
> +
> +/**
> + * Free RX queue elements for Single-Packet RQ.
> + *
> + * @param rxq_ctrl
> + *   Pointer to RX queue structure.
> + */
> +static void
> +rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
>  {
>  	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
>  	const uint16_t q_n = (1 << rxq->elts_n); @@ -175,6 +335,21 @@
> rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)  }
> 
>  /**
> + * Free RX queue elements.
> + *
> + * @param rxq_ctrl
> + *   Pointer to RX queue structure.
> + */
> +static void
> +rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) {
> +	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
> +		rxq_free_elts_mprq(rxq_ctrl);
> +	else
> +		rxq_free_elts_sprq(rxq_ctrl);
> +}
> +
> +/**
>   * Clean up a RX queue.
>   *
>   * Destroy objects, free allocated memory and reset the structure for reuse.
> @@ -623,10 +798,16 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  			struct ibv_cq_init_attr_ex ibv;
>  			struct mlx5dv_cq_init_attr mlx5;
>  		} cq;
> -		struct ibv_wq_init_attr wq;
> +		struct {
> +			struct ibv_wq_init_attr ibv;
> +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> +			struct mlx5dv_wq_init_attr mlx5;
> +#endif
> +		} wq;
>  		struct ibv_cq_ex cq_attr;
>  	} attr;
> -	unsigned int cqe_n = (1 << rxq_data->elts_n) - 1;
> +	unsigned int cqe_n;
> +	unsigned int wqe_n = 1 << rxq_data->elts_n;
>  	struct mlx5_rxq_ibv *tmpl;
>  	struct mlx5dv_cq cq_info;
>  	struct mlx5dv_rwq rwq;
> @@ -634,6 +815,7 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	int ret = 0;
>  	struct mlx5dv_obj obj;
>  	struct mlx5_dev_config *config = &priv->config;
> +	const int mprq_en = mlx5_rxq_mprq_enabled(rxq_data);
> 
>  	assert(rxq_data);
>  	assert(!rxq_ctrl->ibv);
> @@ -658,6 +840,10 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  			goto error;
>  		}
>  	}
> +	if (mprq_en)
> +		cqe_n = wqe_n * (1 << MLX5_MPRQ_STRIDE_NUM_N) - 1;
> +	else
> +		cqe_n = wqe_n  - 1;
>  	attr.cq.ibv = (struct ibv_cq_init_attr_ex){
>  		.cqe = cqe_n,
>  		.channel = tmpl->channel,
> @@ -695,11 +881,11 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  		dev->data->port_id, priv-
> >device_attr.orig_attr.max_qp_wr);
>  	DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d",
>  		dev->data->port_id, priv->device_attr.orig_attr.max_sge);
> -	attr.wq = (struct ibv_wq_init_attr){
> +	attr.wq.ibv = (struct ibv_wq_init_attr){
>  		.wq_context = NULL, /* Could be useful in the future. */
>  		.wq_type = IBV_WQT_RQ,
>  		/* Max number of outstanding WRs. */
> -		.max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n,
> +		.max_wr = wqe_n >> rxq_data->sges_n,
>  		/* Max number of scatter/gather elements in a WR. */
>  		.max_sge = 1 << rxq_data->sges_n,
>  		.pd = priv->pd,
> @@ -713,8 +899,8 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	};
>  	/* By default, FCS (CRC) is stripped by hardware. */
>  	if (rxq_data->crc_present) {
> -		attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
> -		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
> +		attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
> +		attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
>  	}
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
>  	if (config->hw_padding) {
> @@ -722,7 +908,26 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;

Should it be attr.wq.ibv.comp_mask? 

>  	}
>  #endif
> -	tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq);
> +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
> +	attr.wq.mlx5 = (struct mlx5dv_wq_init_attr){
> +		.comp_mask = 0,
> +	};
> +	if (mprq_en) {
> +		struct mlx5dv_striding_rq_init_attr *mprq_attr =
> +			&attr.wq.mlx5.striding_rq_attrs;
> +
> +		attr.wq.mlx5.comp_mask |=
> MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
> +		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
> +			.single_stride_log_num_of_bytes =
> MLX5_MPRQ_STRIDE_SZ_N,
> +			.single_wqe_log_num_of_strides =
> MLX5_MPRQ_STRIDE_NUM_N,
> +			.two_byte_shift_en =
> MLX5_MPRQ_TWO_BYTE_SHIFT,
> +		};
> +	}
> +	tmpl->wq = mlx5_glue->dv_create_wq(priv->ctx, &attr.wq.ibv,
> +					   &attr.wq.mlx5);

Creating wq through dv will be the same as creating wq though ibv? 
On ibv_create_wq there is comp mask of IBV_WQ_FLAGS_CVLAN_STRIPPING and maybe IBV_WQ_INIT_ATTR_FLAGS and IBV_WQ_INIT_ATTR_FLAGS.
While here it seems it is 0 in case mprq_en is not set. 

> +#else
> +	tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq.ibv); #endif
>  	if (tmpl->wq == NULL) {
>  		DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure",
>  			dev->data->port_id, idx);
> @@ -733,16 +938,14 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	 * Make sure number of WRs*SGEs match expectations since a
> queue
>  	 * cannot allocate more than "desc" buffers.
>  	 */
> -	if (((int)attr.wq.max_wr !=
> -	     ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) ||
> -	    ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) {
> +	if (attr.wq.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
> +	    attr.wq.ibv.max_sge != (1u << rxq_data->sges_n)) {
>  		DRV_LOG(ERR,
>  			"port %u Rx queue %u requested %u*%u but got
> %u*%u"
>  			" WRs*SGEs",
>  			dev->data->port_id, idx,
> -			((1 << rxq_data->elts_n) >> rxq_data->sges_n),
> -			(1 << rxq_data->sges_n),
> -			attr.wq.max_wr, attr.wq.max_sge);
> +			wqe_n >> rxq_data->sges_n, (1 << rxq_data-
> >sges_n),
> +			attr.wq.ibv.max_wr, attr.wq.ibv.max_sge);
>  		rte_errno = EINVAL;
>  		goto error;
>  	}
> @@ -777,25 +980,40 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  		goto error;
>  	}
>  	/* Fill the rings. */
> -	rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
> -		(uintptr_t)rwq.buf;
> -	for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
> -		struct rte_mbuf *buf = (*rxq_data->elts)[i];
> -		volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data-
> >wqes)[i];
> -
> +	rxq_data->wqes = rwq.buf;
> +	for (i = 0; (i != wqe_n); ++i) {
> +		volatile struct mlx5_wqe_data_seg *scat;
> +		uintptr_t addr;
> +		uint32_t byte_count;
> +
> +		if (mprq_en) {
> +			struct mlx5_mprq_buf *buf = (*rxq_data-
> >mprq_bufs)[i];
> +
> +			scat = &((volatile struct mlx5_wqe_mprq *)
> +				 rxq_data->wqes)[i].dseg;
> +			addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
> +			byte_count = (1 << MLX5_MPRQ_STRIDE_SZ_N) *
> +				     (1 << MLX5_MPRQ_STRIDE_NUM_N);
> +		} else {
> +			struct rte_mbuf *buf = (*rxq_data->elts)[i];
> +
> +			scat = &((volatile struct mlx5_wqe_data_seg *)
> +				 rxq_data->wqes)[i];
> +			addr = rte_pktmbuf_mtod(buf, uintptr_t);
> +			byte_count = DATA_LEN(buf);
> +		}
>  		/* scat->addr must be able to store a pointer. */
>  		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
>  		*scat = (struct mlx5_wqe_data_seg){
> -			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
> -								  uintptr_t)),
> -			.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
> -			.lkey = mlx5_rx_mb2mr(rxq_data, buf),
> +			.addr = rte_cpu_to_be_64(addr),
> +			.byte_count = rte_cpu_to_be_32(byte_count),
> +			.lkey = mlx5_rx_addr2mr(rxq_data, addr),
>  		};
>  	}
>  	rxq_data->rq_db = rwq.dbrec;
>  	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
>  	rxq_data->cq_ci = 0;
> -	rxq_data->rq_ci = 0;
> +	rxq_data->strd_ci = 0;
>  	rxq_data->rq_pi = 0;
>  	rxq_data->zip = (struct rxq_zip){
>  		.ai = 0,
> @@ -806,7 +1024,7 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev,
> uint16_t idx)
>  	rxq_data->cqn = cq_info.cqn;
>  	rxq_data->cq_arm_sn = 0;
>  	/* Update doorbell counter. */
> -	rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n;
> +	rxq_data->rq_ci = wqe_n >> rxq_data->sges_n;
>  	rte_wmb();
>  	*rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
>  	DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data-
> >port_id, @@ -929,12 +1147,192 @@ mlx5_rxq_ibv_releasable(struct
> mlx5_rxq_ibv *rxq_ibv)  }
> 
>  /**
> + * Callback function to initialize mbufs for Multi-Packet RQ.
> + */
> +static inline void
> +mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg
> __rte_unused,
> +		    void *_m, unsigned int i __rte_unused) {
> +	struct mlx5_mprq_buf *buf = _m;
> +
> +	memset(_m, 0, sizeof(*buf));
> +	buf->mp = mp;
> +	rte_atomic16_set(&buf->refcnt, 1);
> +}
> +
> +/**
> + * Free mempool of Multi-Packet RQ.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + *
> + * @return
> + *   0 on success, negative errno value on failure.
> + */
> +int
> +mlx5_mprq_free_mp(struct rte_eth_dev *dev) {
> +	struct priv *priv = dev->data->dev_private;
> +	struct rte_mempool *mp = priv->mprq_mp;
> +	unsigned int i;
> +
> +	if (mp == NULL)
> +		return 0;
> +	DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet
> RQ",
> +		dev->data->port_id, mp->name);
> +	/*
> +	 * If a buffer in the pool has been externally attached to a mbuf and it
> +	 * is still in use by application, destroying the Rx qeueue can spoil
> +	 * the packet. It is unlikely to happen but if application dynamically
> +	 * creates and destroys with holding Rx packets, this can happen.
> +	 *
> +	 * TODO: It is unavoidable for now because the mempool for Multi-
> Packet
> +	 * RQ isn't provided by application but managed by PMD.

This should be documented as part of the application restrictions when using mprq. 

> +	 */
> +	if (!rte_mempool_full(mp)) {
> +		DRV_LOG(ERR,
> +			"port %u mempool for Multi-Packet RQ is still in use",
> +			dev->data->port_id);
> +		rte_errno = EBUSY;
> +		return -rte_errno;
> +	}
> +	rte_mempool_free(mp);
> +	/* Unset mempool for each Rx queue. */
> +	for (i = 0; i != priv->rxqs_n; ++i) {
> +		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
> +
> +		rxq_ctrl->rxq.mprq_mp = NULL;
> +		mlx5_rxq_release(dev, i);

The release is to decrease the refcnt being added by the get? 

> +	}
> +	return 0;
> +}
> +
> +/**
> + * Allocate a mempool for Multi-Packet RQ. All configured Rx queues
> +share the
> + * mempool. If already allocated, reuse it if there're enough elements.
> + * Otherwise, resize it.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + *
> + * @return
> + *   0 on success, negative errno value on failure.
> + */
> +int
> +mlx5_mprq_alloc_mp(struct rte_eth_dev *dev) {
> +	struct priv *priv = dev->data->dev_private;
> +	struct rte_mempool *mp = priv->mprq_mp;
> +	char name[RTE_MEMPOOL_NAMESIZE];
> +	unsigned int desc = 0;
> +	unsigned int buf_len;
> +	unsigned int obj_num;
> +	unsigned int obj_size;
> +	unsigned int i;
> +
> +	if (!mlx5_mprq_enabled(dev))
> +		return 0;
> +	/* Count the total number of descriptors configured. */
> +	for (i = 0; i != priv->rxqs_n; ++i) {
> +		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
> +
> +		desc += 1 << rxq_ctrl->rxq.elts_n;
> +		mlx5_rxq_release(dev, i);
> +	}
> +	buf_len = (1 << MLX5_MPRQ_STRIDE_SZ_N) * (1 <<
> MLX5_MPRQ_STRIDE_NUM_N);
> +	obj_size = buf_len + sizeof(struct mlx5_mprq_buf);
> +	/*
> +	 * Received packets can be either memcpy'd or externally
> referenced. In
> +	 * case that the packet is attached to an mbuf as an external buffer,
> as
> +	 * it isn't possible to predict how the buffers will be queued by
> +	 * application, there's no option to exactly pre-allocate needed
> buffers
> +	 * in advance but to speculatively prepares enough buffers.
> +	 *
> +	 * In the data path, if this Mempool is depleted, PMD will try to
> memcpy
> +	 * received packets to buffers provided by application (rxq->mp) until
> +	 * this Mempool gets available again.
> +	 */
> +	desc *= 4;
> +	obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n;
> +	/* Check a mempool is already allocated and if it can be resued. */
> +	if (mp != NULL && mp->size >= obj_num) {
> +		DRV_LOG(DEBUG, "port %u mempool %s is being reused",
> +			dev->data->port_id, mp->name);
> +		/* Reuse. */
> +		return 0;
> +	} else if (mp != NULL) {
> +		DRV_LOG(DEBUG, "port %u mempool %s should be resized,
> freeing it",
> +			dev->data->port_id, mp->name);
> +		/*
> +		 * If failed to free, which means it may be still in use, no way
> +		 * but to keep using the existing one. On buffer underrun,
> +		 * packets will be memcpy'd instead of external buffer
> +		 * attachment.
> +		 */
> +		if (mlx5_mprq_free_mp(dev))
> +			return -rte_errno;
> +	}
> +	snprintf(name, sizeof(name), "%s-mprq", dev->device->name);
> +	mp = rte_mempool_create(name, obj_num, obj_size,
> MLX5_MPRQ_MP_CACHE_SZ,
> +				0, NULL, NULL, mlx5_mprq_buf_init, NULL,
> +				dev->device->numa_node,
> +				MEMPOOL_F_NO_PHYS_CONTIG);
> +	if (mp == NULL) {
> +		DRV_LOG(ERR,
> +			"port %u failed to allocate a mempool for"
> +			" Multi-Packet RQ, count=%u, size=%u",
> +			dev->data->port_id, obj_num, obj_size);
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	priv->mprq_mp = mp;
> +	DRV_LOG(INFO, "port %u Multi-Packet RQ is configured",
> +		dev->data->port_id);
> +	/* Set mempool for each Rx queue. */
> +	for (i = 0; i != priv->rxqs_n; ++i) {
> +		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
> +
> +		rxq_ctrl->rxq.mprq_mp = mp;
> +		mlx5_rxq_release(dev, i);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * Configure Rx queue as Multi-Packet RQ.
> + *
> + * @param rxq_ctrl
> + *   Pointer to RX queue structure.
> + *
> + * @return
> + *   0 on success, negative errno value on failure.
> + */
> +static int
> +rxq_configure_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) {
> +	struct priv *priv = rxq_ctrl->priv;
> +	struct rte_eth_dev *dev = eth_dev(priv);
> +	struct mlx5_dev_config *config = &priv->config;
> +
> +	assert(rxq_ctrl->rxq.sges_n == 0);
> +	rxq_ctrl->rxq.strd_sz_n =
> +		MLX5_MPRQ_STRIDE_SZ_N -
> MLX5_MPRQ_MIN_STRIDE_SZ_N;

Why the subtraction? To save bits in the rxq_data context? Is it worth the effort? 

> +	rxq_ctrl->rxq.strd_num_n =
> +		MLX5_MPRQ_STRIDE_NUM_N -
> MLX5_MPRQ_MIN_STRIDE_NUM_N;

Same question. 

> +	rxq_ctrl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
> +	rxq_ctrl->rxq.mprq_max_memcpy_len = config-
> >mprq_max_memcpy_len;
> +	DRV_LOG(DEBUG, "port %u Rx queue %u: Multi-Packet RQ is
> enabled",
> +		dev->data->port_id, rxq_ctrl->idx);
> +	return 0;
> +}
> +
> +/**
>   * Create a DPDK Rx queue.
>   *
>   * @param dev
>   *   Pointer to Ethernet device.
>   * @param idx
> - *   TX queue index.
> + *   RX queue index.
>   * @param desc
>   *   Number of descriptors to configure in queue.
>   * @param socket
> @@ -956,8 +1354,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  	 * Always allocate extra slots, even if eventually
>  	 * the vector Rx will not be used.
>  	 */
> -	const uint16_t desc_n =
> +	uint16_t desc_n =
>  		desc + config->rx_vec_en *
> MLX5_VPMD_DESCS_PER_LOOP;
> +	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
> 
>  	tmpl = rte_calloc_socket("RXQ", 1,
>  				 sizeof(*tmpl) +
> @@ -972,13 +1371,36 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  		/* rte_errno is already set. */
>  		goto error;
>  	}
> +	tmpl->priv = priv;
> +	tmpl->idx = idx;
>  	tmpl->socket = socket;
>  	if (dev->data->dev_conf.intr_conf.rxq)
>  		tmpl->irq = 1;
> -	/* Enable scattered packets support for this queue if necessary. */
> +	/*
> +	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
> +	 * following conditions are met:
> +	 *  - MPRQ is enabled.
> +	 *  - The number of descs is more than the number of strides.
> +	 *  - max_rx_pkt_len is less than the size of a stride sparing
> headroom.
> +	 *
> +	 *  Otherwise, enable Rx scatter if necessary.
> +	 */
>  	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
> -	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
> -	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
> +	if (mprq_en &&
> +	    desc >= (1U << MLX5_MPRQ_STRIDE_NUM_N) &&
> +	    dev->data->dev_conf.rxmode.max_rx_pkt_len <=
> +	    (1U << MLX5_MPRQ_STRIDE_SZ_N) - RTE_PKTMBUF_HEADROOM)
> {

As I said - I think it is better to adjust the stride size to match max_rx_pkt_len + headroom.
Application which enables striding RQ and following the guidelines conditions expect it to work. 
Currently it cannot change the stride size. 

> +		int ret;
> +
> +		/* TODO: Rx scatter isn't supported yet. */
> +		tmpl->rxq.sges_n = 0;
> +		/* Trim the number of descs needed. */
> +		desc >>= MLX5_MPRQ_STRIDE_NUM_N;
> +		ret = rxq_configure_mprq(tmpl);
> +		if (ret)
> +			goto error;
> +	} else if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
> +		   (mb_len - RTE_PKTMBUF_HEADROOM)) {
>  		tmpl->rxq.sges_n = 0;
>  	} else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) {
>  		unsigned int size =
> @@ -1054,13 +1476,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev,
> uint16_t idx, uint16_t desc,
>  	tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf &&
>  		(!!(dev->data->dev_conf.rxmode.mq_mode &
> ETH_MQ_RX_RSS));
>  	tmpl->rxq.port_id = dev->data->port_id;
> -	tmpl->priv = priv;

Why it is removed? It is related to this patch? 

>  	tmpl->rxq.mp = mp;
>  	tmpl->rxq.stats.idx = idx;
>  	tmpl->rxq.elts_n = log2above(desc);
>  	tmpl->rxq.elts =
>  		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
> -	tmpl->idx = idx;

Same question. 

>  	rte_atomic32_inc(&tmpl->refcnt);
>  	DRV_LOG(DEBUG, "port %u Rx queue %u: refcnt %d", dev->data-
> >port_id,
>  		idx, rte_atomic32_read(&tmpl->refcnt));
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 299408b31..fecff9dd6 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -47,6 +47,9 @@ static __rte_always_inline void  rxq_cq_to_mbuf(struct
> mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
>  	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
> 
> +static __rte_always_inline void
> +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx);
> +
>  uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
>  	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
> }; @@ -1920,7 +1923,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> 
>  	while (pkts_n) {
>  		unsigned int idx = rq_ci & wqe_cnt;
> -		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq-
> >wqes)[idx];
> +		volatile struct mlx5_wqe_data_seg *wqe =
> +			&((volatile struct mlx5_wqe_data_seg *)rxq-
> >wqes)[idx];
>  		struct rte_mbuf *rep = (*rxq->elts)[idx];
>  		uint32_t rss_hash_res = 0;
> 
> @@ -2023,6 +2027,207 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
>  	return i;
>  }
> 
> +void
> +mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) {
> +	struct mlx5_mprq_buf *buf = opaque;
> +
> +	if (rte_atomic16_read(&buf->refcnt) == 1) {
> +		rte_mempool_put(buf->mp, buf);
> +	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> +		rte_atomic16_set(&buf->refcnt, 1);
> +		rte_mempool_put(buf->mp, buf);
> +	}
> +}
> +
> +void
> +mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) {
> +	mlx5_mprq_buf_free_cb(NULL, buf);
> +}
> +
> +static inline void
> +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx) {
> +	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
> +	volatile struct mlx5_wqe_data_seg *wqe =
> +		&((volatile struct mlx5_wqe_mprq *)rxq-
> >wqes)[rq_idx].dseg;
> +	void *addr;
> +
> +	/* Replace mbuf. */
> +	(*rxq->mprq_bufs)[rq_idx] = rep;

Do we need to check if rep is not null? And adjust accordingly (i.e. not to update the doorbell reg)? 

> +	/* Replace WQE. */
> +	addr = mlx5_mprq_buf_addr(rep);
> +	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
> +	/* If there's only one MR, no need to replace LKey in WQE. */
> +	if (unlikely(!IS_SINGLE_MR(rxq->mr_ctrl.cache_bh.len)))
> +		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
> +	/* Stash a mbuf for next replacement. */
> +	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
> +		rxq->mprq_repl = rep;
> +	else
> +		rxq->mprq_repl = NULL;
> +}
> +
> +/**
> + * DPDK callback for RX with Multi-Packet RQ support.
> + *
> + * @param dpdk_rxq
> + *   Generic pointer to RX queue structure.
> + * @param[out] pkts
> + *   Array to store received packets.
> + * @param pkts_n
> + *   Maximum number of packets in array.
> + *
> + * @return
> + *   Number of packets successfully received (<= pkts_n).
> + */
> +uint16_t
> +mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t
> +pkts_n) {
> +	struct mlx5_rxq_data *rxq = dpdk_rxq;
> +	const unsigned int strd_n =
> +		1 << (rxq->strd_num_n +
> MLX5_MPRQ_MIN_STRIDE_NUM_N);
> +	const unsigned int strd_sz =
> +		1 << (rxq->strd_sz_n + MLX5_MPRQ_MIN_STRIDE_SZ_N);
> +	const unsigned int strd_shift =
> +		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;

About this strd_shift
>From one side - we don't use it. on the other hand it doesn't expected to have performance impact + the code is already written. So I guess it is OK to keep.

> +	const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
> +	const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
> +	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci &
> cq_mask];
> +	unsigned int i = 0;
> +	uint16_t rq_ci = rxq->rq_ci;
> +	uint16_t strd_idx = rxq->strd_ci;
> +	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
> +
> +	while (i < pkts_n) {
> +		struct rte_mbuf *pkt;
> +		void *addr;
> +		int ret;
> +		unsigned int len;
> +		uint16_t consumed_strd;
> +		uint32_t offset;
> +		uint32_t byte_cnt;
> +		uint32_t rss_hash_res = 0;
> +
> +		if (strd_idx == strd_n) {
> +			/* Replace WQE only if the buffer is still in use. */
> +			if (rte_atomic16_read(&buf->refcnt) > 1) {
> +				mprq_buf_replace(rxq, rq_ci & wq_mask);
> +				/* Release the old buffer. */
> +				mlx5_mprq_buf_free(buf);
> +			} else if (unlikely(rxq->mprq_repl == NULL)) {
> +				struct mlx5_mprq_buf *rep;
> +
> +				/*
> +				 * Currently, the MPRQ mempool is out of
> buffer
> +				 * and doing memcpy regardless of the size of
> Rx
> +				 * packet. Retry allocation to get back to
> +				 * normal.
> +				 */
> +				if (!rte_mempool_get(rxq->mprq_mp,
> +						     (void **)&rep))

Perhaps it is worth to have counter for that (mprq memory pool is out of buffers) . Maybe reported as part of the extended counters. 

> +					rxq->mprq_repl = rep;
> +			}
> +			/* Advance to the next WQE. */
> +			strd_idx = 0;
> +			++rq_ci;
> +			buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
> +		}
> +		cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
> +		ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &rss_hash_res);
> +		if (!ret)
> +			break;
> +		if (unlikely(ret == -1)) {
> +			/* RX error, packet is likely too large. */
> +			++rxq->stats.idropped;
> +			continue;
> +		}
> +		byte_cnt = ret;
> +		offset = strd_idx * strd_sz + strd_shift;
> +		consumed_strd = (byte_cnt &
> MLX5_MPRQ_STRIDE_NUM_MASK) >>
> +				MLX5_MPRQ_STRIDE_NUM_SHIFT;

Can stride consumed be more than 1? 
If not worth verify it (assert/other). 

> +		strd_idx += consumed_strd;
> +		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
> +			continue;
> +		pkt = rte_pktmbuf_alloc(rxq->mp);
> +		if (unlikely(pkt == NULL)) {
> +			++rxq->stats.rx_nombuf;
> +			break;
> +		}
> +		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >>
> MLX5_MPRQ_LEN_SHIFT;
> +		assert((int)len >= (rxq->crc_present << 2));
> +		if (rxq->crc_present)
> +			len -= ETHER_CRC_LEN;
> +		/* Initialize the offload flag. */
> +		pkt->ol_flags = 0;
> +		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset);
> +		/*
> +		 * Memcpy packets to the target mbuf if:
> +		 * - The size of packet is smaller than
> MLX5_MPRQ_MEMCPY_LEN.
> +		 * - Out of buffer in the Mempool for Multi-Packet RQ.
> +		 */
> +		if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl
> == NULL) {
> +			rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr,
> len);
> +		} else {
> +			rte_iova_t buf_iova;
> +			struct rte_mbuf_ext_shared_info *shinfo;
> +			uint16_t buf_len = consumed_strd * strd_sz;
> +
> +			/* Increment the refcnt of the whole chunk. */
> +			rte_atomic16_add_return(&buf->refcnt, 1);
> +			assert((uint16_t)rte_atomic16_read(&buf->refcnt)
> <=
> +			       strd_n + 1);
> +			addr = RTE_PTR_SUB(addr,
> RTE_PKTMBUF_HEADROOM);
> +			/*
> +			 * MLX5 device doesn't use iova but it is necessary in
> a
> +			 * case where the Rx packet is transmitted via a
> +			 * different PMD.
> +			 */
> +			buf_iova = rte_mempool_virt2iova(buf) +
> +				   RTE_PTR_DIFF(addr, buf);

Is it possible that mlx5_mprq_buf will be allocated on 2 different, not contiguous, physical pages? 


> +			shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr,
> +					&buf_len, mlx5_mprq_buf_free_cb,
> buf);

How you make sure you can spare sizeof(*shinfo) from the end of the buffer? 

from your pre-requisite from striding RQ:
+	/*
+	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
+	 * following conditions are met:
+	 *  - MPRQ is enabled.
+	 *  - The number of descs is more than the number of strides.
+	 *  - max_rx_pkt_len is less than the size of a stride sparing headroom.
+	 *
+	 *  Otherwise, enable Rx scatter if necessary.
+	 */
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
-	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
-	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
+	if (mprq_en &&
+	    desc >= (1U << MLX5_MPRQ_STRIDE_NUM_N) &&
+	    dev->data->dev_conf.rxmode.max_rx_pkt_len <=
+	    (1U << MLX5_MPRQ_STRIDE_SZ_N) - RTE_PKTMBUF_HEADROOM) {
+		int ret;


So we make sure the max packet + headroom enters a single stride. Should we add also the shinfo? 
Meaning stride topo is:
{
	Buffer for payload
	Sinfo
	Headroom (of next stride)
}

> +			/*
> +			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags
> when
> +			 * attaching the stride to mbuf and more offload flags
> +			 * will be added below by calling rxq_cq_to_mbuf().
> +			 * Other fields will be overwritten.
> +			 */
> +			rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova,
> buf_len,
> +						  shinfo);
> +			rte_pktmbuf_reset_headroom(pkt);
> +			assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
> +		}
> +		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
> +		PKT_LEN(pkt) = len;
> +		DATA_LEN(pkt) = len;
> +		PORT(pkt) = rxq->port_id;
> +#ifdef MLX5_PMD_SOFT_COUNTERS
> +		/* Increment bytes counter. */
> +		rxq->stats.ibytes += PKT_LEN(pkt);
> +#endif
> +		/* Return packet. */
> +		*(pkts++) = pkt;
> +		++i;
> +	}
> +	/* Update the consumer index. */
> +	rxq->rq_pi += i;

rq_ci is advanced per WQE. But rq_pi is advanced per packet. Why? 

> +	rxq->strd_ci = strd_idx;
> +	rte_io_wmb();
> +	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> +	if (rq_ci != rxq->rq_ci) {
> +		rxq->rq_ci = rq_ci;
> +		rte_io_wmb();
> +		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
> +	}
> +#ifdef MLX5_PMD_SOFT_COUNTERS
> +	/* Increment packets counter. */
> +	rxq->stats.ipackets += i;
> +#endif
> +	return i;
> +}
> +
>  /**
>   * Dummy DPDK callback for TX.
>   *
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 74581cf9b..c3f2621f3 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -64,6 +64,16 @@ struct rxq_zip {
>  	uint32_t cqe_cnt; /* Number of CQEs. */  };
> 
> +/* Multi-Packet RQ buffer header. */
> +struct mlx5_mprq_buf {
> +	struct rte_mempool *mp;
> +	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> +	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> packet.
> +*/ } __rte_cache_aligned;
> +
> +/* Get pointer to the first stride. */
> +#define mlx5_mprq_buf_addr(ptr) ((ptr) + 1)
> +
>  /* RX queue descriptor. */
>  struct mlx5_rxq_data {
>  	unsigned int csum:1; /* Enable checksum offloading. */ @@ -75,19
> +85,30 @@ struct mlx5_rxq_data {
>  	unsigned int elts_n:4; /* Log 2 of Mbufs. */
>  	unsigned int rss_hash:1; /* RSS hash result is enabled. */
>  	unsigned int mark:1; /* Marked flow available on the queue. */
> -	unsigned int :15; /* Remaining bits. */
> +	unsigned int strd_sz_n:3; /* Log 2 of stride size. */
> +	unsigned int strd_num_n:4; /* Log 2 of the number of stride. */
> +	unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */
> +	unsigned int :8; /* Remaining bits. */
>  	volatile uint32_t *rq_db;
>  	volatile uint32_t *cq_db;
>  	uint16_t port_id;
>  	uint16_t rq_ci;
> +	uint16_t strd_ci; /* Stride index in a WQE for Multi-Packet RQ. */
>  	uint16_t rq_pi;
>  	uint16_t cq_ci;
>  	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
> -	volatile struct mlx5_wqe_data_seg(*wqes)[];
> +	uint16_t mprq_max_memcpy_len; /* Maximum size of packet to
> memcpy. */
> +	volatile void *wqes;
>  	volatile struct mlx5_cqe(*cqes)[];
>  	struct rxq_zip zip; /* Compressed context. */
> -	struct rte_mbuf *(*elts)[];
> +	RTE_STD_C11
> +	union  {
> +		struct rte_mbuf *(*elts)[];
> +		struct mlx5_mprq_buf *(*mprq_bufs)[];
> +	};
>  	struct rte_mempool *mp;
> +	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ.
> */
> +	struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish.
> */
>  	struct mlx5_rxq_stats stats;
>  	uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */
>  	struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ @@
> -206,6 +227,11 @@ struct mlx5_txq_ctrl {  extern uint8_t
> rss_hash_default_key[];  extern const size_t rss_hash_default_key_len;
> 
> +int mlx5_check_mprq_support(struct rte_eth_dev *dev); int
> +mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq); int
> +mlx5_mprq_enabled(struct rte_eth_dev *dev); int
> +mlx5_mprq_free_mp(struct rte_eth_dev *dev); int
> +mlx5_mprq_alloc_mp(struct rte_eth_dev *dev);
>  void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl);  int
> mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
>  			unsigned int socket, const struct rte_eth_rxconf
> *conf, @@ -229,6 +255,7 @@ int mlx5_rxq_release(struct rte_eth_dev
> *dev, uint16_t idx);  int mlx5_rxq_releasable(struct rte_eth_dev *dev,
> uint16_t idx);  int mlx5_rxq_verify(struct rte_eth_dev *dev);  int
> rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
> +int rxq_alloc_mprq_buf(struct mlx5_rxq_ctrl *rxq_ctrl);
>  struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev
> *dev,
>  						  const uint16_t *queues,
>  						  uint32_t queues_n);
> @@ -292,6 +319,10 @@ uint16_t mlx5_tx_burst_mpw_inline(void
> *dpdk_txq, struct rte_mbuf **pkts,  uint16_t mlx5_tx_burst_empw(void
> *dpdk_txq, struct rte_mbuf **pkts,
>  			    uint16_t pkts_n);
>  uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n);
> +void mlx5_mprq_buf_free_cb(void *addr, void *opaque); void
> +mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf); uint16_t
> +mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts,
> +			    uint16_t pkts_n);
>  uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
>  			  uint16_t pkts_n);
>  uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, diff --
> git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
> index 215a9e592..06efcc200 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> @@ -275,6 +275,8 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data
> *rxq)
>  	struct mlx5_rxq_ctrl *ctrl =
>  		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
> 
> +	if (mlx5_mprq_enabled(eth_dev(ctrl->priv)))
> +		return -ENOTSUP;
>  	if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
>  		return -ENOTSUP;
>  	return 1;
> @@ -297,6 +299,8 @@ mlx5_check_vec_rx_support(struct rte_eth_dev
> *dev)
> 
>  	if (!priv->config.rx_vec_en)
>  		return -ENOTSUP;
> +	if (mlx5_mprq_enabled(dev))
> +		return -ENOTSUP;
>  	/* All the configured queues should support. */
>  	for (i = 0; i < priv->rxqs_n; ++i) {
>  		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; diff --git
> a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
> index 76678a820..f58047a06 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
> @@ -87,7 +87,8 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data
> *rxq, uint16_t n)
>  	const uint16_t q_mask = q_n - 1;
>  	uint16_t elts_idx = rxq->rq_ci & q_mask;
>  	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
> -	volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx];
> +	volatile struct mlx5_wqe_data_seg *wq =
> +		&((volatile struct mlx5_wqe_data_seg *)rxq-
> >wqes)[elts_idx];
>  	unsigned int i;
> 
>  	assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); diff --git
> a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c index
> 36b7c9e2f..3e7c0a90f 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -102,6 +102,9 @@ mlx5_rxq_start(struct rte_eth_dev *dev)
>  	unsigned int i;
>  	int ret = 0;
> 
> +	/* Allocate/reuse/resize mempool for Multi-Packet RQ. */
> +	if (mlx5_mprq_alloc_mp(dev))
> +		goto error;
>  	for (i = 0; i != priv->rxqs_n; ++i) {
>  		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i);
>  		struct rte_mempool *mp;
> @@ -109,7 +112,8 @@ mlx5_rxq_start(struct rte_eth_dev *dev)
>  		if (!rxq_ctrl)
>  			continue;
>  		/* Pre-register Rx mempool. */
> -		mp = rxq_ctrl->rxq.mp;
> +		mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
> +		     rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp;
>  		DRV_LOG(DEBUG,
>  			"port %u Rx queue %u registering"
>  			" mp %s having %u chunks",
> --
> 2.11.0



More information about the dev mailing list