[PATCH v3 04/9] net/mlx5: add per-queue packet pacing infrastructure

Slava Ovsiienko viacheslavo at nvidia.com
Fri Mar 20 13:51:55 CET 2026


Hi,

> -----Original Message-----
> From: Vincent Jardin <vjardin at free.fr>
> Sent: Friday, March 13, 2026 12:01 AM
> To: dev at dpdk.org
> Cc: Raslan Darawsheh <rasland at nvidia.com>; NBU-Contact-Thomas Monjalon
> (EXTERNAL) <thomas at monjalon.net>; andrew.rybchenko at oktetlabs.ru;
> Dariusz Sosnowski <dsosnowski at nvidia.com>; Slava Ovsiienko
> <viacheslavo at nvidia.com>; Bing Zhao <bingz at nvidia.com>; Ori Kam
> <orika at nvidia.com>; Suanming Mou <suanmingm at nvidia.com>; Matan Azrad
> <matan at nvidia.com>; stephen at networkplumber.org; Vincent Jardin
> <vjardin at free.fr>
> Subject: [PATCH v3 04/9] net/mlx5: add per-queue packet pacing infrastructure
> 
> Add mlx5_txq_rate_limit structure and alloc/free helpers for per-queue data-
> rate packet pacing. Each Tx queue can now hold its own PP (Packet Pacing)
> index allocated via mlx5dv_pp_alloc() with MLX5_DATA_RATE mode.
> 
> mlx5_txq_alloc_pp_rate_limit() converts Mbps to kbps for the PRM rate_limit
> field and allocates a PP index from the HW rate table.
> mlx5_txq_free_pp_rate_limit() releases it.
> 
> PP allocation uses shared mode (flags=0) so that the kernel mlx5 driver can
> reuse a single HW rate table entry for all PP contexts with identical parameters
> (rate, burst, packet size). This avoids exhausting the rate table (typically 128
> entries on ConnectX-6 Dx) when many queues share the same rate. Each queue
> still gets its own PP handle for proper cleanup.
> 
> The existing Clock Queue path (sh->txpp.pp / sh->txpp.pp_id) is untouched — it
> uses MLX5_WQE_RATE for per-packet scheduling with a dedicated index, while
> per-queue rate limiting uses MLX5_DATA_RATE.
> 
> PP index cleanup is added to mlx5_txq_release() to prevent leaks when queues
> are destroyed.
> 
> Supported hardware:
> - ConnectX-6 Dx: per-SQ rate via packet_pacing_rate_limit_index
> - ConnectX-7/8: same mechanism, plus wait-on-time coexistence
> - BlueField-2/3: same PP allocation support
> 
> Not supported:
> - ConnectX-5: packet_pacing exists but MLX5_DATA_RATE mode may
>   not be available on all firmware versions
> - ConnectX-4 Lx and earlier: no packet_pacing capability
> 
> Signed-off-by: Vincent Jardin <vjardin at free.fr>
> ---
>  drivers/net/mlx5/mlx5.h      | 11 ++++++
>  drivers/net/mlx5/mlx5_tx.h   |  1 +
>  drivers/net/mlx5/mlx5_txpp.c | 73
> ++++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_txq.c  |  1 +
>  4 files changed, 86 insertions(+)
> 
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> b83dda5652..c48c3072d1 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -1296,6 +1296,13 @@ struct mlx5_txpp_ts {
>  	RTE_ATOMIC(uint64_t) ts;
>  };
> 
> +/* Per-queue rate limit tracking. */
> +struct mlx5_txq_rate_limit {
> +	void *pp;		/* Packet pacing context from dv_alloc_pp. */
> +	uint16_t pp_id;		/* Packet pacing index. */
> +	uint32_t rate_mbps;	/* Current rate in Mbps, 0 = disabled. */
> +};
> +
>  /* Tx packet pacing structure. */
>  struct mlx5_dev_txpp {
>  	pthread_mutex_t mutex; /* Pacing create/destroy mutex. */ @@ -
> 2634,6 +2641,10 @@ int mlx5_txpp_xstats_get_names(struct rte_eth_dev
> *dev,  void mlx5_txpp_interrupt_handler(void *cb_arg);  int
> mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev);  void
> mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev);
> +int mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
> +				 struct mlx5_txq_rate_limit *rl,
> +				 uint32_t rate_mbps);
> +void mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit *rl);
> 
>  /* mlx5_rxtx.c */
> 
> diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h index
> 0134a2e003..b1b3653247 100644
> --- a/drivers/net/mlx5/mlx5_tx.h
> +++ b/drivers/net/mlx5/mlx5_tx.h
> @@ -192,6 +192,7 @@ struct mlx5_txq_ctrl {
>  	uint16_t dump_file_n; /* Number of dump files. */
>  	struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
>  	uint32_t hairpin_status; /* Hairpin binding status. */
> +	struct mlx5_txq_rate_limit rl; /* Per-queue rate limit. */

Could we use "rate_limit" naming instead of "rl" ?
"rl" is a little bit out of the current naming style (please, see the other struct member naming).

>  	struct mlx5_txq_data txq; /* Data path structure. */
>  	/* Must be the last field in the structure, contains elts[]. */  }; diff --git
> a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c index
> 0e99b58bde..0a883b0a94 100644
> --- a/drivers/net/mlx5/mlx5_txpp.c
> +++ b/drivers/net/mlx5/mlx5_txpp.c
> @@ -128,6 +128,79 @@ mlx5_txpp_alloc_pp_index(struct
> mlx5_dev_ctx_shared *sh)  #endif  }
> 
> +/* Free a per-queue packet pacing index. */ void
> +mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit *rl) { #ifdef
> +HAVE_MLX5DV_PP_ALLOC
> +	if (rl->pp) {
> +		mlx5_glue->dv_free_pp(rl->pp);
> +		rl->pp = NULL;
> +		rl->pp_id = 0;
> +		rl->rate_mbps = 0;
> +	}
> +#else
> +	RTE_SET_USED(rl);
> +#endif
> +}
> +
> +/* Allocate a per-queue packet pacing index for data-rate limiting. */
> +int mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
> +			     struct mlx5_txq_rate_limit *rl,
> +			     uint32_t rate_mbps)
> +{
> +#ifdef HAVE_MLX5DV_PP_ALLOC
> +	uint32_t pp[MLX5_ST_SZ_DW(set_pp_rate_limit_context)];
> +	uint64_t rate_kbps;
> +	struct mlx5_hca_qos_attr *qos = &sh->cdev->config.hca_attr.qos;
> +
> +	MLX5_ASSERT(rate_mbps > 0);

Should we check the rate_mbps in non-debug environment?

> +	rate_kbps = (uint64_t)rate_mbps * 1000;
> +	if (qos->packet_pacing_min_rate && rate_kbps < qos-
> >packet_pacing_min_rate) {
> +		DRV_LOG(ERR, "Rate %u Mbps below HW minimum (%u
> kbps).",
> +			rate_mbps, qos->packet_pacing_min_rate);
> +		rte_errno = ERANGE;
> +		return -ERANGE;
> +	}
> +	if (qos->packet_pacing_max_rate && rate_kbps > qos-
> >packet_pacing_max_rate) {
> +		DRV_LOG(ERR, "Rate %u Mbps exceeds HW maximum (%u
> kbps).",
> +			rate_mbps, qos->packet_pacing_max_rate);
> +		rte_errno = ERANGE;
> +		return -ERANGE;
> +	}
> +	memset(&pp, 0, sizeof(pp));
> +	MLX5_SET(set_pp_rate_limit_context, &pp, rate_limit,
> (uint32_t)rate_kbps);
> +	MLX5_SET(set_pp_rate_limit_context, &pp, rate_mode,
> MLX5_DATA_RATE);
> +	rl->pp = mlx5_glue->dv_alloc_pp(sh->cdev->ctx, sizeof(pp), &pp, 0);
> +	if (rl->pp == NULL) {
> +		DRV_LOG(ERR, "Failed to allocate PP index for rate %u Mbps.",
> +			rate_mbps);
> +		rte_errno = errno;
> +		return -errno;
> +	}
> +	rl->pp_id = ((struct mlx5dv_pp *)rl->pp)->index;
> +	if (!rl->pp_id) {
> +		DRV_LOG(ERR, "Zero PP index allocated for rate %u Mbps.",
> +			rate_mbps);
> +		mlx5_txq_free_pp_rate_limit(rl);
> +		rte_errno = ENOTSUP;
> +		return -ENOTSUP;
> +	}
> +	rl->rate_mbps = rate_mbps;
> +	DRV_LOG(DEBUG, "Allocated PP index %u for rate %u Mbps.",
> +		rl->pp_id, rate_mbps);
> +	return 0;
> +#else
> +	RTE_SET_USED(sh);
> +	RTE_SET_USED(rl);
> +	RTE_SET_USED(rate_mbps);
> +	DRV_LOG(ERR, "Per-queue rate limit requires rdma-core PP support.");
> +	rte_errno = ENOTSUP;
> +	return -ENOTSUP;
> +#endif
> +}
> +
>  static void
>  mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq)  { diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 9275efb58e..fa9bb48fd4 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -1338,6 +1338,7 @@ mlx5_txq_release(struct rte_eth_dev *dev, uint16_t
> idx)
>  	txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
>  	if (rte_atomic_fetch_sub_explicit(&txq_ctrl->refcnt, 1,
> rte_memory_order_relaxed) - 1 > 1)
>  		return 1;
> +	mlx5_txq_free_pp_rate_limit(&txq_ctrl->rl);

This might be a problematic teardown order.
Would be better to release rate_limit object AFTER queue destroying (in txq_obj_release).

>  	if (txq_ctrl->obj) {
>  		priv->obj_ops.txq_obj_release(txq_ctrl->obj);
>  		LIST_REMOVE(txq_ctrl->obj, next);
> --
> 2.43.0

With best regards,
Slava


More information about the dev mailing list