[PATCH v3 04/9] net/mlx5: add per-queue packet pacing infrastructure
Vincent Jardin
vjardin at free.fr
Thu Mar 12 23:01:15 CET 2026
Add mlx5_txq_rate_limit structure and alloc/free helpers for
per-queue data-rate packet pacing. Each Tx queue can now hold
its own PP (Packet Pacing) index allocated via mlx5dv_pp_alloc()
with MLX5_DATA_RATE mode.
mlx5_txq_alloc_pp_rate_limit() converts Mbps to kbps for the PRM
rate_limit field and allocates a PP index from the HW rate table.
mlx5_txq_free_pp_rate_limit() releases it.
PP allocation uses shared mode (flags=0) so that the kernel mlx5
driver can reuse a single HW rate table entry for all PP contexts
with identical parameters (rate, burst, packet size). This avoids
exhausting the rate table (typically 128 entries on ConnectX-6 Dx)
when many queues share the same rate. Each queue still gets its
own PP handle for proper cleanup.
The existing Clock Queue path (sh->txpp.pp / sh->txpp.pp_id) is
untouched — it uses MLX5_WQE_RATE for per-packet scheduling with
a dedicated index, while per-queue rate limiting uses MLX5_DATA_RATE.
PP index cleanup is added to mlx5_txq_release() to prevent leaks
when queues are destroyed.
Supported hardware:
- ConnectX-6 Dx: per-SQ rate via packet_pacing_rate_limit_index
- ConnectX-7/8: same mechanism, plus wait-on-time coexistence
- BlueField-2/3: same PP allocation support
Not supported:
- ConnectX-5: packet_pacing exists but MLX5_DATA_RATE mode may
not be available on all firmware versions
- ConnectX-4 Lx and earlier: no packet_pacing capability
Signed-off-by: Vincent Jardin <vjardin at free.fr>
---
drivers/net/mlx5/mlx5.h | 11 ++++++
drivers/net/mlx5/mlx5_tx.h | 1 +
drivers/net/mlx5/mlx5_txpp.c | 73 ++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_txq.c | 1 +
4 files changed, 86 insertions(+)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index b83dda5652..c48c3072d1 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1296,6 +1296,13 @@ struct mlx5_txpp_ts {
RTE_ATOMIC(uint64_t) ts;
};
+/* Per-queue rate limit tracking. */
+struct mlx5_txq_rate_limit {
+ void *pp; /* Packet pacing context from dv_alloc_pp. */
+ uint16_t pp_id; /* Packet pacing index. */
+ uint32_t rate_mbps; /* Current rate in Mbps, 0 = disabled. */
+};
+
/* Tx packet pacing structure. */
struct mlx5_dev_txpp {
pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
@@ -2634,6 +2641,10 @@ int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev,
void mlx5_txpp_interrupt_handler(void *cb_arg);
int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev);
void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev);
+int mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
+ struct mlx5_txq_rate_limit *rl,
+ uint32_t rate_mbps);
+void mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit *rl);
/* mlx5_rxtx.c */
diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h
index 0134a2e003..b1b3653247 100644
--- a/drivers/net/mlx5/mlx5_tx.h
+++ b/drivers/net/mlx5/mlx5_tx.h
@@ -192,6 +192,7 @@ struct mlx5_txq_ctrl {
uint16_t dump_file_n; /* Number of dump files. */
struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
uint32_t hairpin_status; /* Hairpin binding status. */
+ struct mlx5_txq_rate_limit rl; /* Per-queue rate limit. */
struct mlx5_txq_data txq; /* Data path structure. */
/* Must be the last field in the structure, contains elts[]. */
};
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 0e99b58bde..0a883b0a94 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -128,6 +128,79 @@ mlx5_txpp_alloc_pp_index(struct mlx5_dev_ctx_shared *sh)
#endif
}
+/* Free a per-queue packet pacing index. */
+void
+mlx5_txq_free_pp_rate_limit(struct mlx5_txq_rate_limit *rl)
+{
+#ifdef HAVE_MLX5DV_PP_ALLOC
+ if (rl->pp) {
+ mlx5_glue->dv_free_pp(rl->pp);
+ rl->pp = NULL;
+ rl->pp_id = 0;
+ rl->rate_mbps = 0;
+ }
+#else
+ RTE_SET_USED(rl);
+#endif
+}
+
+/* Allocate a per-queue packet pacing index for data-rate limiting. */
+int
+mlx5_txq_alloc_pp_rate_limit(struct mlx5_dev_ctx_shared *sh,
+ struct mlx5_txq_rate_limit *rl,
+ uint32_t rate_mbps)
+{
+#ifdef HAVE_MLX5DV_PP_ALLOC
+ uint32_t pp[MLX5_ST_SZ_DW(set_pp_rate_limit_context)];
+ uint64_t rate_kbps;
+ struct mlx5_hca_qos_attr *qos = &sh->cdev->config.hca_attr.qos;
+
+ MLX5_ASSERT(rate_mbps > 0);
+ rate_kbps = (uint64_t)rate_mbps * 1000;
+ if (qos->packet_pacing_min_rate && rate_kbps < qos->packet_pacing_min_rate) {
+ DRV_LOG(ERR, "Rate %u Mbps below HW minimum (%u kbps).",
+ rate_mbps, qos->packet_pacing_min_rate);
+ rte_errno = ERANGE;
+ return -ERANGE;
+ }
+ if (qos->packet_pacing_max_rate && rate_kbps > qos->packet_pacing_max_rate) {
+ DRV_LOG(ERR, "Rate %u Mbps exceeds HW maximum (%u kbps).",
+ rate_mbps, qos->packet_pacing_max_rate);
+ rte_errno = ERANGE;
+ return -ERANGE;
+ }
+ memset(&pp, 0, sizeof(pp));
+ MLX5_SET(set_pp_rate_limit_context, &pp, rate_limit, (uint32_t)rate_kbps);
+ MLX5_SET(set_pp_rate_limit_context, &pp, rate_mode, MLX5_DATA_RATE);
+ rl->pp = mlx5_glue->dv_alloc_pp(sh->cdev->ctx, sizeof(pp), &pp, 0);
+ if (rl->pp == NULL) {
+ DRV_LOG(ERR, "Failed to allocate PP index for rate %u Mbps.",
+ rate_mbps);
+ rte_errno = errno;
+ return -errno;
+ }
+ rl->pp_id = ((struct mlx5dv_pp *)rl->pp)->index;
+ if (!rl->pp_id) {
+ DRV_LOG(ERR, "Zero PP index allocated for rate %u Mbps.",
+ rate_mbps);
+ mlx5_txq_free_pp_rate_limit(rl);
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ rl->rate_mbps = rate_mbps;
+ DRV_LOG(DEBUG, "Allocated PP index %u for rate %u Mbps.",
+ rl->pp_id, rate_mbps);
+ return 0;
+#else
+ RTE_SET_USED(sh);
+ RTE_SET_USED(rl);
+ RTE_SET_USED(rate_mbps);
+ DRV_LOG(ERR, "Per-queue rate limit requires rdma-core PP support.");
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+#endif
+}
+
static void
mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq)
{
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9275efb58e..fa9bb48fd4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1338,6 +1338,7 @@ mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
if (rte_atomic_fetch_sub_explicit(&txq_ctrl->refcnt, 1, rte_memory_order_relaxed) - 1 > 1)
return 1;
+ mlx5_txq_free_pp_rate_limit(&txq_ctrl->rl);
if (txq_ctrl->obj) {
priv->obj_ops.txq_obj_release(txq_ctrl->obj);
LIST_REMOVE(txq_ctrl->obj, next);
--
2.43.0
More information about the dev
mailing list