[PATCH v3] net/mlx5: fix age checking crash

Rongwei Liu rongweil at nvidia.com
Thu Oct 9 11:18:10 CEST 2025


When aging is configured, there is a background thread
which queries all the counters in the pool.

Meantime, per queue flow insertion/deletion/update changes
the counter pool too. It introduces a race condition between
resetting counters's in_used and age_idx fields during flow deletion
and reading them in the background thread.

To resolve it, all key members of counter's struct
are placed in a single uint32_t and they are accessed atomically.

To avoid the occasional timestamp equalization with age_idx,
query_gen_when_free is moved out of the union. The total memory
size is kept the same.

Fixes: 04a4de756e14 ("net/mlx5: support flow age action with HWS")
Cc: michaelba at nvidia.com
Cc: stable at dpdk.org
Signed-off-by: Rongwei Liu <rongweil at nvidia.com>
Acked-by: Dariusz Sosnowski <dsosnowski at nvidia.com>

v3: fix windows compilation error.
v2: fix clang compilation error.
---
 drivers/net/mlx5/mlx5_flow_hw.c |   5 +-
 drivers/net/mlx5/mlx5_hws_cnt.c |  10 +--
 drivers/net/mlx5/mlx5_hws_cnt.h | 133 ++++++++++++++++++++------------
 3 files changed, 90 insertions(+), 58 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_flow_hw.c b/drivers/net/mlx5/mlx5_flow_hw.c
index 9a0aa1827e..491a78a0de 100644
--- a/drivers/net/mlx5/mlx5_flow_hw.c
+++ b/drivers/net/mlx5/mlx5_flow_hw.c
@@ -3232,7 +3232,7 @@ flow_hw_shared_action_construct(struct rte_eth_dev *dev, uint32_t queue,
 			return -1;
 		if (action_flags & MLX5_FLOW_ACTION_COUNT) {
 			cnt_queue = mlx5_hws_cnt_get_queue(priv, &queue);
-			if (mlx5_hws_cnt_pool_get(priv->hws_cpool, cnt_queue, &age_cnt, idx) < 0)
+			if (mlx5_hws_cnt_pool_get(priv->hws_cpool, cnt_queue, &age_cnt, idx, 0) < 0)
 				return -1;
 			flow->flags |= MLX5_FLOW_HW_FLOW_FLAG_CNT_ID;
 			flow->cnt_id = age_cnt;
@@ -3668,7 +3668,8 @@ flow_hw_actions_construct(struct rte_eth_dev *dev,
 			/* Fall-through. */
 		case RTE_FLOW_ACTION_TYPE_COUNT:
 			cnt_queue = mlx5_hws_cnt_get_queue(priv, &queue);
-			ret = mlx5_hws_cnt_pool_get(priv->hws_cpool, cnt_queue, &cnt_id, age_idx);
+			ret = mlx5_hws_cnt_pool_get(priv->hws_cpool, cnt_queue, &cnt_id,
+						    age_idx, 0);
 			if (ret != 0) {
 				rte_flow_error_set(error, -ret, RTE_FLOW_ERROR_TYPE_ACTION,
 						action, "Failed to allocate flow counter");
diff --git a/drivers/net/mlx5/mlx5_hws_cnt.c b/drivers/net/mlx5/mlx5_hws_cnt.c
index 5c738f38ca..fb01fce4e5 100644
--- a/drivers/net/mlx5/mlx5_hws_cnt.c
+++ b/drivers/net/mlx5/mlx5_hws_cnt.c
@@ -63,8 +63,8 @@ __mlx5_hws_cnt_svc(struct mlx5_dev_ctx_shared *sh,
 	uint32_t ret __rte_unused;
 
 	reset_cnt_num = rte_ring_count(reset_list);
-	cpool->query_gen++;
 	mlx5_aso_cnt_query(sh, cpool);
+	rte_atomic_fetch_add_explicit(&cpool->query_gen, 1, rte_memory_order_release);
 	zcdr.n1 = 0;
 	zcdu.n1 = 0;
 	ret = rte_ring_enqueue_zc_burst_elem_start(reuse_list,
@@ -134,14 +134,14 @@ mlx5_hws_aging_check(struct mlx5_priv *priv, struct mlx5_hws_cnt_pool *cpool)
 	uint32_t nb_alloc_cnts = mlx5_hws_cnt_pool_get_size(cpool);
 	uint16_t expected1 = HWS_AGE_CANDIDATE;
 	uint16_t expected2 = HWS_AGE_CANDIDATE_INSIDE_RING;
-	uint32_t i;
+	uint32_t i, age_idx, in_use;
 
 	cpool->time_of_last_age_check = curr_time;
 	for (i = 0; i < nb_alloc_cnts; ++i) {
-		uint32_t age_idx = cpool->pool[i].age_idx;
 		uint64_t hits;
 
-		if (!cpool->pool[i].in_used || age_idx == 0)
+		mlx5_hws_cnt_get_all(&cpool->pool[i], &in_use, NULL, &age_idx);
+		if (!in_use || age_idx == 0)
 			continue;
 		param = mlx5_ipool_get(age_info->ages_ipool, age_idx);
 		if (unlikely(param == NULL)) {
@@ -767,7 +767,7 @@ mlx5_hws_cnt_pool_create(struct rte_eth_dev *dev,
 	 * because they already have init value no need
 	 * to wait for query.
 	 */
-	cpool->query_gen = 1;
+	rte_atomic_store_explicit(&cpool->query_gen, 1, rte_memory_order_relaxed);
 	ret = mlx5_hws_cnt_pool_action_create(priv, cpool);
 	if (ret != 0) {
 		rte_flow_error_set(error, -ret,
diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h b/drivers/net/mlx5/mlx5_hws_cnt.h
index 38a9c19449..7af7e71ee0 100644
--- a/drivers/net/mlx5/mlx5_hws_cnt.h
+++ b/drivers/net/mlx5/mlx5_hws_cnt.h
@@ -42,33 +42,36 @@ struct mlx5_hws_cnt_dcs_mng {
 	struct mlx5_hws_cnt_dcs dcs[MLX5_HWS_CNT_DCS_NUM];
 };
 
-struct mlx5_hws_cnt {
-	struct flow_counter_stats reset;
-	bool in_used; /* Indicator whether this counter in used or in pool. */
-	union {
-		struct {
-			uint32_t share:1;
-			/*
-			 * share will be set to 1 when this counter is used as
-			 * indirect action.
-			 */
-			uint32_t age_idx:24;
-			/*
-			 * When this counter uses for aging, it save the index
-			 * of AGE parameter. For pure counter (without aging)
-			 * this index is zero.
-			 */
-		};
-		/* This struct is only meaningful when user own this counter. */
-		uint32_t query_gen_when_free;
+union mlx5_hws_cnt_state {
+	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(uint32_t) data;
+	struct {
+		uint32_t in_used:1;
+		/* Indicator whether this counter in used or in pool. */
+		uint32_t share:1;
+		/*
+		 * share will be set to 1 when this counter is used as
+		 * indirect action.
+		 */
+		uint32_t age_idx:24;
 		/*
-		 * When PMD own this counter (user put back counter to PMD
-		 * counter pool, i.e), this field recorded value of counter
-		 * pools query generation at time user release the counter.
+		 * When this counter uses for aging, it stores the index
+		 * of AGE parameter. Otherwise, this index is zero.
 		 */
 	};
 };
 
+struct mlx5_hws_cnt {
+	struct flow_counter_stats reset;
+	union mlx5_hws_cnt_state cnt_state;
+	/* This struct is only meaningful when user own this counter. */
+	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(uint32_t) query_gen_when_free;
+	/*
+	 * When PMD own this counter (user put back counter to PMD
+	 * counter pool, i.e), this field recorded value of counter
+	 * pools query generation at time user release the counter.
+	 */
+};
+
 struct mlx5_hws_cnt_raw_data_mng {
 	struct flow_counter_stats *raw;
 	struct mlx5_pmd_mr mr;
@@ -197,6 +200,42 @@ mlx5_hws_cnt_id_valid(cnt_id_t cnt_id)
 		MLX5_INDIRECT_ACTION_TYPE_COUNT ? true : false;
 }
 
+static __rte_always_inline void
+mlx5_hws_cnt_set_age_idx(struct mlx5_hws_cnt *cnt, uint32_t value)
+{
+	union mlx5_hws_cnt_state cnt_state;
+
+	cnt_state.data = rte_atomic_load_explicit(&cnt->cnt_state.data, rte_memory_order_acquire);
+	cnt_state.age_idx = value;
+	rte_atomic_store_explicit(&cnt->cnt_state.data, cnt_state.data, rte_memory_order_release);
+}
+
+static __rte_always_inline void
+mlx5_hws_cnt_set_all(struct mlx5_hws_cnt *cnt, uint32_t in_used, uint32_t share, uint32_t age_idx)
+{
+	union mlx5_hws_cnt_state cnt_state;
+
+	cnt_state.in_used = !!in_used;
+	cnt_state.share = !!share;
+	cnt_state.age_idx = age_idx;
+	rte_atomic_store_explicit(&cnt->cnt_state.data, cnt_state.data, rte_memory_order_relaxed);
+}
+
+static __rte_always_inline void
+mlx5_hws_cnt_get_all(struct mlx5_hws_cnt *cnt, uint32_t *in_used, uint32_t *share,
+		     uint32_t *age_idx)
+{
+	union mlx5_hws_cnt_state cnt_state;
+
+	cnt_state.data = rte_atomic_load_explicit(&cnt->cnt_state.data, rte_memory_order_acquire);
+	if (in_used != NULL)
+		*in_used = cnt_state.in_used;
+	if (share != NULL)
+		*share = cnt_state.share;
+	if (age_idx != NULL)
+		*age_idx = cnt_state.age_idx;
+}
+
 /**
  * Generate Counter id from internal index.
  *
@@ -424,9 +463,10 @@ mlx5_hws_cnt_pool_put(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
 
 	hpool = mlx5_hws_cnt_host_pool(cpool);
 	iidx = mlx5_hws_cnt_iidx(hpool, *cnt_id);
-	hpool->pool[iidx].in_used = false;
-	hpool->pool[iidx].query_gen_when_free =
-		rte_atomic_load_explicit(&hpool->query_gen, rte_memory_order_relaxed);
+	mlx5_hws_cnt_set_all(&hpool->pool[iidx], 0, 0, 0);
+	rte_atomic_store_explicit(&hpool->pool[iidx].query_gen_when_free,
+		rte_atomic_load_explicit(&hpool->query_gen, rte_memory_order_relaxed),
+		rte_memory_order_relaxed);
 	if (likely(queue != NULL) && cpool->cfg.host_cpool == NULL)
 		qcache = hpool->cache->qcache[*queue];
 	if (unlikely(qcache == NULL)) {
@@ -480,7 +520,7 @@ mlx5_hws_cnt_pool_put(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
  */
 static __rte_always_inline int
 mlx5_hws_cnt_pool_get(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
-		      cnt_id_t *cnt_id, uint32_t age_idx)
+		      cnt_id_t *cnt_id, uint32_t age_idx, uint32_t shared)
 {
 	unsigned int ret;
 	struct rte_ring_zc_data zcdc = {0};
@@ -508,10 +548,7 @@ mlx5_hws_cnt_pool_get(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
 		__hws_cnt_query_raw(cpool, *cnt_id,
 				    &cpool->pool[iidx].reset.hits,
 				    &cpool->pool[iidx].reset.bytes);
-		cpool->pool[iidx].share = 0;
-		MLX5_ASSERT(!cpool->pool[iidx].in_used);
-		cpool->pool[iidx].in_used = true;
-		cpool->pool[iidx].age_idx = age_idx;
+		mlx5_hws_cnt_set_all(&cpool->pool[iidx], 1, shared, age_idx);
 		return 0;
 	}
 	ret = rte_ring_dequeue_zc_burst_elem_start(qcache, sizeof(cnt_id_t), 1,
@@ -530,8 +567,10 @@ mlx5_hws_cnt_pool_get(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
 	/* get one from local cache. */
 	*cnt_id = (*(cnt_id_t *)zcdc.ptr1);
 	iidx = mlx5_hws_cnt_iidx(cpool, *cnt_id);
-	query_gen = cpool->pool[iidx].query_gen_when_free;
-	if (cpool->query_gen == query_gen) { /* counter is waiting to reset. */
+	query_gen = rte_atomic_load_explicit(&cpool->pool[iidx].query_gen_when_free,
+			rte_memory_order_relaxed);
+	/* counter is waiting to reset. */
+	if (rte_atomic_load_explicit(&cpool->query_gen, rte_memory_order_relaxed) == query_gen) {
 		rte_ring_dequeue_zc_elem_finish(qcache, 0);
 		/* write-back counter to reset list. */
 		mlx5_hws_cnt_pool_cache_flush(cpool, *queue);
@@ -549,10 +588,7 @@ mlx5_hws_cnt_pool_get(struct mlx5_hws_cnt_pool *cpool, uint32_t *queue,
 	__hws_cnt_query_raw(cpool, *cnt_id, &cpool->pool[iidx].reset.hits,
 			    &cpool->pool[iidx].reset.bytes);
 	rte_ring_dequeue_zc_elem_finish(qcache, 1);
-	cpool->pool[iidx].share = 0;
-	MLX5_ASSERT(!cpool->pool[iidx].in_used);
-	cpool->pool[iidx].in_used = true;
-	cpool->pool[iidx].age_idx = age_idx;
+	mlx5_hws_cnt_set_all(&cpool->pool[iidx], 1, shared, age_idx);
 	return 0;
 }
 
@@ -611,24 +647,15 @@ mlx5_hws_cnt_shared_get(struct mlx5_hws_cnt_pool *cpool, cnt_id_t *cnt_id,
 			uint32_t age_idx)
 {
 	struct mlx5_hws_cnt_pool *hpool = mlx5_hws_cnt_host_pool(cpool);
-	uint32_t iidx;
-	int ret;
 
-	ret = mlx5_hws_cnt_pool_get(hpool, NULL, cnt_id, age_idx);
-	if (ret != 0)
-		return ret;
-	iidx = mlx5_hws_cnt_iidx(hpool, *cnt_id);
-	hpool->pool[iidx].share = 1;
-	return 0;
+	return mlx5_hws_cnt_pool_get(hpool, NULL, cnt_id, age_idx, 1);
 }
 
 static __rte_always_inline void
 mlx5_hws_cnt_shared_put(struct mlx5_hws_cnt_pool *cpool, cnt_id_t *cnt_id)
 {
 	struct mlx5_hws_cnt_pool *hpool = mlx5_hws_cnt_host_pool(cpool);
-	uint32_t iidx = mlx5_hws_cnt_iidx(hpool, *cnt_id);
 
-	hpool->pool[iidx].share = 0;
 	mlx5_hws_cnt_pool_put(hpool, NULL, cnt_id);
 }
 
@@ -637,8 +664,10 @@ mlx5_hws_cnt_is_shared(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id)
 {
 	struct mlx5_hws_cnt_pool *hpool = mlx5_hws_cnt_host_pool(cpool);
 	uint32_t iidx = mlx5_hws_cnt_iidx(hpool, cnt_id);
+	uint32_t share;
 
-	return hpool->pool[iidx].share ? true : false;
+	mlx5_hws_cnt_get_all(&hpool->pool[iidx], NULL, &share, NULL);
+	return !!share;
 }
 
 static __rte_always_inline void
@@ -648,8 +677,8 @@ mlx5_hws_cnt_age_set(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id,
 	struct mlx5_hws_cnt_pool *hpool = mlx5_hws_cnt_host_pool(cpool);
 	uint32_t iidx = mlx5_hws_cnt_iidx(hpool, cnt_id);
 
-	MLX5_ASSERT(hpool->pool[iidx].share);
-	hpool->pool[iidx].age_idx = age_idx;
+	MLX5_ASSERT(hpool->pool[iidx].cnt_state.share);
+	mlx5_hws_cnt_set_age_idx(&hpool->pool[iidx], age_idx);
 }
 
 static __rte_always_inline uint32_t
@@ -657,9 +686,11 @@ mlx5_hws_cnt_age_get(struct mlx5_hws_cnt_pool *cpool, cnt_id_t cnt_id)
 {
 	struct mlx5_hws_cnt_pool *hpool = mlx5_hws_cnt_host_pool(cpool);
 	uint32_t iidx = mlx5_hws_cnt_iidx(hpool, cnt_id);
+	uint32_t age_idx, share;
 
-	MLX5_ASSERT(hpool->pool[iidx].share);
-	return hpool->pool[iidx].age_idx;
+	mlx5_hws_cnt_get_all(&hpool->pool[iidx], NULL, &share, &age_idx);
+	MLX5_ASSERT(share);
+	return age_idx;
 }
 
 static __rte_always_inline cnt_id_t
-- 
2.27.0



More information about the dev mailing list