[PATCH v2 2/3] ring: use GCC builtin as alternative to rte_atomic32

Konstantin Ananyev konstantin.ananyev at huawei.com
Sat Jun 6 16:02:52 CEST 2026


> This patch replaces use of the deprecated rte_atomic32 code with
> GCC builtin atomic operations.
> 
> Although it would be preferable to use C11 version on all architectures,
> there is a performance loss if we do it that way:
> 
> Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs:
>   with C11 builtin:           5.86 cycles/elem
>   with __sync builtin:        5.36 cycles/elem  (-9.4%)
> 
> The C11 __atomic_compare_exchange_n builtin writes the actual value back
> to its expected pointer on failure. On x86 this forces GCC
> to emit extra instructions on the critical path between the CAS
> and the success-test.
> 
> __sync_bool_compare_and_swap returns a plain bool with no pointer
> writeback, allowing GCC to emit tighter code.
> 
> Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
> ---
>  lib/ring/meson.build                          |  2 +-
>  lib/ring/rte_ring_elem_pvt.h                  |  2 +-
>  ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 33 +++++++++++--------
>  3 files changed, 21 insertions(+), 16 deletions(-)
>  rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (88%)
> 
> diff --git a/lib/ring/meson.build b/lib/ring/meson.build
> index 21f2c12989..2ba160b178 100644
> --- a/lib/ring/meson.build
> +++ b/lib/ring/meson.build
> @@ -9,7 +9,7 @@ indirect_headers += files (
>          'rte_ring_elem.h',
>          'rte_ring_elem_pvt.h',
>          'rte_ring_c11_pvt.h',
> -        'rte_ring_generic_pvt.h',
> +        'rte_ring_gcc_pvt.h',
>          'rte_ring_hts.h',
>          'rte_ring_hts_elem_pvt.h',
>          'rte_ring_peek.h',
> diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
> index a0fdec9812..9a0170c4f0 100644
> --- a/lib/ring/rte_ring_elem_pvt.h
> +++ b/lib/ring/rte_ring_elem_pvt.h
> @@ -309,7 +309,7 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t
> cons_head,
>  #ifdef RTE_USE_C11_MEM_MODEL
>  #include "rte_ring_c11_pvt.h"
>  #else
> -#include "rte_ring_generic_pvt.h"
> +#include "rte_ring_gcc_pvt.h"
>  #endif
> 
>  /**
> diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h
> similarity index 88%
> rename from lib/ring/rte_ring_generic_pvt.h
> rename to lib/ring/rte_ring_gcc_pvt.h
> index c044b0824f..68ab1355e8 100644
> --- a/lib/ring/rte_ring_generic_pvt.h
> +++ b/lib/ring/rte_ring_gcc_pvt.h
> @@ -7,11 +7,11 @@
>   * Used as BSD-3 Licensed with permission from Kip Macy.
>   */
> 
> -#ifndef _RTE_RING_GENERIC_PVT_H_
> -#define _RTE_RING_GENERIC_PVT_H_
> +#ifndef _RTE_RING_GCC_PVT_H_
> +#define _RTE_RING_GCC_PVT_H_
> 
>  /**
> - * @file rte_ring_generic_pvt.h
> + * @file rte_ring_gcc_pvt.h
>   * It is not recommended to include this file directly,
>   * include <rte_ring.h> instead.
>   * Contains internal helper functions for MP/SP and MC/SC ring modes.
> @@ -25,10 +25,8 @@ static __rte_always_inline void
>  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
>  		uint32_t new_val, uint32_t single, uint32_t enqueue)
>  {
> -	if (enqueue)
> -		rte_smp_wmb();
> -	else
> -		rte_smp_rmb();
> +	RTE_SET_USED(enqueue);
> +
>  	/*
>  	 * If there are other enqueues/dequeues in progress that preceded us,
>  	 * we need to wait for them to complete
> @@ -37,7 +35,12 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht,
> uint32_t old_val,
>  		rte_wait_until_equal_32((volatile uint32_t *)(uintptr_t)&ht->tail,
> old_val,
>  			rte_memory_order_relaxed);
> 
> -	ht->tail = new_val;
> +	/*
> +	 * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
> +	 * Ensures that memory effects by this thread on ring elements array
> +	 * is observed by a different thread of the other type.
> +	 */
> +	__atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
>  }
> 
>  /**
> @@ -73,7 +76,7 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
>  		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
>  {
>  	unsigned int max = n;
> -	int success;
> +	bool success;
> 
>  	do {
>  		/* Reset n to the initial burst count */
> @@ -81,10 +84,10 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
> 
>  		*old_head = d->head;
> 
> -		/* add rmb barrier to avoid load/load reorder in weak
> +		/* add fence to avoid load/load reorder in weak
>  		 * memory model. It is noop on x86
>  		 */
> -		rte_smp_rmb();
> +		__atomic_thread_fence(__ATOMIC_ACQUIRE);
> 
>  		/*
>  		 *  The subtraction is done between two unsigned 32bits value
> @@ -103,10 +106,12 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
>  			return 0;
> 
>  		*new_head = *old_head + n;
> -		success = rte_atomic32_cmpset(
> +
> +		success = __sync_bool_compare_and_swap(
>  				(uint32_t *)(uintptr_t)&d->head,
>  				*old_head, *new_head);
> -	} while (unlikely(success == 0));
> +	} while (unlikely(!success));
> +
>  	return n;
>  }
> 
> @@ -169,4 +174,4 @@ __rte_ring_headtail_move_head_st(struct
> rte_ring_headtail *d,
>  	return n;
>  }
> 
> -#endif /* _RTE_RING_GENERIC_PVT_H_ */
> +#endif /* _RTE_RING_GCC_PVT_H_ */
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev at huawei.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev at huawei.com>

> 2.53.0



More information about the dev mailing list