[PATCH v2 2/3] ring: use GCC builtin as alternative to rte_atomic32
Konstantin Ananyev
konstantin.ananyev at huawei.com
Sat Jun 6 16:02:52 CEST 2026
> This patch replaces use of the deprecated rte_atomic32 code with
> GCC builtin atomic operations.
>
> Although it would be preferable to use C11 version on all architectures,
> there is a performance loss if we do it that way:
>
> Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs:
> with C11 builtin: 5.86 cycles/elem
> with __sync builtin: 5.36 cycles/elem (-9.4%)
>
> The C11 __atomic_compare_exchange_n builtin writes the actual value back
> to its expected pointer on failure. On x86 this forces GCC
> to emit extra instructions on the critical path between the CAS
> and the success-test.
>
> __sync_bool_compare_and_swap returns a plain bool with no pointer
> writeback, allowing GCC to emit tighter code.
>
> Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
> ---
> lib/ring/meson.build | 2 +-
> lib/ring/rte_ring_elem_pvt.h | 2 +-
> ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 33 +++++++++++--------
> 3 files changed, 21 insertions(+), 16 deletions(-)
> rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (88%)
>
> diff --git a/lib/ring/meson.build b/lib/ring/meson.build
> index 21f2c12989..2ba160b178 100644
> --- a/lib/ring/meson.build
> +++ b/lib/ring/meson.build
> @@ -9,7 +9,7 @@ indirect_headers += files (
> 'rte_ring_elem.h',
> 'rte_ring_elem_pvt.h',
> 'rte_ring_c11_pvt.h',
> - 'rte_ring_generic_pvt.h',
> + 'rte_ring_gcc_pvt.h',
> 'rte_ring_hts.h',
> 'rte_ring_hts_elem_pvt.h',
> 'rte_ring_peek.h',
> diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
> index a0fdec9812..9a0170c4f0 100644
> --- a/lib/ring/rte_ring_elem_pvt.h
> +++ b/lib/ring/rte_ring_elem_pvt.h
> @@ -309,7 +309,7 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t
> cons_head,
> #ifdef RTE_USE_C11_MEM_MODEL
> #include "rte_ring_c11_pvt.h"
> #else
> -#include "rte_ring_generic_pvt.h"
> +#include "rte_ring_gcc_pvt.h"
> #endif
>
> /**
> diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h
> similarity index 88%
> rename from lib/ring/rte_ring_generic_pvt.h
> rename to lib/ring/rte_ring_gcc_pvt.h
> index c044b0824f..68ab1355e8 100644
> --- a/lib/ring/rte_ring_generic_pvt.h
> +++ b/lib/ring/rte_ring_gcc_pvt.h
> @@ -7,11 +7,11 @@
> * Used as BSD-3 Licensed with permission from Kip Macy.
> */
>
> -#ifndef _RTE_RING_GENERIC_PVT_H_
> -#define _RTE_RING_GENERIC_PVT_H_
> +#ifndef _RTE_RING_GCC_PVT_H_
> +#define _RTE_RING_GCC_PVT_H_
>
> /**
> - * @file rte_ring_generic_pvt.h
> + * @file rte_ring_gcc_pvt.h
> * It is not recommended to include this file directly,
> * include <rte_ring.h> instead.
> * Contains internal helper functions for MP/SP and MC/SC ring modes.
> @@ -25,10 +25,8 @@ static __rte_always_inline void
> __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
> uint32_t new_val, uint32_t single, uint32_t enqueue)
> {
> - if (enqueue)
> - rte_smp_wmb();
> - else
> - rte_smp_rmb();
> + RTE_SET_USED(enqueue);
> +
> /*
> * If there are other enqueues/dequeues in progress that preceded us,
> * we need to wait for them to complete
> @@ -37,7 +35,12 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht,
> uint32_t old_val,
> rte_wait_until_equal_32((volatile uint32_t *)(uintptr_t)&ht->tail,
> old_val,
> rte_memory_order_relaxed);
>
> - ht->tail = new_val;
> + /*
> + * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
> + * Ensures that memory effects by this thread on ring elements array
> + * is observed by a different thread of the other type.
> + */
> + __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
> }
>
> /**
> @@ -73,7 +76,7 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
> uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
> {
> unsigned int max = n;
> - int success;
> + bool success;
>
> do {
> /* Reset n to the initial burst count */
> @@ -81,10 +84,10 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
>
> *old_head = d->head;
>
> - /* add rmb barrier to avoid load/load reorder in weak
> + /* add fence to avoid load/load reorder in weak
> * memory model. It is noop on x86
> */
> - rte_smp_rmb();
> + __atomic_thread_fence(__ATOMIC_ACQUIRE);
>
> /*
> * The subtraction is done between two unsigned 32bits value
> @@ -103,10 +106,12 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
> return 0;
>
> *new_head = *old_head + n;
> - success = rte_atomic32_cmpset(
> +
> + success = __sync_bool_compare_and_swap(
> (uint32_t *)(uintptr_t)&d->head,
> *old_head, *new_head);
> - } while (unlikely(success == 0));
> + } while (unlikely(!success));
> +
> return n;
> }
>
> @@ -169,4 +174,4 @@ __rte_ring_headtail_move_head_st(struct
> rte_ring_headtail *d,
> return n;
> }
>
> -#endif /* _RTE_RING_GENERIC_PVT_H_ */
> +#endif /* _RTE_RING_GCC_PVT_H_ */
> --
Acked-by: Konstantin Ananyev <konstantin.ananyev at huawei.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev at huawei.com>
> 2.53.0
More information about the dev
mailing list