|SUCCESS| mempool: improve cache behaviour and performance

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Wed Apr 8 22:53:00 CEST 2026
Previous message (by thread): |SUCCESS| pw162952-162953 [PATCH] [v3,2/2] test: simplify spawning e
Next message (by thread): |SUCCESS| mempool: improve cache behaviour and performance
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/162945

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-08

Reviewing commit 904381 from Morten Brorup:

Subject: mempool: improve cache behaviour and performance

This patch refactors the mempool cache to eliminate unexpected behaviour
and reduce cache miss rate by changing flush/refill thresholds and fixing
cache size calculations.

> diff --git a/drivers/net/intel/common/tx.h b/drivers/net/intel/common/tx.h
> index 283bd58d5d..c5114f756d 100644
> --- a/drivers/net/intel/common/tx.h
> +++ b/drivers/net/intel/common/tx.h
> @@ -284,43 +284,11 @@ ci_tx_free_bufs_vec(struct ci_tx_queue *txq, ci_desc_done_fn desc_done, bool ctx
>  			txq->fast_free_mp :
>  			(txq->fast_free_mp = txep[0].mbuf->pool);
>  
> -	if (mp != NULL && (n & 31) == 0) {

Does this code still bypass the cache for requests that are not multiples
of 32 when it shouldn't?

> -		void **cache_objs;
> -		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, rte_lcore_id());

[ ... ]

> +	if (mp != NULL) {
> +		rte_mbuf_raw_free_bulk(mp, (void *)txep, n);
>  		goto done;
>  	}

> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> index 9af275cd9d..e5eb56552f 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> @@ -148,14 +148,19 @@ idpf_singleq_rearm(struct idpf_rx_queue *rxq)
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < IDPF_RXQ_REARM_THRESH) {
>  		/* No. Backfill the cache first, and then fill from it */
> -		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
> -							cache->len);
>  
> -		/* How many do we require i.e. number to fill the cache + the request */
> +		/* Backfill would exceed the cache bounce buffer limit? */
> +		__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +		if (unlikely(IDPF_RXQ_REARM_THRESH > cache->size / 2)) {
> +			idpf_singleq_rearm_common(rxq);
> +			return;
> +		}
> +
> +		/* Backfill the cache from the backend; fetch (size / 2) objects. */
>  		int ret = rte_mempool_ops_dequeue_bulk
> -				(rxq->mp, &cache->objs[cache->len], req);
> +				(rxq->mp, &cache->objs[cache->len], cache->size / 2);

Does this code leave the cache in an invalid state on failure?

In idpf_singleq_rearm()->rte_mempool_ops_dequeue_bulk(), when ret != 0,
cache->len is not updated but objects may have been partially written to
cache->objs[].

>  		if (ret == 0) {
> -			cache->len += req;
> +			cache->len += cache->size / 2;
>  		} else {
>  			if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
>  			    rxq->nb_rx_desc) {

> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 3042d94c14..805b52cc58 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c

[ ... ]

> @@ -757,13 +752,8 @@ rte_mempool_free(struct rte_mempool *mp)
>  static void
>  mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
>  {
> -	/* Check that cache have enough space for flush threshold */
> -	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
> -			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
> -			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
> -
>  	cache->size = size;
> -	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
> +	cache->flushthresh = size; /* Obsolete; for API/ABI compatibility purposes only */
>  	cache->len = 0;
>  }

> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 2e54fc4466..dafe98d1c2 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -89,7 +89,7 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
>   */
>  struct __rte_cache_aligned rte_mempool_cache {
>  	uint32_t size;	      /**< Size of the cache */
> -	uint32_t flushthresh; /**< Threshold before we flush excess elements */
> +	uint32_t flushthresh; /**< Obsolete; for API/ABI compatibility purposes only */
>  	uint32_t len;	      /**< Current cache count */

[ ... ]

> -	/**
> -	 * Cache objects
> -	 *
> -	 * Cache is allocated to this size to allow it to overflow in certain
> -	 * cases to avoid needless emptying of cache.
> -	 */
> -	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
> +	/** Cache objects */
> +	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
> +	/** Unused; for ABI compatibility purposes only */
> +	void *unused_objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
> +	/* Note: Remember to add an RTE_CACHE_GUARD here, if removing unused_objs. */
>  };

[ ... ]

> @@ -1390,24 +1393,26 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>  
> -	__rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -	__rte_assume(cache->len <= cache->flushthresh);
> -	if (likely(cache->len + n <= cache->flushthresh)) {
> +	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +	__rte_assume(cache->len <= cache->size);
> +	if (likely(cache->len + n <= cache->size)) {
>  		/* Sufficient room in the cache for the objects. */
>  		cache_objs = &cache->objs[cache->len];
>  		cache->len += n;
> -	} else if (n <= cache->flushthresh) {
> +	} else if (n <= cache->size / 2) {
>  		/*
> -		 * The cache is big enough for the objects, but - as detected by
> -		 * the comparison above - has insufficient room for them.
> -		 * Flush the cache to make room for the objects.
> +		 * The number of objects is within the cache bounce buffer limit,
> +		 * but - as detected by the comparison above - the cache has
> +		 * insufficient room for them.
> +		 * Flush the cache to the backend to make room for the objects;
> +		 * flush (size / 2) objects.
>  		 */
> -		cache_objs = &cache->objs[0];
> -		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> -		cache->len = n;
> +		cache_objs = &cache->objs[cache->len - cache->size / 2];
> +		cache->len = cache->len - cache->size / 2 + n;

Does this code overflow objs[] when (cache->len - cache->size / 2 + n)
exceeds RTE_MEMPOOL_CACHE_MAX_SIZE?

In rte_mempool_do_generic_put(), the condition (cache->len + n <=
cache->size) is false, and (n <= cache->size / 2) is true. Does this code
guarantee that (cache->len - cache->size / 2 + n) will not exceed the objs
array bounds? What if cache->len is close to cache->size?

> +		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->size / 2);
>  	} else {
> -		/* The request itself is too big for the cache. */
> +		/* The request itself is too big. */
>  		goto driver_enqueue_stats_incremented;
>  	}
>  

[ ... ]

> @@ -1524,7 +1529,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	/* The cache is a stack, so copy will be in reverse order. */
>  	cache_objs = &cache->objs[cache->len];
>  
> -	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
>  	if (likely(n <= cache->len)) {
>  		/* The entire request can be satisfied from the cache. */
>  		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> @@ -1548,13 +1553,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	for (index = 0; index < len; index++)
>  		*obj_table++ = *--cache_objs;
>  
> -	/* Dequeue below would overflow mem allocated for cache? */
> -	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> +	/* Dequeue below would exceed the cache bounce buffer limit? */
> +	__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	if (unlikely(remaining > cache->size / 2))
>  		goto driver_dequeue;
>  
> -	/* Fill the cache from the backend; fetch size + remaining objects. */
> -	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> -			cache->size + remaining);
> +	/* Fill the cache from the backend; fetch (size / 2) objects. */
> +	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache->size / 2);
>  	if (unlikely(ret < 0)) {
>  		/*
>  		 * We are buffer constrained, and not able to fetch all that.
> @@ -1568,10 +1573,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>  
> -	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -	cache_objs = &cache->objs[cache->size + remaining];
> -	cache->len = cache->size;
> +	__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	__rte_assume(remaining <= cache->size / 2);
> +	cache_objs = &cache->objs[cache->size / 2];
> +	cache->len = cache->size / 2 - remaining;

Does this code underflow cache->len when remaining exceeds cache->size / 2?

In rte_mempool_do_generic_get(), after successfully fetching
(cache->size / 2) objects, the code sets (cache->len = cache->size / 2 -
remaining). Can remaining be larger than cache->size / 2, causing
cache->len to become negative (wrapping around for unsigned)?

>  	for (index = 0; index < remaining; index++)
>  		*obj_table++ = *--cache_objs;
>
Previous message (by thread): |SUCCESS| pw162952-162953 [PATCH] [v3,2/2] test: simplify spawning e
Next message (by thread): |SUCCESS| mempool: improve cache behaviour and performance
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the test-report mailing list