|SUCCESS| [v3] mempool: improve cache behaviour and performance

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Fri Apr 10 00:48:49 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/162964

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-09

git sha: 20260409110558.961159-1-mb at smartsharesystems.com
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [PATCH v3] mempool: improve cache behaviour and performance

This patch refactors the mempool cache to eliminate some unexpected
behaviour and reduce the mempool cache miss rate.

> diff --git a/drivers/net/intel/common/tx.h b/drivers/net/intel/common/tx.h
> index 283bd58d5d..eeb0980d40 100644
> --- a/drivers/net/intel/common/tx.h
> +++ b/drivers/net/intel/common/tx.h

[ ... ]

> @@ -284,43 +284,13 @@ ci_tx_free_bufs_vec(struct ci_tx_queue *txq, ci_desc_done_fn desc_done, bool ctx
>  			txq->fast_free_mp :
>  			(txq->fast_free_mp = txep[0].mbuf->pool);
>  
> -	if (mp != NULL && (n & 31) == 0) {
> -		void **cache_objs;
> -		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, rte_lcore_id());
> -
> -		if (cache == NULL)
> -			goto normal;
> -
> -		cache_objs = &cache->objs[cache->len];
> -
> -		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> -			goto done;
> -		}
> -
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it
> -		 *   crosses the cache flush threshold) is flushed to the ring.
> -		 */
> -		/* Add elements back into the cache */
> -		uint32_t copied = 0;
> -		/* n is multiple of 32 */
> -		while (copied < n) {
> -			memcpy(&cache_objs[copied], &txep[copied], 32 * sizeof(void *));
> -			copied += 32;
> -		}
> -		cache->len += n;
> -
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> -					cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +	if (mp != NULL) {
> +		static_assert(sizeof(*txep) == sizeof(struct rte_mbuf *),
> +				"txep array is not similar to an array of rte_mbuf pointers");
> +		rte_mbuf_raw_free_bulk(mp, (void *)txep, n);
>  		goto done;
>  	}

Does this code add the missing mbuf history marking mentioned in the
commit log?  If so, it appears the tracing would only occur if
rte_mbuf_raw_free_bulk() calls it internally.  Can you confirm that the
history marking is correctly added when this path is taken in
ci_tx_free_bufs_vec()?

> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> index 9af275cd9d..59a6c22e98 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c
> @@ -148,14 +148,20 @@ idpf_singleq_rearm(struct idpf_rx_queue *rxq)
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < IDPF_RXQ_REARM_THRESH) {
>  		/* No. Backfill the cache first, and then fill from it */
> -		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
> -							cache->len);
>  
> -		/* How many do we require i.e. number to fill the cache + the request */
> +		/* Backfill would exceed the cache bounce buffer limit? */
> +		__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +		if (unlikely(IDPF_RXQ_REARM_THRESH > cache->size / 2)) {
> +			idpf_singleq_rearm_common(rxq);
> +			return;
> +		}
> +
> +		/* Backfill the cache from the backend; fetch (size / 2) objects. */
> +		__rte_assume(cache->len < cache->size / 2);
>  		int ret = rte_mempool_ops_dequeue_bulk
> -				(rxq->mp, &cache->objs[cache->len], req);
> +				(rxq->mp, &cache->objs[cache->len], cache->size / 2);
>  		if (ret == 0) {
> -			cache->len += req;
> +			cache->len += cache->size / 2;

Does this code potentially overflow cache->objs[] if cache->len is near
cache->size when this path is taken?  The new logic fetches (size / 2)
objects, but the original code fetched exactly what was needed.  Can the
cache be in a state where cache->len + cache->size / 2 exceeds the
actual array bounds?

> @@ -585,8 +608,8 @@ idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
>  							 dma_addr0);
>  				}
>  			}
> -		rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> -				   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +			rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +					   IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);

This indentation change appears unrelated to the stated objectives of
the patch.  Should this whitespace adjustment be separated into a
cleanup commit?

> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 3042d94c14..805b52cc58 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c

[ ... ]

> @@ -850,7 +840,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
>  
>  	/* asked cache too big */
>  	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
> -	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
> +	    cache_size > n) {
>  		rte_errno = EINVAL;
>  		return NULL;
>  	}

The old check was CALC_CACHE_FLUSHTHRESH(cache_size) which was
(cache_size * 3) / 2.  Does this change now allow larger cache_size
values through validation that would have been rejected before?  Can an
application now request cache_size between (n * 2/3) and n that was
previously rejected?  Does this affect backward compatibility?

> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 2e54fc4466..aa2d51bbd5 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h

[ ... ]

> @@ -1377,7 +1384,7 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>   *   A pointer to a mempool cache structure. May be NULL if not needed.
>   */
>  static __rte_always_inline void
> -rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> +rte_mempool_do_generic_put(struct rte_mempool *mp, void * const * __rte_restrict obj_table,

Does adding __rte_restrict to obj_table affect compilation with older
compilers?  The restrict keyword requires certain conditions, and does
this modification guarantee that obj_table does not alias cache->objs[]
when memcpy operations occur?

> @@ -1390,24 +1397,27 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>  
> -	__rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -	__rte_assume(cache->len <= cache->flushthresh);
> -	if (likely(cache->len + n <= cache->flushthresh)) {
> +	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +	__rte_assume(cache->len <= cache->size);
> +	if (likely(cache->len + n <= cache->size)) {
>  		/* Sufficient room in the cache for the objects. */
>  		cache_objs = &cache->objs[cache->len];
>  		cache->len += n;
> -	} else if (n <= cache->flushthresh) {
> +	} else if (n <= cache->size / 2) {
>  		/*
> -		 * The cache is big enough for the objects, but - as detected by
> -		 * the comparison above - has insufficient room for them.
> -		 * Flush the cache to make room for the objects.
> +		 * The number of objects is within the cache bounce buffer limit,
> +		 * but - as detected by the comparison above - the cache has
> +		 * insufficient room for them.
> +		 * Flush the cache to the backend to make room for the objects;
> +		 * flush (size / 2) objects.
>  		 */
> -		cache_objs = &cache->objs[0];
> -		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> -		cache->len = n;
> +		__rte_assume(cache->len > cache->size / 2);
> +		cache_objs = &cache->objs[cache->len - cache->size / 2];
> +		cache->len = cache->len - cache->size / 2 + n;
> +		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->size / 2);

After this flush operation, does cache->len correctly represent the
number of valid objects in cache->objs[]?  The calculation sets
cache->len before copying the n objects, but if cache->len is near
cache->size beforehand, can this result in cache->len > cache->size
after the flush?

> @@ -1524,7 +1534,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	/* The cache is a stack, so copy will be in reverse order. */
>  	cache_objs = &cache->objs[cache->len];
>  
> -	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
>  	if (likely(n <= cache->len)) {
>  		/* The entire request can be satisfied from the cache. */
>  		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> @@ -1548,13 +1558,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	for (index = 0; index < len; index++)
>  		*obj_table++ = *--cache_objs;
>  
> -	/* Dequeue below would overflow mem allocated for cache? */
> -	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> +	/* Dequeue below would exceed the cache bounce buffer limit? */
> +	__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	if (unlikely(remaining > cache->size / 2))
>  		goto driver_dequeue;
>  
> -	/* Fill the cache from the backend; fetch size + remaining objects. */
> -	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> -			cache->size + remaining);
> +	/* Fill the cache from the backend; fetch (size / 2) objects. */
> +	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache->size / 2);

Does this dequeue operation correctly account for the remaining objects
still in cache?  The old code fetched cache->size + remaining, but the
new code fetches only cache->size / 2.  Does this satisfy the original
request when cache->len is nonzero and remaining > 0?

> @@ -1568,10 +1578,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
>  	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>  
> -	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -	cache_objs = &cache->objs[cache->size + remaining];
> -	cache->len = cache->size;
> +	__rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +	__rte_assume(remaining <= cache->size / 2);
> +	cache_objs = &cache->objs[cache->size / 2];
> +	cache->len = cache->size / 2 - remaining;

After copying the remaining objects from the freshly fetched batch, does
cache->len accurately reflect the number of valid objects left in
cache->objs[]?  If remaining is large, does (cache->size / 2 -
remaining) result in cache->len becoming zero or negative (though stored
as unsigned


More information about the test-report mailing list