[RFC PATCH v3 2/2] mempool: de-inline get/put unlikely code paths

Morten Brørup mb at smartsharesystems.com
Fri Mar 13 16:27:53 CET 2026


> From: Andrew Rybchenko [mailto:andrew.rybchenko at oktetlabs.ru]
> Sent: Tuesday, 17 February 2026 07.37
> 
> On 2/16/26 6:23 PM, Morten Brørup wrote:
> > De-inline unlikely code paths, for smaller footprint.
> 
> The idea is interesting and makes sense to me. But could you share
> performance figures to know the impact.
> 
> >
> > Signed-off-by: Morten Brørup <mb at smartsharesystems.com>
> > ---
> > v3:
> > * New functions are called from inline code, so make them
> experimental
> >    instead of internal.
> > v2:
> > * Removed review functions.
> > * Changed #if 0 to #if AVOID_RTE_MEMCPY.
> > ---
> >   lib/mempool/rte_mempool.c | 112 ++++++++++++++++++++
> >   lib/mempool/rte_mempool.h | 212 ++++++++++++++++++++---------------
> ---
> >   2 files changed, 223 insertions(+), 101 deletions(-)
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 3042d94c14..30dce3a2fd 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -1016,6 +1016,118 @@ rte_mempool_create(const char *name, unsigned
> n, unsigned elt_size,
> >   	return NULL;
> >   }
> >
> > +/* internal */
> > +RTE_EXPORT_EXPERIMENTAL_SYMBOL(_rte_mempool_do_generic_put_more,
> 26.03)
> > +void
> > +_rte_mempool_do_generic_put_more(struct rte_mempool *mp, void *
> const *obj_table,
> > +		unsigned int n, struct rte_mempool_cache *cache)
> > +{
> 
> I'd add comments which explain why stats are not updated by the
> function. It is the drawback of the solution when at least
> comments should be added to make it clear. Stats update would
> be very easy to loose in the case of future changes.
> 
> > +	__rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE *
> 2);
> > +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> > +	__rte_assume(cache->len <= cache->flushthresh);
> > +	__rte_assume(cache->len + n > cache->flushthresh);
> > +	if (likely(n <= cache->flushthresh)) {
> > +		uint32_t len;
> > +		void **cache_objs;
> > +
> > +		/*
> > +		 * The cache is big enough for the objects, but - as
> detected by
> > +		 * rte_mempool_do_generic_put() - has insufficient room for
> them.
> > +		 * Flush the cache to make room for the objects.
> > +		 */
> > +		len = cache->len;
> > +		cache_objs = &cache->objs[0];
> > +		cache->len = n;
> > +		rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
> > +
> > +		/* Add the objects to the cache. */
> > +#ifdef AVOID_RTE_MEMCPY /* Simple alternative to rte_memcpy(). */
> 
> I'd not mix introduction of AVOID_RTE_MEMCPY and other goals of the
> patch. If AVOID_RTE_MEMCPY is really useful, it could be added
> separately and appropriately motivated.
> 
> > +		for (uint32_t index = 0; index < n; index++)
> > +			*cache_objs++ = *obj_table++;
> > +#else
> > +		rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> > +#endif
> > +
> > +		return;
> > +	}
> > +
> > +	/* The request itself is too big for the cache. Push objects
> directly to the backend. */
> > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
> > +}
> > +
> > +/* internal */
> > +RTE_EXPORT_EXPERIMENTAL_SYMBOL(_rte_mempool_do_generic_get_more,
> 26.03)
> > +int
> > +_rte_mempool_do_generic_get_more(struct rte_mempool *mp, void
> **obj_table,
> > +		unsigned int n, struct rte_mempool_cache *cache)
> > +{
> > +	int ret;
> > +	unsigned int remaining;
> > +	uint32_t index, len;
> > +	void **cache_objs;
> > +
> > +	/* Use the cache as much as we have to return hot objects first.
> */
> > +	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> > +	len = cache->len;
> > +	remaining = n - len;
> > +	cache_objs = &cache->objs[len];
> > +	cache->len = 0;
> > +	for (index = 0; index < len; index++)
> > +		*obj_table++ = *--cache_objs;
> > +
> > +	/* Dequeue below would overflow mem allocated for cache? */
> > +	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> > +		goto driver_dequeue;
> > +
> > +	/* Fill the cache from the backend; fetch size + remaining
> objects. */
> > +	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> > +			cache->size + remaining);
> > +	if (unlikely(ret < 0)) {
> > +		/*
> > +		 * We are buffer constrained, and not able to fetch all
> that.
> > +		 * Do not fill the cache, just satisfy the remaining part
> of
> > +		 * the request directly from the backend.
> > +		 */
> > +		goto driver_dequeue;
> > +	}
> > +
> > +	/* Satisfy the remaining part of the request from the filled
> cache. */
> > +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> > +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> > +
> > +	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> > +	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> > +	cache_objs = &cache->objs[cache->size + remaining];
> > +	cache->len = cache->size;
> > +	for (index = 0; index < remaining; index++)
> > +		*obj_table++ = *--cache_objs;
> > +
> > +	return 0;
> > +
> > +driver_dequeue:
> > +
> > +	/* Get remaining objects directly from the backend. */
> > +	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> > +
> > +	if (unlikely(ret < 0)) {
> > +		cache->len = n - remaining;
> > +		/*
> > +		 * No further action is required to roll the first part
> > +		 * of the request back into the cache, as objects in
> > +		 * the cache are intact.
> > +		 */
> > +
> > +		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> > +		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> > +	} else {
> > +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> > +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> > +		__rte_assume(ret == 0);
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >   /* Return the number of entries in the mempool */
> >   RTE_EXPORT_SYMBOL(rte_mempool_avail_count)
> >   unsigned int
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 7989d7a475..c6df285194 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -1370,8 +1370,31 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >   	cache->len = 0;
> >   }
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * @internal
> > + * Put several objects back in the mempool, more than the cache has
> room for; used internally.
> > + *
> > + * @param mp
> > + *   A pointer to the mempool structure.
> > + * @param obj_table
> > + *   A pointer to a table of void * pointers (objects).
> > + * @param n
> > + *   The number of objects to store back in the mempool, must be
> strictly
> > + *   positive.
> > + * @param cache
> > + *   A pointer to a mempool cache structure.
> > + */
> > +__rte_experimental
> > +void
> > +_rte_mempool_do_generic_put_more(struct rte_mempool *mp, void *
> const *obj_table,
> > +		unsigned int n, struct rte_mempool_cache *cache);
> > +
> >   /**
> >    * @internal Put several objects back in the mempool; used
> internally.
> > + *
> >    * @param mp
> >    *   A pointer to the mempool structure.
> >    * @param obj_table
> > @@ -1388,9 +1411,16 @@ rte_mempool_do_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >   {
> >   	void **cache_objs;
> >
> > -	/* No cache provided? */
> > -	if (unlikely(cache == NULL))
> > -		goto driver_enqueue;
> > +	if (unlikely(cache == NULL)) {
> 
> Patch summary says about de-inline of unlikely code, but you still have
> it here. May be it is better to be consistent and the case in de-line
> code.
> 
> > +		/* No cache. Push objects directly to the backend. */
> > +		/* Increment stats now, adding in mempool always succeeds.
> */
> > +		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> > +		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> > +
> > +		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
> > +
> > +		return;
> > +	}
> >
> >   	/* Increment stats now, adding in mempool always succeeds. */
> >   	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> > @@ -1403,35 +1433,43 @@ rte_mempool_do_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >   		/* Sufficient room in the cache for the objects. */
> >   		cache_objs = &cache->objs[cache->len];
> >   		cache->len += n;
> > -	} else if (n <= cache->flushthresh) {
> > +
> > +cache_enqueue:
> > +#ifdef AVOID_RTE_MEMCPY /* Simple alternative to rte_memcpy(). */
> >   		/*
> > -		 * The cache is big enough for the objects, but - as
> detected by
> > -		 * the comparison above - has insufficient room for them.
> > -		 * Flush the cache to make room for the objects.
> > +		 * Add the objects to the cache.
> > +		 * If the request size is known at build time,
> > +		 * the compiler unrolls the fixed length copy loop.
> >   		 */
> > -		cache_objs = &cache->objs[0];
> > -		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> > -		cache->len = n;
> > -	} else {
> > -		/* The request itself is too big for the cache. */
> > -		goto driver_enqueue_stats_incremented;
> > -	}
> > -
> > -	/* Add the objects to the cache. */
> > -	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> > +		for (uint32_t index = 0; index < n; index++)
> > +			*cache_objs++ = *obj_table++;
> > +#else
> > +		/* Add the objects to the cache. */
> > +		rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> > +#endif
> >
> > -	return;
> > +		return;
> > +	}
> >
> > -driver_enqueue:
> > +	if (__rte_constant(n) && likely(n <= cache->flushthresh)) {
> > +		uint32_t len;
> >
> > -	/* increment stat now, adding in mempool always success */
> > -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> > -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> > +		/*
> > +		 * The cache is big enough for the objects, but - as
> detected
> > +		 * above - has insufficient room for them.
> > +		 * Flush the cache to make room for the objects.
> > +		 */
> > +		len = cache->len;
> > +		cache_objs = &cache->objs[0];
> > +		cache->len = n;
> > +		rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
> >
> > -driver_enqueue_stats_incremented:
> > +		/* Add the objects to the cache. */
> > +		goto cache_enqueue;
> > +	}
> >
> > -	/* push objects to the backend */
> > -	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
> > +	/* Insufficient room in the cache for the objects. */
> > +	_rte_mempool_do_generic_put_more(mp, obj_table, n, cache);
> >   }
> >
> >
> > @@ -1498,8 +1536,33 @@ rte_mempool_put(struct rte_mempool *mp, void
> *obj)
> >   	rte_mempool_put_bulk(mp, &obj, 1);
> >   }
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * @internal
> > + * Get several objects from the mempool, more than held in the
> cache; used internally.
> > + *
> > + * @param mp
> > + *   A pointer to the mempool structure.
> > + * @param obj_table
> > + *   A pointer to a table of void * pointers (objects).
> > + * @param n
> > + *   The number of objects to get, must be strictly positive.
> > + * @param cache
> > + *   A pointer to a mempool cache structure.
> > + * @return
> > + *   - 0: Success.
> > + *   - <0: Error; code of driver dequeue function.
> > + */
> > +__rte_experimental
> > +int
> > +_rte_mempool_do_generic_get_more(struct rte_mempool *mp, void
> **obj_table,
> > +		unsigned int n, struct rte_mempool_cache *cache);
> > +
> >   /**
> >    * @internal Get several objects from the mempool; used internally.
> > + *
> >    * @param mp
> >    *   A pointer to the mempool structure.
> >    * @param obj_table
> > @@ -1516,26 +1579,36 @@ static __rte_always_inline int
> >   rte_mempool_do_generic_get(struct rte_mempool *mp, void
> **obj_table,
> >   			   unsigned int n, struct rte_mempool_cache *cache)
> >   {
> > -	int ret;
> > -	unsigned int remaining;
> > -	uint32_t index, len;
> > -	void **cache_objs;
> > -
> > -	/* No cache provided? */
> >   	if (unlikely(cache == NULL)) {
> 
> Patch summary says about de-inline of unlikely code, but you still have
> it here.
> 
> > -		remaining = n;
> > -		goto driver_dequeue;
> > -	}
> > +		int ret;
> > +
> > +		/* No cache. Get objects directly from the backend. */
> > +		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
> > +
> > +		if (unlikely(ret < 0)) {
> > +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> > +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> > +		} else {
> > +			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> > +			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> > +			__rte_assume(ret == 0);
> > +		}
> >
> > -	/* The cache is a stack, so copy will be in reverse order. */
> > -	cache_objs = &cache->objs[cache->len];
> > +		return ret;
> > +	}
> >
> >   	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> >   	if (likely(n <= cache->len)) {
> > +		uint32_t index;
> > +		void **cache_objs;
> > +
> >   		/* The entire request can be satisfied from the cache. */
> >   		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> >   		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> >
> > +		/* The cache is a stack, so copy will be in reverse order.
> */
> > +		cache_objs = &cache->objs[cache->len];
> > +
> >   		/*
> >   		 * If the request size is known at build time,
> >   		 * the compiler unrolls the fixed length copy loop.
> > @@ -1547,71 +1620,8 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >   		return 0;
> >   	}
> >
> > -	/* Use the cache as much as we have to return hot objects first.
> */
> > -	len = cache->len;
> > -	remaining = n - len;
> > -	cache->len = 0;
> > -	for (index = 0; index < len; index++)
> > -		*obj_table++ = *--cache_objs;
> > -
> > -	/* Dequeue below would overflow mem allocated for cache? */
> > -	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> > -		goto driver_dequeue;
> > -
> > -	/* Fill the cache from the backend; fetch size + remaining
> objects. */
> > -	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> > -			cache->size + remaining);
> > -	if (unlikely(ret < 0)) {
> > -		/*
> > -		 * We are buffer constrained, and not able to fetch all
> that.
> > -		 * Do not fill the cache, just satisfy the remaining part
> of
> > -		 * the request directly from the backend.
> > -		 */
> > -		goto driver_dequeue;
> > -	}
> > -
> > -	/* Satisfy the remaining part of the request from the filled
> cache. */
> > -	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> > -	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> > -
> > -	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> > -	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> > -	cache_objs = &cache->objs[cache->size + remaining];
> > -	cache->len = cache->size;
> > -	for (index = 0; index < remaining; index++)
> > -		*obj_table++ = *--cache_objs;
> > -
> > -	return 0;
> > -
> > -driver_dequeue:
> > -
> > -	/* Get remaining objects directly from the backend. */
> > -	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> > -
> > -	if (unlikely(ret < 0)) {
> > -		if (likely(cache != NULL)) {
> > -			cache->len = n - remaining;
> > -			/*
> > -			 * No further action is required to roll the first
> part
> > -			 * of the request back into the cache, as objects in
> > -			 * the cache are intact.
> > -			 */
> > -		}
> > -
> > -		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> > -		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> > -	} else {
> > -		if (likely(cache != NULL)) {
> > -			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk,
> 1);
> > -			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs,
> n);
> > -		} else {
> > -			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> > -			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> > -		}
> > -		__rte_assume(ret == 0);
> > -	}
> > -
> > -	return ret;
> > +	/* The entire request cannot be satisfied from the cache. */
> > +	return _rte_mempool_do_generic_get_more(mp, obj_table, n, cache);
> >   }
> >
> >   /**

Good feedback, Andrew.
I'll mark as changes requested, and follow up with a new version later.

-Morten



More information about the dev mailing list