[PATCH 1/1] mempool: implement index-based per core cache
    Dharmik Thakkar 
    Dharmik.Thakkar at arm.com
       
    Thu Jan 13 06:17:23 CET 2022
    
    
  
Hi Konstatin,
Thank you for your comments and the test report!
> On Jan 10, 2022, at 8:26 PM, Ananyev, Konstantin <konstantin.ananyev at intel.com> wrote:
> 
> 
> 
> 
>> Current mempool per core cache implementation stores pointers to mbufs
>> On 64b architectures, each pointer consumes 8B
>> This patch replaces it with index-based implementation,
>> where in each buffer is addressed by (pool base address + index)
>> It reduces the amount of memory/cache required for per core cache
>> 
>> L3Fwd performance testing reveals minor improvements in the cache
>> performance (L1 and L2 misses reduced by 0.60%)
>> with no change in throughput
> 
> I feel really sceptical about that patch and the whole idea in general:
> - From what I read above there is no real performance improvement observed.
>  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>  see below for more details). 
Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
Similar optimizations can be implemented for x86 platforms which should close the performance gap
and in my understanding should give better performance for a bulk size of 32.
> - Space utilization difference looks neglectable too.
Sorry, I did not understand this point.
> - The change introduces a new build time config option with a major limitation:
>   All memzones in a pool have to be within the same 4GB boundary. 
>   To address it properly, extra changes will be required in init(/populate) part of the code.
I agree to the above mentioned challenges and I am currently working on resolving these issues.
>   All that will complicate mempool code, will make it more error prone
>   and harder to maintain.
> But, as there is no real gain in return - no point to add such extra complexity at all.
> 
> Konstantin
> 
> CSX 2.1 GHz
> ==========
> 
> echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
> 
> params :                                                                                                  rate_persec  	
>                                                                                                                 (normal/index-based/diff %)
> (with cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65
> 
> (with user-owned cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92
> 
>> 
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli at arm.com>
>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar at arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang at arm.com>
>> ---
>> lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>> lib/mempool/rte_mempool_ops_default.c |   7 ++
>> 2 files changed, 119 insertions(+), 2 deletions(-)
>> 
>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>> index 1e7a3c15273c..4fabd3b1920b 100644
>> --- a/lib/mempool/rte_mempool.h
>> +++ b/lib/mempool/rte_mempool.h
>> @@ -50,6 +50,10 @@
>> #include <rte_memcpy.h>
>> #include <rte_common.h>
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#include <rte_vect.h>
>> +#endif
>> +
>> #include "rte_mempool_trace_fp.h"
>> 
>> #ifdef __cplusplus
>> @@ -239,6 +243,9 @@ struct rte_mempool {
>> 	int32_t ops_index;
>> 
>> 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *pool_base_value; /**< Base value to calculate indices */
>> +#endif
>> 
>> 	uint32_t populated_size;         /**< Number of populated objects. */
>> 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
>> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>> 	if (cache == NULL || cache->len == 0)
>> 		return;
>> 	rte_mempool_trace_cache_flush(cache, mp);
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	unsigned int i;
>> +	unsigned int cache_len = cache->len;
>> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
>> +	void *base_value = mp->pool_base_value;
>> +	uint32_t *cache_objs = (uint32_t *) cache->objs;
>> +	for (i = 0; i < cache_len; i++)
>> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
>> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>> +#else
>> 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
>> +#endif
>> 	cache->len = 0;
>> }
>> 
>> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 			   unsigned int n, struct rte_mempool_cache *cache)
>> {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t *cache_objs;
>> +	void *base_value;
>> +	uint32_t i;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* increment stat now, adding in mempool always success */
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>> 		goto ring_enqueue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	cache_objs = (uint32_t *) cache->objs;
>> +	cache_objs = &cache_objs[cache->len];
>> +	base_value = mp->pool_base_value;
>> +#else
>> 	cache_objs = &cache->objs[cache->len];
>> +#endif
>> 
>> 	/*
>> 	 * The cache follows the following algorithm
>> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	 */
>> 
>> 	/* Add elements back into the cache */
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +	uint32x2_t v_cache_objs;
>> +
>> +	for (i = 0; i < (n & ~0x1); i += 2) {
>> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
>> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
>> +		vst1_u32(cache_objs + i, v_cache_objs);
>> +	}
>> +	if (n & 0x1) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#else
>> +	for (i = 0; i < n; i++) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#endif
>> +#else
>> 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
>> +#endif
>> 
>> 	cache->len += n;
>> 
>> 	if (cache->len >= cache->flushthresh) {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>> +				cache->len - cache->size);
>> +#else
>> 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>> 				cache->len - cache->size);
>> +#endif
>> 		cache->len = cache->size;
>> 	}
>> 
>> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> {
>> 	int ret;
>> 	uint32_t index, len;
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t i;
>> +	uint32_t *cache_objs;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* No cache provided or cannot be satisfied from cache */
>> 	if (unlikely(cache == NULL || n >= cache->size))
>> 		goto ring_dequeue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *base_value = mp->pool_base_value;
>> +	cache_objs = (uint32_t *) cache->objs;
>> +#else
>> 	cache_objs = cache->objs;
>> +#endif
>> 
>> 	/* Can this be satisfied from the cache? */
>> 	if (cache->len < n) {
>> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 		uint32_t req = n + (cache->size - cache->len);
>> 
>> 		/* How many do we require i.e. number to fill the cache + the request */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>> +		ret = rte_mempool_ops_dequeue_bulk(mp,
>> +			temp_objs, req);
>> +#else
>> 		ret = rte_mempool_ops_dequeue_bulk(mp,
>> 			&cache->objs[cache->len], req);
>> +#endif
>> 		if (unlikely(ret < 0)) {
>> 			/*
>> 			 * In the off chance that we are buffer constrained,
>> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 			goto ring_dequeue;
>> 		}
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		len = cache->len;
>> +		for (i = 0; i < req; ++i, ++len) {
>> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
>> +								base_value);
>> +		}
>> +#endif
>> 		cache->len += req;
>> 	}
>> 
>> 	/* Now fill in the response ... */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_cache_objs;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +
>> +	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
>> +						len -= 4, obj_table += 4) {
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
>> +	}
>> +	switch (n & 0x3) {
>> +	case 3:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 2:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 1:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +	}
>> +#else
>> +	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
>> +#endif
>> +#else
>> 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> 		*obj_table = cache_objs[len];
>> +#endif
>> 
>> 	cache->len -= n;
>> 
>> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
>> index 22fccf9d7619..3543cad9d4ce 100644
>> --- a/lib/mempool/rte_mempool_ops_default.c
>> +++ b/lib/mempool/rte_mempool_ops_default.c
>> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>> 		obj = va + off;
>> 		obj_cb(mp, obj_cb_arg, obj,
>> 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		/* Store pool base value to calculate indices for index-based
>> +		 * lcore cache implementation
>> +		 */
>> +		if (i == 0)
>> +			mp->pool_base_value = obj;
>> +#endif
>> 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>> 		off += mp->elt_size + mp->trailer_size;
>> 	}
>> --
>> 2.25.1
> 
    
    
More information about the dev
mailing list