[dpdk-dev] [PATCH] net/mlx5: prefetch CQEs for a faster decompression

Raslan Darawsheh rasland at mellanox.com
Wed Mar 25 17:14:33 CET 2020


Hi,

> -----Original Message-----
> From: Alexander Kozyrev <akozyrev at mellanox.com>
> Sent: Tuesday, March 24, 2020 4:46 PM
> To: dev at dpdk.org
> Cc: Raslan Darawsheh <rasland at mellanox.com>; Matan Azrad
> <matan at mellanox.com>; Slava Ovsiienko <viacheslavo at mellanox.com>
> Subject: [PATCH] net/mlx5: prefetch CQEs for a faster decompression
> 
> Invalidation of consumed CQEs incurs a performance penalty
> due to many cache misses caused by a non-sequential CQEs access.
> Prefetch CQEs to get a better data locality and speed up the
> decompression of CQEs. Prefetching reduces CPI rate of the
> rxq_cq_decompress_v() function from 1 to 0.85 in my environment,
> resulting in 2% boost in mpps for 64B frames single core test.
> 
> Signed-off-by: Alexander Kozyrev <akozyrev at mellanox.com>
> Acked-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 5 +++--
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 6 +++---
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 6 ++++--
>  3 files changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> index aa43cab084..90548ea22d 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> @@ -155,8 +155,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq,
> volatile struct mlx5_cqe *cq,
>  		const vector unsigned long shmax = {64, 64};
>  #endif
> 
> -		if (!(pos & 0x7) && pos + 8 < mcqe_n)
> -			rte_prefetch0((void *)(cq + pos + 8));
> +		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
> +			if (likely(pos + i < mcqe_n))
> +				rte_prefetch0((void *)(cq + pos + i));
> 
>  		/* A.1 load mCQEs into a 128bit register. */
>  		mcqe1 = (vector unsigned char)vec_vsx_ld(0,
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> index 6d952df787..44f662e1c1 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -145,9 +145,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq,
> volatile struct mlx5_cqe *cq,
>  				    -1UL << ((mcqe_n - pos) *
>  					     sizeof(uint16_t) * 8) : 0);
>  #endif
> -
> -		if (!(pos & 0x7) && pos + 8 < mcqe_n)
> -			rte_prefetch0((void *)(cq + pos + 8));
> +		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
> +			if (likely(pos + i < mcqe_n))
> +				rte_prefetch0((void *)(cq + pos + i));
>  		__asm__ volatile (
>  		/* A.1 load mCQEs into a 128bit register. */
>  		"ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> index 406f23f595..9db9003acd 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -133,8 +133,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq,
> volatile struct mlx5_cqe *cq,
>  		__m128i byte_cnt, invalid_mask;
>  #endif
> 
> -		if (!(pos & 0x7) && pos + 8 < mcqe_n)
> -			rte_prefetch0((void *)(cq + pos + 8));
> +		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
> +			if (likely(pos + i < mcqe_n))
> +				rte_prefetch0((void *)(cq + pos + i));
> +
>  		/* A.1 load mCQEs into a 128bit register. */
>  		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
>  		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
> --
> 2.18.2


Patch applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh


More information about the dev mailing list