|SUCCESS| [TEST,v9] eal/x86: optimize memcpy of small sizes

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Thu May 21 17:25:51 CEST 2026

Previous message (by thread): compilation|SUCCESS| pw(164215) sid(38190) job(PER_PATCH_BUILD18865)[v4] ring: fix zero-copy burst API documentation
Next message (by thread): |Testing |SUCCESS| pw(164215) sid(38190) job(PER_PATCH_BUILD18865-20260521231242) [v4] ring: fix zero-copy burst API documentation
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/164227

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-05-21

Commit: 98CBD80474FA8B44BF855DF32C47DC35F6588C
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [TEST PATCH v9] eal/x86: optimize memcpy of small sizes

This patch consolidates the implementation for copying up to 64 bytes
into one instance in the common copy function, adds __rte_restrict
qualifiers, and implements the missing rte_mov48() function.

> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..8ed8c55010 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h

[ ... ]

> @@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  
>  /**
>   * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
>  {
>  	__m128i xmm0;
>  
> @@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  
>  /**
>   * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
>  {
>  #if defined RTE_MEMCPY_AVX
>  	__m256i ymm0;
> @@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  #endif
>  }
>  
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> +{
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +#endif
> +}

Does this code need the unnecessary casts to uint8_t* and const uint8_t*,
given that dst and src are already declared with those types? The casts
add visual noise without adding type safety.

> +
>  /**
>   * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> +rte_mov64(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
>  {
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;

[ ... ]

> @@ -232,45 +241,22 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
>  	}
>  }
>  
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
>  	size_t dstofss;
>  	size_t bits;
>  
> -	/**
> -	 * Copy less than 16 bytes
> -	 */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				  (const uint8_t *)src - 16 + n);
> -		return ret;
> -	}
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				  (const uint8_t *)src - 32 + n);
> -		return ret;
> -	}
>  	if (n <= 512) {
>  		if (n >= 256) {
>  			n -= 256;

[ ... ]

> @@ -696,44 +648,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  
>  #endif /* __AVX512F__ */
>  
> +/**
> + * Copy bytes from one vector register size aligned location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
>  
> -	/* Copy size < 16 bytes */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
> -	/* Copy 16 <= size <= 32 bytes */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -
> -		return ret;
> -	}
> -
> -	/* Copy 32 < size <= 64 bytes */
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -
> -		return ret;
> -	}
> -
>  	/* Copy 64 bytes blocks */
>  	for (; n > 64; n -= 64) {
>  		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> @@ -749,20 +674,49 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  }
>  
>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
>  {
> +	/* Fast way when copy size doesn't exceed 64 bytes. */
> +	if (n < 16)
> +		return rte_mov15_or_less(dst, src, n);
> +	if (n <= 32) {
> +		if (__rte_constant(n) && n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return dst; /* avoid (harmless) duplicate copy */
> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		return dst;
> +	}
> +	if (n <= 64) {
> +		if (__rte_constant(n) && n == 64) {
> +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +#if defined RTE_MEMCPY_AVX
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
> +#else /* SSE implementation */
> +		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +#endif
> +		return dst;
> +	}

In the SSE path for 33-64 bytes, when n is exactly 48, does this code
copy the third 16-byte block twice? The conditional copies it when n > 48
evaluates to false, but the final overlapping copy at dst - 16 + n would
also include those bytes. For n=48, dst - 16 + 48 equals dst + 32, which
overlaps with the third block at dst + 32.

Would it be clearer to use n >= 48 or potentially utilize rte_mov48()
when applicable?

> +
> +	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
>  	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> -		return rte_memcpy_aligned(dst, src, n);
> +		return rte_memcpy_aligned_more_than_64(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return rte_memcpy_generic_more_than_64(dst, src, n);
>  }

Previous message (by thread): compilation|SUCCESS| pw(164215) sid(38190) job(PER_PATCH_BUILD18865)[v4] ring: fix zero-copy burst API documentation
Next message (by thread): |Testing |SUCCESS| pw(164215) sid(38190) job(PER_PATCH_BUILD18865-20260521231242) [v4] ring: fix zero-copy burst API documentation
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

More information about the test-report mailing list