|SUCCESS| [v11] eal/x86: optimize memcpy of small sizes

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Thu May 21 22:52:51 CEST 2026
Previous message (by thread): |WARNING| pw164241 [PATCH] [v11] eal/x86: optimize memcpy of small s
Next message (by thread): |SUCCESS| dpdk-next-net-intel| 2d9ed20978| Intel-compilation
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/164241

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-05-21

## Review of Patch: eal/x86: optimize memcpy of small sizes

git sha: 20260521185631.116046-1-mb at smartsharesystems.com
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [PATCH v11] eal/x86: optimize memcpy of small sizes

This patch consolidates the implementation for copying up to 64 bytes into
a single location in the common copy function, avoiding duplicate code and
improving performance. It also adds __rte_restrict qualifiers and
implements the missing rte_mov48() function.

> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..8ed8c55010 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h

[ ... ]

> @@ -103,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  
>  /**
>   * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
>  {
>  	__m128i xmm0;
>  
> @@ -116,10 +108,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  
>  /**
>   * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> +rte_mov32(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
>  {
>  #if defined RTE_MEMCPY_AVX
>  	__m256i ymm0;
> @@ -132,12 +124,29 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  #endif
>  }
>  
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> +{
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);

Does this code need explicit casting when the parameters are already uint8_t
pointers? The cast appears redundant.

> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);

Same question regarding the casts here.

> +#endif
> +}

[ ... ]

> @@ -351,10 +337,10 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  
>  /**
>   * Copy 128-byte blocks from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
>   */
>  static __rte_always_inline void
> -rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> +rte_mov128blocks(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src, size_t n)
>  {
>  	__m256i ymm0, ymm1, ymm2, ymm3;
>  

[ ... ]

> @@ -696,44 +648,17 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  
>  #endif /* __AVX512F__ */
>  
> +/**
> + * Copy bytes from one vector register size aligned location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> +		size_t n)
>  {

Note the indentation of the second parameter line. Does this line violate the
alignment style required in the coding standards? Parameters on continuation
lines should typically align with the opening parenthesis.

>  	void *ret = dst;
>  
> -	/* Copy size < 16 bytes */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}

[ ... ]

> @@ -749,20 +674,49 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  }
>  
>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
>  {
> +	/* Fast way when copy size doesn't exceed 64 bytes. */
> +	if (n < 16)
> +		return rte_mov15_or_less(dst, src, n);
> +	if (n <= 32) {
> +		if (__rte_constant(n) && n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return dst; /* avoid (harmless) duplicate copy */
> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		return dst;
> +	}
> +	if (n <= 64) {
> +		if (__rte_constant(n) && n == 64) {
> +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +#if defined RTE_MEMCPY_AVX
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);

Does this calculation `dst - 32 + n` overflow when n is in the range (32, 64]
and dst points to a location near the end of the address space? While unlikely
in practice, pointer arithmetic overflow is undefined behavior.

> +#else /* SSE implementation */
> +		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);

When n is exactly 48, does this code miss copying 16 bytes? The condition is
n > 48, so for n=48 the third copy is skipped, but then the final overlapping
copy below would be:

> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);

For n=48, this becomes dst+32 and src+32, which would only copy the last 16
bytes at [32,48). Does this miss bytes [16,32)?

> +#endif
> +		return dst;
> +	}
> +
> +	/* Implementation for size > 64 bytes depends on alignment with vector register size. */
> +	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> +		return rte_memcpy_aligned_more_than_64(dst, src, n);
> +	else
> +		return rte_memcpy_generic_more_than_64(dst, src, n);
>  }

The coding style guide discourages unnecessary else after return. Can the else
keyword be removed here?

Overall, the consolidation approach looks reasonable. The main concern is
verifying correct coverage for all copy sizes, especially around the 48-byte
boundary in the SSE path.
Previous message (by thread): |WARNING| pw164241 [PATCH] [v11] eal/x86: optimize memcpy of small s
Next message (by thread): |SUCCESS| dpdk-next-net-intel| 2d9ed20978| Intel-compilation
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the test-report mailing list