|SUCCESS| [v9] eal/x86: optimize memcpy of small sizes

dpdklab at iol.unh.edu dpdklab at iol.unh.edu
Thu Apr 30 01:29:46 CEST 2026


Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163367

_AI Code Review_

=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-29

I'll review this DPDK patch for correctness, coding style, and guideline compliance.

git sha: Not available (patch file format)
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [PATCH v9] eal/x86: optimize memcpy of small sizes

This patch consolidates small size memcpy implementations, adds
__rte_restrict to function parameters, implements rte_mov48(), and
optimizes the copy path for sizes up to 64 bytes by handling them before
alignment checks.


> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..8ed8c55010 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h

[ ... ]

> @@ -40,9 +35,6 @@ extern "C" {
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> - * @note This is implemented as a macro, so it's address should not be taken
> - * and care is needed as parameter expressions may be evaluated multiple times.
> - *
>   * @param dst
>   *   Pointer to the destination of the data.
>   * @param src
> @@ -53,15 +45,15 @@ extern "C" {
>   *   Pointer to the destination data.
>   */
>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n);
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);

Does this forward declaration need to remain? It appears immediately before
the actual function implementation at the end of the file, which may make
this declaration redundant.

[ ... ]

> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> +{
> +#if defined RTE_MEMCPY_AVX
> +	rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +	rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +#endif
> +}

The casts to (uint8_t *) and (const uint8_t *) are unnecessary since dst
and src are already declared as uint8_t * types. Does this follow the
project's style regarding redundant casts?

[ ... ]

> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
>  static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> +		size_t n)

Alignment: The parameter continuation line is indented with two tabs.
According to DPDK coding style, continuation lines should align with the
opening parenthesis of the function, or use a single tab plus spaces if
that's the project convention. Can you verify the correct alignment for
multi-line function declarations?

This alignment issue appears in multiple new function declarations:
rte_memcpy_generic_more_than_64() (appears three times), and
rte_memcpy_aligned_more_than_64().

[ ... ]

>  static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> +		size_t n)
>  {
>  	void *ret = dst;
>  
> -	/* Copy size < 16 bytes */
> -	if (n < 16) {
> -		return rte_mov15_or_less(dst, src, n);
> -	}
> -
> -	/* Copy 16 <= size <= 32 bytes */
> -	if (__rte_constant(n) && n == 32) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		if (__rte_constant(n) && n == 16)
> -			return ret; /* avoid (harmless) duplicate copy */
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> -
> -		return ret;
> -	}
> -
> -	/* Copy 32 < size <= 64 bytes */
> -	if (__rte_constant(n) && n == 64) {
> -		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> -		return ret;
> -	}
> -	if (n <= 64) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov32((uint8_t *)dst - 32 + n,
> -				(const uint8_t *)src - 32 + n);
> -
> -		return ret;
> -	}
> -
>  	/* Copy 64 bytes blocks */
>  	for (; n > 64; n -= 64) {
>  		rte_mov64((uint8_t *)dst, (const uint8_t *)src);

In rte_memcpy_aligned_more_than_64(), the function comment states "Use
with n > 64", but the loop condition is "n > 64". Does this correctly
handle the case where n == 65? After copying one 64-byte block, n becomes
1, and the function continues to copy the final "rte_mov64((uint8_t *)dst,
(const uint8_t *)src)". Does this access beyond the source buffer?

[ ... ]

>  static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
>  {
> +	/* Fast way when copy size doesn't exceed 64 bytes. */
> +	if (n < 16)
> +		return rte_mov15_or_less(dst, src, n);
> +	if (n <= 32) {
> +		if (__rte_constant(n) && n == 32) {
> +			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return dst; /* avoid (harmless) duplicate copy */
> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +		return dst;
> +	}
> +	if (n <= 64) {
> +		if (__rte_constant(n) && n == 64) {
> +			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +#if defined RTE_MEMCPY_AVX
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
> +#else /* SSE implementation */
> +		rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +		rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +#endif
> +		return dst;
> +	}

Coding style: Single-statement if blocks should not use braces per the
coding standard, however for consistency with existing code in the file,
the braces on all the if blocks in rte_memcpy() appear appropriate.

Does the expression "dst + 0 * 16" add clarity, or would "dst" or "dst +
0" be clearer? The "0 * 16" pattern appears to be used for visual
alignment with the following lines.


More information about the test-report mailing list