|SUCCESS| [v8] eal/x86: optimize memcpy of small sizes
dpdklab at iol.unh.edu
dpdklab at iol.unh.edu
Wed Apr 29 23:34:16 CEST 2026
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163339
_AI Code Review_
=== Patch Review: series.patch (via Claude) ===
Review date: 2026-04-29
# Review of DPDK Patch: eal/x86: optimize memcpy of small sizes
git sha: 20260429093628.215514-1-mb at smartsharesystems.com
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [PATCH v8] eal/x86: optimize memcpy of small sizes
This patch consolidates the implementation for copying up to 64 bytes,
adds __rte_restrict qualifiers, and adds the missing rte_mov48() function.
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..e0645db2f1 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
[ ... ]
> @@ -95,10 +95,10 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>
> /**
> * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
> */
> static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> +rte_mov16(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> {
> __m128i xmm0;
>
> @@ -106,10 +106,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> _mm_storeu_si128((__m128i *)dst, xmm0);
> }
>
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> +{
> +#if defined RTE_MEMCPY_AVX
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +#else /* SSE implementation */
> + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +#endif
> +}
The casts to (uint8_t *) and (const uint8_t *) in rte_mov48() are
unnecessary. The function parameters are already uint8_t *__rte_restrict.
[ ... ]
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with 16 <= n <= 32.
> + */
> +static __rte_always_inline void *
> +rte_mov16_to_32(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
> +{
> + if (__rte_constant(n) && n == 32) {
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + return dst;
> + }
> + rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> + if (__rte_constant(n) && n == 16)
> + return dst; /* avoid (harmless) duplicate copy */
> + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> + return dst;
> +}
Does this code handle n < 16 correctly? The comment indicates this function
should be used with 16 <= n <= 32, but if called with n < 16, could the
expression "dst - 16 + n" result in a negative offset, causing an underflow
when cast to uint8_t *?
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with 32 <= n <= 64.
> + */
> +static __rte_always_inline void *
> +rte_mov32_to_64(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
> +{
> + if (__rte_constant(n) && n == 64) {
> + rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> + return dst;
> + }
> +#if defined RTE_MEMCPY_AVX
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + if (__rte_constant(n) && n == 32)
> + return dst; /* avoid (harmless) duplicate copy */
> + rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
> +#else /* SSE implementation */
> + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> + if (__rte_constant(n) && n == 32)
> + return dst; /* avoid (harmless) duplicate copy */
> + if (n > 48)
> + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +#endif
> + return dst;
> +}
Does this code handle n < 32 correctly? The comment states this should be
used with 32 <= n <= 64, but if called with n < 32, would "dst - 32 + n"
(AVX path) or "dst - 16 + n" (SSE path) underflow?
Additionally, in the SSE path, does the condition "if (n > 48)" handle
exactly n == 48 correctly? With n == 48, the third rte_mov16() at offset
2*16 would not be called, but then the final overlapping rte_mov16() at
offset 32 would be called. Is this the intended behavior?
[ ... ]
> @@ -749,20 +722,25 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
> }
>
> static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
> {
> + /* Common implementation for size <= 64 bytes. */
> + if (n < 16)
> + return rte_mov15_or_less(dst, src, n);
> + if (n <= 32)
> + return rte_mov16_to_32(dst, src, n);
> + if (n <= 64)
> + return rte_mov32_to_64(dst, src, n);
> +
> + /* Implementation for size > 64 bytes depends on alignment with vector register size. */
> if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> - return rte_memcpy_aligned(dst, src, n);
> + return rte_memcpy_aligned_more_than_64(dst, src, n);
> else
> - return rte_memcpy_generic(dst, src, n);
> + return rte_memcpy_generic_more_than_64(dst, src, n);
> }
Can the conditional branches in rte_memcpy() introduce performance overhead
for the critical path? When n is not a compile-time constant, the function
now performs up to three comparisons before dispatching to the appropriate
handler. In the original code, were some of these comparisons eliminated
through different control flow?
The refactoring consolidates code but does the increased branching in the
common path affect performance negatively for any size ranges?
More information about the test-report
mailing list