|SUCCESS| [v9] eal/x86: optimize memcpy of small sizes
dpdklab at iol.unh.edu
dpdklab at iol.unh.edu
Fri May 8 10:28:25 CEST 2026
Test-Label: ai-code-review
Test-Status: SUCCESS
https://dpdk.org/patch/163367
_AI Code Review_
=== Patch Review: series.patch (via Claude) ===
Review date: 2026-05-08
git sha: series.patch (patch file)
Author: Morten Brorup <mb at smartsharesystems.com>
Subject: [PATCH v9] eal/x86: optimize memcpy of small sizes
This patch optimizes the x86 memcpy implementation for small sizes by
consolidating the code for copying up to 64 bytes into a single common
location before checking alignment requirements. It also adds __rte_restrict
to source and destination addresses and implements the missing rte_mov48()
function.
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 46d34b8081..8ed8c55010 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
[ ... ]
> @@ -53,15 +45,15 @@ extern "C" {
> * Pointer to the destination data.
> */
> static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n);
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n);
>
> /**
> * Copy bytes from one location to another,
> - * locations should not overlap.
> + * locations must not overlap.
> * Use with n <= 15.
> */
> static __rte_always_inline void *
> -rte_mov15_or_less(void *dst, const void *src, size_t n)
> +rte_mov15_or_less(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
> {
> /**
> * Use the following structs to avoid violating C standard
[ ... ]
> +/**
> + * Copy 48 bytes from one location to another,
> + * locations must not overlap.
> + */
> +static __rte_always_inline void
> +rte_mov48(uint8_t *__rte_restrict dst, const uint8_t *__rte_restrict src)
> +{
> +#if defined RTE_MEMCPY_AVX
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +#else /* SSE implementation */
> + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> +#endif
> +}
Does rte_mov48() need the explicit casts on dst and src? They are already
uint8_t *__rte_restrict parameters. The unnecessary casts appear in both the
AVX and SSE paths.
[ ... ]
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
> static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> + size_t n)
> {
> void *ret = dst;
> size_t dstofss;
> size_t bits;
>
> - /**
> - * Copy less than 16 bytes
> - */
> - if (n < 16) {
> - return rte_mov15_or_less(dst, src, n);
> - }
> -
> /**
> * Fast way when copy size doesn't exceed 512 bytes
> */
> - if (__rte_constant(n) && n == 32) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - return ret;
> - }
> - if (n <= 32) {
> - rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> - if (__rte_constant(n) && n == 16)
> - return ret; /* avoid (harmless) duplicate copy */
> - rte_mov16((uint8_t *)dst - 16 + n,
> - (const uint8_t *)src - 16 + n);
> - return ret;
> - }
> - if (__rte_constant(n) && n == 64) {
> - rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> - return ret;
> - }
> - if (n <= 64) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - rte_mov32((uint8_t *)dst - 32 + n,
> - (const uint8_t *)src - 32 + n);
> - return ret;
> - }
> if (n <= 512) {
Does rte_memcpy_generic_more_than_64() verify that n > 64 on entry? The
function comment states "Use with n > 64" but there are no runtime assertions
or checks. Can this code path be called with n <= 64?
[ ... ]
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
> static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> + size_t n)
> {
> void *ret = dst;
> size_t dstofss;
> size_t bits;
>
> - /**
> - * Copy less than 16 bytes
> - */
> - if (n < 16) {
> - return rte_mov15_or_less(dst, src, n);
> - }
> -
> /**
> * Fast way when copy size doesn't exceed 256 bytes
> */
> - if (__rte_constant(n) && n == 32) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - return ret;
> - }
> - if (n <= 32) {
> - rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> - if (__rte_constant(n) && n == 16)
> - return ret; /* avoid (harmless) duplicate copy */
> - rte_mov16((uint8_t *)dst - 16 + n,
> - (const uint8_t *)src - 16 + n);
> - return ret;
> - }
> - if (n <= 64) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - rte_mov32((uint8_t *)dst - 32 + n,
> - (const uint8_t *)src - 32 + n);
> - return ret;
> - }
> if (n <= 256) {
Same question applies here to the AVX implementation. Does the function
verify n > 64 on entry? Can this be called incorrectly?
[ ... ]
> +/**
> + * Copy bytes from one location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
> static __rte_always_inline void *
> -rte_memcpy_generic(void *dst, const void *src, size_t n)
> +rte_memcpy_generic_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> + size_t n)
> {
> __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> void *ret = dst;
> size_t dstofss;
> size_t srcofs;
>
> - /**
> - * Copy less than 16 bytes
> - */
> - if (n < 16) {
> - return rte_mov15_or_less(dst, src, n);
> - }
> -
> /**
> * Fast way when copy size doesn't exceed 512 bytes
> */
> - if (n <= 32) {
> - rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> - if (__rte_constant(n) && n == 16)
> - return ret; /* avoid (harmless) duplicate copy */
> - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> - return ret;
> - }
> - if (n <= 64) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - if (n > 48)
> - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> - return ret;
> - }
> if (n <= 128) {
Same question for the SSE implementation. Does the function verify n > 64?
[ ... ]
> +/**
> + * Copy bytes from one vector register size aligned location to another,
> + * locations must not overlap.
> + * Use with n > 64.
> + */
> static __rte_always_inline void *
> -rte_memcpy_aligned(void *dst, const void *src, size_t n)
> +rte_memcpy_aligned_more_than_64(void *__rte_restrict dst, const void *__rte_restrict src,
> + size_t n)
> {
> void *ret = dst;
>
> - /* Copy size < 16 bytes */
> - if (n < 16) {
> - return rte_mov15_or_less(dst, src, n);
> - }
> -
> - /* Copy 16 <= size <= 32 bytes */
> - if (__rte_constant(n) && n == 32) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - return ret;
> - }
> - if (n <= 32) {
> - rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> - if (__rte_constant(n) && n == 16)
> - return ret; /* avoid (harmless) duplicate copy */
> - rte_mov16((uint8_t *)dst - 16 + n,
> - (const uint8_t *)src - 16 + n);
> -
> - return ret;
> - }
> -
> - /* Copy 32 < size <= 64 bytes */
> - if (__rte_constant(n) && n == 64) {
> - rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> - return ret;
> - }
> - if (n <= 64) {
> - rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> - rte_mov32((uint8_t *)dst - 32 + n,
> - (const uint8_t *)src - 32 + n);
> -
> - return ret;
> - }
> -
> /* Copy 64 bytes blocks */
> for (; n > 64; n -= 64) {
Same verification question applies to rte_memcpy_aligned_more_than_64(). The
function comment states "Use with n > 64" but does the code verify this
precondition?
[ ... ]
> static __rte_always_inline void *
> -rte_memcpy(void *dst, const void *src, size_t n)
> +rte_memcpy(void *__rte_restrict dst, const void *__rte_restrict src, size_t n)
> {
> + /* Fast way when copy size doesn't exceed 64 bytes. */
> + if (n < 16)
> + return rte_mov15_or_less(dst, src, n);
> + if (n <= 32) {
> + if (__rte_constant(n) && n == 32) {
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + return dst;
> + }
> + rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> + if (__rte_constant(n) && n == 16)
> + return dst; /* avoid (harmless) duplicate copy */
> + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> + return dst;
> + }
> + if (n <= 64) {
> + if (__rte_constant(n) && n == 64) {
> + rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> + return dst;
> + }
> +#if defined RTE_MEMCPY_AVX
> + rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> + rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
> +#else /* SSE implementation */
> + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> + if (n > 48)
> + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
> +#endif
> + return dst;
> + }
> +
> + /* Implementation for size > 64 bytes depends on alignment with vector register size. */
Does this code handle the case when n == 64 correctly? Looking at the new
code structure, when n == 64 and __rte_constant(n) is false
More information about the test-report
mailing list