[PATCH v3] eal/x86: optimize memcpy of small sizes

Konstantin Ananyev konstantin.ananyev at huawei.com
Fri Nov 28 15:02:35 CET 2025


> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with n <= 16.
> + *
> + * Note: Copying uninitialized memory is perfectly acceptable.
> + * Using e.g. memcpy(dst, src, 8) instead of
> + * *(unaligned_uint64_t*) = *(const unaligned_uint64_t *)src
> + * avoids compiler warnings about source data may be uninitialized
> + * [-Wmaybe-uninitialized].
> + *
> + * Note: Using "n & X" generates 3-byte "test" instructions,
> + * instead of "n >= X", which would generate 4-byte "cmp" instructions.
> + */
> +static __rte_always_inline void *
> +rte_mov16_or_less(void *dst, const void *src, size_t n)
> +{
> +	/* Faster way when size is known at build time. */
> +	if (__rte_constant(n)) {
> +		if (n == 2)
> +			return memcpy(dst, src, 2);
> +		if (n == 4)
> +			return memcpy(dst, src, 4);
> +		if (n == 6) /* 4 + 2 */
> +			return memcpy(dst, src, 6);
> +		if (n == 8)
> +			return memcpy(dst, src, 8);
> +		if (n == 10) /* 8 + 2 */
> +			return memcpy(dst, src, 10);
> +		if (n == 12) /* 8 + 4 */
> +			return memcpy(dst, src, 12);
> +		if (n == 16) {
> +			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +			return dst;
> +		}
> +	}
> +
> +	if (n & 0x18) { /* n >= 8 */

Probably 'n & 0x8'?
 
> +		/* copy 8 ~ 16 bytes */
> +		memcpy(dst, src, 8);
> +		memcpy((uint8_t *)dst - 8 + n, (const uint8_t *)src - 8 + n, 8);
> +	} else if (n & 0x4) {
> +		/* copy 4 ~ 7 bytes */
> +		memcpy(dst, src, 4);
> +		memcpy((uint8_t *)dst - 4 + n, (const uint8_t *)src - 4 + n, 4);
> +	} else if (n & 0x2) {
> +		/* copy 2 ~ 3 bytes */
> +		memcpy(dst, src, 2);
> +		memcpy((uint8_t *)dst - 2 + n, (const uint8_t *)src - 2 + n, 2);
> +	} else if (n & 0x1) {
> +		/* copy 1 byte */
> +		memcpy(dst, src, 1);
> +	}
> +	return dst;
> +}


More information about the dev mailing list