<span style="font-family:SimSun;">For RISC-V, gcc 14.2, glibc 2.39, when <64 bytes, the test data is as follows. </span><br>
<span style="font-family:SimSun;">It appears the impact is quite significant.</span><br>
<br>
<span style="font-family:SimSun;">================================= 16B aligned =================================</span><br>
<span style="font-family:SimSun;"> 1 0 - 1(-80.05%) 1 - 1(-10.75%) 2 - 11(-85.80%) 2 - 11(-79.70%) </span><br>
<span style="font-family:SimSun;"> 2 0 - 1(-74.77%) 2 - 8(-68.29%) 3 - 12(-74.64%) 3 - 12(-74.01%) </span><br>
<span style="font-family:SimSun;"> 3 0 - 1(-78.86%) 2 - 8(-69.46%) 3 - 12(-75.36%) 3 - 13(-76.22%) </span><br>
<span style="font-family:SimSun;"> 4 0 - 1(-86.02%) 2 - 8(-74.27%) 2 - 12(-79.61%) 3 - 13(-79.16%) </span><br>
<span style="font-family:SimSun;"> 5 0 - 1(-86.29%) 2 - 8(-74.46%) 2 - 12(-79.00%) 3 - 13(-79.73%) </span><br>
<span style="font-family:SimSun;"> 6 0 - 1(-86.22%) 2 - 8(-73.82%) 2 - 12(-79.18%) 3 - 13(-79.23%) </span><br>
<span style="font-family:SimSun;"> 7 0 - 1(-89.68%) 2 - 8(-73.81%) 2 - 12(-79.54%) 3 - 13(-78.50%) </span><br>
<span style="font-family:SimSun;"> 8 0 - 1(-90.05%) 2 - 8(-75.88%) 3 - 12(-75.96%) 3 - 13(-77.93%) </span><br>
<span style="font-family:SimSun;"> 9 0 - 1(-89.85%) 2 - 8(-76.17%) 3 - 12(-76.74%) 3 - 13(-77.49%) </span><br>
<span style="font-family:SimSun;"> 12 0 - 1(-91.32%) 2 - 8(-76.92%) 3 - 12(-75.98%) 3 - 13(-77.69%) </span><br>
<span style="font-family:SimSun;"> 15 0 - 1(-91.46%) 2 - 8(-77.27%) 3 - 12(-76.36%) 3 - 13(-78.41%) </span><br>
<span style="font-family:SimSun;"> 16 0 - 1(-89.70%) 2 - 8(-74.81%) 3 - 12(-75.35%) 3 - 12(-77.52%) </span><br>
<span style="font-family:SimSun;"> 17 0 - 1(-81.57%) 3 - 8(-60.92%) 4 - 12(-66.96%) 5 - 13(-64.20%) </span><br>
<span style="font-family:SimSun;"> 31 0 - 1(-87.58%) 3 - 8(-62.66%) 4 - 12(-68.48%) 5 - 13(-65.12%) </span><br>
<span style="font-family:SimSun;"> 32 0 - 1(-84.06%) 3 - 8(-67.48%) 4 - 12(-68.33%) 4 - 13(-65.48%) </span><br>
<span style="font-family:SimSun;"> 33 0 - 1(-74.64%) 4 - 8(-50.45%) 6 - 12(-51.16%) 7 - 13(-45.94%) </span><br>
<span style="font-family:SimSun;"> 63 0 - 1(-79.33%) 5 - 9(-47.70%) 6 - 13(-49.47%) 9 - 13(-32.40%) </span><br>
<span style="font-family:SimSun;"><br>
</span><br>
<span style="font-family:SimSun;">================================== Unaligned ==================================</span><br>
<span style="font-family:SimSun;"> 1 0 - 1(-80.49%) 1 - 1(-15.31%) 2 - 11(-85.77%) 2 - 12(-80.65%) </span><br>
<span style="font-family:SimSun;"> 2 0 - 1(-78.18%) 2 - 8(-72.49%) 3 - 12(-75.34%) 3 - 12(-74.68%) </span><br>
<span style="font-family:SimSun;"> 3 0 - 1(-79.49%) 2 - 8(-73.40%) 3 - 12(-75.05%) 3 - 14(-76.72%) </span><br>
<span style="font-family:SimSun;"> 4 0 - 1(-86.27%) 2 - 8(-74.48%) 2 - 12(-79.56%) 3 - 13(-79.13%) </span><br>
<span style="font-family:SimSun;"> 5 0 - 1(-86.59%) 2 - 8(-74.54%) 2 - 12(-79.04%) 3 - 12(-77.99%) </span><br>
<span style="font-family:SimSun;"> 6 0 - 1(-87.06%) 2 - 8(-74.01%) 2 - 12(-79.09%) 3 - 12(-78.04%) </span><br>
<span style="font-family:SimSun;"> 7 0 - 1(-90.86%) 2 - 8(-74.09%) 2 - 12(-79.86%) 3 - 12(-78.32%) </span><br>
<span style="font-family:SimSun;"> 8 0 - 1(-89.78%) 2 - 8(-77.01%) 3 - 12(-76.51%) 3 - 14(-79.29%) </span><br>
<span style="font-family:SimSun;"> 9 0 - 1(-89.19%) 2 - 8(-75.99%) 3 - 12(-76.25%) 3 - 14(-79.28%) </span><br>
<span style="font-family:SimSun;"> 12 0 - 1(-89.11%) 2 - 8(-74.25%) 3 - 12(-74.19%) 3 - 14(-77.68%) </span><br>
<span style="font-family:SimSun;"> 15 0 - 1(-90.02%) 2 - 8(-75.39%) 3 - 12(-74.73%) 3 - 15(-78.67%) </span><br>
<span style="font-family:SimSun;"> 16 0 - 1(-80.96%) 2 - 8(-74.49%) 3 - 14(-78.17%) 3 - 14(-77.11%) </span><br>
<span style="font-family:SimSun;"> 17 0 - 1(-66.29%) 3 - 10(-66.85%) 4 - 15(-72.42%) 6 - 14(-61.18%) </span><br>
<span style="font-family:SimSun;"> 31 0 - 1(-86.84%) 3 - 9(-63.75%) 4 - 13(-65.88%) 5 - 15(-64.67%) </span><br>
<span style="font-family:SimSun;"> 32 0 - 1(-87.37%) 3 - 8(-61.34%) 4 - 12(-65.09%) 6 - 15(-64.04%) </span><br>
<span style="font-family:SimSun;"> 33 0 - 2(-84.23%) 5 - 10(-46.98%) 6 - 14(-57.24%) 8 - 16(-53.88%) </span><br>
<span style="font-family:SimSun;"> 63 0 - 2(-81.59%) 5 - 11(-52.12%) 7 - 16(-54.06%) 10 - 20(-50.01%) </span><br>
<div style="font-family:SimSun;white-space:nowrap;">
<br>
</div>
<br>
<br>
<br>
<blockquote name="replyContent" class="ReferenceQuote" style="font-family:SimSun;padding-left:5px;margin-left:5px;border-left:2px solid #B6B6B6;margin-right:0px;">
-----原始邮件-----<br>
<b>发件人:</b><span id="rc_from">"Stephen Hemminger" <stephen@networkplumber.org></span><br>
<b>发送时间:</b><span id="rc_senttime">2025-10-09 16:17:02 (星期四)</span><br>
<b>收件人:</b> "Sun Yuechi" <sunyuechi@iscas.ac.cn><br>
<b>抄送:</b> dev <dev@dpdk.org>, "Stanisław Kardach" <stanislaw.kardach@gmail.com>, "Bruce Richardson" <bruce.richardson@intel.com><br>
<b>主题:</b> Re: [PATCH] eal/riscv: optimize memcpy for small copies under 64 bytes<br>
<br>
<div dir="auto">
How does this compare to glibc/gcc memcpy? I would like to see rte_memcpy go away
</div>
<br>
<div class="gmail_quote gmail_quote_container">
<div dir="ltr" class="gmail_attr">
On Thu, Oct 9, 2025, 08:32 Sun Yuechi <<a href="mailto:sunyuechi@iscas.ac.cn">sunyuechi@iscas.ac.cn</a>> wrote:<br>
</div>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid #CCCCCC;padding-left:1ex;">
Improve rte_memcpy implementation on RISC-V platform for sizes under<br>
64 bytes, based on the ARM implementation.<br>
<br>
Enhanced handling for cases smaller than 64 bytes shows very significant<br>
performance benefits, while the impact is minimal after 64 bytes.<br>
<br>
This optimization is disabled by default as a conservative measure,<br>
since future glibc versions may include similar improvements that<br>
could conflict with this implementation.<br>
<br>
Use RTE_ARCH_RISCV_MEMCPY to enable this optimization.<br>
<br>
Signed-off-by: Sun Yuechi <<a href="mailto:sunyuechi@iscas.ac.cn" target="_blank" rel="noreferrer">sunyuechi@iscas.ac.cn</a>><br>
---<br>
config/riscv/meson.build | 5 ++<br>
lib/eal/riscv/include/rte_memcpy.h | 122 +++++++++++++++++++++++++++++<br>
2 files changed, 127 insertions(+)<br>
<br>
diff --git a/config/riscv/meson.build b/config/riscv/meson.build<br>
index f93ea3e145..73fd0ab4da 100644<br>
--- a/config/riscv/meson.build<br>
+++ b/config/riscv/meson.build<br>
@@ -20,6 +20,11 @@ dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)<br>
<br>
# common flags to all riscv builds, with lowest priority<br>
flags_common = [<br>
+ # Accelerate rte_memcpy for copies smaller than 64 bytes. Be sure to run<br>
+ # the unit test (memcpy_perf_autotest) to verify performance improvements.<br>
+ # Refer to notes in source file (lib/eal/riscv/include/rte_memcpy.h) for<br>
+ # more details.<br>
+ ['RTE_ARCH_RISCV_MEMCPY', false],<br>
['RTE_ARCH_RISCV', true],<br>
['RTE_CACHE_LINE_SIZE', 64],<br>
# Manually set wall time clock frequency for the target. If 0, then it is<br>
diff --git a/lib/eal/riscv/include/rte_memcpy.h b/lib/eal/riscv/include/rte_memcpy.h<br>
index d8a942c5d2..ae6e79e2fc 100644<br>
--- a/lib/eal/riscv/include/rte_memcpy.h<br>
+++ b/lib/eal/riscv/include/rte_memcpy.h<br>
@@ -2,6 +2,7 @@<br>
* Copyright(c) 2022 StarFive<br>
* Copyright(c) 2022 SiFive<br>
* Copyright(c) 2022 Semihalf<br>
+ * Copyright(c) 2025 ISCAS<br>
*/<br>
<br>
#ifndef RTE_MEMCPY_RISCV_H<br>
@@ -14,6 +15,125 @@<br>
<br>
#include "generic/rte_memcpy.h"<br>
<br>
+#ifdef RTE_ARCH_RISCV_MEMCPY<br>
+<br>
+#ifdef __cplusplus<br>
+extern "C" {<br>
+#endif<br>
+<br>
+/*<br>
+ * This implementation is improved from eal/arm/include/rte_memcpy_64.h,<br>
+ * targeting only cases of < 64 bytes.<br>
+ * Currently shows significant performance improvement over various glibc versions,<br>
+ * but is disabled by default due to uncertainty about potential performance<br>
+ * degradation in future versions.<br>
+ * You can use memcpy_perf_autotest to test the performance.<br>
+ */<br>
+<br>
+static __rte_always_inline<br>
+void rte_mov16(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ __uint128_t *dst128 = (__uint128_t *)dst;<br>
+ const __uint128_t *src128 = (const __uint128_t *)src;<br>
+ *dst128 = *src128;<br>
+}<br>
+<br>
+static __rte_always_inline<br>
+void rte_mov32(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ __uint128_t *dst128 = (__uint128_t *)dst;<br>
+ const __uint128_t *src128 = (const __uint128_t *)src;<br>
+ const __uint128_t x0 = src128[0], x1 = src128[1];<br>
+ dst128[0] = x0;<br>
+ dst128[1] = x1;<br>
+}<br>
+<br>
+static __rte_always_inline<br>
+void rte_mov48(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ __uint128_t *dst128 = (__uint128_t *)dst;<br>
+ const __uint128_t *src128 = (const __uint128_t *)src;<br>
+ const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];<br>
+ dst128[0] = x0;<br>
+ dst128[1] = x1;<br>
+ dst128[2] = x2;<br>
+}<br>
+<br>
+static __rte_always_inline void<br>
+rte_mov64(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ memcpy(dst, src, 64);<br>
+}<br>
+<br>
+static __rte_always_inline void<br>
+rte_mov128(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ memcpy(dst, src, 128);<br>
+}<br>
+<br>
+static __rte_always_inline void<br>
+rte_mov256(uint8_t *dst, const uint8_t *src)<br>
+{<br>
+ memcpy(dst, src, 256);<br>
+}<br>
+<br>
+static __rte_always_inline void<br>
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)<br>
+{<br>
+ if (n & 0x08) {<br>
+ /* copy 8 ~ 15 bytes */<br>
+ *(uint64_t *)dst = *(const uint64_t *)src;<br>
+ *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);<br>
+ } else if (n & 0x04) {<br>
+ /* copy 4 ~ 7 bytes */<br>
+ *(uint32_t *)dst = *(const uint32_t *)src;<br>
+ *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);<br>
+ } else if (n & 0x02) {<br>
+ /* copy 2 ~ 3 bytes */<br>
+ *(uint16_t *)dst = *(const uint16_t *)src;<br>
+ *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);<br>
+ } else if (n & 0x01) {<br>
+ /* copy 1 byte */<br>
+ *dst = *src;<br>
+ }<br>
+}<br>
+<br>
+static __rte_always_inline void<br>
+rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)<br>
+{<br>
+ if (n == 16) {<br>
+ rte_mov16(dst, src);<br>
+ } else if (n <= 32) {<br>
+ rte_mov16(dst, src);<br>
+ rte_mov16(dst - 16 + n, src - 16 + n);<br>
+ } else if (n <= 48) {<br>
+ rte_mov32(dst, src);<br>
+ rte_mov16(dst - 16 + n, src - 16 + n);<br>
+ } else {<br>
+ rte_mov48(dst, src);<br>
+ rte_mov16(dst - 16 + n, src - 16 + n);<br>
+ }<br>
+}<br>
+<br>
+static __rte_always_inline void *<br>
+rte_memcpy(void *dst, const void *src, size_t n)<br>
+{<br>
+ if (n >= 64)<br>
+ return memcpy(dst, src, n);<br>
+ if (n < 16) {<br>
+ rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);<br>
+ return dst;<br>
+ }<br>
+ rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);<br>
+ return dst;<br>
+}<br>
+<br>
+#ifdef __cplusplus<br>
+}<br>
+#endif<br>
+<br>
+#else /* RTE_ARCH_RISCV_MEMCPY */<br>
+<br>
#ifdef __cplusplus<br>
extern "C" {<br>
#endif<br>
@@ -60,4 +180,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)<br>
}<br>
#endif<br>
<br>
+#endif /* RTE_ARCH_RISCV_MEMCPY */<br>
+<br>
#endif /* RTE_MEMCPY_RISCV_H */<br>
-- <br>
2.51.0<br>
<br>
</blockquote>
</div>
</blockquote>