[PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used

Mattias Rönnblom mattias.ronnblom at ericsson.com
Fri Sep 20 12:27:16 CEST 2024


In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.

This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.

Performance results from a Raptor Lake @ 3,2 GHz:

GCC 12.3.0
64 bytes packets
Core  Mode              Mpps
E     RTE memcpy        9.5
E     cc memcpy         9.7
E     cc memcpy+pktcpy  9.0

P     RTE memcpy        16.4
P     cc memcpy         13.5
P     cc memcpy+pktcpy  16.2

GCC 12.3.0
1500 bytes packets
Core  Mode              Mpps
P    RTE memcpy         5.8
P    cc memcpy          5.9
P    cc memcpy+pktcpy   5.9

clang 15.0.7
64 bytes packets
Core  Mode              Mpps
P     RTE memcpy        13.3
P     cc memcpy         12.9
P     cc memcpy+pktcpy  13.9

"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom at ericsson.com>
---
 lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..63571587a8 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
 	return nr_copies;
 }
 
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+	void *dst = __builtin_assume_aligned(in_dst, 16);
+	const void *src = __builtin_assume_aligned(in_src, 16);
+
+	if (len <= 256) {
+		size_t left;
+
+		for (left = len; left >= 32; left -= 32) {
+			memcpy(dst, src, 32);
+			dst = RTE_PTR_ADD(dst, 32);
+			src = RTE_PTR_ADD(src, 32);
+		}
+
+		memcpy(dst, src, left);
+	} else
+		memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+	rte_memcpy(dst, src, len);
+}
+#endif
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++) {
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 					   elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++)
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 
 	vq->batch_copy_nb_elems = 0;
 }
-- 
2.43.0



More information about the dev mailing list