[PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used
Mattias Rönnblom
mattias.ronnblom at ericsson.com
Fri Sep 20 12:27:16 CEST 2024
In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.
This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.
Performance results from a Raptor Lake @ 3,2 GHz:
GCC 12.3.0
64 bytes packets
Core Mode Mpps
E RTE memcpy 9.5
E cc memcpy 9.7
E cc memcpy+pktcpy 9.0
P RTE memcpy 16.4
P cc memcpy 13.5
P cc memcpy+pktcpy 16.2
GCC 12.3.0
1500 bytes packets
Core Mode Mpps
P RTE memcpy 5.8
P cc memcpy 5.9
P cc memcpy+pktcpy 5.9
clang 15.0.7
64 bytes packets
Core Mode Mpps
P RTE memcpy 13.3
P cc memcpy 12.9
P cc memcpy+pktcpy 13.9
"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.
Signed-off-by: Mattias Rönnblom <mattias.ronnblom at ericsson.com>
---
lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..63571587a8 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
return nr_copies;
}
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+ void *dst = __builtin_assume_aligned(in_dst, 16);
+ const void *src = __builtin_assume_aligned(in_src, 16);
+
+ if (len <= 256) {
+ size_t left;
+
+ for (left = len; left >= 32; left -= 32) {
+ memcpy(dst, src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ src = RTE_PTR_ADD(src, 32);
+ }
+
+ memcpy(dst, src, left);
+ } else
+ memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+ rte_memcpy(dst, src, len);
+}
+#endif
+
static inline void
do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
int i;
for (i = 0; i < count; i++) {
- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+ pktcpy(elem[i].dst, elem[i].src, elem[i].len);
vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
elem[i].len);
PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
int i;
for (i = 0; i < count; i++)
- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+ pktcpy(elem[i].dst, elem[i].src, elem[i].len);
vq->batch_copy_nb_elems = 0;
}
--
2.43.0
More information about the dev
mailing list