[PATCH v6 7/7] vhost: optimize memcpy routines when cc memcpy is used

Morten Brørup mb at smartsharesystems.com
Wed Oct 9 23:25:25 CEST 2024


> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{

A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.

> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +	if (len <= 256) {
> +		size_t left;
> +
> +		for (left = len; left >= 32; left -= 32) {
> +			memcpy(dst, src, 32);
> +			dst = RTE_PTR_ADD(dst, 32);
> +			src = RTE_PTR_ADD(src, 32);
> +		}
> +
> +		memcpy(dst, src, left);
> +	} else
> +		memcpy(dst, src, len);
> +}
> +#else
> +static __rte_always_inline void
> +pktcpy(void *dst, const void *src, size_t len)
> +{
> +	rte_memcpy(dst, src, len);
> +}
> +#endif
> +
>  static inline void
>  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
>  	__rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++) {
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>  		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>  					   elem[i].len);
>  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++)
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> 
>  	vq->batch_copy_nb_elems = 0;
>  }
> --
> 2.43.0

Anyway,
Acked-by: Morten Brørup <mb at smartsharesystems.com>



More information about the dev mailing list