[dpdk-dev] [PATCH v9 5/9] net/virtio: add vectorized packed ring Rx path

Maxime Coquelin maxime.coquelin at redhat.com
Fri Apr 24 13:51:45 CEST 2020



On 4/24/20 11:24 AM, Marvin Liu wrote:
> Optimize packed ring Rx path with SIMD instructions. Solution of
> optimization is pretty like vhost, is that split path into batch and
> single functions. Batch function is further optimized by AVX512
> instructions. Also pad desc extra structure to 16 bytes aligned, thus
> four elements will be saved in one batch.
> 
> Signed-off-by: Marvin Liu <yong.liu at intel.com>
> 
> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> index c9edb84ee..102b1deab 100644
> --- a/drivers/net/virtio/Makefile
> +++ b/drivers/net/virtio/Makefile
> @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
>  endif
>  
> +ifneq ($(FORCE_DISABLE_AVX512), y)
> +	CC_AVX512_SUPPORT=\
> +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> +	grep -q AVX512 && echo 1)
> +endif
> +
> +ifeq ($(CC_AVX512_SUPPORT), 1)
> +CFLAGS += -DCC_AVX512_SUPPORT
> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
> +
> +ifeq ($(RTE_TOOLCHAIN), gcc)
> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), clang)
> +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1)
> +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), icc)
> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
> +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> +endif
> +endif
> +
>  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
> diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
> index 15150eea1..8e68c3039 100644
> --- a/drivers/net/virtio/meson.build
> +++ b/drivers/net/virtio/meson.build
> @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
>  deps += ['kvargs', 'bus_pci']
>  
>  if arch_subdir == 'x86'
> +	if '-mno-avx512f' not in machine_args
> +		if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
> +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> +			cflags += ['-DCC_AVX512_SUPPORT']
> +			if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
> +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> +			elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> +				cflags += '-DVHOST_CLANG_UNROLL_PRAGMA'
> +			elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0'))
> +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> +			endif
> +			sources += files('virtio_rxtx_packed_avx.c')
> +		endif
> +	endif
>  	sources += files('virtio_rxtx_simple_sse.c')
>  elif arch_subdir == 'ppc'
>  	sources += files('virtio_rxtx_simple_altivec.c')
> diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
> index febaf17a8..5c112cac7 100644
> --- a/drivers/net/virtio/virtio_ethdev.h
> +++ b/drivers/net/virtio/virtio_ethdev.h
> @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void *tx_queue, struct rte_mbuf **tx_pkts,
>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		uint16_t nb_pkts);
>  
> +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		uint16_t nb_pkts);
> +
>  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
>  
>  void virtio_interrupt_handler(void *param);
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index 84f4cf946..c9b6e7844 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -2329,3 +2329,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
>  
>  	return nb_tx;
>  }
> +
> +__rte_weak uint16_t
> +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> +			    struct rte_mbuf **rx_pkts __rte_unused,
> +			    uint16_t nb_pkts __rte_unused)
> +{
> +	return 0;
> +}
> diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> new file mode 100644
> index 000000000..8a7b459eb
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> @@ -0,0 +1,374 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_net.h>
> +
> +#include "virtio_logs.h"
> +#include "virtio_ethdev.h"
> +#include "virtio_pci.h"
> +#include "virtqueue.h"
> +
> +#define BYTE_SIZE 8
> +/* flag bits offset in packed ring desc higher 64bits */
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +
> +#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
> +	FLAGS_BITS_OFFSET)
> +
> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> +	sizeof(struct vring_packed_desc))
> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> +
> +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifndef virtio_for_each_try_unroll
> +#define virtio_for_each_try_unroll(iter, val, num) \
> +	for (iter = val; iter < num; iter++)
> +#endif
> +
> +static inline void
> +virtio_update_batch_stats(struct virtnet_stats *stats,
> +			  uint16_t pkt_len1,
> +			  uint16_t pkt_len2,
> +			  uint16_t pkt_len3,
> +			  uint16_t pkt_len4)
> +{
> +	stats->bytes += pkt_len1;
> +	stats->bytes += pkt_len2;
> +	stats->bytes += pkt_len3;
> +	stats->bytes += pkt_len4;
> +}
> +
> +/* Optionally fill offload information in structure */
> +static inline int
> +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> +{
> +	struct rte_net_hdr_lens hdr_lens;
> +	uint32_t hdrlen, ptype;
> +	int l4_supported = 0;
> +
> +	/* nothing to do */
> +	if (hdr->flags == 0)
> +		return 0;

IIUC, the only difference with the non-vectorized version is the GSO
support removed here.
gso_type being in the same cacheline as flags in virtio_net_hdr, I don't
think checking the performance gain is worth the added maintainance
effort due to code duplication.

Please prove I'm wrong, otherwise please move virtio_rx_offload() in a
header and use it here. Alternative if it really imapcts performance is
to put all the shared code in a dedicated function that can be re-used
by both implementations.

> +
> +	/* GSO not support in vec path, skip check */
> +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> +
> +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> +	m->packet_type = ptype;
> +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> +		l4_supported = 1;
> +
> +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> +		if (hdr->csum_start <= hdrlen && l4_supported) {
> +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> +		} else {
> +			/* Unknown proto or tunnel, do sw cksum. We can assume
> +			 * the cksum field is in the first segment since the
> +			 * buffers we provided to the host are large enough.
> +			 * In case of SCTP, this will be wrong since it's a CRC
> +			 * but there's nothing we can do.
> +			 */
> +			uint16_t csum = 0, off;
> +
> +			rte_raw_cksum_mbuf(m, hdr->csum_start,
> +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> +				&csum);
> +			if (likely(csum != 0xffff))
> +				csum = ~csum;
> +			off = hdr->csum_offset + hdr->csum_start;
> +			if (rte_pktmbuf_data_len(m) >= off + 1)
> +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
> +					off) = csum;
> +		}
> +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
> +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> +	}
> +
> +	return 0;
> +}

Otherwise, the patch looks okay to me.

Thanks,
Maxime



More information about the dev mailing list